def __init__(self, splitter=None, tokenizer=None, feature_generators=None): if not splitter: splitter = NLTKSplitter() if not tokenizer: tokenizer = TmVarTokenizer() if feature_generators is None: feature_generators = [SimpleFeatureGenerator(), PorterStemFeatureGenerator(), WindowFeatureGenerator((-3, -2, -1, 1, 2, 3), ['stem[0]'])] if isinstance(splitter, Splitter): self.splitter = splitter else: raise TypeError('not an instance that implements Splitter') if isinstance(tokenizer, Tokenizer): self.tokenizer = tokenizer else: raise TypeError('not an instance that implements Tokenizer') if hasattr(feature_generators, '__iter__'): for index, feature_generator in enumerate(feature_generators): if not isinstance(feature_generator, FeatureGenerator): raise TypeError('not an instance that implements FeatureGenerator at index {}'.format(index)) self.feature_generators = feature_generators elif isinstance(feature_generators, FeatureGenerator): self.feature_generators = [feature_generators] else: raise TypeError('not an instance or iterable of instances that implements FeatureGenerator')
def setUp(self): self.dataset = StringReader( 'some text ... (c.2708_2711delTTAG, p.V903GfsX905) ... text').read( ) NLTKSplitter().split(self.dataset) TmVarTokenizer().tokenize(self.dataset) part = list(self.dataset.parts())[0] part.annotations.append( Entity(STUB_ENTITY_CLASS_ID, 15, 'c.2708_2711delTTAG')) part.annotations.append( Entity(STUB_ENTITY_CLASS_ID, 35, 'p.V903GfsX905'))
def test_generate_patterns_245(self): dataset = StringReader('token c.A436C token').read() NLTKSplitter().split(dataset) TmVarTokenizer().tokenize(dataset) TmVarDictionaryFeatureGenerator().generate(dataset) token_features = [{key: value for key, value in token.features.items() if value is not 'O'} for token in dataset.tokens()] self.assertEqual(token_features[0], {}) self.assertEqual(token_features[1], {'pattern4[0]': 'B', 'pattern2[0]': 'B'}) self.assertEqual(token_features[2], {'pattern4[0]': 'I', 'pattern2[0]': 'I'}) self.assertEqual(token_features[3], {'pattern4[0]': 'I', 'pattern2[0]': 'I', 'pattern5[0]': 'B'}) self.assertEqual(token_features[4], {'pattern4[0]': 'I', 'pattern2[0]': 'I', 'pattern5[0]': 'I'}) self.assertEqual(token_features[5], {'pattern4[0]': 'E', 'pattern2[0]': 'I', 'pattern5[0]': 'E'}) self.assertEqual(token_features[6], {})
def test_tag(self): # todo question is that the proper way? with predicts_classes GNormPlusGeneTagger().tag(self.data, uniprot=True) NLTKSplitter().split(self.data) TmVarTokenizer().tokenize(self.data) StubSameSentenceRelationExtractor(PRO_CLASS_ID, MUT_CLASS_ID, PRO_REL_MUT_CLASS_ID).annotate(self.data) self.assertEqual(len([x for x in self.data.annotations() if x.class_id == PRO_CLASS_ID]), 0) self.assertEqual(len([x for x in self.data.annotations() if x.class_id == MUT_CLASS_ID]), 2) self.assertEqual(len([x for x in self.data.relations() if x.class_id == PRO_REL_MUT_CLASS_ID]), 0) self.data.purge_false_relationships() self.assertEqual(len([x for x in self.data.relations() if x.class_id == PRO_REL_MUT_CLASS_ID]), 0) del self.data.documents['15878741'].parts['abstract'].annotations[0] self.assertEqual(len([x for x in self.data.annotations() if x.class_id == MUT_CLASS_ID]), 1) self.data.purge_false_relationships() self.assertEqual(len([x for x in self.data.relations() if x.class_id == PRO_REL_MUT_CLASS_ID]), 0)
def setup_class(cls): # create a sample dataset to test cls.dataset = Dataset() doc_id1 = Document() # 15 tokens in 2 sentences doc_id1.parts['p1'] = Part( 'this is some sample text. it contains this c.2708_2711delTTAG mutation.' ) doc_id1.parts['p1'].sentences_ = [ 'this is some sample text.', 'it contains this c.2708_2711delTTAG mutation.' ] cls.dataset.documents['doc_id1'] = doc_id1 cls.tokenizer = TmVarTokenizer() cls.tokenizer.tokenize(cls.dataset)
def get_prepare_pipeline_for_best_model_general(use_windows=True, we_params=None, dictionaries_paths=None, hdfs_url=None, hdfs_user=None, dictionaries_stop_words=None): """ Helper method that returns an instance of PrepareDatasetPipeline which uses the best configuration for predicating any-domain mentions. if we_params is empty dict, no we is applied :returns nalaf.structures.dataset_pipelines.PrepareDatasetPipeline """ # MAYBE ml-performance: use more general-domain tokenizer such as NLTK's tokenizer = TmVarTokenizer() default_we_params = {'additive': None, 'multiplicative': None, 'location': None} we_params = default_we_params if we_params is None else we_params generators = [ SpacyLemmatizer(), SpacyPosTagger(), SentenceMarkerFeatureGenerator(), TmVarFeatureGenerator(get_mutation_features=False) ] windows_include = [] if dictionaries_paths: if type(dictionaries_paths) is str: dictionaries_paths = [x.strip() for x in dictionaries_paths.split(",")] dics_feat_generators = DictionaryFeatureGenerator.construct_all_from_paths(dictionaries_paths=dictionaries_paths, string_tokenizer=tokenizer.tokenize_string, case_sensitive=False, hdfs_url=hdfs_url, hdfs_user=hdfs_user, stop_words=dictionaries_stop_words) generators.extend(dics_feat_generators) for dic in dics_feat_generators: windows_include.append(dic.key + "[0]") if use_windows: windows_include.extend(['stem[0]', 'pos[0]']) f = WindowFeatureGenerator(template=(-2, -1, 1, 2), include_list=windows_include) generators.append(f) if we_params: generators.append(get_word_embeddings_feature_generator(we_params['location'], we_params['additive'], we_params['multiplicative'])) return PrepareDatasetPipeline(tokenizer=tokenizer, feature_generators=generators)
def setUpClass(cls): # create a sample dataset to test cls.dataset = Dataset() doc_id1 = Document() doc_id1.parts['t1'] = Part('This title blows your mind') text = str( 'This magic only exists in your dreams. To become reality, you have to work at it. ' 'Thr is only available with the residue threonine and a mutation, ' 'though things can change positions ' 'when adding some more replacements. Between me being sorry ' 'and you being an insertion.') doc_id1.parts['p1'] = Part(text.replace('\n', '')) cls.dataset.documents['doc_id1'] = doc_id1 NLTKSplitter().split(cls.dataset) TmVarTokenizer().tokenize(cls.dataset) cls.feature = NLMentionFeatureGenerator(thr=4) cls.feature.generate(dataset=cls.dataset)
from nala.utils.corpora import get_corpus from nalaf.preprocessing.spliters import NLTKSplitter from nalaf.preprocessing.tokenizers import TmVarTokenizer data = get_corpus('nala_training_1') NLTKSplitter().split(data) TmVarTokenizer().tokenize(data) from nalaf.features.embeddings import BinarizedWordEmbeddingsFeatureGenerator BinarizedWordEmbeddingsFeatureGenerator( '/home/abojchevski/projects/nala/nala/data/word_embeddings_2016-03-28/word_embeddings.model' ).generate(data) for token in data.tokens(): print(token.features, token.end)
def __init__(self): self.data = get_corpus('IDP4+') NLTKSplitter().split(self.data) TmVarTokenizer().tokenize(self.data)
def __init__(self, entity1_class, entity2_class, rel_type, sentence_distance=0, selected_features_file=None, feature_generators=None, pipeline=None, use_predicted_entities=False, execute_pipeline=True, model=None, **model_params): super().__init__(entity1_class, entity2_class, rel_type) self.sentence_distance = sentence_distance edge_generator = SentenceDistanceEdgeGenerator( entity1_class, entity2_class, rel_type, distance=self.sentence_distance, use_gold=not use_predicted_entities, use_pred=use_predicted_entities, ) if selected_features_file: self.feature_set = FeatureDictionary(is_locked=False) selected_features = unpickle_beautified_file( selected_features_file) # sort to make the order of feature insertion deterministic for selected in sorted(selected_features): self.feature_set[selected] = len(self.feature_set) self.feature_set.is_locked = True else: self.feature_set = None if pipeline: feature_generators = pipeline.feature_generators elif feature_generators is not None: # Trick: if [], this will use pipeline's default generators feature_generators = feature_generators else: feature_generators = self.feature_generators() self.pipeline = pipeline if pipeline \ else RelationExtractionPipeline( entity1_class, entity2_class, rel_type, tokenizer=TmVarTokenizer(), edge_generator=edge_generator, feature_set=self.feature_set, feature_generators=feature_generators) assert feature_generators == self.pipeline.feature_generators or feature_generators == [], str((feature_generators, self.pipeline.feature_generators)) self.execute_pipeline = execute_pipeline # With the following two settings we try force the model to always give the same results between runs # and avoid slight variations due to different random generators initializations if not model_params.get("tol"): # As of 2017-Feb-7, default in SVC is 1e-3: http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html model_params["tol"] = 1e-5 if not model_params.get("random_state"): # TODO set with this model_params["random_state"] = 2727 pass self.model = model if model else SklSVM(**model_params)