def test_case_3(self): data = get_tagged_texts_as_pd(self.folders, '../../../data/datasets/gmb-2.2.0') data = filtrations(data, with_dots=True) self.assertTrue(780339 == len(data))
def test_case_5(self): data = get_tagged_texts_as_pd(self.folders, '../../../data/datasets/gmb-2.2.0') data = filtrations(data, with_dots=True) bio_ner_tags = iob3bio(data.ner_tag.values.tolist()) self.assertTrue(len(bio_ner_tags) == len(data))
def setUpClass(cls): folders = filter_by_subcorpus('../../../data/datasets/gmb-2.2.0', 'subcorpus: Voice of America') data = get_tagged_texts_as_pd(folders, '../../../data/datasets/gmb-2.2.0') data = filtrations(data, with_dots=True) data.ner_tag = iob3bio(data.ner_tag.values) data = additional_features(df=data) cls.features = data.columns.values.tolist() cls.features.remove('ner_tag') cls.features.remove('word_net_sense_number') cls.features.remove('verb_net_roles') cls.features.remove('semantic_relation') cls.features.remove('animacy_tag') cls.features.remove('super_tag') cls.features.remove('lambda_dsr') X, y = SentenceExtractor(features=cls.features, target='ner_tag').fit_transform(data) cls.X_train, cls.X_test, cls.y_train, cls.y_test = train_test_split( X, y, test_size=0.33, random_state=42) cls.X_train = [ sentence for sentence in cls.X_train if len(sentence) > 0 ] cls.y_train = [ sentence.tolist() for sentence in cls.y_train if len(sentence) > 0 ] cls.X_test = [sentence for sentence in cls.X_test if len(sentence) > 0] cls.y_test = [ sentence.tolist() for sentence in cls.y_test if len(sentence) > 0 ]
def setUpClass(cls): folders = filter_by_subcorpus('../../../data/datasets/gmb-2.2.0', 'subcorpus: Voice of America') data = get_tagged_texts_as_pd(folders, '../../../data/datasets/gmb-2.2.0') data = filtrations(data, with_dots=True) data.ner_tag = iob3bio(data.ner_tag.values) cls.X, cls.y = SentenceExtractor( features=[ 'token', 'pos_tag', 'lemma' ], target='ner_tag' ).fit_transform(data)
def setUpClass(cls): folders = filter_by_subcorpus('../../../data/datasets/gmb-2.2.0', 'subcorpus: Voice of America') data = get_tagged_texts_as_pd(folders, '../../../data/datasets/gmb-2.2.0') data = filtrations(data, with_dots=True) data.ner_tag = iob3bio(data.ner_tag.values) data = additional_features(df=data) # features list: cls.features = [ 'token', 'lemma', 'pos_tag', 'is_title', 'contains_digits', 'word_len', 'suffix', 'prefix', 'prev_pos_tag', 'prev_is_title', 'prev_contains_digits', 'prev_word_len', 'prev_suffix', 'prev_prefix', 'next_pos_tag', 'next_is_title', 'next_contains_digits', 'next_word_len', 'next_suffix', 'next_prefix' ] X, y = SentenceExtractor(features=cls.features, target='ner_tag').fit_transform(data) cls.X_train, cls.X_test, cls.y_train, cls.y_test = train_test_split(X, y, test_size=0.33, random_state=42) cls.X_train = [sentence for sentence in cls.X_train if len(sentence) > 0] cls.y_train = [sentence.tolist() for sentence in cls.y_train if len(sentence) > 0] cls.X_test = [sentence for sentence in cls.X_test if len(sentence) > 0] cls.y_test = [sentence.tolist() for sentence in cls.y_test if len(sentence) > 0]
def test_case_1(self): folders = filter_by_subcorpus('../../../data/datasets/gmb-2.2.0', 'subcorpus: Voice of America') data = get_tagged_texts_as_pd(folders, '../../../data/datasets/gmb-2.2.0') data = filtrations(data, with_dots=True) X, y = SentenceExtractor(features=['token', 'pos_tag', 'lemma'], target='ner_tag').fit_transform(data) lemma_sentence_lenghts = list(map(len, X)) tag_sentence_lenghts = list(map(len, y)) self.assertTrue( all(len_lemmas == len_tags for len_lemmas, len_tags in zip( lemma_sentence_lenghts, tag_sentence_lenghts)))
def setUpClass(cls): folders = filter_by_subcorpus('../../../data/datasets/gmb-2.2.0', 'subcorpus: Voice of America') data = get_tagged_texts_as_pd(folders, '../../../data/datasets/gmb-2.2.0') data = filtrations(data, with_dots=True) data.ner_tag = iob3bio(data.ner_tag.values) cls.features = ['token', 'pos_tag', 'lemma'] X, y = SentenceExtractor(features=cls.features, target='ner_tag').fit_transform(data) cls.X_train, cls.X_test, cls.y_train, cls.y_test = train_test_split( X, y, test_size=0.33, random_state=42)
def test_case_2(self): data = get_tagged_texts_as_pd(self.folders, '../../../data/datasets/gmb-2.2.0') self.assertTrue(1231279 == len(data))