예제 #1
0
    def test_case_3(self):
        data = get_tagged_texts_as_pd(self.folders,
                                      '../../../data/datasets/gmb-2.2.0')

        data = filtrations(data, with_dots=True)

        self.assertTrue(780339 == len(data))
예제 #2
0
    def test_case_5(self):
        data = get_tagged_texts_as_pd(self.folders,
                                      '../../../data/datasets/gmb-2.2.0')

        data = filtrations(data, with_dots=True)

        bio_ner_tags = iob3bio(data.ner_tag.values.tolist())

        self.assertTrue(len(bio_ner_tags) == len(data))
예제 #3
0
    def setUpClass(cls):
        folders = filter_by_subcorpus('../../../data/datasets/gmb-2.2.0',
                                      'subcorpus: Voice of America')

        data = get_tagged_texts_as_pd(folders,
                                      '../../../data/datasets/gmb-2.2.0')

        data = filtrations(data, with_dots=True)

        data.ner_tag = iob3bio(data.ner_tag.values)

        data = additional_features(df=data)

        cls.features = data.columns.values.tolist()

        cls.features.remove('ner_tag')

        cls.features.remove('word_net_sense_number')

        cls.features.remove('verb_net_roles')

        cls.features.remove('semantic_relation')

        cls.features.remove('animacy_tag')

        cls.features.remove('super_tag')

        cls.features.remove('lambda_dsr')

        X, y = SentenceExtractor(features=cls.features,
                                 target='ner_tag').fit_transform(data)

        cls.X_train, cls.X_test, cls.y_train, cls.y_test = train_test_split(
            X, y, test_size=0.33, random_state=42)

        cls.X_train = [
            sentence for sentence in cls.X_train if len(sentence) > 0
        ]

        cls.y_train = [
            sentence.tolist() for sentence in cls.y_train if len(sentence) > 0
        ]

        cls.X_test = [sentence for sentence in cls.X_test if len(sentence) > 0]

        cls.y_test = [
            sentence.tolist() for sentence in cls.y_test if len(sentence) > 0
        ]
예제 #4
0
    def setUpClass(cls):
        folders = filter_by_subcorpus('../../../data/datasets/gmb-2.2.0', 'subcorpus: Voice of America')

        data = get_tagged_texts_as_pd(folders, '../../../data/datasets/gmb-2.2.0')

        data = filtrations(data, with_dots=True)

        data.ner_tag = iob3bio(data.ner_tag.values)

        cls.X, cls.y = SentenceExtractor(
            features=[
                'token',
                'pos_tag',
                'lemma'
            ],
            target='ner_tag'
        ).fit_transform(data)
예제 #5
0
    def setUpClass(cls):
        folders = filter_by_subcorpus('../../../data/datasets/gmb-2.2.0', 'subcorpus: Voice of America')

        data = get_tagged_texts_as_pd(folders, '../../../data/datasets/gmb-2.2.0')

        data = filtrations(data, with_dots=True)

        data.ner_tag = iob3bio(data.ner_tag.values)

        data = additional_features(df=data)

        # features list:
        cls.features = [
            'token',
            'lemma',
            'pos_tag',
            'is_title',
            'contains_digits',
            'word_len',
            'suffix',
            'prefix',
            'prev_pos_tag',
            'prev_is_title',
            'prev_contains_digits',
            'prev_word_len',
            'prev_suffix',
            'prev_prefix',
            'next_pos_tag',
            'next_is_title',
            'next_contains_digits',
            'next_word_len',
            'next_suffix',
            'next_prefix'
        ]

        X, y = SentenceExtractor(features=cls.features, target='ner_tag').fit_transform(data)

        cls.X_train, cls.X_test, cls.y_train, cls.y_test = train_test_split(X, y, test_size=0.33, random_state=42)

        cls.X_train = [sentence for sentence in cls.X_train if len(sentence) > 0]

        cls.y_train = [sentence.tolist() for sentence in cls.y_train if len(sentence) > 0]

        cls.X_test = [sentence for sentence in cls.X_test if len(sentence) > 0]

        cls.y_test = [sentence.tolist() for sentence in cls.y_test if len(sentence) > 0]
예제 #6
0
    def test_case_1(self):
        folders = filter_by_subcorpus('../../../data/datasets/gmb-2.2.0',
                                      'subcorpus: Voice of America')

        data = get_tagged_texts_as_pd(folders,
                                      '../../../data/datasets/gmb-2.2.0')
        data = filtrations(data, with_dots=True)

        X, y = SentenceExtractor(features=['token', 'pos_tag', 'lemma'],
                                 target='ner_tag').fit_transform(data)

        lemma_sentence_lenghts = list(map(len, X))

        tag_sentence_lenghts = list(map(len, y))

        self.assertTrue(
            all(len_lemmas == len_tags for len_lemmas, len_tags in zip(
                lemma_sentence_lenghts, tag_sentence_lenghts)))
예제 #7
0
    def setUpClass(cls):
        folders = filter_by_subcorpus('../../../data/datasets/gmb-2.2.0',
                                      'subcorpus: Voice of America')

        data = get_tagged_texts_as_pd(folders,
                                      '../../../data/datasets/gmb-2.2.0')

        data = filtrations(data, with_dots=True)

        data.ner_tag = iob3bio(data.ner_tag.values)

        cls.features = ['token', 'pos_tag', 'lemma']

        X, y = SentenceExtractor(features=cls.features,
                                 target='ner_tag').fit_transform(data)

        cls.X_train, cls.X_test, cls.y_train, cls.y_test = train_test_split(
            X, y, test_size=0.33, random_state=42)
예제 #8
0
    def test_case_2(self):
        data = get_tagged_texts_as_pd(self.folders,
                                      '../../../data/datasets/gmb-2.2.0')

        self.assertTrue(1231279 == len(data))