Пример #1
0
    def test_distance_2(self):
        edge_generator = SentenceDistanceEdgeGenerator(
            STUB_ENTITY_CLASS_ID_1,
            STUB_ENTITY_CLASS_ID_2,
            STUB_RELATION_CLASS_ID_2,
            distance=2)
        edge_generator.generate(self.dataset)
        num_edges = len(list(self.dataset.edges()))

        self.assertEqual(
            num_edges, 1,
            "\n" + "\n".join(str(e) for e in self.dataset.edges()))
Пример #2
0
    def __init__(self,
                 class1,
                 class2,
                 rel_type,
                 parser=None,
                 splitter=None,
                 tokenizer=None,
                 edge_generator=None,
                 feature_set=None,
                 feature_generators=None):
        self.class1 = class1
        self.class2 = class2
        self.rel_type = rel_type

        if not parser:
            nlp = get_spacy_nlp_english(load_parser=True)
            parser = SpacyParser(nlp)

        self.parser = parser

        if not splitter:
            # if nlp:  # Spacy parser is used, which includes a sentence splitter
            #     splitter = GenericSplitter(lambda string: (sent.text for sent in nlp(string).sents))
            # else:
            #     splitter = NLTK_SPLITTER
            splitter = NLTK_SPLITTER

        self.splitter = splitter

        if not tokenizer:
            if nlp:  # Spacy parser is used, which includes a tokenizer
                tokenizer = GenericTokenizer(
                    lambda string: (tok.text for tok in nlp.tokenizer(string)))
            else:
                tokenizer = NLTK_TOKENIZER

        self.tokenizer = tokenizer

        self.edge_generator = SentenceDistanceEdgeGenerator(
            self.class1, self.class2, self.rel_type,
            distance=0) if edge_generator is None else edge_generator

        self.feature_set = FeatureDictionary(
        ) if feature_set is None else feature_set

        self.feature_generators = self._verify_feature_generators(
            feature_generators) if feature_generators else [
                SentenceFeatureGenerator(f_counts_individual=1),
            ]
Пример #3
0
def test_baseline_D1(corpus_percentage):
    corpus = read_corpus("LocText", corpus_percentage)

    if corpus_percentage == 1.0:
        EXPECTED_F = 0.6421
    else:
        EXPECTED_F = None

    edge_generator = SentenceDistanceEdgeGenerator(PRO_ID,
                                                   LOC_ID,
                                                   REL_PRO_LOC_ID,
                                                   distance=1)
    annotator_gen_fun = (
        lambda _: StubRelationExtractor(edge_generator).annotate)

    evaluations = Evaluations.cross_validate(annotator_gen_fun,
                                             corpus,
                                             EVALUATOR,
                                             k_num_folds=5,
                                             use_validation_set=True)
    rel_evaluation = evaluations(REL_PRO_LOC_ID).compute(strictness="exact")

    print(rel_evaluation)
    print(evaluations)
    assert math.isclose(rel_evaluation.f_measure,
                        EXPECTED_F,
                        abs_tol=0.001 * 1.1), rel_evaluation.f_measure

    return evaluations
Пример #4
0
def test_baseline_D0_D1(corpus_percentage):
    corpus = read_corpus("LocText", corpus_percentage)

    if corpus_percentage == 1.0:
        EXPECTED_F = 0.7060
    else:
        EXPECTED_F = None

    edge_generator = CombinatorEdgeGenerator(
        SentenceDistanceEdgeGenerator(PRO_ID,
                                      LOC_ID,
                                      REL_PRO_LOC_ID,
                                      distance=0,
                                      rewrite_edges=False),
        SentenceDistanceEdgeGenerator(PRO_ID,
                                      LOC_ID,
                                      REL_PRO_LOC_ID,
                                      distance=1,
                                      rewrite_edges=False),  # Recall: 88.52
        # SentenceDistanceEdgeGenerator(PRO_ID, LOC_ID, REL_PRO_LOC_ID, distance=2, rewrite_edges=False),
        # SentenceDistanceEdgeGenerator(PRO_ID, LOC_ID, REL_PRO_LOC_ID, distance=3, rewrite_edges=False),  #
        # SentenceDistanceEdgeGenerator(PRO_ID, LOC_ID, REL_PRO_LOC_ID, distance=4, rewrite_edges=False),
        # SentenceDistanceEdgeGenerator(PRO_ID, LOC_ID, REL_PRO_LOC_ID, distance=5, rewrite_edges=False),  #
        # SentenceDistanceEdgeGenerator(PRO_ID, LOC_ID, REL_PRO_LOC_ID, distance=6, rewrite_edges=False),  # Recall: 99.70
    )

    annotator_gen_fun = (
        lambda _: StubRelationExtractor(edge_generator).annotate)

    evaluations = Evaluations.cross_validate(annotator_gen_fun,
                                             corpus,
                                             EVALUATOR,
                                             k_num_folds=5,
                                             use_validation_set=True)
    rel_evaluation = evaluations(REL_PRO_LOC_ID).compute(strictness="exact")

    print(rel_evaluation)
    print(evaluations)
    assert math.isclose(rel_evaluation.f_measure,
                        EXPECTED_F,
                        abs_tol=0.001 * 1.1), rel_evaluation.f_measure

    return rel_evaluation
Пример #5
0
 def __init__(self,
              entity1_class,
              entity2_class,
              relation_type,
              use_gold=True,
              use_pred=True):
     edge_generator = SentenceDistanceEdgeGenerator(entity1_class,
                                                    entity2_class,
                                                    relation_type,
                                                    distance=None,
                                                    use_gold=use_gold,
                                                    use_pred=use_pred)
     super().__init__(edge_generator)
Пример #6
0
    def test_Stub_D0_plus_D1_RelationExtractor(self):

        dataset = TestTaggers.get_test_dataset()

        edge_generator_1 = SentenceDistanceEdgeGenerator(STUB_E_ID_1,
                                                         STUB_E_ID_2,
                                                         STUB_R_ID_1,
                                                         distance=0,
                                                         rewrite_edges=False)
        edge_generator_2 = SentenceDistanceEdgeGenerator(STUB_E_ID_1,
                                                         STUB_E_ID_2,
                                                         STUB_R_ID_1,
                                                         distance=1,
                                                         rewrite_edges=False)
        edge_generator = CombinatorEdgeGenerator(edge_generator_1,
                                                 edge_generator_2)
        annotator = StubRelationExtractor(edge_generator)

        annotator.annotate(dataset)
        # Assert that indeed 4 sentences were considered
        assert 4 == len(list(dataset.sentences())), str(
            list(dataset.sentences()))

        # print("actu_rels", list(dataset.relations()))
        # print("edges", list(dataset.edges()))
        # print("pred_rels", list(dataset.predicted_relations()))

        evaluator = DocumentLevelRelationEvaluator(rel_type=STUB_R_ID_1)

        evals = evaluator.evaluate(dataset)
        evaluation = evals(STUB_R_ID_1)
        self.assertEqual(evaluation.tp, 3)
        self.assertEqual(evaluation.fn, 0)
        self.assertEqual(evaluation.fp, 2)
        computation = evals(STUB_R_ID_1).compute(strictness="exact")
        self.assertEqual(computation.f_measure, 0.7499999999999999)
Пример #7
0
class RelationExtractionPipeline:
    """
    Prepares an instance of a dataset by executing modules in fixed order.
        * Finally executes each feature generator in the order they were provided

    :param class1: the class of entity1
    :type class1: str
    :param class1: the class of entity2
    :type class1: str
    :param rel_type: the relation type between the two entities
    :type rel_type: str
    :param train: if the mode is training or testing
    :type train: bool
    :param feature_set: the feature_set of the original training data
    :type feature_set: str
    :param feature_generators: one or more modules responsible for generating features
    :type feature_generators: collections.Iterable[FeatureGenerator]
    """
    def __init__(self,
                 class1,
                 class2,
                 rel_type,
                 parser=None,
                 splitter=None,
                 tokenizer=None,
                 edge_generator=None,
                 feature_set=None,
                 feature_generators=None):
        self.class1 = class1
        self.class2 = class2
        self.rel_type = rel_type

        if not parser:
            nlp = get_spacy_nlp_english(load_parser=True)
            parser = SpacyParser(nlp)

        self.parser = parser

        if not splitter:
            # if nlp:  # Spacy parser is used, which includes a sentence splitter
            #     splitter = GenericSplitter(lambda string: (sent.text for sent in nlp(string).sents))
            # else:
            #     splitter = NLTK_SPLITTER
            splitter = NLTK_SPLITTER

        self.splitter = splitter

        if not tokenizer:
            if nlp:  # Spacy parser is used, which includes a tokenizer
                tokenizer = GenericTokenizer(
                    lambda string: (tok.text for tok in nlp.tokenizer(string)))
            else:
                tokenizer = NLTK_TOKENIZER

        self.tokenizer = tokenizer

        self.edge_generator = SentenceDistanceEdgeGenerator(
            self.class1, self.class2, self.rel_type,
            distance=0) if edge_generator is None else edge_generator

        self.feature_set = FeatureDictionary(
        ) if feature_set is None else feature_set

        self.feature_generators = self._verify_feature_generators(
            feature_generators) if feature_generators else [
                SentenceFeatureGenerator(f_counts_individual=1),
            ]

    def execute(self, dataset, only_features=False):
        # Note: the order of splitter/tokenizer/edger/parser is important
        # Note: we could avoid the re-splitting & tokenization (see c3d320f08ed8893460d5a68b1b5c87aab6ea0c27)
        #   yet that may later create unforseen problems and re-doing has no significant impact in running time

        start = time.time()

        if not only_features:
            self.splitter.split(dataset)
            self.tokenizer.tokenize(dataset)
            self.parser.parse(
                dataset
            )  # Note, the percolate_tokens_to_entities should go before the edge generator due to sentences adjustments
            self.edge_generator.generate(dataset)

        # The labels are always re-generated
        dataset.label_edges()

        for feature_generator in self.feature_generators:
            feature_generator.generate(dataset,
                                       self.feature_set,
                                       use_gold=self.edge_generator.use_gold,
                                       use_pred=self.edge_generator.use_pred)

        end = time.time()
        print_debug(
            "Relation pipeline (only_features: {}), running time: {}".format(
                only_features, str(end - start)))

    def _verify_feature_generators(self, feature_generators):
        if hasattr(feature_generators, '__iter__'):
            for index, feature_generator in enumerate(feature_generators):
                if not isinstance(feature_generator, FeatureGenerator):
                    raise TypeError(
                        'not an instance that implements FeatureGenerator at index {}'
                        .format(index))

            return feature_generators

        elif isinstance(feature_generators, FeatureGenerator):
            return [feature_generators]

        else:
            raise TypeError(
                'not an instance or iterable of instances that implements FeatureGenerator'
            )
Пример #8
0
def test_same_stats():

    original = read_corpus("LocText_v0", corpus_percentage=1.0)
    newone = read_corpus("LocText", corpus_percentage=1.0)

    # Verification
    original.validate_entity_offsets()
    newone.validate_entity_offsets()

    # Basic
    assert 100 == len(original) == len(newone)
    assert len(list(original.entities())) == len(list(newone.entities())) and len(list(original.entities())) > 0
    assert 0 == len(list(original.predicted_entities())) == len(list(newone.predicted_entities()))
    assert len(list(original.relations())) == len(list(newone.relations())) and len(list(original.relations())) > 0
    assert 0 == len(list(original.predicted_relations())) == len(list(newone.predicted_relations()))

    # Elaborated
    edge_generator_d0 = SentenceDistanceEdgeGenerator(PRO_ID, LOC_ID, REL_PRO_LOC_ID, distance=0)
    annotator = StubRelationExtractor(edge_generator_d0)

    annotator.annotate(original)
    annotator.annotate(newone)

    assert len(list(original.edges())) > 0 and (len(list(original.edges())) == len(list(newone.edges())) == len(list(newone.predicted_relations())))
    num_d0 = len(list(newone.predicted_relations()))

    edge_generator_d1 = SentenceDistanceEdgeGenerator(PRO_ID, LOC_ID, REL_PRO_LOC_ID, distance=1)
    annotator = StubRelationExtractor(edge_generator_d1)

    annotator.annotate(original)
    annotator.annotate(newone)

    assert len(list(original.edges())) > 0 and (len(list(original.edges())) == len(list(newone.edges())) == (- num_d0 + len(list(newone.predicted_relations()))))

    # Normalizations
    assert all(len(e.norms) == 0 for e in original.entities())

    count_normalizations = 0

    for e in newone.entities():
        if str(e.class_id) != "e_4":
            print(e.norms)

            assert len(e.norms) == 1, e
            norm_id = next(iter(e.norms.values()))

            assert type(norm_id) is str or e.class_id == "e_1" and norm_id is None, e   # do not write arrays, only comma-separated strings
            assert norm_id is None or ' ' not in norm_id, e   # We cannot have stuff like 'GO:0005811 lipid droplet' -- let's have only the GO id

            if e.class_id in ['e_2', 'e_3']:
                assert norm_id != '', e
                assert ',' not in norm_id, e

            if e.class_id == 'e_2':
                assert norm_id.startswith("GO:")

        count_normalizations += 1

    assert count_normalizations == len(list(newone.entities())) == len(list(original.entities()))

    # Document based

    for docid, original_document in original.documents.items():
        newone_document = newone.documents[docid]

        original_count_entities = sum(1 for _ in original_document.entities())
        newone_count_entities = sum(1 for _ in newone_document.entities())

        assert original_count_entities == newone_count_entities, docid

        original_count_relations = sum(1 for _ in original_document.relations())
        newone_count_relations = sum(1 for _ in newone_document.relations())

        assert original_count_relations == newone_count_relations, docid
Пример #9
0
    def __init__(self,
                 entity1_class,
                 entity2_class,
                 rel_type,
                 sentence_distance=0,
                 selected_features_file=None,
                 feature_generators=None,
                 pipeline=None,
                 use_predicted_entities=False,
                 execute_pipeline=True,
                 model=None,
                 **model_params):

        super().__init__(entity1_class, entity2_class, rel_type)

        self.sentence_distance = sentence_distance
        edge_generator = SentenceDistanceEdgeGenerator(
            entity1_class,
            entity2_class,
            rel_type,
            distance=self.sentence_distance,
            use_gold=not use_predicted_entities,
            use_pred=use_predicted_entities,
        )

        if selected_features_file:
            self.feature_set = FeatureDictionary(is_locked=False)
            selected_features = unpickle_beautified_file(
                selected_features_file)
            # sort to make the order of feature insertion deterministic
            for selected in sorted(selected_features):
                self.feature_set[selected] = len(self.feature_set)
            self.feature_set.is_locked = True

        else:
            self.feature_set = None

        if pipeline:
            feature_generators = pipeline.feature_generators
        elif feature_generators is not None:  # Trick: if [], this will use pipeline's default generators
            feature_generators = feature_generators
        else:
            feature_generators = self.feature_generators()

        self.pipeline = pipeline if pipeline \
            else RelationExtractionPipeline(
                entity1_class, entity2_class, rel_type,
                tokenizer=TmVarTokenizer(),
                edge_generator=edge_generator,
                feature_set=self.feature_set,
                feature_generators=feature_generators)

        assert feature_generators == self.pipeline.feature_generators or feature_generators == [], str((feature_generators, self.pipeline.feature_generators))

        self.execute_pipeline = execute_pipeline

        # With the following two settings we try force the model to always give the same results between runs
        # and avoid slight variations due to different random generators initializations

        if not model_params.get("tol"):
            # As of 2017-Feb-7, default in SVC is 1e-3: http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
            model_params["tol"] = 1e-5

        if not model_params.get("random_state"):
            # TODO set with this
            model_params["random_state"] = 2727
            pass

        self.model = model if model else SklSVM(**model_params)