예제 #1
0
def test_number_of_tagged_entities():
    if run_tests:
        dataset_parts = read_corpus("LocText", corpus_percentage=1.0)
        dataset_whole = read_corpus("LocText", corpus_percentage=1.0)
        assert 0 == num_predicted_entities(
            dataset_parts) == num_predicted_entities(dataset_whole)

        TAGGER_SEND_PARTS.annotate(dataset_parts)
        TAGGER_SEND_WHOLE.annotate(dataset_whole)

        for docid, document in dataset_parts.documents.items():
            if docid in [
                    "23543752",  # Special analysis as it contains unicode chracters (μ)
                    "23150645"  # also unicode and annotation of reductase where reductase as string alone is not tagged
            ]:
                for e in document.predicted_entities():
                    print(e)
                print()

        num_preds_with_parts = num_predicted_entities(dataset_parts)
        num_preds_with_whole = num_predicted_entities(dataset_whole)

        print("Numbers, real:", len(list(dataset_parts.entities())),
              "vs. pred: ", num_preds_with_parts, num_preds_with_whole)

        # Equality holds if all tagger entity types are explicitly given, otherwise the tagging with parts yield less predicted entities
        assert num_preds_with_parts == num_preds_with_whole

        for pred_part, pred_whole in zip(dataset_parts.predicted_entities(),
                                         dataset_whole.predicted_entities()):
            assert (pred_part == pred_whole)
예제 #2
0
def _test_LocText(corpus_percentage,
                  model,
                  EXPECTED_F=None,
                  predict_entities=None,
                  EXPECTED_F_SE=0.001):
    # Note: EXPECTED_F=None will make the test fail for non-yet verified evaluations
    # Note: the real StdErr's are around ~0.0027-0.0095. Decrease them by default to be more strict with tests

    assert corpus_percentage in [
        TEST_MIN_CORPUS_PERCENTAGE, 1.0
    ], "corpus_percentage must == {} or 1.0. You gave: {}".format(
        str(TEST_MIN_CORPUS_PERCENTAGE), str(corpus_percentage))

    corpus = read_corpus("LocText", corpus_percentage, predict_entities)

    args = [
        '--model', model, '--corpus_percentage',
        str(corpus_percentage), '--evaluation_level',
        str(EVALUATION_LEVEL)
    ]
    if predict_entities:
        args += ['--predict_entities', predict_entities]

    rel_evaluation = evaluate_with_argv(args)

    print("LocText " + model, rel_evaluation)
    assert math.isclose(rel_evaluation.f_measure,
                        EXPECTED_F,
                        abs_tol=EXPECTED_F_SE * 1.1)

    return rel_evaluation
예제 #3
0
def test_baseline_D1(corpus_percentage):
    corpus = read_corpus("LocText", corpus_percentage)

    if corpus_percentage == 1.0:
        EXPECTED_F = 0.6421
    else:
        EXPECTED_F = None

    edge_generator = SentenceDistanceEdgeGenerator(PRO_ID,
                                                   LOC_ID,
                                                   REL_PRO_LOC_ID,
                                                   distance=1)
    annotator_gen_fun = (
        lambda _: StubRelationExtractor(edge_generator).annotate)

    evaluations = Evaluations.cross_validate(annotator_gen_fun,
                                             corpus,
                                             EVALUATOR,
                                             k_num_folds=5,
                                             use_validation_set=True)
    rel_evaluation = evaluations(REL_PRO_LOC_ID).compute(strictness="exact")

    print(rel_evaluation)
    print(evaluations)
    assert math.isclose(rel_evaluation.f_measure,
                        EXPECTED_F,
                        abs_tol=0.001 * 1.1), rel_evaluation.f_measure

    return evaluations
예제 #4
0
def test_baseline_full(corpus_percentage):
    if (corpus_percentage == 1.0):
        EXPECTED_F = 0.5050
    else:
        EXPECTED_F = None

    corpus = read_corpus("LocText",
                         corpus_percentage,
                         predict_entities="9606,3702,4932")

    annotator_gen_fun = (lambda _: StubSameSentenceRelationExtractor(
        PRO_ID, LOC_ID, REL_PRO_LOC_ID, use_gold=False, use_pred=True).annotate
                         )

    evaluations = Evaluations.cross_validate(annotator_gen_fun,
                                             corpus,
                                             EVALUATOR,
                                             k_num_folds=5,
                                             use_validation_set=True)
    rel_evaluation = evaluations(REL_PRO_LOC_ID).compute(strictness="exact")

    print(evaluations)
    assert math.isclose(rel_evaluation.f_measure,
                        EXPECTED_F,
                        abs_tol=0.001 * 1.1), rel_evaluation.f_measure

    return evaluations
예제 #5
0
def test_get_evaluation_result_of_corpus(evaluation_level):
    """
    Evaluates the performance of corpus entities [e_1 (Protein), e_2 (Localization) and e_3 (Organism)]
    [precision, recall and f-measure]
    :param corpus:
    :return:
    """

    # Gets both annotation and pred_annotation entities.
    corpus = read_corpus("LocText",
                         corpus_percentage=1.0,
                         predict_entities="9606,3702,4932")

    (mention_evaluator,
     entity_evaluator) = _get_entity_evaluator(evaluation_level)

    print()
    print("EVALUATION LEVEL:", evaluation_level)
    print()

    # print("-----------------------------------------------------------------------------------")
    # print("MentionLevelEvaluator")
    # print(mention_evaluator.evaluate(corpus))
    # print("-----------------------------------------------------------------------------------")
    # print()
    print()
    print(
        "-----------------------------------------------------------------------------------"
    )
    print("EntityEvaluator")
    print(entity_evaluator.evaluate(corpus))
    print(
        "-----------------------------------------------------------------------------------"
    )
예제 #6
0
def test_num_of_normalization_in_new_file():

    old_dataset = read_corpus("LocText_v1", corpus_percentage=1.0)
    new_dataset = read_corpus("LocText_v2", corpus_percentage=1.0)

    # First of all, count of entities & relations remains equal (and ofc there are 0 predictions)
    assert (len(list(old_dataset.entities())) == len(
        list(new_dataset.entities())))
    assert (len(list(old_dataset.predicted_entities())) == len(
        list(new_dataset.predicted_entities())) == 0)
    assert (len(list(old_dataset.relations())) == len(
        list(new_dataset.relations())))
    assert (len(list(old_dataset.predicted_relations())) == len(
        list(new_dataset.predicted_relations())) == 0)

    # Test now new number of normalizations
    print("Actually obtained number of newly normalized ID's: ",
          num_normalizations(new_dataset) - num_normalizations(old_dataset))

    # 8 is the number of newly normalized records from Tanya, [Greens] in new file.
    assert num_normalizations(
        new_dataset) == num_normalizations(old_dataset) + 8
예제 #7
0
def test_baseline_D0_D1(corpus_percentage):
    corpus = read_corpus("LocText", corpus_percentage)

    if corpus_percentage == 1.0:
        EXPECTED_F = 0.7060
    else:
        EXPECTED_F = None

    edge_generator = CombinatorEdgeGenerator(
        SentenceDistanceEdgeGenerator(PRO_ID,
                                      LOC_ID,
                                      REL_PRO_LOC_ID,
                                      distance=0,
                                      rewrite_edges=False),
        SentenceDistanceEdgeGenerator(PRO_ID,
                                      LOC_ID,
                                      REL_PRO_LOC_ID,
                                      distance=1,
                                      rewrite_edges=False),  # Recall: 88.52
        # SentenceDistanceEdgeGenerator(PRO_ID, LOC_ID, REL_PRO_LOC_ID, distance=2, rewrite_edges=False),
        # SentenceDistanceEdgeGenerator(PRO_ID, LOC_ID, REL_PRO_LOC_ID, distance=3, rewrite_edges=False),  #
        # SentenceDistanceEdgeGenerator(PRO_ID, LOC_ID, REL_PRO_LOC_ID, distance=4, rewrite_edges=False),
        # SentenceDistanceEdgeGenerator(PRO_ID, LOC_ID, REL_PRO_LOC_ID, distance=5, rewrite_edges=False),  #
        # SentenceDistanceEdgeGenerator(PRO_ID, LOC_ID, REL_PRO_LOC_ID, distance=6, rewrite_edges=False),  # Recall: 99.70
    )

    annotator_gen_fun = (
        lambda _: StubRelationExtractor(edge_generator).annotate)

    evaluations = Evaluations.cross_validate(annotator_gen_fun,
                                             corpus,
                                             EVALUATOR,
                                             k_num_folds=5,
                                             use_validation_set=True)
    rel_evaluation = evaluations(REL_PRO_LOC_ID).compute(strictness="exact")

    print(rel_evaluation)
    print(evaluations)
    assert math.isclose(rel_evaluation.f_measure,
                        EXPECTED_F,
                        abs_tol=0.001 * 1.1), rel_evaluation.f_measure

    return rel_evaluation
예제 #8
0
파일: util.py 프로젝트: wangjs/LocText
def get_model_and_data(sentence_distance, predict_entities):
    corpus = read_corpus("LocText", predict_entities=predict_entities)

    # TODO the specific parameters like C=1 or even `linear` are controversial -- Maybe I should I change that
    annotator = LocTextDXModelRelationExtractor(
        PRO_ID,
        LOC_ID,
        REL_PRO_LOC_ID,
        sentence_distance,
        use_predicted_entities=len(predict_entities) > 0,
        preprocess=True,
        kernel='linear',
        C=1)
    annotator.pipeline.execute(corpus)
    X, y, groups = annotator.model.write_vector_instances(
        corpus, annotator.pipeline.feature_set)
    X = annotator.model.preprocess.fit_transform(X)

    return (annotator, X, y, groups)
예제 #9
0
def _test(corpus_percentage, entity_map_fun, relation_accept_fun,
          expected_sum_perct_d0_d1, expected_nums, expected_percts):
    corpus = read_corpus("LocText", corpus_percentage)

    # Note: the predictor will already split & tokenize the corpus. See the implementation for details
    StubSamePartRelationExtractor(PRO_ID, LOC_ID,
                                  REL_PRO_LOC_ID).annotate(corpus)

    (counter_nums, counter_percts) = corpus.compute_stats_relations_distances(
        REL_PRO_LOC_ID, entity_map_fun, relation_accept_fun)

    print()
    print("# Documents", len(corpus))
    print("# Uniq Rels", sum(counter_nums.values()))
    print("  ", counter_nums)
    print("  ", counter_percts)

    assert expected_nums == counter_nums
    assert math.isclose(expected_sum_perct_d0_d1,
                        (counter_percts['D0'] + counter_percts['D1']),
                        abs_tol=0.01)
예제 #10
0
def test_same_stats():

    original = read_corpus("LocText_v0", corpus_percentage=1.0)
    newone = read_corpus("LocText", corpus_percentage=1.0)

    # Verification
    original.validate_entity_offsets()
    newone.validate_entity_offsets()

    # Basic
    assert 100 == len(original) == len(newone)
    assert len(list(original.entities())) == len(list(newone.entities())) and len(list(original.entities())) > 0
    assert 0 == len(list(original.predicted_entities())) == len(list(newone.predicted_entities()))
    assert len(list(original.relations())) == len(list(newone.relations())) and len(list(original.relations())) > 0
    assert 0 == len(list(original.predicted_relations())) == len(list(newone.predicted_relations()))

    # Elaborated
    edge_generator_d0 = SentenceDistanceEdgeGenerator(PRO_ID, LOC_ID, REL_PRO_LOC_ID, distance=0)
    annotator = StubRelationExtractor(edge_generator_d0)

    annotator.annotate(original)
    annotator.annotate(newone)

    assert len(list(original.edges())) > 0 and (len(list(original.edges())) == len(list(newone.edges())) == len(list(newone.predicted_relations())))
    num_d0 = len(list(newone.predicted_relations()))

    edge_generator_d1 = SentenceDistanceEdgeGenerator(PRO_ID, LOC_ID, REL_PRO_LOC_ID, distance=1)
    annotator = StubRelationExtractor(edge_generator_d1)

    annotator.annotate(original)
    annotator.annotate(newone)

    assert len(list(original.edges())) > 0 and (len(list(original.edges())) == len(list(newone.edges())) == (- num_d0 + len(list(newone.predicted_relations()))))

    # Normalizations
    assert all(len(e.norms) == 0 for e in original.entities())

    count_normalizations = 0

    for e in newone.entities():
        if str(e.class_id) != "e_4":
            print(e.norms)

            assert len(e.norms) == 1, e
            norm_id = next(iter(e.norms.values()))

            assert type(norm_id) is str or e.class_id == "e_1" and norm_id is None, e   # do not write arrays, only comma-separated strings
            assert norm_id is None or ' ' not in norm_id, e   # We cannot have stuff like 'GO:0005811 lipid droplet' -- let's have only the GO id

            if e.class_id in ['e_2', 'e_3']:
                assert norm_id != '', e
                assert ',' not in norm_id, e

            if e.class_id == 'e_2':
                assert norm_id.startswith("GO:")

        count_normalizations += 1

    assert count_normalizations == len(list(newone.entities())) == len(list(original.entities()))

    # Document based

    for docid, original_document in original.documents.items():
        newone_document = newone.documents[docid]

        original_count_entities = sum(1 for _ in original_document.entities())
        newone_count_entities = sum(1 for _ in newone_document.entities())

        assert original_count_entities == newone_count_entities, docid

        original_count_relations = sum(1 for _ in original_document.relations())
        newone_count_relations = sum(1 for _ in newone_document.relations())

        assert original_count_relations == newone_count_relations, docid
예제 #11
0
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.decomposition import PCA

from nalaf.learning.lib.sklsvm import SklSVM
from nalaf.structures.data import Dataset

from loctext.learning.train import read_corpus
from loctext.util import PRO_ID, LOC_ID, ORG_ID, REL_PRO_LOC_ID, repo_path
from loctext.learning.annotators import LocTextDXModelRelationExtractor

import matplotlib.pyplot as plt

print(__doc__)

corpus = read_corpus("LocText")
locTextModel = LocTextDXModelRelationExtractor(PRO_ID, LOC_ID, REL_PRO_LOC_ID)
locTextModel.pipeline.execute(corpus)
X, y = SklSVM._convert_edges_to_SVC_instances(
    corpus, locTextModel.pipeline.feature_set)


def pca_plot():
    X_copy = X.toarray()
    pca_2d = PCA(n_components=2).fit_transform(X_copy)

    for instance_i in range(0, pca_2d.shape[0]):
        if y[instance_i] < 0:
            neg = plt.scatter(pca_2d[instance_i, 0],
                              pca_2d[instance_i, 1],
                              c='r')