예제 #1
0
파일: getIAA.py 프로젝트: marilenaoita/nala
def benchmark_nala(member1, member2):
    itrs = []

    # Read the IAA iterations in blocks so that the plain documents are not deleted with the AnnJsonAnnotationReader's
    for itr in IterationRound.all():
        if itr.is_IAA():
            dataset = itr.read(read_annotations=False)
            AnnJsonAnnotationReader(
                os.path.join(itr.path, "reviewed", member1),
                read_only_class_id=MUT_CLASS_ID,
                delete_incomplete_docs=False).annotate(dataset)
            AnnJsonAnnotationReader(os.path.join(itr.path, "reviewed",
                                                 member2),
                                    read_only_class_id=MUT_CLASS_ID,
                                    delete_incomplete_docs=False,
                                    is_predicted=True).annotate(dataset)
            itrs.append(dataset)
            dataset = None

    # Then merge the IAA iterations
    all_itrs_dataset = Dataset()
    for itr_dataset in itrs:
        all_itrs_dataset.extend_dataset(itr_dataset)

    ExclusiveNLDefiner().define(all_itrs_dataset)

    return (all_itrs_dataset, MentionLevelEvaluator(
        subclass_analysis=True).evaluate(all_itrs_dataset))
예제 #2
0
def get_corpora(names, only_class_id=None):
    dataset = Dataset()

    for name in names.split(','):
        dataset.extend_dataset(get_corpus(name, only_class_id=only_class_id))

    return dataset
예제 #3
0
def get_corpus_name(name, only_class_id=None):
    """
    :rtype: nalaf.structures.data.Dataset
    """
    assert only_class_id == MUT_CLASS_ID, "(corpus name: {}) The class_id to read (only) is always assumed to be `{}`".format(
        name, MUT_CLASS_ID)

    parts = name.split("_")
    training = test = random = False

    if len(parts) > 1:
        name = parts[0]
        typ = parts[1]
        training = True if typ == "training" else False
        test = True if typ == "test" else False
        random = True if typ == "random" else False
        until_iteration = int(parts[2]) if len(parts) > 2 else None

        if until_iteration:
            assert name == "nala", "Iteration subsets are currently supported only with nala (training or test/indexed, not random/discoveries)"

    if name == "tmVar":
        if not (training or test):
            fn = 'corpus.txt'
        elif training:
            fn = 'train.PubTator.txt'
        elif test:
            fn = 'test.PubTator.txt'

        entirecorpusfile = os.path.join(__corpora_folder, 'tmvar', fn)
        return TmVarReader(entirecorpusfile, MUT_CLASS_ID).read()

    if name == "MF":
        ret = Dataset()

        if not (training or test):
            training = test = True

        if training:
            fn = 'devo_set.txt'
            entirecorpusfile = os.path.join(__corpora_folder, 'mutationfinder',
                                            'cleaned corpus', fn)
            ret.extend_dataset(MutationFinderReader(entirecorpusfile).read())

        if test:
            fn = 'test_set.txt'
            entirecorpusfile = os.path.join(__corpora_folder, 'mutationfinder',
                                            'cleaned corpus', fn)
            ret.extend_dataset(MutationFinderReader(entirecorpusfile).read())

        return ret

    if name == "SETH":
        # this is implementation with everthing into single part
        # ret = SETHReader(os.path.join(__corpora_folder, 'seth', 'corpus.txt')).read()
        # annreader = SETHAnnotationReader(os.path.join(__corpora_folder, 'seth', 'annotations'))
        # annreader.annotate(ret)

        # alternative implementation with abstract and title in separate parts
        ann_folder = os.path.join(__corpora_folder, 'seth', 'annotations')
        pmids = [
            file[:-4] for file in os.listdir(ann_folder)
            if file.endswith('.ann')
        ]
        ret = PMIDReader(pmids).read()
        DownloadedSETHAnnotationReader(ann_folder,
                                       mut_class_id=MUT_CLASS_ID,
                                       gene_class_id=None).annotate(ret)

        return ret

    if name == "LEAP-FS":
        corpus_file = os.path.join(__corpora_folder,
                                   'ProteinResidueFullTextCorpus',
                                   'ProteinResidueFullText.tsv')
        ret = ProteinResidueCorpusPartialReader(corpus_file,
                                                mut_class_id=MUT_CLASS_ID,
                                                residue_class_id=None).read()

        return ret

    elif name == "IDP4":
        return Iteration.read_IDP4()

    elif name == "nala":
        if training:
            return Iteration.read_nala_training(until_iteration)
        elif test:
            return Iteration.read_nala_test(number_iterations=until_iteration)
        elif random:
            return Iteration.read_nala_random()
        else:
            return Iteration.read_nala()

    elif name == "IDP4+":
        if training:
            return Iteration.read_IDP4Plus_training(until_iteration)
        elif test:
            return Iteration.read_IDP4Plus_test()
        else:
            return Iteration.read_IDP4Plus()

    elif name == "Var":
        folder = os.path.join(__corpora_folder, 'variome', 'data')
        return VerspoorReader(folder,
                              mut_class_id=MUT_CLASS_ID,
                              gene_class_id=None).read()

    elif name == "Var120":
        folder = os.path.join(__corpora_folder, 'variome_120',
                              'annotations_mutations_explicit')
        return VerspoorReader(folder,
                              mut_class_id=MUT_CLASS_ID,
                              gene_class_id=None).read()

    elif name == "OSIRIS":
        file = os.path.join(__corpora_folder, 'osiris', 'OSIRIScorpusv01.xml')
        return OSIRISReader(file, MUT_CLASS_ID).read()

    elif name in ALL_CORPORA:
        return Dataset()
        # raise NotImplementedError("My bad, not implemented: " + name)

    else:
        raise Exception("Do not recognize given corpus name: " + name)
예제 #4
0
파일: getIAA.py 프로젝트: marilenaoita/nala
total_dataset = Dataset()
individual_evaluations = []

#product_members_pairs = combinations(members, 2)
product_members_pairs = list(
    ((m1, m2) for (m1, m2) in product(members, members) if m1 != m2))

for member1, member2 in product_members_pairs:
    (dataset, evaluation) = benchmark(member1, member2)

    if not show_only_total_results:
        print(member1, member2)
        print("  -> Overlapping: ", dataset.__repr__())
        print(evaluation)
        print("")
        individual_evaluations.append(evaluation)

    total_dataset.extend_dataset(dataset)
    dataset = None
    evaluation = None

total_evaluation = Evaluations.merge(individual_evaluations,
                                     are_disjoint_evaluations=False)

print()
print()
print("_Total_ overlapping: ", total_dataset.__repr__(),
      ", num members pairs: ", len(individual_evaluations))
print()
print(total_evaluation)