예제 #1
0
    def test_simple_records(self):
        """Regression test for alignment with fake records."""
        docs = {
            'ext1': [
                Reference(title='Matt', year=2011),
                Reference(title='Erick', year=2013),
            ],
            'ext2': [
                Reference(title='Matt', year=2011),
            ],
            'ext3': [
                Reference(title='John', year=2010),
                Reference(title='Eric', year=2013),
            ]
        }

        aligned_answer = [[["ext1", Reference(title='Matt', year=2011)],
                           ["ext2", Reference(title='Matt', year=2011)]],
                          [["ext1",
                            Reference(title='Erick', year=2013)],
                           ["ext3", Reference(title='Eric', year=2013)]],
                          [["ext3", Reference(title='John', year=2010)]]]

        aligned_calc = align.align_records(docs)
        for ref_ans, ref_calc in zip(aligned_answer, aligned_calc):
            self.assertDictEqual(dict(ref_ans), dict(ref_calc))
예제 #2
0
def merge_records(records: Dict[str, List[Reference]],
                  extractor_priors: list = EXTRACTORS) \
        -> Tuple[List[Reference], float]:
    """
    Merge extracted references into a single authoritative set of references.

    Takes a list of reference metadata records (each formatted according to the
    schema) and reconciles them with each other to form one primary record for
    each item. First step is to match the lists against each other using
    similarity measures. Then, for each individual record we combine the
    possible fields and augment them with possible external information to form
    a single record.

    Parameters
    ----------
    records : dict
        The reference records from multiple extraction servies/lookup services.
        Keys are extractor names, values are lists of references (dict).
        E.g. ``{"cermine": [references], "grobid": [references]}``.
    extractor_priors : list
        Represents prior level of trust in field output for each extractor.

    Returns
    -------
    list
        Authoritative reference metadata. Each item represents a single
        cite reference (``dict``).
    """
    N_extractions = len(records)
    records = {extractor: normalize.normalize_records(extraction)
               for extractor, extraction in records.items()}
    try:
        aligned_records = align.align_records(records)
    except Exception as e:
        raise RuntimeError('Alignment failed: %s' % e) from e

    try:
        aligned_probabilities = beliefs.validate(aligned_records)
    except Exception as e:
        raise RuntimeError('Validation failed: %s' % e) from e

    try:
        arbitrated_records = arbitrate.arbitrate_all(aligned_records,
                                                     aligned_probabilities,
                                                     extractor_priors,
                                                     N_extractions)
    except Exception as e:
        raise RuntimeError('Arbitration failed: %s' % e) from e

    try:
        final_records = normalize.filter_records(arbitrated_records)
    except Exception as e:
        raise RuntimeError('Filtering failed: %s' % e) from e
    return final_records
예제 #3
0
def _similarity_list(value_a: list, value_b: list) -> float:
    """Similarity of two lists, based on values (without regard to order)."""
    aligned = align_records({'a': value_a, 'b': value_b})
    scores = []
    for item in aligned:
        if len(item) != 2:
            scores.append(0.)
            continue
        _item: dict = {k: v for k, v in item}
        scores.append(_similarity(_item['a'], _item['b']))
    return mean(scores)
예제 #4
0
    def test_extraction(self):
        from references.process import extract
        from references.services import refextract, cermine, grobid
        from references.process.merge import align, arbitrate, priors, beliefs, \
            normalize
        pdf_path = 'evaluation/pdfs/0801.0012.pdf'
        document_id = '0801.0012'

        extractions = {}
        extractions['cermine'] = cermine.extract_references(pdf_path)
        extractions['grobid'] = grobid.extract_references(pdf_path)
        extractions['refextract'] = refextract.extract_references(pdf_path)

        # with open('data/0801.0012.cermine.json', 'w') as f:
        #     json.dump(extractions['cermine'], f, indent=4, default=decimal_default)
        # with open('data/0801.0012.grobid.json', 'w') as f:
        #     json.dump(extractions['grobid'], f, indent=4, default=decimal_default)
        # with open('data/0801.0012.refextract.json', 'w') as f:
        #     json.dump(extractions['refextract'], f, indent=4, default=decimal_default)

        extractions = {
            extractor: normalize.normalize_records(extracted)
            for extractor, extracted in extractions.items()
        }
        # with open('data/0801.0012.normalized.json', 'w') as f:
        #     json.dump(extractions, f, indent=4, default=decimal_default)

        aligned_records = align.align_records(extractions)
        # with open('data/0801.0012.aligned.json', 'w') as f:
        #     json.dump(aligned_records, f, indent=4, default=decimal_default)
        aligned_probabilities = beliefs.validate(aligned_records)
        # with open('data/0801.0012.probabilities.json', 'w') as f:
        #     json.dump(aligned_probabilities, f, indent=4, default=decimal_default)
        arbitrated_records = arbitrate.arbitrate_all(aligned_records,
                                                     aligned_probabilities,
                                                     priors.EXTRACTORS, 3)
        # with open('data/0801.0012.arbitrated.json', 'w') as f:
        #     json.dump(arbitrated_records, f, indent=4, default=decimal_default)
        final_records, score = normalize.filter_records(arbitrated_records)
예제 #5
0
        raw = [row for row in csv.reader(f)]

    referenceCounts = [{k: row[i] for i, k in enumerate(raw[0])}
                       for row in raw if len(row) == len(raw[0])]

    for row in referenceCounts:

        full_path = os.path.join(basepath, row['pdf'])
        if not os.path.exists(full_path):
            continue
        document_id = row['pdf'][:-4]
        print('Extracting %s' % document_id)

        extractions = extract.extract(full_path, document_id)
        for extractor, refs in extractions.items():
            print(extractor, len(refs), row['N'])

        N_extractions = len(extractions)
        aligned_records = align.align_records(extractions)

        print('aligned', len(aligned_records), row['N'])

        aligned_probabilities = beliefs.validate(aligned_records)
        arbitrated_records = arbitrate.arbitrate_all(aligned_records,
                                                     aligned_probabilities,
                                                     priors.EXTRACTORS,
                                                     N_extractions)
        final_records, score = normalize.filter_records(arbitrated_records)
        print('final', len(final_records), row['N'], score)
        print('--')