Python align_records 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: references.process.merge.align

메소드/함수: align_records

hotexamples.com에서의 예제들: 5

Python align_records - 5개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 references.process.merge.align.align_records에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: test_align.py 프로젝트: Tubbz-alt/arxiv-references

    def test_simple_records(self):
        """Regression test for alignment with fake records."""
        docs = {
            'ext1': [
                Reference(title='Matt', year=2011),
                Reference(title='Erick', year=2013),
            ],
            'ext2': [
                Reference(title='Matt', year=2011),
            ],
            'ext3': [
                Reference(title='John', year=2010),
                Reference(title='Eric', year=2013),
            ]
        }

        aligned_answer = [[["ext1", Reference(title='Matt', year=2011)],
                           ["ext2", Reference(title='Matt', year=2011)]],
                          [["ext1",
                            Reference(title='Erick', year=2013)],
                           ["ext3", Reference(title='Eric', year=2013)]],
                          [["ext3", Reference(title='John', year=2010)]]]

        aligned_calc = align.align_records(docs)
        for ref_ans, ref_calc in zip(aligned_answer, aligned_calc):
            self.assertDictEqual(dict(ref_ans), dict(ref_calc))

예제 #2

파일 보기

파일: __init__.py 프로젝트: Tubbz-alt/arxiv-references

def merge_records(records: Dict[str, List[Reference]],
                  extractor_priors: list = EXTRACTORS) \
        -> Tuple[List[Reference], float]:
    """
    Merge extracted references into a single authoritative set of references.

    Takes a list of reference metadata records (each formatted according to the
    schema) and reconciles them with each other to form one primary record for
    each item. First step is to match the lists against each other using
    similarity measures. Then, for each individual record we combine the
    possible fields and augment them with possible external information to form
    a single record.

    Parameters
    ----------
    records : dict
        The reference records from multiple extraction servies/lookup services.
        Keys are extractor names, values are lists of references (dict).
        E.g. ``{"cermine": [references], "grobid": [references]}``.
    extractor_priors : list
        Represents prior level of trust in field output for each extractor.

    Returns
    -------
    list
        Authoritative reference metadata. Each item represents a single
        cite reference (``dict``).
    """
    N_extractions = len(records)
    records = {extractor: normalize.normalize_records(extraction)
               for extractor, extraction in records.items()}
    try:
        aligned_records = align.align_records(records)
    except Exception as e:
        raise RuntimeError('Alignment failed: %s' % e) from e

    try:
        aligned_probabilities = beliefs.validate(aligned_records)
    except Exception as e:
        raise RuntimeError('Validation failed: %s' % e) from e

    try:
        arbitrated_records = arbitrate.arbitrate_all(aligned_records,
                                                     aligned_probabilities,
                                                     extractor_priors,
                                                     N_extractions)
    except Exception as e:
        raise RuntimeError('Arbitration failed: %s' % e) from e

    try:
        final_records = normalize.filter_records(arbitrated_records)
    except Exception as e:
        raise RuntimeError('Filtering failed: %s' % e) from e
    return final_records

예제 #3

파일 보기

def _similarity_list(value_a: list, value_b: list) -> float:
    """Similarity of two lists, based on values (without regard to order)."""
    aligned = align_records({'a': value_a, 'b': value_b})
    scores = []
    for item in aligned:
        if len(item) != 2:
            scores.append(0.)
            continue
        _item: dict = {k: v for k, v in item}
        scores.append(_similarity(_item['a'], _item['b']))
    return mean(scores)

예제 #4

파일 보기

    def test_extraction(self):
        from references.process import extract
        from references.services import refextract, cermine, grobid
        from references.process.merge import align, arbitrate, priors, beliefs, \
            normalize
        pdf_path = 'evaluation/pdfs/0801.0012.pdf'
        document_id = '0801.0012'

        extractions = {}
        extractions['cermine'] = cermine.extract_references(pdf_path)
        extractions['grobid'] = grobid.extract_references(pdf_path)
        extractions['refextract'] = refextract.extract_references(pdf_path)

        # with open('data/0801.0012.cermine.json', 'w') as f:
        #     json.dump(extractions['cermine'], f, indent=4, default=decimal_default)
        # with open('data/0801.0012.grobid.json', 'w') as f:
        #     json.dump(extractions['grobid'], f, indent=4, default=decimal_default)
        # with open('data/0801.0012.refextract.json', 'w') as f:
        #     json.dump(extractions['refextract'], f, indent=4, default=decimal_default)

        extractions = {
            extractor: normalize.normalize_records(extracted)
            for extractor, extracted in extractions.items()
        }
        # with open('data/0801.0012.normalized.json', 'w') as f:
        #     json.dump(extractions, f, indent=4, default=decimal_default)

        aligned_records = align.align_records(extractions)
        # with open('data/0801.0012.aligned.json', 'w') as f:
        #     json.dump(aligned_records, f, indent=4, default=decimal_default)
        aligned_probabilities = beliefs.validate(aligned_records)
        # with open('data/0801.0012.probabilities.json', 'w') as f:
        #     json.dump(aligned_probabilities, f, indent=4, default=decimal_default)
        arbitrated_records = arbitrate.arbitrate_all(aligned_records,
                                                     aligned_probabilities,
                                                     priors.EXTRACTORS, 3)
        # with open('data/0801.0012.arbitrated.json', 'w') as f:
        #     json.dump(arbitrated_records, f, indent=4, default=decimal_default)
        final_records, score = normalize.filter_records(arbitrated_records)

예제 #5

파일 보기

파일: extractions.py 프로젝트: Tubbz-alt/arxiv-references

        raw = [row for row in csv.reader(f)]

    referenceCounts = [{k: row[i] for i, k in enumerate(raw[0])}
                       for row in raw if len(row) == len(raw[0])]

    for row in referenceCounts:

        full_path = os.path.join(basepath, row['pdf'])
        if not os.path.exists(full_path):
            continue
        document_id = row['pdf'][:-4]
        print('Extracting %s' % document_id)

        extractions = extract.extract(full_path, document_id)
        for extractor, refs in extractions.items():
            print(extractor, len(refs), row['N'])

        N_extractions = len(extractions)
        aligned_records = align.align_records(extractions)

        print('aligned', len(aligned_records), row['N'])

        aligned_probabilities = beliefs.validate(aligned_records)
        arbitrated_records = arbitrate.arbitrate_all(aligned_records,
                                                     aligned_probabilities,
                                                     priors.EXTRACTORS,
                                                     N_extractions)
        final_records, score = normalize.filter_records(arbitrated_records)
        print('final', len(final_records), row['N'], score)
        print('--')