def test_simple_records(self): """Regression test for alignment with fake records.""" docs = { 'ext1': [ Reference(title='Matt', year=2011), Reference(title='Erick', year=2013), ], 'ext2': [ Reference(title='Matt', year=2011), ], 'ext3': [ Reference(title='John', year=2010), Reference(title='Eric', year=2013), ] } aligned_answer = [[["ext1", Reference(title='Matt', year=2011)], ["ext2", Reference(title='Matt', year=2011)]], [["ext1", Reference(title='Erick', year=2013)], ["ext3", Reference(title='Eric', year=2013)]], [["ext3", Reference(title='John', year=2010)]]] aligned_calc = align.align_records(docs) for ref_ans, ref_calc in zip(aligned_answer, aligned_calc): self.assertDictEqual(dict(ref_ans), dict(ref_calc))
def merge_records(records: Dict[str, List[Reference]], extractor_priors: list = EXTRACTORS) \ -> Tuple[List[Reference], float]: """ Merge extracted references into a single authoritative set of references. Takes a list of reference metadata records (each formatted according to the schema) and reconciles them with each other to form one primary record for each item. First step is to match the lists against each other using similarity measures. Then, for each individual record we combine the possible fields and augment them with possible external information to form a single record. Parameters ---------- records : dict The reference records from multiple extraction servies/lookup services. Keys are extractor names, values are lists of references (dict). E.g. ``{"cermine": [references], "grobid": [references]}``. extractor_priors : list Represents prior level of trust in field output for each extractor. Returns ------- list Authoritative reference metadata. Each item represents a single cite reference (``dict``). """ N_extractions = len(records) records = {extractor: normalize.normalize_records(extraction) for extractor, extraction in records.items()} try: aligned_records = align.align_records(records) except Exception as e: raise RuntimeError('Alignment failed: %s' % e) from e try: aligned_probabilities = beliefs.validate(aligned_records) except Exception as e: raise RuntimeError('Validation failed: %s' % e) from e try: arbitrated_records = arbitrate.arbitrate_all(aligned_records, aligned_probabilities, extractor_priors, N_extractions) except Exception as e: raise RuntimeError('Arbitration failed: %s' % e) from e try: final_records = normalize.filter_records(arbitrated_records) except Exception as e: raise RuntimeError('Filtering failed: %s' % e) from e return final_records
def _similarity_list(value_a: list, value_b: list) -> float: """Similarity of two lists, based on values (without regard to order).""" aligned = align_records({'a': value_a, 'b': value_b}) scores = [] for item in aligned: if len(item) != 2: scores.append(0.) continue _item: dict = {k: v for k, v in item} scores.append(_similarity(_item['a'], _item['b'])) return mean(scores)
def test_extraction(self): from references.process import extract from references.services import refextract, cermine, grobid from references.process.merge import align, arbitrate, priors, beliefs, \ normalize pdf_path = 'evaluation/pdfs/0801.0012.pdf' document_id = '0801.0012' extractions = {} extractions['cermine'] = cermine.extract_references(pdf_path) extractions['grobid'] = grobid.extract_references(pdf_path) extractions['refextract'] = refextract.extract_references(pdf_path) # with open('data/0801.0012.cermine.json', 'w') as f: # json.dump(extractions['cermine'], f, indent=4, default=decimal_default) # with open('data/0801.0012.grobid.json', 'w') as f: # json.dump(extractions['grobid'], f, indent=4, default=decimal_default) # with open('data/0801.0012.refextract.json', 'w') as f: # json.dump(extractions['refextract'], f, indent=4, default=decimal_default) extractions = { extractor: normalize.normalize_records(extracted) for extractor, extracted in extractions.items() } # with open('data/0801.0012.normalized.json', 'w') as f: # json.dump(extractions, f, indent=4, default=decimal_default) aligned_records = align.align_records(extractions) # with open('data/0801.0012.aligned.json', 'w') as f: # json.dump(aligned_records, f, indent=4, default=decimal_default) aligned_probabilities = beliefs.validate(aligned_records) # with open('data/0801.0012.probabilities.json', 'w') as f: # json.dump(aligned_probabilities, f, indent=4, default=decimal_default) arbitrated_records = arbitrate.arbitrate_all(aligned_records, aligned_probabilities, priors.EXTRACTORS, 3) # with open('data/0801.0012.arbitrated.json', 'w') as f: # json.dump(arbitrated_records, f, indent=4, default=decimal_default) final_records, score = normalize.filter_records(arbitrated_records)
raw = [row for row in csv.reader(f)] referenceCounts = [{k: row[i] for i, k in enumerate(raw[0])} for row in raw if len(row) == len(raw[0])] for row in referenceCounts: full_path = os.path.join(basepath, row['pdf']) if not os.path.exists(full_path): continue document_id = row['pdf'][:-4] print('Extracting %s' % document_id) extractions = extract.extract(full_path, document_id) for extractor, refs in extractions.items(): print(extractor, len(refs), row['N']) N_extractions = len(extractions) aligned_records = align.align_records(extractions) print('aligned', len(aligned_records), row['N']) aligned_probabilities = beliefs.validate(aligned_records) arbitrated_records = arbitrate.arbitrate_all(aligned_records, aligned_probabilities, priors.EXTRACTORS, N_extractions) final_records, score = normalize.filter_records(arbitrated_records) print('final', len(final_records), row['N'], score) print('--')