def refextract_text(): """Run refextract on a piece of text.""" if current_app.config.get("FEATURE_FLAG_ENABLE_REFEXTRACT_SERVICE"): headers = { "Content-Type": "application/json", "Accept": "application/json" } data = { "journal_kb_data": create_journal_dict(), "text": request.json["text"] } response = requests.post( f"{current_app.config['REFEXTRACT_SERVICE_URL']}/extract_references_from_text", headers=headers, data=orjson.dumps(data), ) if response.status_code != 200: return jsonify({"message": "Can not extract references"}, 500) extracted_references = response.json()["extracted_references"] else: extracted_references = extract_references_from_string( request.json["text"], override_kbs_files={"journals": create_journal_dict()}, reference_format="{title},{volume},{page}", ) deduplicated_extracted_references = dedupe_list(extracted_references) references = map_refextract_to_schema(deduplicated_extracted_references) match_result = match_references(references) return jsonify(match_result.get("matched_references"))
def extract_references_from_text(text, source=None, custom_kbs_file=None): """Extract references from text and return in INSPIRE format.""" extracted_references = extract_references_from_string( text, override_kbs_files=get_refextract_kbs_path(), reference_format=u'{title},{volume},{page}', ) return map_refextract_to_schema(extracted_references, source=source)
def refextract_text(): """Run refextract on a piece of text.""" extracted_references = extract_references_from_string( request.json['text'], override_kbs_files=get_refextract_kbs_path(), reference_format=u'{title},{volume},{page}') references = map_refextract_to_schema(extracted_references) return jsonify(references)
def extract_references_from_text(text, source=None, custom_kbs_file=None): """Extract references from text and return in INSPIRE format.""" with local_refextract_kbs_path() as kbs_path: extracted_references = extract_references_from_string( text, override_kbs_files=kbs_path, reference_format=u'{title},{volume},{page}', ) return map_refextract_to_schema(extracted_references, source=source)
def refextract_text(): """Run refextract on a piece of text.""" extracted_references = extract_references_from_string( request.json["text"], override_kbs_files={"journals": create_journal_dict()}, reference_format="{title},{volume},{page}", ) references = map_refextract_to_schema(extracted_references) match_result = match_references(references) return jsonify(match_result.get("matched_references"))
def refextract_text(): """Run refextract on a piece of text.""" with local_refextract_kbs_path() as kbs_path: extracted_references = extract_references_from_string( request.json['text'], override_kbs_files=kbs_path, reference_format=u'{title},{volume},{page}' ) references = map_refextract_to_schema(extracted_references) return jsonify(references)
def refextract_text(): """Run refextract on a piece of text.""" with local_refextract_kbs_path() as kbs_path: extracted_references = extract_references_from_string( request.json["text"], override_kbs_files=kbs_path, reference_format="{title},{volume},{page}", ) references = map_refextract_to_schema(extracted_references) match_result = match_references(references) return jsonify(match_result.get("matched_references"))
def get_references(rl): refs = [] #convert individual references for ref in rl.find_all('ref'): (lt, refno) = ('', '') for label in ref.find_all('label'): lt = label.text.strip() lt = re.sub('\W', '', lt) if re.search('\[', lt): refno = '%s ' % (lt) else: refno = '[%s] ' % (lt) #journal and preprint for mc in ref.find_all( 'element-citation', attrs={'publication-type': ['journal', 'preprint']}): (title, authors, pbn, doi, arxiv) = ('', [], '', '', '') #authors for nametag in mc.find_all('name'): name = '' for gn in nametag.find_all('given-names'): name = gn.text.strip() for sn in nametag.find_all('surname'): name += ' ' + sn.text.strip() authors.append(name) #title for at in mc.find_all('article-title'): #title = at.text.strip() title = cleanformulas(at) #pubnote for source in mc.find_all('source'): pbn = source.text.strip() for volume in mc.find_all('volume'): pbn += ' ' + volume.text.strip() for issue in mc.find_all('issue'): pbn += ', No. ' + issue.text.strip() for year in mc.find_all('year'): pbn += ' (%s) ' % (year.text.strip()) for fpage in mc.find_all('fpage'): pbn += ' ' + fpage.text.strip() for lpage in mc.find_all('lpage'): pbn += '-' + lpage.text.strip() for fpage in mc.find_all('elocation-id'): pbn += ' ' + fpage.text.strip() #refextract on pbn to normalize it repbn = extract_references_from_string( pbn, override_kbs_files={ 'journals': '/opt/invenio/etc/docextract/journal-titles-inspire.kb' }, reference_format="{title},{volume},{page}") if repbn: if 'journal_reference' in repbn[0].keys(): #print ' [refextract] normalize "%s" to "%s"' % (pbn, repbn[0]['journal_reference']) pbn = repbn[0]['journal_reference'] else: for comment in mc.find_all('comment'): pbn = comment.text.strip() #DOI for pi in mc.find_all('pub-id', attrs={'pub-id-type': 'doi'}): doi = pi.text.strip() #arXiv for el in mc.find_all('ext-link', attrs={'ext-link-type': 'arxiv'}): arxiv = el.text.strip() if re.search('^\d\d\d\d\.\d\d\d\d', arxiv): arxiv = 'arXiv:' + arxiv #all together if doi: reference = [('x', refno + '%s: %s, %s, DOI: %s' % (', '.join(authors), title, pbn, doi))] if arxiv: reference.append(('r', arxiv)) reference.append(('a', 'doi:' + doi)) if lt: reference.append(('o', re.sub('\D', '', lt))) else: reference = [ ('x', refno + '%s: %s, %s' % (', '.join(authors), title, pbn)) ] if arxiv: reference.append(('r', arxiv)) refs.append(reference) #book for mc in ref.find_all( 'element-citation', attrs={'publication-type': ['confproc', 'book']}): (atitle, btitle, editors, authors, pbn, bpbn, doi) = ('', '', [], [], '', '', '') #authors/editors for pg in mc.find_all('person-group'): for nametag in mc.find_all('name'): name = '' for gn in nametag.find_all('given-names'): name = gn.text.strip() for sn in nametag.find_all('surname'): name += ' ' + sn.text.strip() if pg['person-group-type'] == 'author': authors.append(name) elif pg['person-group-type'] == 'editor': editors.append(name) #title for at in mc.find_all('article-title'): atitle = cleanformulas(at) #atitle = at.text.strip() #book title for source in mc.find_all('source'): btitle = cleanformulas(source) #btitle = source.text.strip() for source in mc.find_all('conf-name'): btitle += ' ' + cleanformulas(source) #book pubnote for publishername in mc.find_all('publisher-name'): bpbn = publishername.text.strip() + ', ' for publisherloc in mc.find_all('publisher-loc'): bpbn += publisherloc.text.strip() + ', ' for year in mc.find_all('year'): bpbn += year.text.strip() #pubnote for fpage in mc.find_all('fpage'): pbn += ' ' + fpage.text.strip() for lpage in mc.find_all('lpage'): pbn += '-' + lpage.text.strip() #all together if atitle: refs.append([('x', refno + '%s: %s, pages %s in: %s: %s, %s' % (', '.join(authors), atitle, pbn, ', '.join(editors), btitle, bpbn))]) else: refs.append([ ('x', refno + '%s: %s, %s' % (', '.join(authors), btitle, bpbn)) ]) #other for mc in ref.find_all('mixed-citation', attrs={'publication-type': 'other'}): (doi, recid, arxiv) = ('', '', '') #INSPIRE links inspirelink = '' for el in mc.find_all('ext-link', attrs={'ext-link-type': 'uri'}): if el.has_attr('xlink:href'): link = el['xlink:href'] if re.search('inspirehep.net.*IRN', link): irn = re.sub('.*\D', '', link) #inspire2 for recid in search_pattern(p='970__a:SPIRES-' + irn): #inspire2 inspirelink += ', https://old.inspirehep.net/record/%i' % (recid) #inspire2 el.decompose() elif re.search('inspirehep.net.*recid', link): recid = re.sub('.*\D', '', link) inspirelink += ', https://old.inspirehep.net/record/%s' % ( recid) el.decompose() elif re.search('inspirehep.net', link): el.decompose() elif re.search('arxiv.org', link): arxiv = re.sub(' ', '', el.text.strip()) arxiv = re.sub('^\[', '', arxiv) arxiv = re.sub('(\d)\]$', r'\1', arxiv) if re.search('^\d{4}\.\d', arxiv): arxiv = 'arXiv:' + arxiv elif re.search('ar[xX]iv\:[a-z\-]+\/\d', arxiv): arxiv = arxiv[6:] el.decompose() #missing spaces? for bold in mc.find_all('bold'): bt = bold.text.strip() bold.replace_with(' %s ' % (bt)) #DOI for pi in mc.find_all('pub-id', attrs={'pub-id-type': 'doi'}): doi = pi.text.strip() pi.replace_with(', DOI: %s' % (doi)) #all together reference = [('x', refno + cleanformulas(mc))] #reference = [('x', refno + mc.text.strip())] if doi: reference.append(('a', 'doi:' + doi)) if recid: reference.append(('0', str(recid))) if arxiv: reference.append(('r', arxiv)) if doi or recid or arxiv: if lt: reference.append(('o', re.sub('\D', '', lt))) refs.append(reference) return refs
def refextract_url(): """Run refextract on a URL.""" extracted_references = extract_references_from_string(request.json['url']) references = map_refextract_to_schema(extracted_references) return jsonify(references)
def refextract_text(): """Run refextract on a piece of text.""" extracted_references = extract_references_from_string(request.json['text']) references = map_refextract_to_schema(extracted_references) return jsonify(references)
def get_references(url, clean='jacow'): from refextract import extract_references_from_string filename = url.split('/')[-1] if os.path.isfile('%s/%s_clean.txt' % (tmppath, filename[:-4])): controlfile = codecs.EncodedFile( codecs.open('%s/%s_clean.txt' % (tmppath, filename[:-4])), 'utf8') fulltext = controlfile.read() fulltext = fulltext.decode("utf-8") controlfile.close() else: if not os.path.isfile('%s/%s.txt' % (tmppath, filename[:-4])): if not os.path.isfile('%s/%s' % (tmppath, filename)): os.system('wget -q -O %s%s %s' % (tmppath, filename, url)) os.system('/usr/bin/pdftotext %s%s' % (tmppath, filename)) infile = codecs.EncodedFile( codecs.open('%s/%s.txt' % (tmppath, filename[:-4])), 'utf8') fulltext = infile.readlines() fulltext = [line.decode("utf-8") for line in fulltext] if clean == 'jacow': fulltext = clean_fulltext_jacow(fulltext, verbose=1) elif clean == 'moriond': fulltext = clean_fulltext_moriond(fulltext) elif clean == 'linebreaks': fulltext = '\n'.join(fulltext) + '\n' fulltext = clean_linebreaks(fulltext) else: fulltext = '\n'.join(fulltext) + '\n' fulltext = get_reference_section(fulltext) infile.close() if '[2]' in fulltext: lines = fulltext.split('\n') lines.sort(cmp=by_number) fulltext = '\n'.join(lines) last_number = 0 errors = '' for line in lines: number = re_number.search(line) if number: this_number = int(number.group(1)) if not this_number - last_number == 1: errors += '%s: [%s] followed by [%s]\n' % ( filename[:-4], last_number, this_number) last_number = this_number elif last_number: errors += '%s: No number for %s\n' % (filename[:-4], line[:30]) if errors: reflog_file = open('%s/%s.log' % (publisherdatapath, filename[:-4]), mode='wb') reflog_file.write(errors) reflog_file.close() controlfile = codecs.EncodedFile( codecs.open('%s/%s_clean.txt' % (tmppath, filename[:-4]), mode='wb'), 'utf8') controlfile.write(fulltext.encode("utf-8")) controlfile.close() refs = extract_references_from_string( fulltext, is_only_references=False, override_kbs_files={ 'journals': '/opt/invenio/etc/docextract/journal-titles-inspire.kb' }, reference_format="{title},{volume},{page}") references = [] #mappings for references in JSON to MARC mappings = { 'doi': 'a', 'collaborations': 'c', 'document_type': 'd', 'author': 'h', 'isbn': 'i', 'texkey': 'k', 'misc': 'm', 'journal_issue': 'n', 'label': 'o', 'linemarker': 'o', 'reportnumber': 'r', 'journal_reference': 's', 'title': 't', 'urls': 'u', 'url': 'u', 'raw_ref': 'x', # 'journal_title': None, # 'journal_volume': None, # 'journal_page': None, # 'journal_year': None, # 'publisher': None, 'year': 'y' } for ref in refs: entryaslist = [('9', 'refextract')] for key in ref.keys(): if key in mappings: for entry in ref[key]: entryaslist.append((mappings[key], entry)) # else: # print 'no mapping for', key references.append(entryaslist) return references