def extract_references(self, req, form): """Refrences extraction page This page can be used for authors to test their pdfs against our refrences extraction process""" user_info = collect_user_info(req) # Handle the 3 POST parameters if 'pdf' in form and form['pdf'].value: pdf = form['pdf'].value references_xml = extract_from_pdf_string(pdf) elif 'arxiv' in form and form['arxiv'].value: url = make_arxiv_url(arxiv_id=form['arxiv'].value) references_xml = extract_references_from_url_xml(url) elif 'url' in form and form['url'].value: url = form['url'].value references_xml = extract_references_from_url_xml(url) elif 'txt' in form and form['txt'].value: txt = form['txt'].value references_xml = extract_references_from_string_xml(txt) else: references_xml = None # If we have not uploaded anything yet # Display the form that allows us to do so if not references_xml: out = self.extract_references_template() else: out = """ <style type="text/css"> #referenceinp_link { display: none; } </style> """ out += format_record(0, 'hdref', xml_record=references_xml.encode('utf-8'), user_info=user_info) # Render the page (including header, footer) return page(title='References Extractor', body=out, uid=user_info['uid'], req=req)
def extract(self, req, form): """Refrences extraction page This page can be used for authors to test their pdfs against our refrences extraction process""" user_info = collect_user_info(req) # Handle the 3 POST parameters if 'pdf' in form and form['pdf'].value: pdf = form['pdf'].value.strip() references_xml = extract_from_pdf_string(pdf) elif 'arxiv' in form and form['arxiv'].value: url = make_arxiv_url(arxiv_id=form['arxiv'].value.strip()) references_xml = extract_references_from_url_xml(url) elif 'url' in form and form['url'].value: url = form['url'].value.strip() try: references_xml = extract_references_from_url_xml(url) except (FullTextNotAvailable, ConnectionError, HTTPError, Timeout): references_xml = None elif 'txt' in form and form['txt'].value: txt = form['txt'].value.decode('utf-8', 'ignore') references_xml = extract_references_from_string_xml(txt) else: references_xml = None # If we have not uploaded anything yet # Display the form that allows us to do so if not references_xml: out = docextract_templates.tmpl_web_form() else: references_html = format_record(0, 'hdref', xml_record=references_xml, user_info=user_info) out = docextract_templates.tmpl_web_result(references_html) # Render the page (including header, footer) return page(title='References Extractor', body=out, uid=user_info['uid'], req=req)
def extract_references_pdf_url(self, req, form): """Extract references from the pdf pointed by the passed url""" check_login(req) if 'url' not in form: return 'No URL specified' url = form['url'].value if not check_url(url): return 'Invalid URL specified' return extract_references_from_url_xml(url)
def extract(self, req, form): """Refrences extraction page This page can be used for authors to test their pdfs against our refrences extraction process""" user_info = collect_user_info(req) # Handle the 3 POST parameters if 'pdf' in form and form['pdf'].value: pdf = form['pdf'].value references_xml = extract_from_pdf_string(pdf) elif 'arxiv' in form and form['arxiv'].value: url = make_arxiv_url(arxiv_id=form['arxiv'].value) references_xml = extract_references_from_url_xml(url) elif 'url' in form and form['url'].value: url = form['url'].value references_xml = extract_references_from_url_xml(url) elif 'txt' in form and form['txt'].value: txt = form['txt'].value.decode('utf-8', 'ignore') references_xml = extract_references_from_string_xml(txt) else: references_xml = None # If we have not uploaded anything yet # Display the form that allows us to do so if not references_xml: out = docextract_templates.tmpl_web_form() else: references_html = format_record(0, 'hdref', xml_record=references_xml, user_info=user_info) out = docextract_templates.tmpl_web_result(references_html) # Render the page (including header, footer) return page(title='References Extractor', body=out, uid=user_info['uid'], req=req)
def replace_references(recid, uid=None, txt=None, url=None): """Replace references for a record The record itself is not updated, the marc xml of the document with updated references is returned Parameters: * recid: the id of the record * txt: references in text mode * inspire: format of ther references """ # Parse references if txt is not None: references_xml = extract_references_from_string_xml( txt, is_only_references=True) elif url is not None: references_xml = extract_references_from_url_xml(url) else: references_xml = extract_references_from_record_xml(recid) references = create_record(references_xml.encode('utf-8')) dummy1, dummy2, record, dummy3, dummy4, dummy5, dummy6 = get_cache_file_contents( recid, uid) out_xml = None references_to_add = record_get_field_instances(references[0], tag='999', ind1='C', ind2='5') refextract_status = record_get_field_instances(references[0], tag='999', ind1='C', ind2='6') if references_to_add: # Replace 999 fields record_delete_fields(record, '999') record_add_fields(record, '999', references_to_add) record_add_fields(record, '999', refextract_status) # Update record references out_xml = record_xml_output(record) return out_xml
def replace_references(recid, uid=None, txt=None, url=None): """Replace references for a record The record itself is not updated, the marc xml of the document with updated references is returned Parameters: * recid: the id of the record * txt: references in text mode * inspire: format of ther references """ # Parse references if txt is not None: references_xml = extract_references_from_string_xml(txt, is_only_references=True) elif url is not None: references_xml = extract_references_from_url_xml(url) else: references_xml = extract_references_from_record_xml(recid) references = create_record(references_xml) dummy1, dummy2, record, dummy3, dummy4, dummy5, dummy6 = get_cache_contents(recid, uid) out_xml = None references_to_add = record_get_field_instances(references[0], tag='999', ind1='C', ind2='5') refextract_status = record_get_field_instances(references[0], tag='999', ind1='C', ind2='6') if references_to_add: # Replace 999 fields record_delete_fields(record, '999') record_add_fields(record, '999', references_to_add) record_add_fields(record, '999', refextract_status) # Update record references out_xml = record_xml_output(record) return out_xml
def extract(self, req, form): """Refrences extraction page This page can be used for authors to test their pdfs against our refrences extraction process""" user_info = collect_user_info(req) plots = None list_image_names = [] list_caption = [] plots_dir = os.path.join(CFG_PREFIX, "var/www/img/plots/") # unique folder name # Handle the 3 POST parameters if 'pdf' in form and form['pdf'].value: pdf = form['pdf'].value references_xml = extract_from_pdf_string(pdf) pdf_string = form['pdf'].file.read() pdf = safe_mkstemp('extract.pdf') f = open(pdf, 'w') f.write(pdf_string) f.close() plots = 'File pdf: ' + str(pdf) + '<br />' (exit_code, output_buffer, stderr_output_buffer) = run_shell_command(CFG_PDFPLOTEXTRACTOR_PATH + ' ' + pdf) plotextracted_pdf_path = pdf + ".extracted/extracted.json" code, figures, extracted = merging_articles(None, plotextracted_pdf_path) id_fulltext = "" marc_path = create_MARCXML(figures, id_fulltext, code, extracted, write_file=True) plots += marc_path + '<br />' f = open (marc_path, 'r') record_xml = f.read() f.close() #plots_dir = "/opt/invenio/var/www/img/plots/" if os.path.exists(plots_dir): shutil.rmtree(plots_dir) os.mkdir(plots_dir) re_list = REGEXP_RECORD.findall(record_xml) for r in re_list: re_subfield = REGEXP_SUBFIELD_A.findall(r) for index, image_path in enumerate(re_subfield): if index == 0: run_shell_command('cp ' + image_path + ' ' + plots_dir) elif 'arxiv' in form and form['arxiv'].value: plots = "" url_pdf = make_arxiv_url(arxiv_id=form['arxiv'].value) references_xml = extract_references_from_url_xml(url_pdf) url_tarball = make_arxiv_tar_url(arxiv_id=form['arxiv'].value) plotextracted_xml_path, plotextracted_pdf_path = extract_plots_from_latex_and_pdf(url_tarball, url_pdf) plots += 'TAR: ' + plotextracted_xml_path + '<br />' plots += 'PDF: ' + plotextracted_pdf_path + '<br />' ''' code, figures, extracted = merging_latex_pdf(plotextracted_xml_path, None, "", ) id_fulltext = "" marc_path = create_MARCXML(figures, id_fulltext, code, extracted, write_file=True) ''' dest_dir = os.path.join(CFG_TMPDIR, 'textmining') try: os.mkdir(dest_dir) except OSError: pass code, message, figures, marc_path = merging_latex_pdf(plotextracted_xml_path, "", "", dest_dir) plots += 'OUTPUT: ' + marc_path + '<br />' f = open (marc_path, 'r') record_xml = f.read() f.close() if os.path.exists(plots_dir): shutil.rmtree(plots_dir) os.mkdir(plots_dir) re_list = REGEXP_RECORD.findall(record_xml) for r in re_list: re_subfield = REGEXP_SUBFIELD_A.findall(r) re_subfield_caption = REGEXP_SUBFIELD_D.findall(r) for index, image_path in enumerate(re_subfield): if index == 0: run_shell_command('cp ' + image_path + ' ' + plots_dir) list_image_names.append(os.path.split(image_path)[1]) list_caption.append(re_subfield_caption[index]) elif 'url' in form and form['url'].value: url = form['url'].value references_xml = extract_references_from_url_xml(url) plots = "ME3" elif 'txt' in form and form['txt'].value: txt = form['txt'].value references_xml = extract_references_from_string_xml(txt) else: references_xml = None # If we have not uploaded anything yet # Display the form that allows us to do so if not references_xml: out = self.extract_references_template() else: out = """ <style type="text/css"> #referenceinp_link { display: none; } /*img.plot { width: 250px; height: 250px; }*/ </style> """ out += format_record(0, 'hdref', xml_record=references_xml.encode('utf-8'), user_info=user_info) if plots: out += "<h2>Plots</h2>" out += plots dirList = os.listdir(plots_dir) for i, fname in enumerate(dirList): out += '<h3>Figure ' + str(i+1) + '</h3> <p><img src="/img/plots/' + fname + '" class="plot"></p>' index = list_image_names.index(fname) out += '<p>' + list_caption[index] + '</p>' # Render the page (including header, footer) return page(title='Document Extractor', body=out, uid=user_info['uid'], req=req)