def extract_one(config, pdf_path): """Extract references from one file""" # the document body is not empty: # 2. If necessary, locate the reference section: if config.treat_as_reference_section: docbody = open(pdf_path).read().decode('utf-8') out = extract_references_from_string_xml(docbody) else: write_message("* processing pdffile: %s" % pdf_path, verbose=2) out = extract_references_from_file_xml(pdf_path) return out
def extract_from_pdf_string(pdf): """Extract references from a pdf stored in a string Given a string representing a pdf, this function writes the string to disk and passes it to refextract. We need to create a temoporary file because we need to run pdf2text on it""" # Save new record to file tf = NamedTemporaryFile(prefix='docextract-pdf', dir=CFG_TMPSHAREDDIR) try: tf.write(pdf) tf.flush() refs = extract_references_from_file_xml(tf.name) finally: # Also deletes the file tf.close() return refs