Exemplo n.º 1
0
def extract_one(config, pdf_path):
    """Extract references from one file"""

    # the document body is not empty:
    # 2. If necessary, locate the reference section:
    if config.treat_as_reference_section:
        docbody = open(pdf_path).read().decode('utf-8')
        out = extract_references_from_string_xml(docbody)
    else:
        write_message("* processing pdffile: %s" % pdf_path, verbose=2)
        out = extract_references_from_file_xml(pdf_path)

    return out
Exemplo n.º 2
0
def extract_one(config, pdf_path):
    """Extract references from one file"""

    # the document body is not empty:
    # 2. If necessary, locate the reference section:
    if config.treat_as_reference_section:
        docbody = open(pdf_path).read().decode('utf-8')
        out = extract_references_from_string_xml(docbody)
    else:
        write_message("* processing pdffile: %s" % pdf_path, verbose=2)
        out = extract_references_from_file_xml(pdf_path)

    return out
def extract_from_pdf_string(pdf):
    """Extract references from a pdf stored in a string

    Given a string representing a pdf, this function writes the string to
    disk and passes it to refextract.
    We need to create a temoporary file because we need to run pdf2text on it"""
    # Save new record to file
    tf = NamedTemporaryFile(prefix='docextract-pdf', dir=CFG_TMPSHAREDDIR)
    try:
        tf.write(pdf)
        tf.flush()
        refs = extract_references_from_file_xml(tf.name)
    finally:
        # Also deletes the file
        tf.close()

    return refs
def extract_from_pdf_string(pdf):
    """Extract references from a pdf stored in a string

    Given a string representing a pdf, this function writes the string to
    disk and passes it to refextract.
    We need to create a temoporary file because we need to run pdf2text on it"""
    # Save new record to file
    tf = NamedTemporaryFile(prefix='docextract-pdf',
                            dir=CFG_TMPSHAREDDIR)
    try:
        tf.write(pdf)
        tf.flush()
        refs = extract_references_from_file_xml(tf.name)
    finally:
        # Also deletes the file
        tf.close()

    return refs