Пример #1
0
def pdfrefs(in_fp):
    """Extract references from a pdf file"""
    xml = pdf2xml(in_fp)

    ref_re = re.compile('\[(\d+)\] (.*)')
    in_ref = False
    refs = dict()
    key = None

    for t in xml.iter('text'):
        if not t.text:
            continue

        if 'references' in t.text.lower():
            in_ref = True
            continue

        if not in_ref:
            continue

        ref_m = ref_re.match(t.text)

        if ref_m:
            key, text = int(ref_m.group(1)), ref_m.group(2)
            refs[key] = [text]
        else:
            refs[key].append(t.text)

    refs = {k: ' '.join(v) for k, v in refs.items()}

    return refs
Пример #2
0
def pdftitle(in_fp):
    xml = pdf2xml(in_fp)

    fontspecs = xml.findall("./page[@number='1']/fontspec")
    max_size = -1
    max_i = None

    for f in fontspecs:
        i = int(f.attrib['id'])
        size = int(f.attrib['size'])

        if size >= max_size:
            max_size = size
            max_i = i

    titles = xml.xpath(".//text[@font='{0}']/descendant-or-self::text()".format(max_i))
    title = ' '.join(titles).strip()
    title = re.sub('\s+', ' ', title)

    return title