示例#1
0
def check_pdf(pdf_dir, pdf_file):
    pdfpath = os.path.join(pdf_dir, pdf_file)
    metapath = metautil.meta_path(pdfpath)
    meta = metautil.get_meta(pdfpath)

    if meta is None:
        logging.warn('skip %s with no metadata' % pdfpath)
        return

    if meta.get('pagetype') == 'kanpo':
        # No problem, just not the type of doc we process
        return

    if meta.get('pagetype') != 'summary':
        logging.warn('skip %s with page type %s (expect summary)' % (pdfpath, meta.get('pagetype')))
        return

    if meta.get('srctitle') != u'政治資金規正法に基づく届出' or meta.get('docsettype') != u'報道資料':
        # No problem, just not the type of doc we process, or maybe already processed
        return

    pdftext = extract_pdf_text(pdfpath).decode('utf-8')

    # Some docs are really borken :(
    if pdf_file in HARDCODED_DOCS:
        gname = HARDCODED_DOCS[pdf_file]['gname']
        doctype = HARDCODED_DOCS[pdf_file]['doctype']
    elif '(cid:' in pdftext:
        logging.warn('%s contains unknown characters' % pdf_file)
        return
    else:
        lines = pdftext.splitlines()
        lines = [x.strip() for x in lines]
        gname = group_name(lines, pdf_file)
        if gname is None:
            logging.info('Couldn\'t decide a group for %s' % pdf_file)
            return
        doctype = todoke_type(lines, pdf_file)
        if doctype is None:
            logging.info('Couldn\'t decide a doctype for %s' % pdf_file)
            return
    assert gname is not None and doctype is not None
    update_meta(metapath, meta, gname, doctype)
示例#2
0
def check_pdf(s, pdf_path, pdf_root, api_root, docs_by_url, nodefer, groupsonly):
    relative_path = pdf_path[len(pdf_root):]
    meta = metautil.get_meta(pdf_path)

    if meta is None:
        logging.warn("Skip %s which has no metadata!" % relative_path)
        return

    if 'title' not in meta or 'url' not in meta or 'srcurl' not in meta:
        logging.warn("Invalid metadata for %s!" % relative_path)
        return

    url = meta['url']
    if url in docs_by_url:
        # Verify the contents.. should be the same source
        record = docs_by_url[url]
        if record['srcurl'] != meta['srcurl']:
            logging.warn('Difference sources for %s at %s and in db: %s vs %s'
                         % (url, relative_path, record['srcurl'], meta['srcurl']))
        return
    else:
        gname = meta['srctitle']

        note = None

        notepart = MULTIPART_RE.search(gname)
        # logging.info(u"Group %s notepart %s." % (gname, notepart))
        if notepart is not None:
            notepart = notepart.groups()
            gname = notepart[0]
            note = notepart[1]

        if note:
            note = note.strip()
        gname = gname.strip()

        if 'grouptype' not in meta and gname not in _group_cache:
            if nodefer:
                logging.info(u"Recording %s as unknown." % (gname,))
                meta['grouptype'] = u'不明'
            else:
                logging.info(u"Defer %s (%s) to get more group data" % (relative_path, gname))
                return
        gtype = meta.get('grouptype')
        # Sometimes this name has a の..
        if gtype == u'政党の本部':
            gtype = u'政党本部'
        if gtype == u'政党の支部':
            gtype = u'政党支部'
        if gtype == u'総括文書(支部分)':
            gtype = u'政党支部'
        if gtype == u'資金管理団体(国会議員関係政治団体を除く。)':
            gtype = u'資金管理団体'
        if gtype == u'国会議員関係政治団体(政党の支部を除く。)':
            gtype = u'国会議員関係政治団体'
        if gtype == u'政党':
            # This could be honbu or shibu
            gtype = None

        title = meta['title']
        title_parts = title.split('\t')
        parent = None
        if len(title_parts) == 2:
            parent = title_parts[1].strip()
            if len(parent) <= 1:
                parent = None

        group = get_or_make_group(s, api_root, gname, gtype, parent)
        if group is None:
            # Something went wrong.. unknown type?
            return

        docdir, docfname = os.path.split(relative_path)
        docset = get_or_make_docset(s, api_root, title, meta['docsettype'], docdir)

        if groupsonly:
            return

        # Collect pdf stats - size and pages
        fsize = os.stat(pdf_path).st_size
        pagesre = re.compile('Pages:\s+(\d+)')
        p1 = subprocess.Popen(['pdfinfo', pdf_path], stdout=subprocess.PIPE)
        (stdoutdata, stderrdata) = p1.communicate()
        pagecount = 0
        m = pagesre.search(stdoutdata)
        if not m:
            import pdb; pdb.set_trace()
        else:
            pagecount = int(m.groups()[0])

        # finally.. make the doc.
        document = make_doc(s, api_root, docset['id'], meta['year'],
                            group['id'], docfname, meta['url'], meta['srcurl'],
                            fsize, pagecount, note)