def cache_pdf(session, url, srcurl, site_base_url, ptype, title, srctitle, grouptype, year, docsettype, more_meta=None): """ Get a PDF with caching. more_meta is an optional list of additional metadata k,v pairs """ url = normalise(url, srcurl) if url in _seen: logging.warn("ALREADY SEEN PDF %s" % url) return pdf_cache_file = url[len(site_base_url):] if pdf_cache_file.startswith('/'): pdf_cache_file = pdf_cache_file[1:] pdf_cache_file = os.path.join(pdf_cache_dir, pdf_cache_file) pdf_dir = os.path.dirname(pdf_cache_file) if not os.path.isdir(pdf_dir): os.makedirs(pdf_dir) meta_file = metautil.meta_path(pdf_cache_file) if (not _redo_meta) and os.path.exists(pdf_cache_file) and os.path.exists(meta_file): logging.debug("already have pdf and meta for %s" % url) return try: meta = None if not _redo_meta: save(session, url, pdf_cache_file) if _redo_meta and os.path.exists(meta_file): # For debugging.. # oldmeta = open(meta_file, 'rb').read() pass meta = open(meta_file, 'wb') meta.write('url,%s\n' % url) meta.write('srcurl,%s\n' % srcurl) meta.write('title,%s\n' % title.encode('utf-8')) meta.write('srctitle,%s\n' % srctitle.encode('utf-8')) meta.write('pagetype,%s\n' % ptype.encode('utf-8')) meta.write('grouptype,%s\n' % grouptype.encode('utf-8') if grouptype else '') meta.write('docsettype,%s\n' % docsettype.encode('utf-8')) meta.write('year,%s\n' % year.encode('utf-8')) meta.write('fetched,%s\n' % str(datetime.datetime.now())) if more_meta is not None: for k, v in more_meta: meta.write('%s,%s\n' % (k, v.encode('utf-8'))) meta.close() except: if meta: meta.close() os.unlink(meta_file) raise
def check_pdf(pdf_dir, pdf_file): pdfpath = os.path.join(pdf_dir, pdf_file) metapath = metautil.meta_path(pdfpath) meta = metautil.get_meta(pdfpath) if meta is None: logging.warn('skip %s with no metadata' % pdfpath) return if meta.get('pagetype') == 'kanpo': # No problem, just not the type of doc we process return if meta.get('pagetype') != 'summary': logging.warn('skip %s with page type %s (expect summary)' % (pdfpath, meta.get('pagetype'))) return if meta.get('srctitle') != u'政治資金規正法に基づく届出' or meta.get('docsettype') != u'報道資料': # No problem, just not the type of doc we process, or maybe already processed return pdftext = extract_pdf_text(pdfpath).decode('utf-8') # Some docs are really borken :( if pdf_file in HARDCODED_DOCS: gname = HARDCODED_DOCS[pdf_file]['gname'] doctype = HARDCODED_DOCS[pdf_file]['doctype'] elif '(cid:' in pdftext: logging.warn('%s contains unknown characters' % pdf_file) return else: lines = pdftext.splitlines() lines = [x.strip() for x in lines] gname = group_name(lines, pdf_file) if gname is None: logging.info('Couldn\'t decide a group for %s' % pdf_file) return doctype = todoke_type(lines, pdf_file) if doctype is None: logging.info('Couldn\'t decide a doctype for %s' % pdf_file) return assert gname is not None and doctype is not None update_meta(metapath, meta, gname, doctype)