def clean_headers(xd): # remove known unwanted header fields, log unknown headers for hdr in list(xd.headers.keys()): if hdr in ["Source", "Identifier", "Acquired", "Issued", "Category"]: xd.set_header(hdr, None) else: if hdr.lower() not in xdfile.HEADER_ORDER: utils.warn("%s: '%s' header not known: '%s'" % (xd.filename, hdr, xd.headers[hdr])) # clean Author and Editor headers author = xd.get_header("Author") or "" if not author: if xd.get_header("Creator"): assert not author author = xd.get_header("Creator") xd.set_header("Creator", None) editor = xd.get_header("Editor") or "" newauthor, neweditor = clean_author(author, editor) if newauthor != author: xd.set_header("Author" + CLEAN_SUFFIX, newauthor) if neweditor != editor: xd.set_header("Editor" + CLEAN_SUFFIX, neweditor) # clean Title header title = xd.get_header("Title") or "" newtitle = clean_title(title) if newtitle != title: xd.set_header("Title" + CLEAN_SUFFIX, newtitle) # create Date header dt = xd.get_header("Date") ## try getting Date from filename if not dt: try: d = utils.parse_date_from_filename(xd.filename) if d: dt = d.strftime("%Y-%m-%d") except Exception as e: utils.error(str(e)) if args.debug: raise ## try getting Date from copyright if not dt: rights = xd.get_header("Copyright") or "" dt = find_date(rights) if dt: xd.set_header("Date", dt)
def deduce_set_seqnum(xd): # look to filename base = utils.parse_pathname(xd.filename).base # check for date dt = utils.parse_date_from_filename(base) # datetime object if dt: xd.set_header("Date", dt) else: # check for number in full path (eltana dir had number) m = re.search(r'(\d+)', xd.filename) if m: xd.set_header("Number", int(m.group(1)))