def _get_parser(date, doctype='grant'): """ Given a [date], returns the class of parser needed to parse it """ xmlhandlers = get_xml_handlers('process.cfg', doctype) for daterange in xmlhandlers.iterkeys(): if daterange[0] <= date <= daterange[1]: return xmlhandlers[daterange] return xmlhandlers['default']
import re import mmap import contextlib import itertools import sys import lib.handlers.grant_handler as grant_handler import lib.patSQL as patSQL import lib.argconfig_parse as argconfig_parse from lib.config_parser import get_xml_handlers xmlclasses = [patSQL.AssigneeXML, patSQL.CitationXML, patSQL.ClassXML, \ patSQL.InventorXML, patSQL.PatentXML, patSQL.PatdescXML, \ patSQL.LawyerXML, patSQL.ScirefXML, patSQL.UsreldocXML] regex = re.compile(r"""([<][?]xml version.*?[>]\s*[<][!]DOCTYPE\s+([A-Za-z-]+)\s+.*?/\2[>])""", re.S+re.I) xmlhandlers = get_xml_handlers('process.cfg') def list_files(patentroot, xmlregex): """ Returns listing of all files within patentroot whose filenames match xmlregex """ files = [patentroot+'/'+fi for fi in os.listdir(patentroot) \ if re.search(xmlregex, fi, re.I) != None] if not files: logging.error("No files matching {0} found in {1}".format(XMLREGEX,PATENTROOT)) sys.exit(1) return files def _get_date(filename, dateformat='ipg%y%m%d.xml'): """