def test_parse_patent(self): testdir = os.path.join(basedir, './fixtures/xml') xmlregex = r'ipg120327.one.xml' filelist = parse.list_files(testdir, xmlregex) grant_list = list(parse.parse_files(filelist)) parsed_grants = list(parse.parse_patents(grant_list)) self.assertTrue(len(parsed_grants) == len(grant_list)*len(xmlclasses))
def test_list_files(self): testdir = os.path.join(basedir, './fixtures/xml') xmlregex = r'ipg120327.one.xml' files = parse.list_files(testdir, xmlregex) self.assertTrue(isinstance(files, list)) self.assertTrue(len(files) == 1) self.assertTrue(all(filter(lambda x: isinstance(x, str), files))) self.assertTrue(all(map(lambda x: os.path.exists(x), files)))
def test_parse_patent(self): patentroot = '.' testdir = [os.path.join(basedir, './fixtures/xml')] xmlregex = r'ipg120327.one.xml' filelist = parse.list_files(testdir, patentroot, xmlregex) grant_list = parse.parallel_parse(filelist) parsed_grants = parse.parse_patent(grant_list) self.assertTrue(len(list(parsed_grants)) == len(grant_list)*len(xmlclasses))
urls += generate_download_list(parse_config['years'], 'grant') if should_process_applications: urls += generate_download_list(parse_config['years'], 'application') downloaddir = parse_config['downloaddir'] if downloaddir and not os.path.exists(downloaddir): os.makedirs(downloaddir) print 'Downloading files at {0}'.format(str(datetime.datetime.today())) download_files(urls) print 'Downloaded files:',parse_config['years'] f = datetime.datetime.now() print 'Finished downloading in {0}'.format(str(f-s)) # find files print "Starting parse on {0} on directory {1}".format(str(datetime.datetime.today()),parse_config['datadir']) if should_process_grants: files = parse.list_files(parse_config['datadir'],parse_config['grantregex']) print 'Running grant parse...' run_parse(files, 'grant') f = datetime.datetime.now() print "Found {2} files matching {0} in directory {1}"\ .format(parse_config['grantregex'], parse_config['datadir'], len(files)) if should_process_applications: files = parse.list_files(parse_config['datadir'],parse_config['applicationregex']) print 'Running application parse...' run_parse(files, 'application') f = datetime.datetime.now() print "Found {2} files matching {0} in directory {1}"\ .format(parse_config['applicationregex'], parse_config['datadir'], len(files)) print 'Finished parsing in {0}'.format(str(f-s)) # run extra phases if needed, then move output files
import unittest sys.path.append('..') from couch_patent import * sys.path.append('../lib/') from patXML import * import parse basedir = os.path.join(os.curdir, '../test') testdir = os.path.join(basedir, 'fixtures/xml/') testfile = XMLPatentBase(open(testdir + 'ipg120327.one.xml').read()) patentroot = '.' xmlregex = r'ipg120327.one.xml' filelist = parse.list_files([testdir], patentroot, xmlregex) grant_list = parse.parallel_parse(filelist) parsed_grants = list(parse.parse_patent(grant_list)) class TestCouchPatent(unittest.TestCase): def setUp(self): self.assertTrue(testfile) def test_get_doc_metadata(self): """ Tests that get_metadata retrieves the requisite information from a parsed xml file """ metadata = get_metadata(parsed_grants[0]) self.assertTrue(isinstance(metadata, dict))
if should_process_applications: urls += generate_download_list(parse_config['years'], 'application') downloaddir = parse_config['downloaddir'] if downloaddir and not os.path.exists(downloaddir): os.makedirs(downloaddir) print 'Downloading files at {0}'.format(str(datetime.datetime.today())) download_files(urls) print 'Downloaded files:', parse_config['years'] f = datetime.datetime.now() print 'Finished downloading in {0}'.format(str(f - s)) # find files print "Starting parse on {0} on directory {1}".format( str(datetime.datetime.today()), parse_config['datadir']) if should_process_grants: files = parse.list_files(parse_config['datadir'], parse_config['grantregex']) print 'Running grant parse...' run_parse(files, 'grant') f = datetime.datetime.now() print "Found {2} files matching {0} in directory {1}"\ .format(parse_config['grantregex'], parse_config['datadir'], len(files)) if should_process_applications: files = parse.list_files(parse_config['datadir'], parse_config['applicationregex']) print 'Running application parse...' run_parse(files, 'application') f = datetime.datetime.now() print "Found {2} files matching {0} in directory {1}"\ .format(parse_config['applicationregex'], parse_config['datadir'], len(files)) print 'Finished parsing in {0}'.format(str(f - s))
# download the files to be parsed urls = generate_download_list(parse_config['years']) dview.scatter('urls', urls) # check download directory downloaddir = parse_config['downloaddir'] if downloaddir and not os.path.exists(downloaddir): os.makedirs(downloaddir) dview['downloaddir'] = parse_config['downloaddir'] dview.apply(download_files) print 'Downloaded files:',parse_config['years'] f = datetime.datetime.now() print 'Finished downloading in {0}'.format(str(f-s)) # find files print "Starting parse on {0} on directory {1}".format(str(datetime.datetime.today()),parse_config['datadir']) files = parse.list_files(parse_config['datadir'],parse_config['dataregex']) dview.scatter('files',files) print "Found {2} files matching {0} in directory {1}".format(parse_config['dataregex'], parse_config['datadir'], len(files)) # run parse and commit SQL print 'Running parse...' inserts = list(itertools.chain.from_iterable(dview.apply(run_parse))) parse.commit_tables(inserts) f = datetime.datetime.now() print 'Finished parsing in {0}'.format(str(f-s)) # run extra phases if needed, then move output files run_clean(process_config) run_consolidate(process_config) parse.move_tables(process_config['outputdir'])
import unittest sys.path.append('..') from couch_patent import * sys.path.append('../lib/') from patXML import * import parse basedir = os.path.join(os.curdir, '../test') testdir = os.path.join(basedir, 'fixtures/xml/') testfile = XMLPatentBase(open(testdir+'ipg120327.one.xml').read()) patentroot = '.' xmlregex = r'ipg120327.one.xml' filelist = parse.list_files([testdir], patentroot, xmlregex) grant_list = parse.parallel_parse(filelist) parsed_grants = list(parse.parse_patent(grant_list)) class TestCouchPatent(unittest.TestCase): def setUp(self): self.assertTrue(testfile) def test_get_doc_metadata(self): """ Tests that get_metadata retrieves the requisite information from a parsed xml file """ metadata = get_metadata(parsed_grants[0]) self.assertTrue(isinstance(metadata, dict))