# collect files if len(args.target) == 0 or (len(args.target) == 1 and os.path.isdir(args.target[0])): targ_dir = 'assign_files' if len(args.target) == 0 else args.target[0] file_list = sorted(glob.glob('%s/*.xml' % targ_dir)) else: file_list = args.target # do robust parsing for fpath in file_list: (fdir, fname) = os.path.split(fpath) print('Parsing %s' % fname) i0 = i o0 = o p0 = p try: parse_gen3(fpath) except Exception as e: print('EXCEPTION OCCURRED!') print_exc() print('Found %d records, %d dropped, %d patents' % (i - i0, o - o0, p - p0)) print('Total %d records, %d dropped, %d patents' % (i, o, p)) print() # clear out the rest chunker.commit() con.close()
# parse by generation for fpath in file_list: (fdir, fname) = os.path.split(fpath) if fname.endswith('.dat'): gen = 1 parser = parse_grants_gen1 elif fname.startswith('pgb'): gen = 2 parser = parse_grants_gen2 elif fname.startswith('ipgb'): gen = 3 parser = parse_grants_gen3 else: raise(Exception('Unknown format')) print('Parsing %s, gen = %d' % (fname, gen)) i0 = i try: parser(fpath, store_patent) except Exception as e: print('EXCEPTION OCCURRED!') print_exc() print('Found %d patents, %d total' % (i-i0, i)) print() # commit to db and close pat_chunker.commit() ipc_chunker.commit() cit_chunker.commit() con.close()
print("Reached limit.") break (fdir, fname) = os.path.split(fpath) if fname.endswith('.dat'): gen = 1 parser = parse_grants_gen1 elif fname.startswith('pgb'): gen = 2 parser = parse_grants_gen2 elif fname.startswith('ipgb'): gen = 3 parser = parse_grants_gen3 else: raise (Exception('Unknown format')) print('Parsing %s, gen = %d' % (fname, gen)) i0 = i try: parser(fpath, store_patent) except Exception as e: print('EXCEPTION OCCURRED!') print_exc() print('Found %d patents, %d total' % (i - i0, i)) print() # commit to db and close pat_chunker.commit() ipc_chunker.commit() cit_chunker.commit() con.close()
return True # collect files if len(args.target) == 0 or (len(args.target) == 1 and os.path.isdir(args.target[0])): targ_dir = 'assign_files' if len(args.target) == 0 else args.target[0] file_list = sorted(glob.glob('%s/*.xml' % targ_dir)) else: file_list = args.target # do robust parsing for fpath in file_list: (fdir, fname) = os.path.split(fpath) print('Parsing %s' % fname) i0 = i o0 = o p0 = p try: parse_gen3(fpath) except Exception as e: print('EXCEPTION OCCURRED!') print_exc() print('Found %d records, %d dropped, %d patents' % (i-i0, o-o0, p-p0)) print('Total %d records, %d dropped, %d patents' % (i, o, p)) print() # clear out the rest chunker.commit() con.close()