p.set_file(results) if first: p.print_csv_titles() first = False if p.is_valid(): counter += 1 if p.has_warnings(): print("%s WARNINGS %s" % (i, p.get_warnings()), file=warnings) else: print("%s ERROR %s " % (i, p.errors)) print("[%s] %s %s\n" % (i, elem, p.errors), file=errors) p.print_csv() pr_time = (time() - txp) if first: Patent.print_empty_titles(results) Patent.print_zip_info(results) print("TIMES:\n\tPatent %s\n\tPrint %s" % (p_time, pr_time)) print("Total printed: %s\nTotal patents: %s" % (counter, len(patents))) if counter == len(patents): # os.remove(file) pass def unzip_patent(patent_zip): patent_xml = patent_zip.replace("zip", "xml") zf = zipfile.ZipFile(patent_zip) data = zf.read(os.path.basename(patent_xml)) return data
" data is shown below\n%s" % (i, exc, elem), file=errors) p.set_file(results) if first: p.print_csv_titles() first = False if p.is_valid(): p.print_csv() counter += 1 if p.has_warnings(): print("%s WARNINGS %s" % (i, p.get_warnings()), file=warnings) else: print("%s ERROR %s " % (i, p.errors)) print("[%s] %s %s\n" % (i, elem, p.errors), file=errors) if first: Patent.print_empty_titles(results) Patent.print_zip_info(results) print("Total printed: %s\nTotal patents: %s" % (counter, len(patents)-1)) return (counter == (len(patents) - 1)) if __name__ == "__main__": if os.path.exists(PATH): shutil.rmtree(PATH) os.makedirs(PATH) os.makedirs(ERROR_PATH) os.makedirs(WARN_PATH) # os.makedirs(RES_PATH) pfm = PatentsFileManager().get_patent_generator()
def process_zip(data): data = data.replace('\n', '') print("Processing zip...") total = 0.0 process = 0.0 printing = 0.0 usp = "<patent-assignments" usp_c = "</patent-assignments>" q = len(data) / 100 index_1 = data.find(usp, 0, q) index_2 = data.find(usp_c) index_2 += len(usp_c) head = data[0:index_1] tail = data[index_2:len(data)-1] t1 = time() s = BeautifulSoup(head+tail, "lxml") t2 = time() print("Soup is ready...Time: %s" % (t2-t1)) patents = data[index_1:index_2+len(usp_c)] patents = patents.split("<patent-assignment>") patents = ["<patent-assignment>" + p for p in patents] dtd = s("us-patent-assignments")[0]["dtd-version"] date_produced = s("us-patent-assignments")[0]["date-produced"] ak = s("action-key-code")[0].string s("transaction-date").contents = [p for p in s("transaction-date")[0].contents if p != " "] d = s("transaction-date")[0]("date")[0].string # print("Transaction %s" % s("transaction-date")) results = open(os.path.join("test_results", 'ad%s.csv' % d), "w+") errors = open(os.path.join("test_results", 'errors%s.txt' % d), "w+") warnings = open(os.path.join("test_results", 'warnings%s.txt' % d), "w+") t3 = time() print("DTD %s, DP %s, AK %s, D %s" % (dtd, date_produced, ak, d)) print("Time gathering zip info: %s" % (t3 - t2)) # Patent.set_zip_info(dtd, date_produced, ak, d) # p = Patent(example) # p.set_file(results) # p.print_csv_titles() # p.print_csv() Patent.set_zip_info(dtd, date_produced, ak, d) first = True t4 = time() counter = 1 for i in xrange(1, len(patents)-1): if counter < 5: elem = patents[i] t_x_1 = time() p = Patent(elem) t_x_2 = time() # print("[%s]\nInit %s" % (i, t_x_2-t_x_1)) process += (t_x_2 - t_x_1) p.set_file(results) if first: p.print_csv_titles() first = False if p.is_valid(): tv1 = time() p.print_csv() counter += 1 tv2 = time() if counter % 10000: print("Printing %s" % (tv2-tv1), end="") print("%s OK" % i) printing += (tv2-tv1) if p.has_warnings(): print(p.get_warnings(), file=warnings) else: print("\n%s Not valid %s " % (i, p.errors)) print("[%s] %s %s\n" % (i, elem, p.errors), file=errors) t5 = time() # print("Total time: %s\n\n" % (t5-t4)) total += (t5-t4) t4 = t5 if first: Patent.print_empty_titles(results) Patent.print_zip_info(results) print("Processing %s\nPrinting %s\nTotal %s, count: %s" % (process / counter , printing / counter, total / counter, counter))