def extract_XML2_grant(raw_data, args_array): # # Data documentation on the fields in XML2 Grant data can be found # in the /documents/data_descriptions/PatentGrantSGMLv19-Documentation.pdf file # Start timer start_time = time.time() logger = USPTOLogger.logging.getLogger("USPTO_Database_Construction") # Pass the url_link and format into local variables url_link = args_array['url_link'] uspto_xml_format = args_array['uspto_xml_format'] # Define all arrays needed to hold the data processed_grant = [] processed_applicant = [] processed_examiner = [] processed_assignee = [] processed_agent = [] processed_inventor = [] processed_usclass = [] processed_intclass = [] processed_gracit = [] processed_forpatcit = [] processed_nonpatcit = [] processed_foreignpriority = [] # Pass the raw data into Element tree xml object try: document_root = ET.fromstring(raw_data) except ET.ParseError as e: print_xml = raw_data.split("\n") for num, line in enumerate(print_xml, start=1): print(str(num) + ' : ' + line) logger.error( "Character Entity prevented ET from parsing XML in file: " + url_link) traceback.print_exc() exc_type, exc_obj, exc_tb = sys.exc_info() fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] logger.error("Exception: " + str(exc_type) + " in Filename: " + str(fname) + " on Line: " + str(exc_tb.tb_lineno) + " Traceback: " + traceback.format_exc()) # SDOBI is the bibliographic data r = document_root.find('SDOBI') if r is not None: # B100 Document Identification for B100 in r.findall('B100'): try: document_id = USPTOSanitizer.return_element_text( B100.find('B110')).strip() document_id = USPTOSanitizer.fix_patent_number( document_id)[:20] except: document_id = None logger.error("No Patent Number was found for: " + url_link) try: kind = USPTOSanitizer.return_element_text( B100.find('B130')).strip()[:2] app_type = USPTOSanitizer.return_xml2_app_type( args_array, kind).strip() except: kind = None # PATENT ISSUE DATE try: pub_date = USPTOSanitizer.return_formatted_date( USPTOSanitizer.return_element_text(B100.find('B140')), args_array, document_id) except: pub_date = None # B190 is Publishing Country or Organization # This is always US in Red Book Patent Grant documents and # this field is not stored or used. try: pub_country = USPTOSanitizer.return_element_text( B100.find('B190')).strip() except: pub_country = None # B200 is Domestic Filing Data for B200 in r.findall('B200'): # TODO: find this in XML2 applications app_country = None # Application number try: app_no = USPTOSanitizer.return_element_text( B200.find('B210')).strip()[:20] except: app_no = None # Application Date try: app_date = USPTOSanitizer.return_formatted_date( USPTOSanitizer.return_element_text(B200.find('B220')), args_array, document_id) except: app_date = None # Series Code try: series_code = USPTOSanitizer.return_element_text( B200.find('B211US')).strip()[:2] except: series_code = None # Collect the Grant Length try: grant_length = USPTOSanitizer.return_element_text( r.find("B400").find("B472").find("B474")).strip() except: grant_length = None # Collect Technical information # such as classification and references # TODO: don't need the loop here for B500 in r.findall('B500'): # US Classification for B520 in B500.findall('B520'): position = 1 # USCLASS for B521 in B520.findall('B521'): # Reset the class vars n_class = None n_section = None n_subclass = None # Collect class vars n_class_info = USPTOSanitizer.return_element_text(B521) n_class_main, n_subclass = USPTOSanitizer.return_class( n_class_info) n_class_main = n_class_main.strip()[:5] n_subclass = n_subclass.strip()[:15] # Append SQL data into dictionary to be written later processed_usclass.append({ "table_name": "uspto.USCLASS_G", "GrantID": document_id, "Position": position, "Class": n_class_main, "SubClass": n_subclass, "FileName": args_array['file_name'] }) #print(processed_usclass) position += 1 # B522 USCLASS FURTHER for B522 in B520.findall('B522'): n_class_info = USPTOSanitizer.return_element_text(B522) n_class_main, n_subclass = USPTOSanitizer.return_class( n_class_info) n_class_main = n_class_main.strip()[:5] n_subclass = n_subclass.strip()[:15] # Append SQL data into dictionary to be written later processed_usclass.append({ "table_name": "uspto.USCLASS_G", "GrantID": document_id, "Position": position, "Class": n_class_main, "SubClass": n_subclass, "FileName": args_array['file_name'] }) position += 1 # B510 International Class data # TODO: check if I need to set all variables to empty or can just leave as null # TODO: check if classification is parsed correctly for B510 in B500.findall('B510'): #logger.warning("International Classifcation found in XML2: " + args_array['url_link'] + " document: " + str(document_id)) # Reset position position = 1 # B511 Main Class for B511 in B510.findall('B511'): i_section = None i_class = None i_subclass = None i_class_mgr = None i_class_sgr = None i_malformed = None int_class = USPTOSanitizer.return_element_text(B511) # Int Class is: if (len(int_class.split()) > 1): sec_1, sec_2 = int_class.split() sec_1 = sec_1.strip()[:15] # Remove the Section from first character i_section = sec_1[0] i_class = sec_1[1:3] i_subclass = sec_1[-1] i_class_mgr = sec_2.strip()[:-2] i_class_sgr = sec_2.strip()[-2:] else: int_class = int_class.strip()[:15] i_section = int_class[0] i_class = int_class[1:] i_subclass = int_class[-1] i_malformed = 1 # Append SQL data into dictionary to be written later processed_intclass.append({ "table_name": "uspto.INTCLASS_G", "GrantID": document_id, "Position": position, "Section": i_section, "Class": i_class, "SubClass": i_subclass, "MainGroup": i_class_mgr, "SubGroup": i_class_sgr, "Malformed": i_malformed, "FileName": args_array['file_name'] }) #print(processed_intclass) position += 1 # B512 Further International Class for B512 in B510.findall('B512'): i_section = None i_class = None i_subclass = None i_class_mgr = None i_class_sgr = None i_malformed = None int_class = USPTOSanitizer.return_element_text(B512) # Split class in to class and group if (len(int_class.split()) > 1): sec_1, sec_2 = int_class.split() sec_1 = sec_1.strip()[:15] # Remove the Section from first character i_section = sec_1[0] i_class = sec_1[1:3] i_subclass = sec_1[-1] i_class_mgr = sec_2.strip()[:-2] i_class_sgr = sec_2.strip()[-2:] else: # TODO: Is this correct?? int_class = int_class.strip()[:15] i_section = int_class[0] i_class = int_class[1:] i_subclass = int_class[-1] i_malformed = 1 # Append SQL data into dictionary to be written later processed_intclass.append({ "table_name": "uspto.INTCLASS_G", "GrantID": document_id, "Position": position, "Section": i_section, "Class": i_class, "SubClass": i_subclass, "MainGroup": i_class_mgr, "SubGroup": i_class_sgr, "Malformed": i_malformed, "FileName": args_array['file_name'] }) #print(processed_intclass) position += 1 # B540 Collect Title for B540 in B500.findall('B540'): try: title = USPTOSanitizer.strip_for_csv( USPTOSanitizer.return_element_text(B540)[:500]) except: title = None # Patent Citations for B560 in B500.findall('B560'): # Reset position counter for all citations loop position = 1 # B561 is Patent Citation for B561 in B560.findall('B561'): # TODO: find out how to do PCIT, DOC without loop. Only B561 needs loop pcit = B561.find('PCIT') # Determien if the patent is US or not #TODO: needs to check better, what does non US patent look like # If all patents have PARTY-US then perhaps a databse call to check the country of origin # would still allow to separate into GRACIT and FORPATCIT_G #if PCIT.find("PARTY-US") == True: #print "CITATION COUNTRY US" #citation_country = "US" #else: #citation_country = "NON-US" #logger.warning("NON US patent found") #citation_country = "US" # Declare items in case they are not found citation_name = None citation_city = None citation_state = None citation_country = None doc = pcit.find('DOC') if doc is not None: try: citation_document_number = USPTOSanitizer.return_element_text( doc.find('DNUM')).strip()[:15] except: citation_document_number = None try: pct_kind = USPTOSanitizer.return_element_text( doc.find('KIND')).strip()[:10] except: pct_kind = None try: citation_date = USPTOSanitizer.return_formatted_date( USPTOSanitizer.return_element_text( doc.find('DATE')), args_array, document_id) except: citation_date = None prt = pcit.find('PARTY-US') if prt is not None: try: citation_name = USPTOSanitizer.return_element_text( prt.find("NAM").find("SNM")).strip()[:100] except: citation_name = None # Citation Address info try: citation_city = USPTOSanitizer.return_element_text( prt.find('ADR').find('CITY')).strip()[:100] except: citation_city = None try: citation_state = USPTOSanitizer.return_element_text( prt.find('ADR').find('STATE')).strip()[:3] except: citation_state = None # Citation country try: citation_country = USPTOSanitizer.return_element_text( prt.find("ADR").find('CTRY')).strip()[:3] except: try: # If state is a US state, set country to US if USPTOSanitizer.is_US_state( citation_state): citation_country = "US" else: citation_country = None except: citation_country = None # Parse citation category if (len(B561.getchildren()) > 1): try: citation_category = B561.getchildren( )[1].tag.replace("\n", "").replace("\r", "").upper() except: citation_category = None else: citation_category = None #TODO: be aware that there may be something crazy in the # citation document number if pct_kind != None: # Append SQL data into dictionary to be written later processed_gracit.append({ "table_name": "uspto.GRACIT_G", "GrantID": document_id, "Position": position, "CitedID": citation_document_number, "Kind": pct_kind, "Name": citation_name, "Date": citation_date, "Country": citation_country, "Category": citation_category, "FileName": args_array['file_name'] }) #print(processed_gracit) position += 1 else: # Append SQL data into dictionary to be written later processed_forpatcit.append({ "table_name": "uspto.FORPATCIT_G", "GrantID": document_id, "Position": position, "CitedID": citation_document_number, "Kind": pct_kind, "Name": citation_name, "Date": citation_date, "Country": citation_country, "Category": citation_category, "FileName": args_array['file_name'] }) #print(processed_forpatcit) position += 1 # Reset position counter for non-patent citations loop position = 1 # Non-patent Literature Citations for B562 in B560.findall('B562'): NCIT = B562.find('NCIT') if NCIT is not None: # Sometimes, there will be '<i> or <sup>, etc.' in the reference string; we need to remove it non_patent_citation_text = USPTOSanitizer.return_element_text( NCIT) non_patent_citation_text = re.sub( '<[^>]+>', '', non_patent_citation_text) else: non_patent_citation_text = None # Parse citation category into code if (len(B562.getchildren()) > 1): try: ncitation_category = B562.getchildren( )[1].tag.replace("\n", "").replace("\r", "").upper() except: ncitation_category = None else: ncitation_category = None # Append SQL data into dictionary to be written later processed_nonpatcit.append({ "table_name": "uspto.NONPATCIT_G", "GrantID": document_id, "Position": position, "Citation": non_patent_citation_text, "Category": ncitation_category, "FileName": args_array['file_name'] }) #print(processed_nonpatcit) position += 1 # Collect number of claims for B570 in B500.findall('B570'): try: claims_num = USPTOSanitizer.return_element_text( B570.find('B577')).strip() except: claims_num = None # Collect number of drawings and figures for B590 in B500.findall('B590'): for B595 in B590.findall('B595'): try: number_of_drawings = USPTOSanitizer.return_element_text( B595).strip() number_of_drawings = number_of_drawings.split("/")[0] except: number_of_drawings = None for B596 in B590.findall('B596'): try: number_of_figures = USPTOSanitizer.return_element_text( B596).strip() except: number_of_figures = None # TODO: B582 find out what it is. Looks like patent classifications but it's all alone in the XML # B700 is Parties # TODO: find the applicant data and append to array for B700 in r.findall('B700'): # B720 Inventor for B720 in B700.findall('B720'): # Reset position for inventors position = 1 # Collect inventor information for B721 in B720.findall('B721'): for i in B721.findall('PARTY-US'): # Inventor Name try: inventor_first_name = USPTOSanitizer.return_element_text( i.find('NAM').find('FNM')).strip()[:100] except: inventor_first_name = None try: inventor_last_name = USPTOSanitizer.return_element_text( i.find('NAM').find('SNM')).strip()[:100] except: inventor_last_name = None # Inventor Address info try: inventor_city = USPTOSanitizer.return_element_text( i.find('ADR').find('CITY')).strip()[:100] except: inventor_city = None try: inventor_state = USPTOSanitizer.return_element_text( i.find('ADR').find('STATE')).strip()[:3] except: inventor_state = None # Inventor country try: inventor_country = USPTOSanitizer.return_element_text( i.find("ADR").find('CTRY')).strip()[:3] except: try: # If state is a US state, set country to US if USPTOSanitizer.is_US_state(inventor_state): inventor_country = "US" else: inventor_country = None except: inventor_country = None inventor_nationality = None inventor_residence = None # Append SQL data into dictionary to be written later processed_inventor.append({ "table_name": "uspto.INVENTOR_G", "GrantID": document_id, "Position": position, "FirstName": inventor_first_name, "LastName": inventor_last_name, "City": inventor_city, "State": inventor_state, "Country": inventor_country, "Nationality": inventor_nationality, "Residence": inventor_residence, "FileName": args_array['file_name'] }) #print(processed_inventor) position += 1 # B730 Assignee # TODO: check if finding child of child is working # Reset position for assignees position = 1 for B730 in B700.findall('B730'): for B731 in B730.findall('B731'): for x in B731.findall('PARTY-US'): try: asn_orgname = USPTOSanitizer.return_element_text( x.find('NAM').find("ONM")).strip()[:500] except: asn_orgname = None asn_role = None try: asn_city = USPTOSanitizer.return_element_text( x.find("ADR").find('CITY')).strip()[:100] except: asn_city = None try: asn_state = USPTOSanitizer.return_element_text( x.find("ADR").find('STATE')).strip()[:30] except: asn_state = None # Assignee country try: asn_country = USPTOSanitizer.return_element_text( x.find("ADR").find('CTRY')).strip()[:3] except: try: # Fix country if country missing if USPTOSanitizer.is_US_state(asn_state): asn_country = "US" else: asn_country = None except: asn_country = None # Append SQL data into dictionary to be written later processed_assignee.append({ "table_name": "uspto.ASSIGNEE_G", "GrantID": document_id, "Position": position, "OrgName": asn_orgname, "Role": asn_role, "City": asn_city, "State": asn_state, "Country": asn_country, "FileName": args_array['file_name'] }) #print(processed_assignee) position += 1 # B740 is Legal Agent / Attorney for B740 in B700.findall('B740'): # Reset position for agents position = 1 for B741 in B740.findall('B741'): for x in B741.findall('PARTY-US'): # Attorney Organization try: agent_orgname = USPTOSanitizer.return_element_text( x.find('NAM').find("ONM")).strip()[:300] except: agent_orgname = None # Attorney Name try: agent_last_name = USPTOSanitizer.return_element_text( x.find('NAM').find('FNM')).strip()[:100] except: agent_last_name = None try: agent_first_name = USPTOSanitizer.return_element_text( x.find('NAM').find('SNM')).strip()[:100] except: agent_first_name = None # Attorney Address information try: agent_city = USPTOSanitizer.return_element_text( x.find("ADR").find('CITY')).strip()[:100] except: agent_city = None try: agent_state = USPTOSanitizer.return_element_text( x.find("ADR").find('STATE')).strip()[:30] except: agent_state = None # Agent country try: agent_country = USPTOSanitizer.return_element_text( x.find("ADR").find('CTRY')).strip()[:3] except: try: # Fix country if missing if USPTOSanitizer.is_US_state(agent_state): agent_country = "US" else: agent_country = None except: agent_country = None # Append SQL data into dictionary to be written later processed_agent.append({ "table_name": "uspto.AGENT_G", "GrantID": document_id, "Position": position, "OrgName": agent_orgname, "LastName": agent_last_name, "FirstName": agent_first_name, "Country": agent_country, "FileName": args_array['file_name'] }) #print(processed_agent) position += 1 # B745 Examiner for B745 in B700.findall('B745'): position = 1 # Primary Examiner for B746 in B745.findall('B746'): for x in B746.findall('PARTY-US'): try: examiner_last_name = USPTOSanitizer.return_element_text( x.find('NAM').find('SNM')).strip()[:50] except: examiner_last_name = None try: examiner_fist_name = USPTOSanitizer.return_element_text( x.find('NAM').find('FNM')).strip()[:50] except: examiner_fist_name = None try: examiner_department = USPTOSanitizer.return_element_text( B745.find('B748US')).strip()[:50] except: examiner_department = None # Append SQL data into dictionary to be written later processed_examiner.append({ "table_name": "uspto.EXAMINER_G", "GrantID": document_id, "Position": position, "LastName": examiner_last_name, "FirstName": examiner_fist_name, "Department": examiner_department, "FileName": args_array['file_name'] }) #print(processed_examiner) position += 1 # Assistant Examiner for B747 in B745.findall('B747'): for x in B747.findall('PARTY-US'): try: examiner_last_name = USPTOSanitizer.return_element_text( x.find('NAM').find('SNM')).strip()[:50] except: examiner_last_name = None try: examiner_fist_name = USPTOSanitizer.return_element_text( x.find('NAM').find('FNM')).strip()[:50] except: examiner_fist_name = None try: examiner_department = USPTOSanitizer.return_element_text( B745.find('B748US')).strip()[:50] except: examiner_department = None # Append SQL data into dictionary to be written later processed_examiner.append({ "table_name": "uspto.EXAMINER_G", "GrantID": document_id, "Position": position, "LastName": examiner_last_name, "FirstName": examiner_fist_name, "Department": examiner_department, "FileName": args_array['file_name'] }) #print(processed_examiner) position += 1 # B300 Foreign priotiry data position = 1 for B300 in r.findall('B300'): # Country try: pc_country = USPTOSanitizer.return_element_text( B300.find('B330').find('CTRY')).strip()[:5] except: pc_country = None # Prority filing date try: pc_date = USPTOSanitizer.return_formatted_date( USPTOSanitizer.return_element_text( B300.find('B320').find('DATE')).strip()[:45]) except: pc_date = None # Prority document number try: pc_doc_num = USPTOSanitizer.return_element_text( B300.find('B310').find('DNUM')).strip()[:45] except: pc_doc_dum = None # Set the fields that are not in gXML2 pc_kind = None # Append SQL data into dictionary to be written later processed_foreignpriority.append({ "table_name": "uspto.FOREIGNPRIORITY_G", "GrantID": document_id, "Position": position, "Kind": pc_kind, "Country": pc_country, "DocumentID": pc_doc_num, "PriorityDate": pc_date, "FileName": args_array['file_name'] }) #print(processed_foreignpriority) position += 1 # Collect Abstract from data try: a_elem = document_root.find('SDOAB') if a_elem is not None: abstract = USPTOSanitizer.strip_for_csv( USPTOSanitizer.return_element_text(a_elem)) else: abstract = None except Exception as e: abstract = None #traceback.print_exc() #logger.error("Exception while extracting description from " + str(document_id) + ": " + traceback.print_exc()) #print(abstract) # Collect detailed description from DETDESC try: d_elem = document_root.find('SDODE').find('DETDESC') if d_elem is not None: description = USPTOSanitizer.strip_for_csv(' '.join( d_elem.itertext())) else: description = None except Exception as e: description = None #traceback.print_exc() #logger.error("Exception while extracting claim from " + str(document_id) + ": " + traceback.print_exc()) #print(description) # Collect claims from data try: c_elem = document_root.find('SDOCL') if c_elem is not None: claims = USPTOSanitizer.strip_for_csv(' '.join( c_elem.itertext())) #claims = USPTOSanitizer.strip_for_csv(USPTOSanitizer.return_element_text(c_elem)) else: claims = None except Exception as e: claims = None #traceback.print_exc() #logger.error("Exception while extracting claim from " + str(document_id) + ": " + traceback.print_exc()) #print(claims) # Append SQL data into dictionary to be written later processed_grant.append({ "table_name": "uspto.GRANT", "GrantID": document_id, "Title": title, "Claims": claims, "Description": description, "IssueDate": pub_date, "Kind": kind, "GrantLength": grant_length, "USSeriesCode": series_code, "Abstract": abstract, "ClaimsNum": claims_num, "DrawingsNum": number_of_drawings, "FiguresNum": number_of_figures, "ApplicationID": app_no, "FileDate": app_date, "AppType": app_type, "FileName": args_array['file_name'] }) #print(processed_grant) # Return a dictionary of the processed_ data arrays return { "processed_grant": processed_grant, "processed_applicant": processed_applicant, "processed_examiner": processed_examiner, "processed_assignee": processed_assignee, "processed_agent": processed_agent, "processed_inventor": processed_inventor, "processed_usclass": processed_usclass, "processed_intclass": processed_intclass, "processed_gracit": processed_gracit, "processed_forpatcit": processed_forpatcit, "processed_nonpatcit": processed_nonpatcit, "processed_foreignpriority": processed_foreignpriority }
def extract_XML1_application(raw_data, args_array): # Set process start time start_time = time.time() logger = USPTOLogger.logging.getLogger("USPTO_Database_Construction") # Pass the url_link and format into local variables url_link = args_array['url_link'] uspto_xml_format = args_array['uspto_xml_format'] # Define required arrays processed_application = [] processed_foreignpriority = [] processed_assignee = [] processed_agent = [] processed_inventor = [] processed_usclass = [] processed_intclass = [] processed_cpcclass = [] # Pass the xml into Element tree object document_root = ET.fromstring(raw_data) r = document_root.find('subdoc-bibliographic-information') # Get and fix the document_id data di = r.find('document-id') if di is not None: # This document ID is NOT application number try: document_id = di.findtext('doc-number').strip() except: document_id = None logger.error("No Patent Number was found for: " + url_link) try: kind = di.findtext('kind-code').strip()[:2] app_type = USPTOSanitizer.return_xml2_app_type(args_array, kind).strip() except: kind = None app_type = None try: pub_date = USPTOSanitizer.return_formatted_date( di.findtext('document-date'), args_array, document_id) except: pub_date = None # Get application filing data ar = r.find('domestic-filing-data') if ar is not None: try: app_no = ar.find('application-number').findtext( 'doc-number').strip()[:20] except: app_no = None try: app_date = USPTOSanitizer.return_formatted_date( ar.findtext('filing-date'), args_array, document_id) except: app_date = None try: series_code = ar.findtext( 'application-number-series-code').strip()[:2] except: series_code = None # Get technical information ti = r.find('technical-information') if ti is not None: # Get invention title try: title = USPTOSanitizer.strip_for_csv( ti.findtext('title-of-invention')[:500]) except: title = None # Get international classification data ic = ti.find('classification-ipc') if ic is not None: # Init position position = 1 # Process the primary international class icm = ic.find('classification-ipc-primary') if icm is not None: #print(icm.findtext('ipc')) # Clear variable values i_class_sec = None i_class = None i_subclass = None i_class_mgr = None i_class_sgr = None i_malformed = None try: i_class_sec, i_class, i_subclass, i_class_mgr, i_class_sgr = USPTOSanitizer.return_international_class_XML1_application( icm.findtext('ipc')) i_class_sec = i_class_sec.strip()[:15] i_class = i_class.strip()[:15] i_subclass = i_subclass.strip()[:15] i_class_mgr = i_class_mgr.strip()[:15] i_class_sgr = i_class_sgr.strip()[:15] except Exception as e: traceback.print_exc() i_class_sec = None i_class = None i_subclass = None i_class_mgr = None i_class_sgr = None i_malformed = 1 logger.warning( "Malformed international class found in application ID: " + document_id + " in file: " + url_link) # Append SQL data into dictionary to be written later processed_intclass.append({ "table_name": "uspto.INTCLASS_A", "ApplicationID": app_no, "Position": position, "Section": i_class_sec, "Class": i_class, "SubClass": i_subclass, "MainGroup": i_class_mgr, "SubGroup": i_class_sgr, "Malformed": i_malformed, "FileName": args_array['file_name'] }) #print(processed_intclass) position += 1 # Process any secondary international classes ics = ic.findall('classification-ipc-secondary') if ics is not None: for ics_item in ics: # Clear variable values i_class_sec = None i_class = None i_subclass = None i_class_mgr = None i_class_sgr = None i_malformed = None try: i_class_sec, i_class, i_subclass, i_class_mgr, i_class_sgr = USPTOSanitizer.return_international_class_XML1_application( ics_item.findtext('ipc')) i_class_sec = i_class_sec.strip()[:15] i_class = i_class.strip()[:15] i_subclass = i_subclass.strip()[:15] i_class_mgr = i_class_mgr.strip()[:15] i_class_sgr = i_class_sgr.strip()[:15] except Exception as e: traceback.print_exc() i_class_sec = None i_class = None i_subclass = None i_class_mgr = None i_class_sgr = None i_malformed = 1 logger.warning( "Malformed international class found in application ID: " + document_id + " in file: " + url_link) # Append SQL data into dictionary to be written later processed_intclass.append({ "table_name": "uspto.INTCLASS_A", "ApplicationID": app_no, "Position": position, "Section": i_class_sec, "Class": i_class, "SubClass": i_subclass, "MainGroup": i_class_mgr, "SubGroup": i_class_sgr, "Malformed": i_malformed, "FileName": args_array['file_name'] }) #print(processed_intclass) position += 1 # Get US classification data nc = ti.find('classification-us') nc_position = 1 if nc is not None: uspc = nc.find('classification-us-primary').find('uspc') if uspc is not None: n_class_main = None n_subclass = None try: n_class_main = uspc.findtext('class').strip()[:5] except: n_class_main = None try: n_subclass = uspc.findtext('subclass').strip()[:15] except: n_subclass = None # Append SQL data into dictionary to be written later processed_usclass.append({ "table_name": "uspto.USCLASS_A", "ApplicationID": app_no, "Position": nc_position, "Class": n_class_main, "SubClass": n_subclass, "FileName": args_array['file_name'] }) #print(processed_usclass) nc_position += 1 # Collect all Secondary US class ncs = nc.findall('classification-us-secondary') for ncs_item in ncs: n_class_main = None n_subclass = None uspc = ncs_item.find('uspc') if uspc is not None: try: n_class_main = uspc.findtext('class').strip()[:5] except: n_class_main = None try: n_subclass = uspc.findtext('subclass').strip()[:5] except: n_subclass = None # Append SQL data into dictionary to be written later processed_usclass.append({ "table_name": "uspto.USCLASS_A", "ApplicationID": app_no, "Position": nc_position, "Class": n_class_main, "SubClass": n_subclass, "FileName": args_array['file_name'] }) #print(processed_usclass) nc_position += 1 # Get priority claims pc_position = 1 pc_kind = None for pc in r.findall('foreign-priority-data'): try: pc_country = pc.findtext('country-code').strip()[:100] except: pc_country = None try: pc_doc_num = pc.find('priority-application-number').findtext( 'doc-number').strip()[:100] except: pc_doc_num = None try: pc_date = USPTOSanitizer.return_formatted_date( pc.findtext('filing-date'), args_array, document_id) except: pc_date = None # Append SQL data into dictionary to be written later processed_foreignpriority.append({ "table_name": "uspto.FOREIGNPRIORITY_A", "ApplicationID": app_no, "Position": pc_position, "Kind": pc_kind, "Country": pc_country, "DocumentID": pc_doc_num, "PriorityDate": pc_date, "FileName": args_array['file_name'] }) #print(processed_foreignpriority) pc_position += 1 # Get inventor data invs = r.find('inventors') if invs is not None: # Init position inv_position = 1 for inventor in invs.findall('first-named-inventor'): n = inventor.find('name') try: inventor_first_name = n.findtext('given-name').strip()[:100] except: inventor_first_name = None try: inventor_last_name = n.findtext('family-name').strip()[:100] except: inventor_last_name = None # Get the residence tag res = inventor.find('residence') if res is not None: residence_us = res.find('residence-us') if residence_us is not None: try: inventor_city = residence_us.findtext( 'city').strip()[:100] except: inventor_city = None try: inventor_state = residence_us.findtext( 'state').strip()[:100] except: inventor_state = None try: inventor_country = residence_us.findtext( 'country-code').strip()[:100] except: inventor_country = None residence_non_us = res.find('residence-non-us') if residence_non_us is not None: try: inventor_city = residence_non_us.findtext( 'city').strip()[:100] except: inventor_city = None try: inventor_state = residence_non_us.findtext( 'state').strip()[:100] except: inventor_state = None try: inventor_country = residence_non_us.findtext( 'country-code').strip()[:100] except: inventor_country = None # Append SQL data into dictionary to be written later processed_inventor.append({ "table_name": "uspto.INVENTOR_A", "ApplicationID": app_no, "Position": inv_position, "FirstName": inventor_first_name, "LastName": inventor_last_name, "City": inventor_city, "State": inventor_state, "Country": inventor_country, "FileName": args_array['file_name'] }) #print(processed_inventor) inv_position += 1 # For all secordary inventors for inv in invs.findall('inventor'): if inv is not None: n = inv.find('name') if n is not None: try: inventor_first_name = n.findtext( 'given-name').strip()[:100] except: inventor_first_name = None try: inventor_last_name = n.findtext( 'family-name').strip()[:100] except: inventor_last_name = None res = inv.find('residence') if res is not None: residence_us = res.find('residence-us') if residence_us is not None: try: inventor_city = residence_us.findtext( 'city').strip()[:100] except: inventor_city = None try: inventor_state = residence_us.findtext( 'state').strip()[:100] except: inventor_state = None try: inventor_country = residence_us.findtext( 'country-code').strip()[:100] except: inventor_country = None residence_non_us = res.find('residence-non-us') if residence_non_us is not None: try: inventor_city = residence_non_us.findtext( 'city').strip()[:100] except: inventor_city = None try: inventor_state = residence_non_us.findtext( 'state').strip()[:100] except: inventor_state = None try: inventor_country = residence_non_us.findtext( 'country-code').strip()[:100] except: inventor_country = None # Append SQL data into dictionary to be written later processed_inventor.append({ "table_name": "uspto.INVENTOR_A", "ApplicationID": app_no, "Position": inv_position, "FirstName": inventor_first_name, "LastName": inventor_last_name, "City": inventor_city, "State": inventor_state, "Country": inventor_country, "FileName": args_array['file_name'] }) #print(processed_inventor) inv_position += 1 # Get assignee data # Init position asn_position = 1 for asn in r.findall('assignee'): try: asn_role = asn.findtext('assignee-type').strip()[:100] except: asn_role = None try: asn_orgname = asn.findtext('organization-name').strip()[:300] except: asn_orgname = None adr_elem = asn.find('address') try: asn_city = adr_elem.findtext('city').strip()[:100] except: asn_city = None try: asn_state = adr_elem.findtext('state').strip()[:100] except: asn_state = None try: asn_country = adr_elem.find('country').findtext( 'country-code').strip()[:100] except: asn_country = None if asn_country == None: if USPTOSanitizer.is_US_state(asn_state): asn_country = "US" # These have not been found in XML1, # but a full XML parse should be done asn_firstname = None asn_lastname = None # Append SQL data into dictionary to be written later processed_assignee.append({ "table_name": "uspto.ASSIGNEE_A", "ApplicationID": app_no, "Position": asn_position, "OrgName": asn_orgname, "FirstName": asn_firstname, "LastName": asn_lastname, "Role": asn_role, "City": asn_city, "State": asn_state, "Country": asn_country, "FileName": args_array['file_name'] }) #print(processed_assignee) asn_position += 1 # Find the agent element agn = r.find('correspondence-address') # Init position agn_position = 1 if agn is not None: try: agent_orgname = agn.findtext('name-1').strip() except: agent_orgname = None try: agent_orgname_2 = agn.findtext('name-2').strip() except: agent_orgname_2 = None # Combine Orgname 1 and 2 and shorten if needed if agent_orgname != None and agent_orgname_2 != None: agent_orgname = USPTOSanitizer.strip_for_csv(agent_orgname + " " + agent_orgname_2)[:300] # Get the address element addr_elem = agn.find('address') if addr_elem is not None: try: try: agent_addr_1 = addr_elem.findtext( 'address-1').strip()[:100] except: agent_addr_1 = "" try: agent_addr_2 = addr_elem.findtext( 'address-2').strip()[:100] except: agent_addr_2 = "" agent_address = USPTOSanitizer.strip_for_csv(agent_addr_1 + agent_addr_2) except: agent_address = None try: agent_city = addr_elem.findtext('city').strip()[:50] except: agent_city = None try: agent_state = addr_elem.findtext('state').strip()[:3] except: agent_state = None try: agent_country = addr_elem.find('country').findtext( 'country-code').strip()[:3] except: if USPTOSanitizer.is_US_state(agent_state): agent_country = "US" else: agent_country = None # Append SQL data into dictionary to be written later processed_agent.append({ "table_name": "uspto.AGENT_A", "ApplicationID": app_no, "Position": agn_position, "OrgName": agent_orgname, "Address": agent_address, "City": agent_city, "State": agent_state, "Country": agent_country, "FileName": args_array['file_name'] }) #print(processed_agent) agn_position += 1 # Find the abstract of the application try: abstract = USPTOSanitizer.strip_for_csv( USPTOSanitizer.return_element_text( document_root.find('subdoc-abstract'))) except: abstract = None # Find the description try: description = "" d_elem = document_root.find('subdoc-description') if d_elem is not None: description += USPTOSanitizer.strip_for_csv(' '.join( d_elem.itertext())) else: description = None except Exception as e: description = None #traceback.print_exc() #logger.error("Exception while extracting description from " + str(app_no)) #print(description) # Find the claims try: claims = "" c_elem = document_root.find('subdoc-claims') if c_elem is not None: claims += USPTOSanitizer.strip_for_csv(' '.join(c_elem.itertext())) else: claims = None except Exception as e: claims = None #traceback.print_exc() #logger.error("Exception while extracting claim from " + str(app_no)) #print(claims) # Find the number of claims try: number_of_claims = 0 for clms in c_elem.findall('claim'): number_of_claims += 1 except Exception as e: number_of_claims = None #traceback.print_exc() #logger.error("Exception while extracting number of claims from " + str(app_no)) #print(number_of_claims) # Find the number of drawings and figures try: number_of_figures = 0 number_of_drawings = 0 drw_elem = document_root.find('subdoc-drawings') if drw_elem != None: for fg in drw_elem.findall('figure'): img_type = fg.find('image').attrib['ti'].strip() if img_type == "DR": number_of_drawings += 1 elif img_type == "FG": number_of_figures += 1 else: number_of_figures = None number_of_drawings = None except Exception as e: number_of_figures = None number_of_drawings = None #traceback.print_exc() #logger.error("Exception while extracting figures and drawings num " + str(app_no)) #print(number_of_figures) #print(number_of_drawings) # Append SQL data into dictionary to be written later processed_application.append({ "table_name": "uspto.APPLICATION", "ApplicationID": app_no, "PublicationID": document_id, "AppType": app_type, "Title": title, "FileDate": app_date, "PublishDate": pub_date, "Kind": kind, "USSeriesCode": series_code, "Abstract": abstract, "ClaimsNum": number_of_claims, "DrawingsNum": number_of_drawings, "FiguresNum": number_of_figures, "Description": description, "Claims": claims, "FileName": args_array['file_name'] }) #print(processed_application) # Return a dictionary of the processed_ data arrays return { "processed_application": processed_application, "processed_foreignpriority": processed_foreignpriority, "processed_assignee": processed_assignee, "processed_agent": processed_agent, "processed_inventor": processed_inventor, "processed_usclass": processed_usclass, "processed_intclass": processed_intclass, "processed_cpcclass": processed_cpcclass }
def extract_XML2_grant_tag_counts(args_array): logger = USPTOLogger.logging.getLogger("USPTO_Database_Construction") # Extract the XML file from the ZIP file xml_file_contents = USPTOProcessZipFile.extract_xml_file_from_zip(args_array) # If xml_file_contents is None or False, then return immediately if xml_file_contents == None or xml_file_contents == False: return False # Declare a dictionary to use in counting tags # NOTE: CPCClASS_G, APPLICANT_G, are not available in XML2 Grant files tags_dict = { "GRANT" : ["<PATDOC"], "INTCLASS_G" : ["<B510"], "USCLASS_G" : ["<B521", "<B522"], "INVENTOR_G" : ["<B721"], "AGENT_G" : ["<B740"], "ASSIGNEE_G" : ["<B730"], "NONPATCIT_G" : ["<B562"], "EXAMINER_G" : ["<B746", "<B747"], "FOREIGNPRIORITY_G" : ["<B310"] } # Declare a dictionary to hold counts by table counts_dict = { "file_name" : args_array['file_name'], "GRANT" : 0, "INTCLASS_G" : 0, "CPCCLASS_G" : 0, "USCLASS_G" : 0, "INVENTOR_G" : 0, "AGENT_G" : 0, "ASSIGNEE_G" : 0, "APPLICANT_G" : 0, "NONPATCIT_G" : 0, "EXAMINER_G" : 0, "GRACIT_G" : 0, "FORPATCIT_G" : 0, "FOREIGNPRIORITY_G" : 0 } # Print to stdout and log print("-- Starting the XML2 grant tag counting process for contents of: " + args_array['file_name'] + ". Time Started: " + time.strftime("%c")) logger.info("Starting the XML2 grant tag counting process for contents of: " + args_array['file_name'] + ". Time Started: " + time.strftime("%c")) # Loop through the file contents line by line for line in xml_file_contents: # Decode the line from byte-object line = USPTOSanitizer.decode_line(line) # Loop through tags_dict items and look for XML tag for table, tag in tags_dict.items(): item_found = False # If list is provided if isinstance(tag, list): for item in tag: # Look for field tag if item in line: item_found = True if item_found == True: # Increment the count for appropriate table counts_dict[table] += 1 # Count the items that cannot be counted by only tags # Parse the tags that need to be XML parsed # Create variables needed to parse the file xml_string = '' patent_xml_started = False # Loop through all lines in the xml file for line in xml_file_contents: # Decode the line from byte-object line = USPTOSanitizer.decode_line(line) # This identifies the start of well formed XML segment for patent # grant bibliographic information if "<PATDOC" in line: patent_xml_started = True xml_string += "<PATDOC>" # This identifies end of well-formed XML segement for single patent # grant bibliographic information elif "</PATDOC" in line: patent_xml_started = False xml_string += "</PATDOC>" #print(xml_string) # Pass the raw_data data into Element Tree try: document_root = ET.fromstring(xml_string) # SDOBI is the bibliographic data r = document_root.find('SDOBI') # Patent Citations B500 = r.find('B500') if B500 is not None: for B560 in B500.findall('B560'): # B561 is Patent Citation for B561 in B560.findall('B561'): try: pcit = B561.find('PCIT').find('DOC') except: pcit = None if pcit is not None: prt = pcit.find('PARTY-US') try: citation_state = USPTOSanitizer.return_element_text(prt.find('ADR').find('STATE')).strip()[:3] except: citation_state = None try: citation_country = USPTOSanitizer.return_element_text(prt.find("ADR").find('CTRY')).strip()[:3] except: try: # If state is a US state, set country to US if USPTOSanitizer.is_US_state(citation_state): citation_country = "US" else: citation_country = None except: citation_country = None if citation_country == "US" or citation_country == None: counts_dict['GRACIT_G'] += 1 elif citation_country is not None: counts_dict['FORPATCIT_G'] += 1 # Reset the xml string xml_string = '' except ET.ParseError as e: print_xml = xml_string.split("\n") for num, line in enumerate(print_xml, start = 1): #print(str(num) + ' : ' + line) logger.error(str(num) + ' : ' + line) logger.error("Character Entity prevented ET from parsing XML in file: " + args_array['file_name'] ) traceback.print_exc() exc_type, exc_obj, exc_tb = sys.exc_info() fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] logger.error("Exception: " + str(exc_type) + " in Filename: " + str(fname) + " on Line: " + str(exc_tb.tb_lineno) + " Traceback: " + traceback.format_exc()) # This is used to append lines of file when inside single patent grant elif patent_xml_started == True: # Check which type of encoding should be used to fix the line string xml_string += USPTOSanitizer.replace_old_html_characters(line) # Print to stdout and log print("-- Finished the XML2 grant tag counting process for contents of: " + args_array['file_name'] + ". Time Finished: " + time.strftime("%c")) logger.info("Finished the XML2 grant tag counting process for contents of: " + args_array['file_name'] + ". Time Finished: " + time.strftime("%c")) # Return the dictionary of counts for found tags if args_array['stdout_level'] == 1: pprint(counts_dict) return counts_dict
def extract_XML2_grant(raw_data, args_array): # # Data documentation on the fields in XML2 Grant data can be found # in the /documents/data_descriptions/PatentGrantSGMLv19-Documentation.pdf file # # Import logger logger = USPTOLogger.logging.getLogger("USPTO_Database_Construction") # Pass the url_link and format into local variables url_link = args_array['url_link'] uspto_xml_format = args_array['uspto_xml_format'] #print raw_data # Define all arrays needed to hold the data processed_grant = [] processed_applicant = [] processed_examiner = [] processed_assignee = [] processed_agent = [] processed_inventor = [] processed_usclass = [] processed_intclass = [] processed_gracit = [] processed_forpatcit = [] processed_nonpatcit = [] processed_foreignpriority = [] # Start timer start_time = time.time() try: # Pass the raw data into Element tree xml object patent_root = ET.fromstring(raw_data) except ET.ParseError as e: print_xml = raw_data.split("\n") for num, line in enumerate(print_xml, start=1): print(str(num) + ' : ' + line) logger.error( "Character Entity prevented ET from parsing XML in file: " + url_link) # Print traceback traceback.print_exc() # Print exception information to file exc_type, exc_obj, exc_tb = sys.exc_info() fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] logger.error("Exception: " + str(exc_type) + " in Filename: " + str(fname) + " on Line: " + str(exc_tb.tb_lineno) + " Traceback: " + traceback.format_exc()) # Start the parsing process for XML for r in patent_root.findall('SDOBI'): # Collect main grant document data for B100 in r.findall('B100'): try: document_id = USPTOSanitizer.return_element_text( B100.find('B110')) document_id = USPTOSanitizer.fix_patent_number( document_id)[:20] except: document_id = None logger.error("No Patent Number was found for: " + url_link) try: kind = USPTOSanitizer.return_element_text( B100.find('B130'))[:2] app_type = USPTOSanitizer.return_xml2_app_type( args_array, kind) except: kind = None try: # PATENT ISSUE DATE pub_date = USPTOSanitizer.return_formatted_date( USPTOSanitizer.return_element_text(B100.find('B140')), args_array, document_id) except: pub_date = None try: # PATENT APPLICANT COUNTRY?? pub_country = USPTOSanitizer.return_element_text( B100.find('B190')) except: pub_country = None # Collect apllication data in document for B200 in r.findall('B200'): # TODO: find this in XML2 applications app_country = None try: # Application number app_no = USPTOSanitizer.return_element_text( B200.find('B210'))[:20] except: app_no = None try: # Application date app_date = USPTOSanitizer.return_formatted_date( USPTOSanitizer.return_element_text(B200.find('B220')), args_array, document_id) except: app_date = None try: series_code = USPTOSanitizer.return_element_text( B200.find('B211US'))[:2] except: series_code = None # Collect the grant length grant_length = USPTOSanitizer.return_element_text(r.find("B474")) # Collect US classification for B500 in r.findall('B500'): # US Classification for B520 in B500.findall('B520'): position = 1 # USCLASS for B521 in B520.findall('B521'): n_class_info = USPTOSanitizer.return_element_text(B521) n_class_main, n_subclass = USPTOSanitizer.return_class( n_class_info) n_class_main = n_class_main[:5] n_subclass = n_subclass[:15] # Append SQL data into dictionary to be written later processed_usclass.append({ "table_name": "uspto.USCLASS_G", "GrantID": document_id, "Position": position, "Class": n_class_main, "SubClass": n_subclass, "FileName": args_array['file_name'] }) position += 1 for B522 in B520.findall('B522'): # USCLASS FURTHER n_class_info = USPTOSanitizer.return_element_text(B522) n_class_main, n_subclass = USPTOSanitizer.return_class( n_class_info) n_class_main = n_class_main[:5] n_subclass = n_subclass[:15] # Append SQL data into dictionary to be written later processed_usclass.append({ "table_name": "uspto.USCLASS_G", "GrantID": document_id, "Position": position, "Class": n_class_main, "SubClass": n_subclass, "FileName": args_array['file_name'] }) position += 1 # Collect International Class data # TODO: check if I need to set all variables to empty or can just leave as null # TODO: check if classification is parsed correctly for B510 in B500.findall('B510'): # INTERNATIONAL CLASS #logger.warning("International Classifcation found in XML2: " + args_array['url_link'] + " document: " + str(document_id)) # Reset position position = 1 for B511 in B510.findall('B511'): #MAIN CLASS i_class_version_date = None i_class_action_date = None i_class_gnr = None i_class_level = None i_class_sec = None int_class = USPTOSanitizer.return_element_text(B511) # TODO: check international classification and rewrite this parsing piece. if (len(int_class.split()) > 1): i_class, i_subclass = int_class.split() i_class = i_class[:15] i_subclass = i_subclass[:15] else: i_class = int_class[:15] i_subclass = None i_class_mgr = None i_class_sgr = None i_class_sps = None i_class_val = None i_class_status = None i_class_ds = None # Append SQL data into dictionary to be written later processed_intclass.append({ "table_name": "uspto.INTCLASS_G", "GrantID": document_id, "Position": position, "Section": i_class_sec, "Class": i_class, "SubClass": i_subclass, "MainGroup": i_class_mgr, "SubGroup": i_class_sgr, "FileName": args_array['file_name'] }) position += 1 #INTERNATIONAL CLASS FURTHER for B512 in B510.findall('B511'): i_class_version_date = None i_class_action_date = None i_class_gnr = None i_class_level = None i_class_sec = None int_class = USPTOSanitizer.return_element_text(B512) # TODO: splitting int class does not include possible multiple subclasses if (len(int_class.split()) > 1): i_class = int_class.split()[0][:15] i_subclass = int_class.split()[1][:15] else: i_class = int_class[:15] i_subclass = None i_class_mgr = None i_class_sgr = None i_class_sps = None i_class_val = None i_class_status = None i_class_ds = None # Append SQL data into dictionary to be written later processed_intclass.append({ "table_name": "uspto.INTCLASS_G", "GrantID": document_id, "Position": position, "Section": i_class_sec, "Class": i_class, "SubClass": i_subclass, "MainGroup": i_class_mgr, "SubGroup": i_class_sgr, "FileName": args_array['file_name'] }) position += 1 # Collect Tite for B540 in B500.findall('B540'): try: title = USPTOSanitizer.return_element_text(B540)[:500] except: title = None # Collect Citations for B560 in B500.findall('B560'): # CITATIONS # Reset position counter for all citations loop position = 1 for B561 in B560.findall('B561'): #PATCIT # TODO: find out how to do PCIT, DOC without loop. Only B561 needs loop PCIT = B561.find('PCIT') # Determien if the patent is US or not #TODO: needs to check better, what does non US patent look like # If all patents have PARTY-US then perhaps a databse call to check the country of origin # would still allow to separate into GRACIT and FORPATCIT_G #if PCIT.find("PARTY-US") == True: #print "CiTATION OUNTRY US" #citation_country == "US" #else: #citation_country = "NON-US" #logger.warning("NON US patent found") citation_country = "US" DOC = PCIT.find('DOC') try: citation_document_number = USPTOSanitizer.return_element_text( DOC.find('DNUM'))[:15] except: citation_document_number = None try: pct_kind = USPTOSanitizer.return_element_text( DOC.find('KIND'))[:10] except: pct_kind = None try: citation_date = USPTOSanitizer.return_formatted_date( USPTOSanitizer.return_element_text( DOC.find('DATE'), args_array, document_id)) except: citation_date = None try: citation_name = USPTOSanitizer.return_element_text( PCIT.find('PARTY-US'))[:100] except: citation_name = None # Parse citation category if (len(B561.getchildren()) > 1): citation_category = B561.getchildren()[1].tag.replace( "\n", "").replace("\r", "") #print type(citation_category) # TODO: check that the citation category tag matches correctly #print "Citation Category = " + citation_category + " Length: " + str(len(citation_category)) if "CITED-BY-EXAMINER" in citation_category: citation_category = 1 elif "CITED-BY-OTHER" in citation_category: citation_category = 2 else: citation_category = 0 logger.warning("Cited by unknown type") else: citation_category = None #TODO: be aware that there may be something crazy in the citation document number if citation_country == "US": # Append SQL data into dictionary to be written later processed_gracit.append({ "table_name": "uspto.GRACIT_G", "GrantID": document_id, "Position": position, "CitedID": citation_document_number, "Kind": pct_kind, "Name": citation_name, "Date": citation_date, "Country": citation_country, "Category": citation_category, "FileName": args_array['file_name'] }) position += 1 else: # Append SQL data into dictionary to be written later processed_forpatcit.append({ "table_name": "uspto.FORPATCIT_G", "GrantID": document_id, "Position": position, "CitedID": citation_document_number, "Kind": pct_kind, "Name": citation_name, "Date": citation_date, "Country": citation_country, "Category": citation_category, "FileName": args_array['file_name'] }) position += 1 # Reset position counter for non-patent citations loop position = 1 for B562 in B560.findall('B562'): #NON-PATENT LITERATURE for NCIT in B562.findall('NCIT'): # sometimes, there will be '<i> or <sup>, etc.' in the reference string; we need to remove it non_patent_citation_text = USPTOSanitizer.return_element_text( NCIT) non_patent_citation_text = re.sub( '<[^>]+>', '', non_patent_citation_text) # parse citation cateory into code ncitation_category = ET.tostring(NCIT) if (len(B562.getchildren()) > 1): ncitation_category = B562.getchildren( )[1].tag.replace("\n", "").replace("\r", "") #print type(ncitation_category) #rint "Non patent citation category" + ncitation_category if "CITED-BY-EXAMINER" in ncitation_category: ncitation_category = 1 elif "CITED-BY-OTHER" in ncitation_category: ncitation_category = 2 else: ncitation_category = 0 # Append SQL data into dictionary to be written later processed_nonpatcit.append({ "table_name": "uspto.NONPATCIT_G", "GrantID": document_id, "Position": position, "Citation": non_patent_citation_text, "Category": ncitation_category, "FileName": args_array['file_name'] }) position += 1 # Collect number of claims for B570 in B500.findall('B570'): try: claims_num = USPTOSanitizer.return_element_text( B570.find('B577')) except: claims_num = None # Collect number of drawings and figures for B590 in B500.findall('B590'): for B595 in B590.findall('B595'): try: number_of_drawings = USPTOSanitizer.return_element_text( B595) number_of_drawings = number_of_drawings.split("/")[0] except: number_of_drawings = None for B596 in B590.findall('B596'): try: number_of_figures = USPTOSanitizer.return_element_text( B596) except: number_of_figures = None # TODO: B582 find out what it is. Looks like patent classifications but it's all alone in the XML # Collect party information # TODO: find the applicant data and append to array for B700 in r.findall('B700'): #PARTIES # Collect inventor data for B720 in B700.findall('B720'): #INVENTOR # Reset position for inventors position = 1 # Collect inventor information for B721 in B720.findall('B721'): for i in B721.findall('PARTY-US'): itSequence = position try: inventor_first_name = USPTOSanitizer.return_element_text( i.find('NAM').find('FNM'))[:100] except: inventor_first_name = None try: inventor_last_name = USPTOSanitizer.return_element_text( i.find('NAM').find('SNM'))[:100] except: inventor_last_name = None try: inventor_city = USPTOSanitizer.return_element_text( i.find('ADR').find('CITY'))[:100] except: inventor_city = None try: inventor_state = USPTOSanitizer.return_element_text( i.find('ADR').find('STATE'))[:3] except: inventor_state = None # Inventor country try: inventor_country = USPTOSanitizer.return_element_text( x.find("ADR").find('CTRY'))[:3] except: try: if USPTOSanitizer.is_US_state(inventor_state): inventor_country = "US" else: inventor_country = None except: inventor_country = None inventor_nationality = None inventor_residence = None # Append SQL data into dictionary to be written later processed_inventor.append({ "table_name": "uspto.INVENTOR_G", "GrantID": document_id, "Position": position, "FirstName": inventor_first_name, "LastName": inventor_last_name, "City": inventor_city, "State": inventor_state, "Country": inventor_country, "Nationality": inventor_nationality, "Residence": inventor_residence, "FileName": args_array['file_name'] }) position += 1 # Collect Assignee data # TODO: check if finding child of child is working # Reset position for assignees position = 1 for B730 in B700.findall('B730'): for B731 in B730.findall('B731'): for x in B731.findall('PARTY-US'): try: asn_orgname = USPTOSanitizer.return_element_text( x.find('NAM').find("ONM"))[:500] except: asn_orgname = None asn_role = None try: asn_city = USPTOSanitizer.return_element_text( x.find("ADR").find('CITY'))[:100] except: asn_city = None try: asn_state = USPTOSanitizer.return_element_text( x.find("ADR").find('STATE'))[:30] except: asn_state = None # Assignee country try: asn_country = USPTOSanitizer.return_element_text( x.find("ADR").find('CTRY'))[:3] except: try: if USPTOSanitizer.is_US_state(asn_state): asn_country = "US" else: asn_country = None except: asn_country = None # Append SQL data into dictionary to be written later processed_assignee.append({ "table_name": "uspto.ASSIGNEE_G", "GrantID": document_id, "Position": position, "OrgName": asn_orgname, "Role": asn_role, "City": asn_city, "State": asn_state, "Country": asn_country, "FileName": args_array['file_name'] }) # Increment the position placement position += 1 # Collect agent data for B740 in B700.findall('B740'): # Reset position for agents position = 1 for B741 in B740.findall('B741'): for x in B741.findall('PARTY-US'): try: agent_orgname = USPTOSanitizer.return_element_text( x.find('NAM').find("ONM"))[:300] except: agent_orgname = None try: agent_last_name = USPTOSanitizer.return_element_text( x.find('NAM').find('FNM'))[:100] except: agent_last_name = None try: agent_first_name = USPTOSanitizer.return_element_text( x.find('NAM').find('SNM'))[:100] except: agent_first_name = None # Attorney Address information try: agent_city = USPTOSanitizer.return_element_text( x.find("ADR").find('CITY'))[:100] except: agent_city = None try: agent_state = USPTOSanitizer.return_element_text( x.find("ADR").find('STATE'))[:30] except: agent_state = None # Agent country try: agent_country = USPTOSanitizer.return_element_text( x.find("ADR").find('CTRY'))[:3] except: try: if USPTOSanitizer.is_US_state(agent_state): agent_country = "US" else: agent_country = None except: agent_country = None # Append SQL data into dictionary to be written later processed_agent.append({ "table_name": "uspto.AGENT_G", "GrantID": document_id, "Position": position, "OrgName": agent_orgname, "LastName": agent_last_name, "FirstName": agent_first_name, "Country": agent_country, "FileName": args_array['file_name'] }) position += 1 # Collect examiner data for B745 in B700.findall('B745'): position = 1 # Primary Examiner for B746 in B745.findall('B746'): for x in B746.findall('PARTY-US'): try: examiner_last_name = USPTOSanitizer.return_element_text( x.find('NAM').find('SNM'))[:50] except: examiner_last_name = None try: examiner_fist_name = USPTOSanitizer.return_element_text( x.find('NAM').find('FNM'))[:50] except: examiner_fist_name = None #TODO: find out if 748US is the department examiner_department = None # Append SQL data into dictionary to be written later processed_examiner.append({ "table_name": "uspto.EXAMINER_G", "GrantID": document_id, "Position": position, "LastName": examiner_last_name, "FirstName": examiner_fist_name, "Department": examiner_department, "FileName": args_array['file_name'] }) position += 1 # Assistant Examiner for B747 in B745.findall('B747'): for x in B747.findall('PARTY-US'): try: examiner_last_name = USPTOSanitizer.return_element_text( x.find('NAM').find('SNM'))[:50] except: examiner_last_name = None try: examiner_fist_name = USPTOSanitizer.return_element_text( x.find('NAM').find('FNM'))[:50] except: examiner_fist_name = None #TODO: find out if 748US is the department examiner_department = None # Append SQL data into dictionary to be written later processed_examiner.append({ "table_name": "uspto.EXAMINER_G", "GrantID": document_id, "Position": position, "LastName": examiner_last_name, "FirstName": examiner_fist_name, "Department": examiner_department, "FileName": args_array['file_name'] }) position += 1 # Collect foreign priotiry data position = 1 for B300 in r.findall('B300'): # Country try: pc_country = USPTOSanitizer.return_element_text( B300.find('B330').find('CTRY'))[:5] except: pc_country = None # Prority filing date try: pc_date = USPTOSanitizer.return_formatted_date( USPTOSanitizer.return_element_text( B300.find('B320').find('DATE'))[:45]) except: pc_date = None # Prority document number try: pc_doc_num = USPTOSanitizer.return_element_text( B300.find('B310').find('DNUM'))[:45] except: pc_doc_dum = None # Set the fields that are not in gXML2 pc_kind = None # Append SQL data into dictionary to be written later processed_foreignpriority.append({ "table_name": "uspto.FOREIGNPRIORITY_G", "GrantID": document_id, "Position": position, "Kind": pc_kind, "Country": pc_country, "DocumentID": pc_doc_num, "PriorityDate": pc_date, "FileName": args_array['file_name'] }) #print(processed_foreignpriority) # Increment Position position += 1 # Collect Abstract from data try: abstr = patent_root.find('SDOAB') abstract = USPTOSanitizer.return_element_text(abstr).strip() #print abstract except: abstract = None # Collect claims from data try: cl = patent_root.find('SDOCL') claims = USPTOSanitizer.return_element_text(cl) #print claims except: traceback.print_exc() claims = None # Append SQL data into dictionary to be written later processed_grant.append({ "table_name": "uspto.GRANT", "GrantID": document_id, "Title": title, "IssueDate": pub_date, "Kind": kind, "GrantLength": grant_length, "USSeriesCode": series_code, "Abstract": abstract, "ClaimsNum": claims_num, "DrawingsNum": number_of_drawings, "FiguresNum": number_of_figures, "ApplicationID": app_no, "Claims": claims, "FileDate": app_date, "AppType": app_type, "FileName": args_array['file_name'] }) # Return a dictionary of the processed_ data arrays return { "processed_grant": processed_grant, "processed_applicant": processed_applicant, "processed_examiner": processed_examiner, "processed_assignee": processed_assignee, "processed_agent": processed_agent, "processed_inventor": processed_inventor, "processed_usclass": processed_usclass, "processed_intclass": processed_intclass, "processed_gracit": processed_gracit, "processed_forpatcit": processed_forpatcit, "processed_nonpatcit": processed_nonpatcit, "processed_foreignpriority": processed_foreignpriority }