def clean_patents(source_path, destination_path, mixed_destination_path, csv_eu_path): """ clean_patents """ patent_data = pd.DataFrame(columns=['filename', 'applicant', 'citations']) csv_dataframe = xmlh.load_index_csv(csv_eu_path) for index, path in enumerate(source_path): files = fh.get_list_files(path, 'xml') for i in tqdm(range(len(files))): try: path_filename = files[i] # print("file: ", path_filename) parsed_xml = et.parse(path_filename) patent_document = parsed_xml.getroot() # country, date, doc_n, dtd_ver, file, id, kind, lang, status attributes = patent_document.attrib lang = attributes['lang'].upper() kind = attributes['kind'].upper() if lang == 'EN': filename = attributes['file'] region = filename[:2].upper() if region == 'EP': eu_parser(patent_document, attributes, filename, kind, destination_path[index], mixed_destination_path[index], csv_dataframe, patent_data) elif region == 'US': us_parser(patent_document, filename, kind, destination_path[index]) except: print("WARNING!!!! Check out the patent: ", path_filename) continue
def load_path_data(source_path, destination_path): print("seeking for patents ...") patents = [] for index, path in enumerate(source_path): patent_data = pd.DataFrame(columns=['file_path']) for file_path in fh.get_list_files(path, 'xml'): patent_data.loc[patent_data.shape[0] + 1] = [file_path] patents.append((patent_data, destination_path[index], path)) return patents
def handle_complete_args(source_path, folder_level): source_path = fh.link_paths(source_path, '*') source_path = fh.get_list_files(source_path, None) source_path = list(map(lambda path : path + '/', source_path)) print("source path: %s" % source_path) folder_level = int(folder_level) print("folder destination level: %s" % folder_level) if len(source_path) == 0 or source_path[len(source_path)-1][-5:-1] == '.xml': return source_path_warnings() return source_path, folder_level
def handle_partial_args(source_path): source_path = fh.link_paths(source_path, '*') source_path = fh.get_list_files(source_path, None) source_path = list(map(lambda path : path + '/', source_path)) print("source path: %s" % source_path) if len(source_path) == 0 or source_path[len(source_path)-1][-5:-1] == '.xml': return th.source_path_warnings() else: folder_level = source_path[0].count('/')-1 print("folder destination level: %s" % folder_level) return source_path, folder_level
def load_data_2(source_path): print('### reading patents ###') """ load_data_2 """ data_frame = pd.DataFrame(columns=['abstract', 'claim', 'description', 'classification']) classifications_df = pd.DataFrame(columns=['class', 'count']) patent_ids = [] for path in source_path: for patent_index, path_filename in enumerate(fh.get_list_files(path, 'txt')): file = open(path_filename, "r") patent_ids.append(th.get_patent_id(path_filename)) kind = get_txt_text(file, 1).strip() if kind == 'A1': classcode = get_txt_text(file, 1).strip() applicant = get_txt_text(file, 1).strip() abstract = get_txt_text(file, 1).strip() citations = get_txt_text(file, 1).strip() file.close() fill_dataframe(data_frame, classifications_df, classcode, abstract, None, None, patent_index) elif kind == 'B1': classcode = get_txt_text(file, 1).strip() id_respective_document = get_txt_text(file, 1).strip() # abstract = get_txt_text(file, 1).strip() claim = get_txt_text(file, 1).strip() description = get_txt_text(file, 1).strip() # citations = get_txt_text(file, 1).strip() file.close() fill_dataframe(data_frame, classifications_df, classcode, None, claim, description, patent_index) else: if kind == 'A1B1': print("eu_mix_patent !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") else: print("us_patent !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") classcode = get_txt_text(file, 1).strip() applicant = get_txt_text(file, 1).strip() abstract = get_txt_text(file, 1).strip() claim = get_txt_text(file, 1).strip() description = get_txt_text(file, 1).strip() citations = get_txt_text(file, 1).strip() file.close() fill_dataframe(data_frame, classifications_df, classcode, abstract, claim, description,patent_index) data_frame['id'] = data_frame.index classifications_df.sort_values(by=['count'], ascending=False, inplace=True, kind='quicksort') return patent_ids, data_frame, classifications_df
def explore_patents(source_path, destination_path): """ explore_patents """ patent_index = pd.DataFrame(columns=['file', 'country', 'kind', 'date', 'path']) for index, path in enumerate(source_path): files = fh.get_list_files(path, 'xml') # for i in tqdm(range(len(files))): for path_filename in files: try: f = open(path_filename, "rb") patent_file = TextIOWrapper(BytesIO(f.read())) patents = get_patents(patent_file.read(), int(re.search("([0-9]{4})", path).group())) # results = [(process_patent(patents[i][0], patents[i][1], destination_path[index], path, patent_index)) for i in tqdm(range(len(patents)))] # alternative results = list(map(lambda i : (process_patent(patents[i][0], patents[i][1], destination_path[index], path, patent_index)), tqdm(range(len(patents))))) except: print("WARNING!!!! Check out the file: ", path_filename) continue xmlh.write_index(patent_index, script_key)
def apply_method_for_reading_patents_1_2(data_frame, classifications_df, path): return [apply_method_for_reading_patents_2(data_frame, classifications_df, patent_index, path_filename) for patent_index, path_filename in enumerate(fh.get_list_files(path, 'txt'))]
def handle_path_patent(data_frame, classifications_df, path): return list(map(lambda path_filename : handle_patent_file(data_frame, classifications_df, path_filename), fh.get_list_files(path, 'txt')))
def explore_patents(source_path, destination_path): """ explore_patents """ patent_data = pd.DataFrame( columns=['file', 'country', 'kind', 'date', 'path']) for index, path in enumerate(source_path): files = fh.get_list_files(path, 'xml') for i in tqdm(range(len(files))): try: path_filename = files[i] parsed_xml = etree.parse(path_filename) patent_document = parsed_xml.getroot() # country, date, doc_n, dtd_ver, file, id, kind, lang, status attributes = patent_document.attrib if 'lang' in attributes and 'kind' in attributes and 'file' in attributes and 'date-publ' in attributes and 'country' in attributes: lang = attributes['lang'].upper() kind = attributes['kind'].upper() file = attributes['file'] date = attributes['date-publ'] region = attributes['country'] country = get_alternative_country(patent_document) classcode = get_alternative_classcode(patent_document) if lang == 'EN' and classcode != "": filename = th.get_eu_filename( path_filename) # LLDDDDDDDDLLLD.xml if kind == 'A1': # bibliografy - index applicant, abstract, citations = a1_parser( patent_document) # add A1 to index if the abstract is not empty (and the respective B1 is not in the dataset) if abstract != "": xmlh.write_eu_a1_xml_patent( destination_path[index], filename, lang, kind, classcode, applicant, abstract, citations) filenumber = get_file_number(file) patent_data.loc[patent_data.shape[0] + 1] = [ region + filenumber, country, kind, date, path ] elif kind == 'B1': # text - data id_respective_document = get_id_document( region, file, kind) # if the A1 has the abstract i do not need to save it here if id_respective_document[:-4] in patent_data[ 'file']: description, claim, abstract, citations = b1_parser( patent_document, False) # if there is not, i have also to save the information of the patent elif id_respective_document[: -4] not in patent_data[ 'file']: description, claim, abstract, citations = b1_parser( patent_document, True) xmlh.write_eu_b1_xml_patent( destination_path[index], filename, lang, kind, id_respective_document, classcode, abstract, claim, description, citations) filenumber = get_file_number(file) patent_data.loc[patent_data.shape[0] + 1] = [ region + filenumber, country, kind, date, path ] else: print("WARNING!!! outlier - no patent document: ", path_filename) except: print("WARNING!!!! Check out the patent: ", path_filename) continue xmlh.write_index(patent_data, script_key)