def tuple_clean(query_output): """ Cleans, formats, outputs the query data. Collects summary statistics per country: number of records, number of nonblank address lines and average number of coauthors and ipc codes per country. Args: query_output: tuple of unformated person_appln tuples Returns: Files of cleaned person_appln rows written out by country. File of summary statistics written out one row per country. """ auth_patent_n = len(query_output) addresses_n = 0 coauths = list() ipc = list() name_clean_time = time.time() names = [q[2] for q in query_output] names = psCleanup.name_clean(names, psCleanup.name_address_dict_list) names_ids = [psCleanup.get_legal_ids(n, psCleanup.legal_regex) for n in names] addresses = [q[3] for q in query_output] addresses = psCleanup.name_clean(addresses, psCleanup.name_address_dict_list) coauthors = [q[5] for q in query_output] coauthors = psCleanup.name_clean(coauthors, psCleanup.coauth_dict_list) name_clean_finish = time.time() print 'Cleaning time per name + address' print (name_clean_finish - name_clean_time) / float(len(names)) for idx, record in enumerate(query_output): clean_time_start = time.time() ## Unpack the tuple appln_id, person_id, person_name, person_address, person_ctry_code, \ coauth, ipc_codes = record ## Separate out the authors and ipcs for cleaning coauthors_split = coauthors[idx].split('**') ipc_split = ipc_codes.split('**') ## Drop the co-author that is this author clean_coauthors = [name for name in coauthors_split if name != person_name] ## Generate some summary statistics addresses_n += len(person_address) > 0 coauths.append(len(clean_coauthors)) ipc.append(len(ipc_split)) appln_id = str(appln_id) person_id = str(person_id) ## Clean the person name, then break out the ## legal identifiers preclean_time = time.time() ## print preclean_time - clean_time_start # raw_name = psCleanup.name_clean([person_name])[0] clean_name, firm_legal_ids = names_ids[idx] # intermediate_clean_time = time.time() # print intermediate_clean_time - clean_time_start clean_ipcs = psCleanup.ipc_clean(ipc_split) # intermediate_clean_time_2 = time.time() # print intermediate_clean_time_2 - intermediate_clean_time coauthors_final = psCleanup.get_max(clean_coauthors) ipc_codes_final = psCleanup.get_max(clean_ipcs) legal_ids_final = psCleanup.get_max([firm_legal_ids]) clean_time_end = time.time() print appln_id, person_id, clean_name, legal_ids_final, addresses[idx], person_ctry_code, coauthors_final, ipc_codes_final print 'Record clean time:' print clean_time_end - clean_time_start # filename = outpathname + record[4]+'_out' # with open(filename, 'a') as tabfile: # cleanwriter = csv.writer(tabfile, delimiter ='\t') # cleanwriter.writerow(appln_id, # person_id, # clean_name, # addresses[idx], # legal_ids_final, # person_ctry_code, # coauthors_final, # ipc_codes_final, # year # ) # coauth_mean = numpy.mean(coauths) # ipc_mean = numpy.mean(ipc) # with open(outpathname+'summary_stats', 'a') as csvfile: # statswriter = csv.writer(csvfile) # statswriter.writerow([year, auth_patent_n, addresses_n, coauth_mean, ipc_mean]) return None
] name_output['person_name'], name_output['firm_legal_id'] = zip(*names_ids) name_output['person_address'] = psCleanup.name_clean( name_output['person_address'], psCleanup.name_address_dict_list) print time.strftime('%c', time.localtime()) print 'Names clean' ## ID the coauthors and join coauthor_list = [] for appln_id, person_id in zip(name_output['appln_id'], name_output['person_id']): coauthors = name_output['person_name'][ (name_output['appln_id'] == appln_id) & (name_output['person_id'] != person_id)] coauthor_list.append(psCleanup.get_max(coauthors)) name_output['coauthors'] = coauthor_list print time.strftime('%c', time.localtime()) print 'Coauthors aggregated' ipc_list = [ ipc_output.xs(appln_id) if appln_id in ipc_output.index else '' for appln_id in name_output['appln_id'] ] ## Clean and join the IPC codes ipc_split = [] for ipc in ipc_list: if len(ipc) > 0: ipc_split.append(ipc[0].split('**')) else:
names_ids = [psCleanup.get_legal_ids(n, psCleanup.legal_regex) for n in names] name_output['person_name'], name_output['firm_legal_id'] = zip(*names_ids) name_output['person_address'] = psCleanup.name_clean(name_output['person_address'], psCleanup.name_address_dict_list ) print time.strftime('%c', time.localtime()) print 'Names clean' ## ID the coauthors and join coauthor_list = [] for appln_id, person_id in zip(name_output['appln_id'], name_output['person_id']): coauthors = name_output['person_name'][(name_output['appln_id'] == appln_id) & (name_output['person_id'] != person_id) ] coauthor_list.append(psCleanup.get_max(coauthors)) name_output['coauthors'] = coauthor_list print time.strftime('%c', time.localtime()) print 'Coauthors aggregated' ipc_list = [ipc_output.xs(appln_id) if appln_id in ipc_output.index else '' for appln_id in name_output['appln_id'] ] ## Clean and join the IPC codes ipc_split = [] for ipc in ipc_list: if len(ipc) > 0: ipc_split.append(ipc[0].split('**')) else: