cat_counts['year'] = patent_year df_counts = pd.DataFrame(cat_counts) grouped_counts = df_counts.groupby('year') summed_counts = grouped_counts.agg(sum) summed_counts['country'] = country return summed_counts country_files = os.listdir('./data') country_files = [f for f in country_files if 'tsv' in f] ## Read in and group the gre en IPC codes per the WIPO definition green_ipcs = pd.read_csv('./data/ipc_green_inventory_tags_8dig.csv') ## Clean the ipc codes to match those in the PATSTAT output ipc_clean = psCleanup.ipc_clean(green_ipcs['ipc']) green_ipcs['ipc'] = ipc_clean del ipc_clean ## Categorize at the top level the IPC codes green_energy_cats = {} for idx, d in enumerate(green_ipcs.l1): if d in green_energy_cats: green_energy_cats[d].append(green_ipcs.ipc[idx]) else: green_energy_cats[d] = [green_ipcs.ipc[idx]] ## Translate the ipc codes into regex for searching cat_regex = psCleanup.make_regex(green_energy_cats) ## Clean the data
print time.strftime('%c', time.localtime()) print 'Coauthors aggregated' ipc_list = [ ipc_output.xs(appln_id) if appln_id in ipc_output.index else '' for appln_id in name_output['appln_id'] ] ## Clean and join the IPC codes ipc_split = [] for ipc in ipc_list: if len(ipc) > 0: ipc_split.append(ipc[0].split('**')) else: ipc_split.append('') clean_ipc = [psCleanup.ipc_clean(ipc) for ipc in ipc_split] name_output['ipc_codes'] = [psCleanup.get_max(ipc) for ipc in clean_ipc] ## Write out files by country-year name_clean_finish = time.time() print 'Cleaning time per name + address + ipc code' print(name_clean_finish - name_clean_time) / float(len(name_output)) end_time = time.time() elapsed_time = end_time - start_time n_records = str(len(name_output)) print 'Time elapsed for ' + year + ' and ' + n_records + ' records: ' + str( numpy.round(elapsed_time, 0)) grouped_country = name_output.groupby('person_ctry_code')
print time.strftime('%c', time.localtime()) print 'Coauthors aggregated' ipc_list = [ipc_output.xs(appln_id) if appln_id in ipc_output.index else '' for appln_id in name_output['appln_id'] ] ## Clean and join the IPC codes ipc_split = [] for ipc in ipc_list: if len(ipc) > 0: ipc_split.append(ipc[0].split('**')) else: ipc_split.append('') clean_ipc = [psCleanup.ipc_clean(ipc) for ipc in ipc_split] name_output['ipc_codes'] = [psCleanup.get_max(ipc) for ipc in clean_ipc] ## Write out files by country-year name_clean_finish = time.time() print 'Cleaning time per name + address + ipc code' print (name_clean_finish - name_clean_time) / float(len(name_output)) end_time = time.time() elapsed_time = end_time - start_time n_records = str(len(name_output)) print 'Time elapsed for ' + year + ' and ' + n_records + ' records: ' + str(numpy.round(elapsed_time, 0)) grouped_country = name_output.groupby('person_ctry_code') for country, group in grouped_country:
def tuple_clean(query_output): """ Cleans, formats, outputs the query data. Collects summary statistics per country: number of records, number of nonblank address lines and average number of coauthors and ipc codes per country. Args: query_output: tuple of unformated person_appln tuples Returns: Files of cleaned person_appln rows written out by country. File of summary statistics written out one row per country. """ auth_patent_n = len(query_output) addresses_n = 0 coauths = list() ipc = list() name_clean_time = time.time() names = [q[2] for q in query_output] names = psCleanup.name_clean(names, psCleanup.name_address_dict_list) names_ids = [psCleanup.get_legal_ids(n, psCleanup.legal_regex) for n in names] addresses = [q[3] for q in query_output] addresses = psCleanup.name_clean(addresses, psCleanup.name_address_dict_list) coauthors = [q[5] for q in query_output] coauthors = psCleanup.name_clean(coauthors, psCleanup.coauth_dict_list) name_clean_finish = time.time() print 'Cleaning time per name + address' print (name_clean_finish - name_clean_time) / float(len(names)) for idx, record in enumerate(query_output): clean_time_start = time.time() ## Unpack the tuple appln_id, person_id, person_name, person_address, person_ctry_code, \ coauth, ipc_codes = record ## Separate out the authors and ipcs for cleaning coauthors_split = coauthors[idx].split('**') ipc_split = ipc_codes.split('**') ## Drop the co-author that is this author clean_coauthors = [name for name in coauthors_split if name != person_name] ## Generate some summary statistics addresses_n += len(person_address) > 0 coauths.append(len(clean_coauthors)) ipc.append(len(ipc_split)) appln_id = str(appln_id) person_id = str(person_id) ## Clean the person name, then break out the ## legal identifiers preclean_time = time.time() ## print preclean_time - clean_time_start # raw_name = psCleanup.name_clean([person_name])[0] clean_name, firm_legal_ids = names_ids[idx] # intermediate_clean_time = time.time() # print intermediate_clean_time - clean_time_start clean_ipcs = psCleanup.ipc_clean(ipc_split) # intermediate_clean_time_2 = time.time() # print intermediate_clean_time_2 - intermediate_clean_time coauthors_final = psCleanup.get_max(clean_coauthors) ipc_codes_final = psCleanup.get_max(clean_ipcs) legal_ids_final = psCleanup.get_max([firm_legal_ids]) clean_time_end = time.time() print appln_id, person_id, clean_name, legal_ids_final, addresses[idx], person_ctry_code, coauthors_final, ipc_codes_final print 'Record clean time:' print clean_time_end - clean_time_start # filename = outpathname + record[4]+'_out' # with open(filename, 'a') as tabfile: # cleanwriter = csv.writer(tabfile, delimiter ='\t') # cleanwriter.writerow(appln_id, # person_id, # clean_name, # addresses[idx], # legal_ids_final, # person_ctry_code, # coauthors_final, # ipc_codes_final, # year # ) # coauth_mean = numpy.mean(coauths) # ipc_mean = numpy.mean(ipc) # with open(outpathname+'summary_stats', 'a') as csvfile: # statswriter = csv.writer(csvfile) # statswriter.writerow([year, auth_patent_n, addresses_n, coauth_mean, ipc_mean]) return None