cat_counts['year'] = patent_year
    df_counts = pd.DataFrame(cat_counts)
    grouped_counts = df_counts.groupby('year')
    summed_counts = grouped_counts.agg(sum)
    summed_counts['country'] = country
    return summed_counts

country_files = os.listdir('./data')
country_files = [f for f in country_files if 'tsv' in f]


## Read in and group the gre en IPC codes per the WIPO definition
green_ipcs = pd.read_csv('./data/ipc_green_inventory_tags_8dig.csv')

## Clean the ipc codes to match those in the PATSTAT output
ipc_clean = psCleanup.ipc_clean(green_ipcs['ipc'])
green_ipcs['ipc'] = ipc_clean
del ipc_clean

## Categorize at the top level the IPC codes
green_energy_cats = {}
for idx, d in enumerate(green_ipcs.l1):
    if d in green_energy_cats:
        green_energy_cats[d].append(green_ipcs.ipc[idx])
    else:
        green_energy_cats[d] = [green_ipcs.ipc[idx]]

## Translate the ipc codes into regex for searching
cat_regex = psCleanup.make_regex(green_energy_cats)

## Clean the data
예제 #2
0
    print time.strftime('%c', time.localtime())
    print 'Coauthors aggregated'

    ipc_list = [
        ipc_output.xs(appln_id) if appln_id in ipc_output.index else ''
        for appln_id in name_output['appln_id']
    ]
    ## Clean and join the IPC codes
    ipc_split = []
    for ipc in ipc_list:
        if len(ipc) > 0:
            ipc_split.append(ipc[0].split('**'))
        else:
            ipc_split.append('')

    clean_ipc = [psCleanup.ipc_clean(ipc) for ipc in ipc_split]
    name_output['ipc_codes'] = [psCleanup.get_max(ipc) for ipc in clean_ipc]

    ## Write out files by country-year
    name_clean_finish = time.time()
    print 'Cleaning time per name + address + ipc code'
    print(name_clean_finish - name_clean_time) / float(len(name_output))

    end_time = time.time()
    elapsed_time = end_time - start_time
    n_records = str(len(name_output))
    print 'Time elapsed for ' + year + ' and ' + n_records + ' records: ' + str(
        numpy.round(elapsed_time, 0))

    grouped_country = name_output.groupby('person_ctry_code')
예제 #3
0
    print time.strftime('%c', time.localtime())
    print 'Coauthors aggregated'


    ipc_list = [ipc_output.xs(appln_id) if appln_id in ipc_output.index else ''
                for appln_id in name_output['appln_id']
                ]
    ## Clean and join the IPC codes
    ipc_split = []
    for ipc in ipc_list:
        if len(ipc) > 0:
            ipc_split.append(ipc[0].split('**'))
        else:
            ipc_split.append('')

    clean_ipc = [psCleanup.ipc_clean(ipc) for ipc in ipc_split]
    name_output['ipc_codes'] = [psCleanup.get_max(ipc) for ipc in clean_ipc]

    ## Write out files by country-year
    name_clean_finish = time.time()
    print 'Cleaning time per name + address + ipc code'
    print (name_clean_finish - name_clean_time) / float(len(name_output))

    end_time = time.time()
    elapsed_time = end_time - start_time
    n_records = str(len(name_output))
    print 'Time elapsed for ' + year + ' and ' + n_records + ' records: ' + str(numpy.round(elapsed_time, 0))

    grouped_country = name_output.groupby('person_ctry_code')

    for country, group in grouped_country:
예제 #4
0
def tuple_clean(query_output):
    """
    Cleans, formats, outputs the query data.
    Collects summary statistics per country: number of records,
        number of nonblank address lines and average number of coauthors
        and ipc codes per country.
    
    Args:
        query_output: tuple of unformated person_appln tuples
    Returns:
        Files of cleaned person_appln rows written out by country.
        File of summary statistics written out one row per country.
    """

    auth_patent_n = len(query_output)
    addresses_n = 0
    coauths = list()
    ipc = list()

    name_clean_time = time.time()
    names = [q[2] for q in query_output]
    names = psCleanup.name_clean(names, psCleanup.name_address_dict_list)
    names_ids = [psCleanup.get_legal_ids(n, psCleanup.legal_regex) for n in names]
    
    addresses = [q[3] for q in query_output]
    addresses = psCleanup.name_clean(addresses, psCleanup.name_address_dict_list)

    coauthors = [q[5] for q in query_output]
    coauthors = psCleanup.name_clean(coauthors, psCleanup.coauth_dict_list)

    name_clean_finish = time.time()

    print 'Cleaning time per name + address'
    print (name_clean_finish - name_clean_time) / float(len(names))


    for idx, record in enumerate(query_output):

        clean_time_start = time.time()
        ## Unpack the tuple
        appln_id, person_id, person_name, person_address, person_ctry_code, \
                  coauth, ipc_codes = record
        
        ## Separate out the authors and ipcs for cleaning
        coauthors_split = coauthors[idx].split('**')
        ipc_split = ipc_codes.split('**')

        ## Drop the co-author that is this author
        clean_coauthors = [name for name in coauthors_split if name != person_name]

        ## Generate some summary statistics
        addresses_n += len(person_address) > 0
        coauths.append(len(clean_coauthors))
        ipc.append(len(ipc_split))

        
        appln_id = str(appln_id)
        person_id = str(person_id)

        ## Clean the person name, then break out the
        ## legal identifiers
        preclean_time = time.time()
        ## print preclean_time - clean_time_start
        # raw_name = psCleanup.name_clean([person_name])[0]
        clean_name, firm_legal_ids = names_ids[idx]
        # intermediate_clean_time = time.time()
        # print intermediate_clean_time - clean_time_start
        clean_ipcs = psCleanup.ipc_clean(ipc_split)

        # intermediate_clean_time_2 = time.time()
        # print intermediate_clean_time_2 - intermediate_clean_time
        
        coauthors_final = psCleanup.get_max(clean_coauthors)
        ipc_codes_final = psCleanup.get_max(clean_ipcs)
        legal_ids_final = psCleanup.get_max([firm_legal_ids])
        clean_time_end = time.time()
        
        print appln_id, person_id, clean_name, legal_ids_final, addresses[idx], person_ctry_code, coauthors_final, ipc_codes_final

        
        print 'Record clean time:'
        print clean_time_end - clean_time_start
        
    #     filename = outpathname + record[4]+'_out'
        
    
    #     with open(filename, 'a') as tabfile:
    #         cleanwriter = csv.writer(tabfile, delimiter ='\t')
    #         cleanwriter.writerow(appln_id,
    #                              person_id,
    #                              clean_name,
    #                              addresses[idx],
    #                              legal_ids_final,
    #                              person_ctry_code,
    #                              coauthors_final,
    #                              ipc_codes_final,
    #                              year
    #                              )

    # coauth_mean = numpy.mean(coauths) 
    # ipc_mean = numpy.mean(ipc)

    # with open(outpathname+'summary_stats', 'a') as csvfile:
    #     statswriter = csv.writer(csvfile)
    #     statswriter.writerow([year, auth_patent_n, addresses_n, coauth_mean, ipc_mean])       

    return None
예제 #5
0
    cat_counts['year'] = patent_year
    df_counts = pd.DataFrame(cat_counts)
    grouped_counts = df_counts.groupby('year')
    summed_counts = grouped_counts.agg(sum)
    summed_counts['country'] = country
    return summed_counts


country_files = os.listdir('./data')
country_files = [f for f in country_files if 'tsv' in f]

## Read in and group the gre en IPC codes per the WIPO definition
green_ipcs = pd.read_csv('./data/ipc_green_inventory_tags_8dig.csv')

## Clean the ipc codes to match those in the PATSTAT output
ipc_clean = psCleanup.ipc_clean(green_ipcs['ipc'])
green_ipcs['ipc'] = ipc_clean
del ipc_clean

## Categorize at the top level the IPC codes
green_energy_cats = {}
for idx, d in enumerate(green_ipcs.l1):
    if d in green_energy_cats:
        green_energy_cats[d].append(green_ipcs.ipc[idx])
    else:
        green_energy_cats[d] = [green_ipcs.ipc[idx]]

## Translate the ipc codes into regex for searching
cat_regex = psCleanup.make_regex(green_energy_cats)

## Clean the data