def generate_brand_counts_csv():
    try:
        stock_master = file_utils.read_csv("combined_stock_master_withbrands.csv")
        brands = count_field(stock_master, "Brand")
        file_utils.save_csv("brand_counts.csv", brands, fieldnames=["Brand", "Count"])
    except FileNotFoundError:
        print("Warning: files brand_counts.csv and/or combined_stock_master_withbrands.csv were not found. Brand data will not be used.")
def generate_top_category_string_csv():
    tcs = file_utils.top_category_names()
    rows = []
    for s in tcs:
        print(s)
        row = {"Top Category Name": s, "Top Category String": top_category_to_string(s)}
        rows.append(row)
    file_utils.save_csv("top_category_strings.csv", rows)
def generate_top_category_files(column_name):
    file_utils.mkdir("top_category_files")
    rows = file_utils.read_csv('unspsc_codes_v3.csv')
    tcs = {}
    for row in rows:
        if row[column_name] not in tcs:
            tcs[row[column_name]] = []
        tcs[row[column_name]].append(row)
    for tc in tcs:
        filename = "top_category_files/" + tc + ".csv"
        print("Saving " + filename)
        file_utils.save_csv(filename, tcs[tc])
Exemplo n.º 4
0
def save_groundtruth(post, comments, outfile):
    print("Saving groundtruth as", outfile + "_groundtruth.csv")

    #convert ground_truth from given format to eval format
    truth_events = []
    #include post
    truth_events.append({
        'rootID': "t3_" + post['id_h'],
        'nodeID': "t3_" + post['id_h'],
        'parentID': "t3_" + post['id_h']
    })
    #and all comments, sorted by time
    for comment in sorted(comments.values(), key=lambda k: k['created_utc']):
        truth_events.append({
            'rootID': comment['link_id_h'],
            'nodeID': "t1_" + comment['id_h'],
            'parentID': comment['parent_id_h']
        })

    #save ground-truth of this cascade
    file_utils.save_csv(truth_events,
                        outfile + "_groundtruth.csv",
                        fields=['rootID', 'nodeID', 'parentID'])
Exemplo n.º 5
0
    #for now, only output if doing a single post
    if batch == False:
        #save groundtruth cascade to csv
        functions_paper_model.save_groundtruth(sim_post, post_comments,
                                               outfile)

        #save sim results to json - all simulated events plus some simulation parameters
        functions_paper_model.save_sim_json(group, sim_post_id, random_post,
                                            time_observed, min_node_quality,
                                            max_nodes, estimate_initial_params,
                                            sim_events, outfile)

        #save sim results to second output file - csv, one event per row, columns 'rootID', 'nodeID', and 'parentID' for now
        print("Saving results to", outfile + ".csv...")
        file_utils.save_csv(sim_events,
                            outfile + ".csv",
                            fields=['rootID', 'nodeID', 'parentID'])
        print("")

    #EVAL

    #compute tree edit distance between ground-truth and simulated cascades
    dist, update_count, update_time, insert_count, insert_time, remove_count, remove_time, match_count = functions_paper_model.eval_trees(
        sim_tree, sim_post, post_comments)
    if batch == False:
        print("Tree edit distance:", dist)
        print("   update:", update_count, update_time)
        print("   insert:", insert_count, insert_time)
        print("   remove:", remove_count, remove_time)
        print("   match:", match_count)
    parser.add_argument("-s", "--skip_preprocessing", help="If set, skip preprocessing steps. This will slow down the processing.", action="store_true")

    args = parser.parse_args()

    stock_master = file_utils.read_csv(args.filename, add_ids=args.add_ids)
    level = args.level
    top_categories_to_check_count = args.num_to_check
    output = args.output
    jac = args.jaccard
    topn = args.matches
    parallel = not args.no_parallel
    skip_preprocessing = args.skip_preprocessing

    stime = time.time()

    if not output:
        stock_master = pandas.DataFrame(stock_master)
        df = add_commodities_to_dataframe(stock_master)
        print(df)
    else:
        rows = add_commodities_to_stocks(stock_master, level+" Name", top_categories_to_check_count, jac, topn, parallel, skip_preprocessing)
        try:
            file_utils.save_csv(output, rows, fieldnames=FIELDNAMES)
        except ValueError:
            print("Warning: row dictionaries contain keys not in fieldnames. Ignoring fieldnames...")
            file_utils.save_csv(output, rows)

    etime = time.time()
    ttime = etime-stime
    print('Time = ', ttime, 's')
Exemplo n.º 7
0
        old_rows = file_utils.read_csv(output_file)
    else:
        old_rows = []

    ndf = pandas.DataFrame(sites_rows)
    odf = pandas.DataFrame(old_rows)
    all_columns = ndf.columns.union(odf.columns)
    ndf = ndf.reindex(columns=all_columns, fill_value="-1")
    odf = odf.reindex(columns=all_columns, fill_value="-1")
    df = pandas.concat([ndf, odf]).reset_index(drop=True)

    if output_file:
        matches_df = match_sites_dataframe(df,
                                           matches_json=matches_json,
                                           top_n=top_n)
        matches_df = matches_df.sort_values(
            by=["Stock & Site", "Match Stock & Site"])
        result_rows = matches_df.to_dict("records")
        file_utils.save_csv(output_file,
                            result_rows,
                            fieldnames=OUTPUT_FIELDNAMES)
    else:
        matches_df = match_sites_dataframe(df, top_n=top_n)
    with pandas.option_context('display.max_rows', None, 'display.max_columns',
                               None):  # more options can be specified also
        print(matches_df.head(n=10))

    etime = time.time()
    ttime = etime - stime
    print('Time = ', ttime, 's')
Exemplo n.º 8
0
                #ks-test those distributions - if sim list is non-empty
                if len(sim_timestamps) != 0:
                    D, p_val = stats.ks_2samp(true_timestamps, sim_timestamps)
                else:
                    D = None
                    p_val = None

                #add results to running total
                observed_count = len([
                    timestamp for timestamp in true_timestamps
                    if timestamp < time * 60
                ]) if model != "rand_tree" else None
                res = {
                    'model': model,
                    'post': post_id,
                    'run': run,
                    'true_count': len(true_timestamps),
                    "sim_count": len(sim_timestamps),
                    'observed_count': observed_count,
                    'time_observed': time,
                    'D': D,
                    'p_val': p_val
                }
                ks_test_res.append(res)

#save ks-test results to file
file_utils.save_csv(ks_test_res, ks_test_res_filename % subreddit, [
    "model", 'post', 'run', 'true_count', 'sim_count', 'observed_count',
    'time_observed', "D", "p_val"
])
Exemplo n.º 9
0
        res = {'model': model, 'time_observed': time, 'D': D, 'p_val': p_val}
        ks_test_res.append(res)

        #add lifetime list for this time to running list
        output_lists.append(all_sim_lifetime[time])
        output_fields.append("%s_%dh_sim" %
                             (model,
                              time) if model != "rand_tree" else "%s_%sh_sim" %
                             (model, "all"))

#save all lists to file
file_utils.multi_lists_to_csv(output_lists, output_fields,
                              lifetime_lists_filename)

#save ks-test results to file
file_utils.save_csv(ks_test_res, ks_test_res_filename,
                    ["model", 'time_observed', "D", "p_val"])

#this is all the old version, which is done for only the model, per subreddit
#new version (above) aggregates all the subreddits together, and does each model
'''
#process each subreddit
for subreddit in subreddits:
	#load true cascade lifetime data
	data = file_utils.load_csv_pandas(true_stats_filename % subreddit)
	#pull lifetime list
	true_lifetime = list(data['lifetime(minutes)'])

	#add true data to output lists
	output_lists.append(true_lifetime)
	output_fields.append("%s_true" % subreddit)