def generate_brand_counts_csv(): try: stock_master = file_utils.read_csv("combined_stock_master_withbrands.csv") brands = count_field(stock_master, "Brand") file_utils.save_csv("brand_counts.csv", brands, fieldnames=["Brand", "Count"]) except FileNotFoundError: print("Warning: files brand_counts.csv and/or combined_stock_master_withbrands.csv were not found. Brand data will not be used.")
def generate_top_category_string_csv(): tcs = file_utils.top_category_names() rows = [] for s in tcs: print(s) row = {"Top Category Name": s, "Top Category String": top_category_to_string(s)} rows.append(row) file_utils.save_csv("top_category_strings.csv", rows)
def generate_top_category_files(column_name): file_utils.mkdir("top_category_files") rows = file_utils.read_csv('unspsc_codes_v3.csv') tcs = {} for row in rows: if row[column_name] not in tcs: tcs[row[column_name]] = [] tcs[row[column_name]].append(row) for tc in tcs: filename = "top_category_files/" + tc + ".csv" print("Saving " + filename) file_utils.save_csv(filename, tcs[tc])
def save_groundtruth(post, comments, outfile): print("Saving groundtruth as", outfile + "_groundtruth.csv") #convert ground_truth from given format to eval format truth_events = [] #include post truth_events.append({ 'rootID': "t3_" + post['id_h'], 'nodeID': "t3_" + post['id_h'], 'parentID': "t3_" + post['id_h'] }) #and all comments, sorted by time for comment in sorted(comments.values(), key=lambda k: k['created_utc']): truth_events.append({ 'rootID': comment['link_id_h'], 'nodeID': "t1_" + comment['id_h'], 'parentID': comment['parent_id_h'] }) #save ground-truth of this cascade file_utils.save_csv(truth_events, outfile + "_groundtruth.csv", fields=['rootID', 'nodeID', 'parentID'])
#for now, only output if doing a single post if batch == False: #save groundtruth cascade to csv functions_paper_model.save_groundtruth(sim_post, post_comments, outfile) #save sim results to json - all simulated events plus some simulation parameters functions_paper_model.save_sim_json(group, sim_post_id, random_post, time_observed, min_node_quality, max_nodes, estimate_initial_params, sim_events, outfile) #save sim results to second output file - csv, one event per row, columns 'rootID', 'nodeID', and 'parentID' for now print("Saving results to", outfile + ".csv...") file_utils.save_csv(sim_events, outfile + ".csv", fields=['rootID', 'nodeID', 'parentID']) print("") #EVAL #compute tree edit distance between ground-truth and simulated cascades dist, update_count, update_time, insert_count, insert_time, remove_count, remove_time, match_count = functions_paper_model.eval_trees( sim_tree, sim_post, post_comments) if batch == False: print("Tree edit distance:", dist) print(" update:", update_count, update_time) print(" insert:", insert_count, insert_time) print(" remove:", remove_count, remove_time) print(" match:", match_count)
parser.add_argument("-s", "--skip_preprocessing", help="If set, skip preprocessing steps. This will slow down the processing.", action="store_true") args = parser.parse_args() stock_master = file_utils.read_csv(args.filename, add_ids=args.add_ids) level = args.level top_categories_to_check_count = args.num_to_check output = args.output jac = args.jaccard topn = args.matches parallel = not args.no_parallel skip_preprocessing = args.skip_preprocessing stime = time.time() if not output: stock_master = pandas.DataFrame(stock_master) df = add_commodities_to_dataframe(stock_master) print(df) else: rows = add_commodities_to_stocks(stock_master, level+" Name", top_categories_to_check_count, jac, topn, parallel, skip_preprocessing) try: file_utils.save_csv(output, rows, fieldnames=FIELDNAMES) except ValueError: print("Warning: row dictionaries contain keys not in fieldnames. Ignoring fieldnames...") file_utils.save_csv(output, rows) etime = time.time() ttime = etime-stime print('Time = ', ttime, 's')
old_rows = file_utils.read_csv(output_file) else: old_rows = [] ndf = pandas.DataFrame(sites_rows) odf = pandas.DataFrame(old_rows) all_columns = ndf.columns.union(odf.columns) ndf = ndf.reindex(columns=all_columns, fill_value="-1") odf = odf.reindex(columns=all_columns, fill_value="-1") df = pandas.concat([ndf, odf]).reset_index(drop=True) if output_file: matches_df = match_sites_dataframe(df, matches_json=matches_json, top_n=top_n) matches_df = matches_df.sort_values( by=["Stock & Site", "Match Stock & Site"]) result_rows = matches_df.to_dict("records") file_utils.save_csv(output_file, result_rows, fieldnames=OUTPUT_FIELDNAMES) else: matches_df = match_sites_dataframe(df, top_n=top_n) with pandas.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also print(matches_df.head(n=10)) etime = time.time() ttime = etime - stime print('Time = ', ttime, 's')
#ks-test those distributions - if sim list is non-empty if len(sim_timestamps) != 0: D, p_val = stats.ks_2samp(true_timestamps, sim_timestamps) else: D = None p_val = None #add results to running total observed_count = len([ timestamp for timestamp in true_timestamps if timestamp < time * 60 ]) if model != "rand_tree" else None res = { 'model': model, 'post': post_id, 'run': run, 'true_count': len(true_timestamps), "sim_count": len(sim_timestamps), 'observed_count': observed_count, 'time_observed': time, 'D': D, 'p_val': p_val } ks_test_res.append(res) #save ks-test results to file file_utils.save_csv(ks_test_res, ks_test_res_filename % subreddit, [ "model", 'post', 'run', 'true_count', 'sim_count', 'observed_count', 'time_observed', "D", "p_val" ])
res = {'model': model, 'time_observed': time, 'D': D, 'p_val': p_val} ks_test_res.append(res) #add lifetime list for this time to running list output_lists.append(all_sim_lifetime[time]) output_fields.append("%s_%dh_sim" % (model, time) if model != "rand_tree" else "%s_%sh_sim" % (model, "all")) #save all lists to file file_utils.multi_lists_to_csv(output_lists, output_fields, lifetime_lists_filename) #save ks-test results to file file_utils.save_csv(ks_test_res, ks_test_res_filename, ["model", 'time_observed', "D", "p_val"]) #this is all the old version, which is done for only the model, per subreddit #new version (above) aggregates all the subreddits together, and does each model ''' #process each subreddit for subreddit in subreddits: #load true cascade lifetime data data = file_utils.load_csv_pandas(true_stats_filename % subreddit) #pull lifetime list true_lifetime = list(data['lifetime(minutes)']) #add true data to output lists output_lists.append(true_lifetime) output_fields.append("%s_true" % subreddit)