def ret_rec(household, rep): # print out every 100000 returns? #if rep.total_responses % 100000 == 0: #print(rep.total_responses) if oo.record_active_summary: # add household to summary of responses for key, value in rep.active_summary.items(): value[str(getattr(household, key))][math.floor(rep.env.now / 24)] += 1 for key, value in rep.active_totals.items(): value[str(getattr(household, key))] += 1 if oo.record_active_paper_summary and not household.digital: for key, value in rep.active_paper_summary.items(): value[str(getattr(household, key))][math.floor(rep.env.now / 24)] += 1 for key, value in rep.active_paper_totals.items(): value[str(getattr(household, key))] += 1 household.return_received = True if oo.record_return_received: rep.output_data['Return_received'].append( oo.generic_output(rep.reps, household.district.district, household.la, household.lsoa, household.digital, household.hh_type, household.hh_id, rep.env.now)) # currently every return gets counted as a response as soon as it is received - this may need to change household.responded = True rep.total_responses += 1 household.district.total_responses += 1 # check size of output data - if over an amount, size or length write to file? if oo.record_responded: rep.output_data['Responded'].append( oo.generic_output(rep.reps, household.district.district, household.la, household.lsoa, household.digital, household.hh_type, household.hh_id, rep.env.now)) # checks size of output and writes to file if too large if (h.dict_size(rep.output_data)) > rep.max_output_file_size: h.write_output(rep.output_data, rep.output_path, rep.run) yield rep.env.timeout(0)
def main(): users = user_database.UserDatabase() test_tweets = helper.extract_tweets(users, max_users = -1, max_tweets = 1, min_tweets = 1, test_sample = True) print len(test_tweets) with open('ngrams.pkl') as fid: ngrams_vars = cPickle.load(fid) ngrams = ngrams_vars["ngrams"] useful_ngrams = ngrams_vars["useful_ngrams"] helper.extract_ngrams(test_tweets, ngrams, for_cv_tweets = True) test_vars = helper.build_Xy(test_tweets, useful_ngrams) X_test = test_vars["X"] y_test = test_vars["y"] logistic = joblib.load("Results/Log_Testing/Logit_C0.1.joblib.pkl") print('Test Logistic Regression score: %f'%logistic.score(X_test, y_test)) y_test_pred = logistic.predict(X_test) helper.write_output("Results/Log_Testing/TestUsers.csv", y_test, y_test_pred, test_tweets, users)
def naive_bayes(X_train, y_train, X_cv, y_cv, train_tweets, cv_tweets, filename): alphas = [0.01, 0.1, 1, 10, 100] best_cv_score = 0. best_alpha = 0 for alpha in alphas: print "Doing fit" mnb = MultinomialNB(alpha=alpha) mnb.fit(X_train, y_train) y_cv_pred = mnb.predict(X_cv) cv_score = mnb.score(X_cv, y_cv) print('CV Multinomial NB score: %(score)f for alpha = %(alpha)f'% {"score": cv_score, "alpha": alpha}) if cv_score > best_cv_score: best_cv_score = cv_score best_alpha = alpha helper.write_output(filename + "_alpha" + str(alpha) + ".csv", y_cv, y_cv_pred, cv_tweets, users) return best_cv_score
def create_districts(self): co_number = 0 list_of_districts = sorted(list(self.input_data['districts'].keys())) for distr in list_of_districts: # checks size of output file and writes to file if too large if (h.dict_size(self.output_data)) >= self.max_output_file_size: h.write_output(self.output_data, self.output_path, self.run) self.districts.append(district.District(self, distr)) try: co_number += self.input_data['districts'][distr][ "census officer"]["standard"]["number"] except KeyError as e: warning_detail = "no CO for run, ", self.run, " in create districts" if oo.record_warnings: self.output_data['Warnings'].append( oo.warnings(self.reps, e, warning_detail)) co_number = 0
def logistic_regression(X_train, y_train, X_cv, y_cv, train_tweets, cv_tweets, filename): C_regs = [0.03, 0.1, 0.3, 1.0, 3.0] # C_regs = [0.1] best_cv_score = 0. best_C = 0 for C_reg in C_regs: print "Doing fit" logistic = linear_model.LogisticRegression(penalty="l2", C=C_reg) logistic.fit(X_train, y_train) cv_score = logistic.score(X_cv, y_cv) print('Train LogisticRegression score: %f'% logistic.score(X_train, y_train)) print('CV LogisticRegression score: %(score)f for C_reg = %(C)f'% {"score": cv_score, "C": C_reg}) y_cv_pred = logistic.predict(X_cv) if cv_score > best_cv_score: best_cv_score = cv_score best_C = C_reg helper.write_output(filename + "_C" + str(C_reg) + ".csv", y_cv, y_cv_pred, cv_tweets, users) joblib.dump(logistic, filename + "_C" + str(C_reg) + ".joblib.pkl", compress=9) return best_cv_score
def start_run(run_input, seeds, out_path): max_output_file_size = 100000000 start_date = dt.date(*map(int, run_input['start_date'].split(','))) end_date = dt.date(*map(int, run_input['end_date'].split(','))) sim_hours = (end_date - start_date).total_seconds()/3600 census_date = dt.date(*map(int, run_input['census_date'].split(','))) census_day = (census_date - start_date).days # write key dates/info to a csv for later use in post processing temp_list = [{'start_date': start_date, 'end_date': end_date, 'census_date': census_date, 'census_day': census_day, 'sim_hours': sim_hours}] # create dataframe to hold summary data days = int(sim_hours / 24)+1 day_cols = [] for day in range(0, days): day_cols.append(day) # generate list of codes for reference from input file where appropriate dig_list = ['False', 'True'] la_list = [] lsoa_list = [] hh_type_list = [] # cycle through the input data to only include those household types, lsoa and la that exist to appear in the output for d_key, d_value in run_input['districts'].items(): for hh_key, hh_value in d_value['households'].items(): if hh_key not in hh_type_list: hh_type_list.append(hh_key) for la_key, la_value in hh_value['cca_makeup'].items(): if la_key not in la_list: la_list.append(la_key) for lsoa_key, lsoa_value in la_value.items(): if lsoa_key not in lsoa_list: lsoa_list.append(lsoa_key) """ passive_summary records a daily summary of returns that occur without any intervention passive_totals records a simple total of returns without intervention active_summary records all returns including those due to the interventions active_totals records a simple total of all returns active_paper_summary paper returns over time active_paper_totals total paper returns visit_summary records all visits over time by the level in the key visit_totals records simple sum of total visits by level in key time_summary records time spent on task by the census officers time_totals sum of total time on task paper_summary summary of paper given out over time paper_totals sum of paper given out the summaries and totals are at the levels in the keys. """ passive_summary = {} passive_totals = {} active_summary = {} active_totals = {} active_paper_summary = {} active_paper_totals = {} visit_totals = {} visit_summary = {} time_totals = {} time_summary = {} paper_totals = {} paper_summary = {} if oo.record_passive_summary: passive_summary = {'la': dict((la_list[i], [0]*days) for i in range(0, len(la_list))), 'lsoa': dict((lsoa_list[i], [0] * days) for i in range(0, len(lsoa_list))), 'digital': dict((dig_list[i], [0]*days) for i in range(0, len(dig_list))), 'hh_type': dict((hh_type_list[i], [0]*days) for i in range(0, len(hh_type_list))) } passive_totals = {'lsoa': dict((lsoa_list[i], 0) for i in range(0, len(lsoa_list))), 'la': dict((la_list[i], 0) for i in range(0, len(la_list))) } if oo.record_active_summary: active_summary = {'la': dict((la_list[i], [0] * days) for i in range(0, len(la_list))), 'digital': dict((dig_list[i], [0] * days) for i in range(0, len(dig_list))), 'hh_type': dict((hh_type_list[i], [0] * days) for i in range(0, len(hh_type_list))) } active_totals = {'lsoa': dict((lsoa_list[i], 0) for i in range(0, len(lsoa_list))), 'la': dict((la_list[i], 0) for i in range(0, len(la_list))) } if oo.record_active_paper_summary: active_paper_summary = {'la': dict((la_list[i], [0] * days) for i in range(0, len(la_list))), 'hh_type': dict((hh_type_list[i], [0] * days) for i in range(0, len(hh_type_list))) } active_paper_totals = {'lsoa': dict((lsoa_list[i], 0) for i in range(0, len(lsoa_list))), 'la': dict((la_list[i], 0) for i in range(0, len(la_list))) } if oo.record_visit_summary: visit_totals = {'la': dict((la_list[i], 0) for i in range(0, len(la_list))) } visit_summary = {'la': dict((la_list[i], [0]*days) for i in range(0, len(la_list))), 'hh_type': dict((hh_type_list[i], [0] * days) for i in range(0, len(hh_type_list)))} if oo.record_time_summary: time_totals = {'la': dict((la_list[i], 0) for i in range(0, len(la_list)))} time_summary = {'la': dict((la_list[i], [0] * days) for i in range(0, len(la_list))), 'lsoa': dict((lsoa_list[i], [0] * days) for i in range(0, len(lsoa_list)))} if oo.record_paper_summary: paper_totals = {'la': dict((la_list[i], 0) for i in range(0, len(la_list)))} paper_summary = {'la': dict((la_list[i], [0] * days) for i in range(0, len(la_list))), 'hh_type': dict((hh_type_list[i], [0] * days) for i in range(0, len(hh_type_list)))} l.acquire() if oo.record_key_info: if not os.path.isdir(os.path.join(out_path, 'key info')): os.mkdir(os.path.join(out_path, 'key info')) if not os.path.isfile(os.path.join(out_path, 'key dates', run_input['run_id'] + ".csv")): pd.DataFrame(temp_list).to_csv(os.path.join(out_path, 'key info', run_input['run_id'] + ".csv")) else: pd.DataFrame(temp_list).to_csv(os.path.join(out_path, 'key info', run_input['run_id'] + ".csv"), mode='a', header=False) l.release() output_data = defaultdict(list) rnd = random.Random() rnd.seed(str(seeds)) # define simpy env for current rep env = simpy.Environment() # initialise replication initialise.Rep(env, run_input, output_data, passive_summary, passive_totals, active_summary, active_totals, active_paper_summary, active_paper_totals, visit_summary, visit_totals, time_summary, time_totals, paper_summary, paper_totals, rnd, sim_hours, start_date, census_day, out_path, max_output_file_size) # and run it env.run(until=sim_hours) # write the output to csv files hp.write_output(output_data, out_path, run_input['run_id']) # write summary data to csv to defined folder in main outputs summary_path = os.path.join(out_path, 'summary') c_run = run_input['run_id'] c_rep = run_input['rep_id'] if oo.record_passive_summary: hp.output_summary(summary_path, passive_summary, 'passive_summary', c_run, c_rep) hp.output_summary(summary_path, passive_totals, 'passive_totals', c_run, c_rep) if oo.record_active_summary: hp.output_summary(summary_path, active_summary, 'active_summary', c_run, c_rep) hp.output_summary(summary_path, active_totals, 'active_totals', c_run, c_rep) if oo.record_active_paper_summary: hp.output_summary(summary_path, active_paper_summary, 'active_paper_summary', c_run, c_rep) hp.output_summary(summary_path, active_paper_totals, 'active_paper_totals', c_run, c_rep) if oo.record_visit_summary: hp.output_summary(summary_path, visit_summary, 'visit_summary', c_run, c_rep) hp.output_summary(summary_path, visit_totals, 'visit_totals', c_run, c_rep) if oo.record_time_summary: hp.output_summary(summary_path, time_summary, 'time_summary', c_run, c_rep) hp.output_summary(summary_path, time_totals, 'time_totals', c_run, c_rep) if oo.record_paper_summary: hp.output_summary(summary_path, paper_summary, 'paper_summary', c_run, c_rep) hp.output_summary(summary_path, paper_totals, 'paper_totals', c_run, c_rep)
desc('state_off_count')).take(top_n_states) top_state_off_list = [row['VEH_LIC_STATE_ID'] for row in top_states_with_hig_offnces] intd_df2 = intd_df1.where(col('VEH_LIC_STATE_ID').isin(top_state_off_list)) wspec = Window.orderBy(desc('veh_make_count')) intd_df3 = intd_df2.groupBy('VEH_MAKE_ID').agg(countDistinct('CRASH_ID').alias('veh_make_count')).orderBy( desc('veh_make_count')) intd_df4 = intd_df3.withColumn('rank', dense_rank().over(wspec)) top_veh_make = intd_df4.filter(intd_df4['rank'] <= top_n_veh_make).collect() vehicle_list = [row['VEH_MAKE_ID'] for row in top_veh_make] return list(enumerate(vehicle_list, start=1)) df_charges, df_damages, df_units, df_endorse, df_prim_person, df_restrict = create_tables(folder_path) count_crashes_male = num_crashes_gender_accidents(df_prim_person, prop['ANALYSIS_1_VAR_1'], prop['ANALYSIS_1_VAR_2']) helper.write_output(prop, count_crashes_male, 'analysis_1') count_two_wheelers = num_two_whlrs_bkd_for_crash(df_units, eval(prop['ANALYSIS_2'])) helper.write_output(prop, count_two_wheelers, 'analysis_2') state = state_with_highest_accidents_gender(df_prim_person, prop['ANALYSIS_3']) helper.write_output(prop, state, 'analysis_3') veh_list = veh_make_highest_injury(df_units, prop['ANALYSIS_4_VAR_1'], prop['ANALYSIS_4_VAR_2']) helper.write_output(prop, veh_list, 'analysis_4') final_df = top_ethnic_group_veh_bdy_style(df_units, df_prim_person) helper.write_output(prop, final_df, 'analysis_5') top_zip_list = top_zip_crashes_alcohol(df_units, df_prim_person, prop['ANALYSIS_6']) helper.write_output(prop, top_zip_list, 'analysis_6')
def main(): filename = sys.argv[1] data = parse_input(filename) num_pizzas, pizzas = longest_subarray_less_k(data["maximum_slices"], data["sizes"]) write_output(num_pizzas, pizzas)
========== Original file size: %s bytes Minified file size: %s bytes Removed: %s%% bytes """ % (input_size, output_size, removed) if __name__ == "__main__": if len(sys.argv) > 1: try: filename = sys.argv[1] if os.path.getsize(filename) == 0: print "C'mon, don't bug me! Empty file?!" exit(-1) input = read_input(filename) output = minifyme(input) output_filename = "%s.min.js" % sys.argv[1][:-3] write_output(output_filename, output) print_statistics(input, output) print "File %s written." % output_filename except Exception, e: print "Something wrong." print e else: print "Usage: minifyme file.js"
import sys from helper import write_output # parse input parameters input_file = sys.argv[1] top_10_occupation_output_file = sys.argv[2] top_10_state_output_file = sys.argv[3] # code to create the two desired output text files write_output(input_file, top_10_occupation_output_file, 'occupation') write_output(input_file, top_10_state_output_file, 'state')