示例#1
0
def ret_rec(household, rep):
    # print out every 100000 returns?
    #if rep.total_responses % 100000 == 0:
    #print(rep.total_responses)

    if oo.record_active_summary:
        # add household to summary of responses
        for key, value in rep.active_summary.items():
            value[str(getattr(household,
                              key))][math.floor(rep.env.now / 24)] += 1

        for key, value in rep.active_totals.items():
            value[str(getattr(household, key))] += 1

    if oo.record_active_paper_summary and not household.digital:

        for key, value in rep.active_paper_summary.items():
            value[str(getattr(household,
                              key))][math.floor(rep.env.now / 24)] += 1

        for key, value in rep.active_paper_totals.items():
            value[str(getattr(household, key))] += 1

    household.return_received = True
    if oo.record_return_received:
        rep.output_data['Return_received'].append(
            oo.generic_output(rep.reps, household.district.district,
                              household.la, household.lsoa, household.digital,
                              household.hh_type, household.hh_id, rep.env.now))
    # currently every return gets counted as a response as soon as it is received - this may need to change
    household.responded = True
    rep.total_responses += 1
    household.district.total_responses += 1

    # check size of output data - if over an amount, size or length write to file?
    if oo.record_responded:
        rep.output_data['Responded'].append(
            oo.generic_output(rep.reps, household.district.district,
                              household.la, household.lsoa, household.digital,
                              household.hh_type, household.hh_id, rep.env.now))

    # checks size of output and writes to file if too large
    if (h.dict_size(rep.output_data)) > rep.max_output_file_size:
        h.write_output(rep.output_data, rep.output_path, rep.run)

    yield rep.env.timeout(0)
示例#2
0
def main():

    users = user_database.UserDatabase()
    test_tweets = helper.extract_tweets(users, max_users = -1, max_tweets = 1, min_tweets = 1, test_sample = True)
    print len(test_tweets)

    with open('ngrams.pkl') as fid:
        ngrams_vars = cPickle.load(fid)
        ngrams = ngrams_vars["ngrams"]
        useful_ngrams = ngrams_vars["useful_ngrams"]

    helper.extract_ngrams(test_tweets, ngrams, for_cv_tweets = True)
    test_vars = helper.build_Xy(test_tweets, useful_ngrams)
    X_test = test_vars["X"]
    y_test = test_vars["y"]

    logistic = joblib.load("Results/Log_Testing/Logit_C0.1.joblib.pkl")
    print('Test Logistic Regression score: %f'%logistic.score(X_test, y_test))
    y_test_pred = logistic.predict(X_test)
    helper.write_output("Results/Log_Testing/TestUsers.csv", y_test, y_test_pred, test_tweets, users)
示例#3
0
def naive_bayes(X_train, y_train, X_cv, y_cv, train_tweets, cv_tweets, filename):
    alphas = [0.01, 0.1, 1, 10, 100]
    best_cv_score = 0.
    best_alpha = 0

    for alpha in alphas:
        print "Doing fit"
        mnb = MultinomialNB(alpha=alpha)
        mnb.fit(X_train, y_train)

        y_cv_pred = mnb.predict(X_cv)
        cv_score = mnb.score(X_cv, y_cv)
        print('CV Multinomial NB score: %(score)f for alpha = %(alpha)f'% {"score": cv_score, "alpha": alpha})

        if cv_score > best_cv_score:
            best_cv_score = cv_score
            best_alpha = alpha

        helper.write_output(filename + "_alpha" + str(alpha) + ".csv", y_cv, y_cv_pred, cv_tweets, users)

    return best_cv_score
示例#4
0
    def create_districts(self):

        co_number = 0

        list_of_districts = sorted(list(self.input_data['districts'].keys()))

        for distr in list_of_districts:

            # checks size of output file and writes to file if too large
            if (h.dict_size(self.output_data)) >= self.max_output_file_size:
                h.write_output(self.output_data, self.output_path, self.run)

            self.districts.append(district.District(self, distr))

            try:
                co_number += self.input_data['districts'][distr][
                    "census officer"]["standard"]["number"]
            except KeyError as e:
                warning_detail = "no CO for run, ", self.run, " in create districts"
                if oo.record_warnings:
                    self.output_data['Warnings'].append(
                        oo.warnings(self.reps, e, warning_detail))
                co_number = 0
示例#5
0
def logistic_regression(X_train, y_train, X_cv, y_cv, train_tweets, cv_tweets, filename):
    C_regs = [0.03, 0.1, 0.3, 1.0, 3.0]
#    C_regs = [0.1]
    best_cv_score = 0.
    best_C = 0

    for C_reg in C_regs:
        print "Doing fit"
        logistic = linear_model.LogisticRegression(penalty="l2", C=C_reg)
        logistic.fit(X_train, y_train)

        cv_score = logistic.score(X_cv, y_cv)
        print('Train LogisticRegression score: %f'% logistic.score(X_train, y_train))
        print('CV LogisticRegression score: %(score)f for C_reg = %(C)f'% {"score": cv_score, "C": C_reg})
        y_cv_pred = logistic.predict(X_cv)

        if cv_score > best_cv_score:
            best_cv_score = cv_score
            best_C = C_reg

        helper.write_output(filename + "_C" + str(C_reg) + ".csv", y_cv, y_cv_pred, cv_tweets, users)
        joblib.dump(logistic, filename + "_C" + str(C_reg) + ".joblib.pkl", compress=9)

    return best_cv_score
示例#6
0
def start_run(run_input, seeds, out_path):

    max_output_file_size = 100000000
    start_date = dt.date(*map(int, run_input['start_date'].split(',')))
    end_date = dt.date(*map(int, run_input['end_date'].split(',')))
    sim_hours = (end_date - start_date).total_seconds()/3600
    census_date = dt.date(*map(int, run_input['census_date'].split(',')))
    census_day = (census_date - start_date).days

    # write key dates/info to a csv for later use in post processing
    temp_list = [{'start_date': start_date,
                  'end_date': end_date,
                  'census_date': census_date,
                  'census_day': census_day,
                  'sim_hours': sim_hours}]

    # create dataframe to hold summary data
    days = int(sim_hours / 24)+1
    day_cols = []
    for day in range(0, days):
        day_cols.append(day)

    # generate list of codes for reference from input file where appropriate
    dig_list = ['False', 'True']
    la_list = []
    lsoa_list = []
    hh_type_list = []

    # cycle through the input data to only include those household types, lsoa and la that exist to appear in the output
    for d_key, d_value in run_input['districts'].items():
        for hh_key, hh_value in d_value['households'].items():
            if hh_key not in hh_type_list:
                hh_type_list.append(hh_key)
            for la_key, la_value in hh_value['cca_makeup'].items():
                if la_key not in la_list:
                    la_list.append(la_key)
                for lsoa_key, lsoa_value in la_value.items():
                    if lsoa_key not in lsoa_list:
                        lsoa_list.append(lsoa_key)

    """
    passive_summary records a daily summary of returns that occur without any intervention
    passive_totals records a simple total of returns without intervention

    active_summary records all returns including those due to the interventions
    active_totals records a simple total of all returns

    active_paper_summary paper returns over time
    active_paper_totals total paper returns

    visit_summary records all visits over time by the level in the key
    visit_totals records simple sum of total visits by level in key

    time_summary records time spent on task by the census officers
    time_totals sum of total time on task

    paper_summary summary of paper given out over time
    paper_totals sum of paper given out

    the summaries and totals are at the levels in the keys.
    """

    passive_summary = {}
    passive_totals = {}
    active_summary = {}
    active_totals = {}
    active_paper_summary = {}
    active_paper_totals = {}
    visit_totals = {}
    visit_summary = {}
    time_totals = {}
    time_summary = {}
    paper_totals = {}
    paper_summary = {}

    if oo.record_passive_summary:

        passive_summary = {'la': dict((la_list[i], [0]*days) for i in range(0, len(la_list))),
                           'lsoa': dict((lsoa_list[i], [0] * days) for i in range(0, len(lsoa_list))),
                           'digital': dict((dig_list[i], [0]*days) for i in range(0, len(dig_list))),
                           'hh_type': dict((hh_type_list[i], [0]*days) for i in range(0, len(hh_type_list)))
                           }

        passive_totals = {'lsoa': dict((lsoa_list[i], 0) for i in range(0, len(lsoa_list))),
                          'la': dict((la_list[i], 0) for i in range(0, len(la_list)))
                          }

    if oo.record_active_summary:

        active_summary = {'la': dict((la_list[i], [0] * days) for i in range(0, len(la_list))),
                          'digital': dict((dig_list[i], [0] * days) for i in range(0, len(dig_list))),
                          'hh_type': dict((hh_type_list[i], [0] * days) for i in range(0, len(hh_type_list)))
                          }

        active_totals = {'lsoa': dict((lsoa_list[i], 0) for i in range(0, len(lsoa_list))),
                         'la': dict((la_list[i], 0) for i in range(0, len(la_list)))
                         }

    if oo.record_active_paper_summary:

        active_paper_summary = {'la': dict((la_list[i], [0] * days) for i in range(0, len(la_list))),
                                'hh_type': dict((hh_type_list[i], [0] * days) for i in range(0, len(hh_type_list)))
                                }

        active_paper_totals = {'lsoa': dict((lsoa_list[i], 0) for i in range(0, len(lsoa_list))),
                               'la': dict((la_list[i], 0) for i in range(0, len(la_list)))
                               }

    if oo.record_visit_summary:

        visit_totals = {'la': dict((la_list[i], 0) for i in range(0, len(la_list)))
                        }

        visit_summary = {'la': dict((la_list[i], [0]*days) for i in range(0, len(la_list))),
                         'hh_type': dict((hh_type_list[i], [0] * days) for i in range(0, len(hh_type_list)))}

    if oo.record_time_summary:

        time_totals = {'la': dict((la_list[i], 0) for i in range(0, len(la_list)))}

        time_summary = {'la': dict((la_list[i], [0] * days) for i in range(0, len(la_list))),
                        'lsoa': dict((lsoa_list[i], [0] * days) for i in range(0, len(lsoa_list)))}

    if oo.record_paper_summary:

        paper_totals = {'la': dict((la_list[i], 0) for i in range(0, len(la_list)))}

        paper_summary = {'la': dict((la_list[i], [0] * days) for i in range(0, len(la_list))),
                         'hh_type': dict((hh_type_list[i], [0] * days) for i in range(0, len(hh_type_list)))}

    l.acquire()

    if oo.record_key_info:
        if not os.path.isdir(os.path.join(out_path, 'key info')):
            os.mkdir(os.path.join(out_path, 'key info'))

        if not os.path.isfile(os.path.join(out_path, 'key dates', run_input['run_id'] + ".csv")):
            pd.DataFrame(temp_list).to_csv(os.path.join(out_path, 'key info', run_input['run_id'] + ".csv"))
        else:
            pd.DataFrame(temp_list).to_csv(os.path.join(out_path, 'key info', run_input['run_id'] + ".csv"), mode='a',
                                           header=False)
    l.release()

    output_data = defaultdict(list)

    rnd = random.Random()
    rnd.seed(str(seeds))

    # define simpy env for current rep
    env = simpy.Environment()

    # initialise replication
    initialise.Rep(env,
                   run_input,
                   output_data,
                   passive_summary,
                   passive_totals,
                   active_summary,
                   active_totals,
                   active_paper_summary,
                   active_paper_totals,
                   visit_summary,
                   visit_totals,
                   time_summary,
                   time_totals,
                   paper_summary,
                   paper_totals,
                   rnd,
                   sim_hours,
                   start_date,
                   census_day,
                   out_path,
                   max_output_file_size)

    # and run it
    env.run(until=sim_hours)

    # write the output to csv files
    hp.write_output(output_data, out_path, run_input['run_id'])

    # write summary data to csv to defined folder in main outputs
    summary_path = os.path.join(out_path, 'summary')

    c_run = run_input['run_id']
    c_rep = run_input['rep_id']

    if oo.record_passive_summary:

        hp.output_summary(summary_path, passive_summary, 'passive_summary', c_run, c_rep)
        hp.output_summary(summary_path, passive_totals, 'passive_totals', c_run, c_rep)

    if oo.record_active_summary:

        hp.output_summary(summary_path, active_summary, 'active_summary', c_run, c_rep)
        hp.output_summary(summary_path, active_totals, 'active_totals', c_run, c_rep)

    if oo.record_active_paper_summary:
        hp.output_summary(summary_path, active_paper_summary, 'active_paper_summary', c_run, c_rep)
        hp.output_summary(summary_path, active_paper_totals, 'active_paper_totals', c_run, c_rep)

    if oo.record_visit_summary:

        hp.output_summary(summary_path, visit_summary, 'visit_summary', c_run, c_rep)
        hp.output_summary(summary_path, visit_totals, 'visit_totals', c_run, c_rep)

    if oo.record_time_summary:

        hp.output_summary(summary_path, time_summary, 'time_summary', c_run, c_rep)
        hp.output_summary(summary_path, time_totals, 'time_totals', c_run, c_rep)

    if oo.record_paper_summary:

        hp.output_summary(summary_path, paper_summary, 'paper_summary', c_run, c_rep)
        hp.output_summary(summary_path, paper_totals, 'paper_totals', c_run, c_rep)
示例#7
0
        desc('state_off_count')).take(top_n_states)
    top_state_off_list = [row['VEH_LIC_STATE_ID'] for row in top_states_with_hig_offnces]
    intd_df2 = intd_df1.where(col('VEH_LIC_STATE_ID').isin(top_state_off_list))
    wspec = Window.orderBy(desc('veh_make_count'))
    intd_df3 = intd_df2.groupBy('VEH_MAKE_ID').agg(countDistinct('CRASH_ID').alias('veh_make_count')).orderBy(
        desc('veh_make_count'))
    intd_df4 = intd_df3.withColumn('rank', dense_rank().over(wspec))
    top_veh_make = intd_df4.filter(intd_df4['rank'] <= top_n_veh_make).collect()
    vehicle_list = [row['VEH_MAKE_ID'] for row in top_veh_make]
    return list(enumerate(vehicle_list, start=1))


df_charges, df_damages, df_units, df_endorse, df_prim_person, df_restrict = create_tables(folder_path)

count_crashes_male = num_crashes_gender_accidents(df_prim_person, prop['ANALYSIS_1_VAR_1'], prop['ANALYSIS_1_VAR_2'])
helper.write_output(prop, count_crashes_male, 'analysis_1')

count_two_wheelers = num_two_whlrs_bkd_for_crash(df_units, eval(prop['ANALYSIS_2']))
helper.write_output(prop, count_two_wheelers, 'analysis_2')

state = state_with_highest_accidents_gender(df_prim_person, prop['ANALYSIS_3'])
helper.write_output(prop, state, 'analysis_3')

veh_list = veh_make_highest_injury(df_units, prop['ANALYSIS_4_VAR_1'], prop['ANALYSIS_4_VAR_2'])
helper.write_output(prop, veh_list, 'analysis_4')

final_df = top_ethnic_group_veh_bdy_style(df_units, df_prim_person)
helper.write_output(prop, final_df, 'analysis_5')

top_zip_list = top_zip_crashes_alcohol(df_units, df_prim_person, prop['ANALYSIS_6'])
helper.write_output(prop, top_zip_list, 'analysis_6')
示例#8
0
def main():
    filename = sys.argv[1]
    data = parse_input(filename)
    num_pizzas, pizzas = longest_subarray_less_k(data["maximum_slices"],
                                                 data["sizes"])
    write_output(num_pizzas, pizzas)
示例#9
0
==========    
    Original file size: %s bytes  
    Minified file size: %s bytes

    Removed: %s%% bytes
""" % (input_size, output_size, removed)


if __name__ == "__main__":
    if len(sys.argv) > 1:
        try:
            filename = sys.argv[1]
            if os.path.getsize(filename) == 0: 
                print "C'mon, don't bug me! Empty file?!"
                exit(-1)

            input = read_input(filename)

            output = minifyme(input)

            output_filename = "%s.min.js" % sys.argv[1][:-3]
            write_output(output_filename, output)

            print_statistics(input, output)
            print "File %s written." % output_filename
        except Exception, e:
            print "Something wrong."
            print e
    else:
        print "Usage: minifyme file.js"
示例#10
0
import sys
from helper import write_output

# parse input parameters
input_file = sys.argv[1]
top_10_occupation_output_file = sys.argv[2]
top_10_state_output_file = sys.argv[3]

# code to create the two desired output text files
write_output(input_file, top_10_occupation_output_file, 'occupation')
write_output(input_file, top_10_state_output_file, 'state')