def report_main(): import os import shutil data = load_data() dirname = "Report" path = os.getcwd() make_dir(dirname, path) print("Tolong tunggu sebentar...") wb_recap = load_sheet('format/Format_3.xlsx') wb_solo = load_sheet('format/Format_3.xlsx') rekapfolder = "Process" rekapfile = "Rekap Evaluasi" organisasi = "BPMU" tahun = "2018/2019" wb = load_sheet(rekapfolder + '/' + rekapfile + '.xlsx', True) init = 0 for cat in data: make_dir(cat, path + '/' + dirname) for y in data[cat]: for nim in y: wb.active = wb[nim] wb_recap.active = init wb_recap.copy_worksheet(wb_recap.active) wb_recap.active.title = nim[:9] # fillidentity(wb_recap,data[cat][0][nim][0]['nama'],nim,organisasi,data[cat][0][nim][0]['jabatan'],tahun) # fillscore(wb_recap,wb[nim]) # fillidentity(wb_solo,data[cat][0][nim][0]['nama'],nim,organisasi,data[cat][0][nim][0]['jabatan'],tahun) # fillscore(wb_solo,wb[nim]) wb_solo.save(path + '/' + dirname + "/" + cat + '/' + nim + ".xlsx") init += 1 wb_recap.save(path + '/' + dirname + "/Recap Laporan Evaluasi.xlsx") return
def get_data(): x_i = [] for row in load_data(): x_i.append(row) if len(x_i) > 400: yield np.array(x_i[:400]).reshape(-1, 400, 5), np.array(x_i[400]).reshape( -1, 5) x_i.pop(-1)
def main(num_epochs=NUM_EPOCHS): #l_in = lasagne.layers.InputLayer((BATCH_SIZE,64,1,8,512),x,'input_layer') l_in = lasagne.layers.InputLayer((BATCH_SIZE, sli, 1, sli_l, 512)) l_forward_1 = lasagne.layers.LSTMLayer( l_in, N_HIDDEN, grad_clipping=GRAD_CLIP, nonlinearity=lasagne.nonlinearities.tanh) l_forward_slice = lasagne.layers.SliceLayer(l_forward_1, -1, 1) l_out = lasagne.layers.DenseLayer( l_forward_slice, num_units=vocab_size, W=lasagne.init.GlorotUniform(), nonlinearity=lasagne.nonlinearities.softmax) target_values = T.ivector('target_output') network_output = lasagne.layers.get_output(l_out) cost = T.nnet.categorical_crossentropy(network_output, target_values).mean() all_params = lasagne.layers.get_all_params(l_out, trainable=True) updates = lasagne.updates.adagrad(cost, all_params, LEARNING_RATE) train = theano.function([l_in.input_var, target_values], cost, updates=updates, allow_input_downcast=True) compute_cost = theano.function([l_in.input_var, target_values], cost, allow_input_downcast=True) get_out = theano.function([l_in.input_var], lasagne.layers.get_output(l_out), allow_input_downcast=True) probs = theano.function([l_in.input_var], network_output, allow_input_downcast=True) for n in xrange(1000): inp_t, inp_v, output_t, output_v = load_data() x, x_v, y, y_v = gen_data() avg_cost = 0 avg_cost += train(x, y) val_output = get_out(x_v) val_predictions = np.argmax(val_output, axis=1) #print(val_predictions) #print(y_v) accuracy = np.mean(val_predictions == y_v) print(accuracy) print(avg_cost)
def make_recap_sheet(): data = load_data() sheet = load_sheet('format/Format_2.xlsx') active_sheet_idx = 0 for cat in data: for _ in data[cat]: for nim in _: sheet.copy_worksheet(sheet.active) active_sheet_idx += 1 sheet.active = active_sheet_idx sheet.active.title = nim sheet.active["B1"].value = data[cat][0][nim][0]['nama'] sheet.active["B2"].value = nim sheet.active["B3"].value = data[cat][0][nim][0]['jabatan'] sheet.active = 0 return sheet
def main(cfg): # parse config DATA_FOLDER = path.Path(cfg["DATA"]["DatasetPath"]) MODEL_PATH = path.Path(cfg["MODEL"]["FilePath"]) # do something with data #X = pd.read_csv(f'{DATA_FOLDER}/{cfg["DATA"]["UsersFile"]}') x, y = load_data('train', cfg) train_x, val_x = split(x) train_y, val_y = split(y) model = SimpleModel() model.fit(train_x, train_y, val_x, val_y) joblib.dump(model, MODEL_PATH) logging.info("model was trained")
def main(num_epochs=NUM_EPOCHS): #l_in = lasagne.layers.InputLayer((BATCH_SIZE,64,1,8,512),x,'input_layer') l_in = lasagne.layers.InputLayer((BATCH_SIZE,sli,1,sli_l,512)) l_forward_1 = lasagne.layers.LSTMLayer( l_in, N_HIDDEN, grad_clipping=GRAD_CLIP, nonlinearity=lasagne.nonlinearities.tanh) l_forward_slice = lasagne.layers.SliceLayer(l_forward_1, -1, 1) l_out = lasagne.layers.DenseLayer(l_forward_slice, num_units=vocab_size, W = lasagne.init.GlorotUniform(),nonlinearity=lasagne.nonlinearities.softmax) target_values = T.ivector('target_output') network_output = lasagne.layers.get_output(l_out) cost = T.nnet.categorical_crossentropy(network_output,target_values).mean() all_params = lasagne.layers.get_all_params(l_out,trainable=True) updates = lasagne.updates.adagrad(cost, all_params, LEARNING_RATE) train = theano.function([l_in.input_var, target_values], cost, updates=updates, allow_input_downcast=True) compute_cost = theano.function([l_in.input_var, target_values], cost, allow_input_downcast=True) get_out = theano.function([l_in.input_var],lasagne.layers.get_output(l_out),allow_input_downcast=True) probs = theano.function([l_in.input_var],network_output,allow_input_downcast=True) for n in xrange(1000): inp_t,inp_v,output_t,output_v = load_data() x, x_v, y, y_v = gen_data() avg_cost = 0 avg_cost += train(x,y) val_output = get_out(x_v) val_predictions = np.argmax(val_output, axis=1) #print(val_predictions) #print(y_v) accuracy = np.mean(val_predictions == y_v) print(accuracy) print(avg_cost)
def main(cfg): # parse config DATA_FOLDER = path.Path(cfg["DATA"]["DatasetPath"]) USER_ID = cfg["COLUMNS"]["USER_ID"] PREDICTION = cfg["COLUMNS"]["PREDICTION"] MODEL_PATH = path.Path(cfg["MODEL"]["FilePath"]) SUBMISSION_FILE = path.Path(cfg["SUBMISSION"]["FilePath"]) # do something with data #X = pd.read_csv(f'{DATA_FOLDER}/{cfg["DATA"]["UsersFile"]}') x, client_ids = load_data('test', cfg) model = joblib.load(MODEL_PATH) preds = model.predict(x) preds = np.round(preds.flatten()).astype(int) sub = pd.DataFrame.from_dict({ 'client_id': client_ids.tolist(), 'target': preds.tolist() }) sub.to_csv(SUBMISSION_FILE, index=False)
def main_process(): import os import shutil data = load_data() dirname = "Process" path = os.getcwd() make_dir(dirname,path) print("Tolong tunggu sebentar...") try: wb_recap = make_recap_sheet() for cat in data: for y in data[cat]: for nim in y: wb = load_sheet("Data Evaluasi/"+nim+".xlsx", 1) for row in range(5,wb.active.max_row,7): if wb.active["C"+str(row)].value != nim and wb.active["C"+str(row)].value != None: wb_recap.active = wb_recap[wb.active["C"+str(row)].value] init = 3 while wb_recap.active["E"+str(init)].value != None: init+=1 wb_recap.active["E"+str(init)].value = data[cat][0][nim][0]['nama'] wb_recap.active["F"+str(init)].value = nim wb_recap.active["G"+str(init)].value = data[cat][0][nim][0]['jabatan'] wb_recap.active["H"+str(init)].value = wb.active["F"+str(row+5)].value wb_recap.active["I"+str(init)].value = wb.active["G"+str(row+5)].value wb_recap.active["J"+str(init)].value = wb.active["H"+str(row+5)].value wb_recap.active["K"+str(init)].value = wb.active["K"+str(row+5)].value wb_recap.active["L"+str(init)].value = wb.active["N"+str(row+5)].value wb_recap.active["M"+str(init)].value = wb.active["S"+str(row+5)].value wb_recap.active["N"+str(init)].value = wb.active["W"+str(row+5)].value wb_recap.active["O"+str(init)].value = wb.active["AA"+str(row+5)].value except Exception as e: print(e) wb_recap.active = 0 wb_recap[wb_recap.active.title].sheet_state = "hidden" wb_recap.save(path+'/'+dirname+"/Rekap Evaluasi.xlsx") return
#x = T.tensor4() N_HIDDEN = 100 LEARNING_RATE = .001 GRAD_CLIP = 100 NUM_EPOCHS = 20 BATCH_SIZE = 200 vocab_size = 9 inp_t, inp_v, output_t, output_v = load_data() sli_l = 8 sli = 64 #y = T.ivector() def gen_data(): xx = np.zeros((BATCH_SIZE, 512, 512)) rng_state = np.random.get_state() np.random.shuffle(inp_t) np.random.set_state(rng_state) np.random.shuffle(output_t) y = output_t[0:BATCH_SIZE] xx = inp_t[0:BATCH_SIZE, :, :] y_v = output_v
def run_temp_model(outcome, path_to_data, path_to_result_folder, n_samples=1000): if not os.path.exists(path_to_result_folder): os.makedirs(path_to_result_folder) # # get temp stuff # df = pd.read_csv(path_to_data) # annual_temps = [] # daily_temps = [] # for annual_temp in np.arange(df.meanTempDegree.min(), df.meanTempDegree.max() + 1, 1): # at_dt_temps = np.arange(df.loc[df.meanTempDegree == annual_temp, 'dailyTempCat'].min(), # df.loc[df.meanTempDegree == annual_temp, 'dailyTempCat'].max() + 0.1, 0.1) # annual_temps += [np.repeat(annual_temp, at_dt_temps.size)] # daily_temps += [at_dt_temps] # annual_temps = np.hstack(annual_temps) # daily_temps = np.hstack(daily_temps) # del df # load data # ------------------------------------------------------------------------- tdata = process.load_data(path_to_data, outcome) tdata = actions.mtslice.adjust_mean(tdata) with open(path_to_result_folder + "/" + outcome + "_tdata.pkl", 'wb') as fwrite: pickle.dump(tdata, fwrite, -1) tdata_agg = actions.mtslice.aggregate_mtslice(tdata) tdata_agg = actions.mtslice.adjust_agg_std(tdata_agg) with open(path_to_result_folder + "/" + outcome + "_tdata_agg.pkl", 'wb') as fwrite: pickle.dump(tdata_agg, fwrite, -1) # fit the mean surface # ------------------------------------------------------------------------- linear_no_mono = ('inj' in outcome) surface_result = actions.surface.fit_surface(tdata_agg, linear_no_mono=linear_no_mono) with open(path_to_result_folder + "/" + outcome + "_surface_result.pkl", 'wb') as fwrite: pickle.dump(surface_result, fwrite, -1) # fit the study structure in the residual # ------------------------------------------------------------------------- trend_result, tdata_residual = actions.mtslice.fit_trend(tdata, surface_result, inlier_pct=0.95) with open(path_to_result_folder + "/" + outcome + "_trend_result.pkl", 'wb') as fwrite: pickle.dump(trend_result, fwrite, -1) with open(path_to_result_folder + "/" + outcome + "_tdata_residual.pkl", 'wb') as fwrite: pickle.dump(tdata_residual, fwrite, -1) # predict surface with UI # ----------------------------------------------------------------------------- annual_temps, daily_temps = utils.create_grid_points_alt(np.unique(tdata_agg.mean_temp), 0.1, tdata) curve_samples = process.sample_surface( mt=annual_temps, dt=daily_temps, num_samples=n_samples, surface_result=surface_result, trend_result=trend_result, include_re=True ) curve_samples_df = pd.DataFrame( np.vstack([annual_temps, daily_temps, curve_samples]).T, columns=['annual_temperature', 'daily_temperature'] + [f'draw_{i}' for i in range(n_samples)] ) curve_samples_df.to_csv( path_to_result_folder + "/" + outcome + "_curve_samples.csv", index=False ) evidence_score = score.scorelator(curve_samples_df, trend_result, tdata, outcome, path_to_result_folder) evidence_score.to_csv( path_to_result_folder + "/" + outcome + "_score.csv", index=False ) del curve_samples, curve_samples_df # plot the result # ------------------------------------------------------------------------- # 3D surface and the level plot actions.surface.plot_surface(tdata_agg, surface_result) plt.savefig(path_to_result_folder + "/" + outcome + "_surface.pdf", bbox_inches="tight") # plot uncertainty for each mean temp (can be subset of this) plt.figure(figsize=(8, 6)) for mt in trend_result.mean_temp: fig, ax = plt.subplots(1, 1, figsize=(8, 5)) viz.plot_slice_uncertainty( mt, tdata, surface_result, trend_result, ylim=[-1.0, 1.0], ax=ax) ax.set_xlabel("daily temperature") ax.set_title(outcome + " at mean temperature %i" %mt) fig.savefig(path_to_result_folder + "/" + outcome + "_slice_%i.pdf" % mt, bbox_inches="tight") plt.close(fig)
t1 = time.time() print(wd_model.get_available_gpus() ) # 返回格式为:['/device:GPU:0', '/device:GPU:1'] # LOAD DATA print('*-' * 40, 'LOAD DATA') making_data_dir = '/home/didi2021/didi2021/giscup_2021/final_train_data_0703/max_order_xt/' link_data_dir = '/home/didi2021/didi2021/giscup_2021/final_train_data_0703/max_170_link_sqe_for_order/' cross_data_dir = '/home/didi2021/didi2021/giscup_2021/final_train_data_0703/for_0714_cross_sqe_for_order/' head_link_dir = '/home/didi2021/didi2021/giscup_2021/final_train_data_0703/max_head_link_data_clear/' win_order_data_dir = '/home/didi2021/didi2021/giscup_2021/final_train_data_0703/win_order_xw/' pre_arrival_sqe_dir = '/home/didi2021/didi2021/giscup_2021/final_train_data_0703/sqe_arrival_for_link/' data_for_driver_xw = '/home/didi2021/didi2021/giscup_2021/final_train_data_0703/data_for_driver_xw/' downstream_status_dir = '/home/didi2021/didi2021/giscup_2021/final_train_data_0703/downstream_status_for_order/' data, mk_cols_list, link_cols_list, cross_cols_list = process.load_data( making_data_dir, link_data_dir, cross_data_dir, head_link_dir, win_order_data_dir, pre_arrival_sqe_dir, data_for_driver_xw, downstream_status_dir) # PROCESSING DATA print('*-' * 40, 'PROCESSING DATA') train_data, val_data = process.processing_data(data, mk_cols_list, link_cols_list, cross_cols_list, WIDE_COLS) del data gc.collect() # print(train_data.columns.tolist()) # PROCESSING INPUTS print('*-' * 40, 'PROCESSING INPUTS') # SAVE LIST a = np.array(mk_cols_list)
def main(): random.seed(240480) if use_preprocessed_data: print('load preprocessed data') df_train = pd.read_csv('data/train_processed.csv') df_test = pd.read_csv('data/test_processed.csv') else: df_train, df_test = load_data() print('configure data for training') id_test = df_test['id'] y_train = df_train['relevance'].values X_train = df_train[:] X_test = df_test[:] print('construct model') # TF-IDF vectorize - converts docs to tf-idf feature matrix. tfidf = TfidfVectorizer(ngram_range=(1, 1), stop_words='english') # truncated singular value decomposition - dimensionality reduction. tsvd = TruncatedSVD(n_components=10, random_state=240480) # random forest rfr = RandomForestRegressor(n_estimators=500, n_jobs=-1, random_state=240480, verbose=1) # TODO: get these features to include some cosine similarity measure between search term and other fields! # think we need to first fit tfidvectoriser to each of title, description, brand # and then insert into pipeline to generate 3x features of search term against the respective vocabs # potentially just include similarity scores as features. or maybe RF will handle this on its own... # pipeline: # 1. build feature unions [cust_txt_col (to extract column) -> tfidf -> tsvd] # 2. pass to random forest. clf = Pipeline([ ('union', FeatureUnion( transformer_list=[ ('cst', cust_regression_vals()), ('txt1', Pipeline([('s1', cust_txt_col(key='search_term')), ('tfidf1', tfidf), ('tsvd1', tsvd)])), ('txt2', Pipeline([('s2', cust_txt_col(key='product_title')), ('tfidf2', tfidf), ('tsvd2', tsvd)])), ('txt3', Pipeline([('s3', cust_txt_col(key='product_description')), ('tfidf3', tfidf), ('tsvd3', tsvd)])), ('txt4', Pipeline([('s4', cust_txt_col(key='brand')), ('tfidf4', tfidf), ('tsvd4', tsvd)])) ], transformer_weights={ 'cst': 1.0, 'txt1': 0.5, 'txt2': 0.25, 'txt3': 0.0, 'txt4': 0.5 }, n_jobs=-1 )), ('rfr', rfr)]) print('run grid search') # TODO: search over relative weightings of transformer features? param_grid = {'rfr__max_features': [10], 'rfr__max_depth': [20]} RMSE = make_scorer(fmean_squared_error, greater_is_better=False) model = grid_search.GridSearchCV(estimator=clf, param_grid=param_grid, cv=2, scoring=RMSE) model.fit(X_train, y_train) print("Best parameters found by grid search:") print(model.best_params_) print("Best CV score:") print(model.best_score_) print('run predictions') y_pred = model.predict(X_test) print('save submission file') pd.DataFrame({"id": id_test, "relevance": y_pred}).to_csv('submission.csv', index=False)
def main(): random.seed(240480) if use_preprocessed_data: print('load preprocessed data') df_train = pd.read_csv('data/train_processed.csv') df_test = pd.read_csv('data/test_processed.csv') else: df_train, df_test = load_data() print('configure data for training') id_test = df_test['id'] y_train = df_train['relevance'].values X_train = df_train[:] X_test = df_test[:] print('construct model') # TF-IDF vectorize - converts docs to tf-idf feature matrix. tfidf = TfidfVectorizer(ngram_range=(1, 1), stop_words='english') # truncated singular value decomposition - dimensionality reduction. tsvd = TruncatedSVD(n_components=10, random_state=240480) # random forest rfr = RandomForestRegressor(n_estimators=500, n_jobs=-1, random_state=240480, verbose=1) # TODO: get these features to include some cosine similarity measure between search term and other fields! # think we need to first fit tfidvectoriser to each of title, description, brand # and then insert into pipeline to generate 3x features of search term against the respective vocabs # potentially just include similarity scores as features. or maybe RF will handle this on its own... # pipeline: # 1. build feature unions [cust_txt_col (to extract column) -> tfidf -> tsvd] # 2. pass to random forest. clf = Pipeline([('union', FeatureUnion(transformer_list=[ ('cst', cust_regression_vals()), ('txt1', Pipeline([('s1', cust_txt_col(key='search_term')), ('tfidf1', tfidf), ('tsvd1', tsvd)])), ('txt2', Pipeline([('s2', cust_txt_col(key='product_title')), ('tfidf2', tfidf), ('tsvd2', tsvd)])), ('txt3', Pipeline([('s3', cust_txt_col(key='product_description')), ('tfidf3', tfidf), ('tsvd3', tsvd)])), ('txt4', Pipeline([('s4', cust_txt_col(key='brand')), ('tfidf4', tfidf), ('tsvd4', tsvd)])) ], transformer_weights={ 'cst': 1.0, 'txt1': 0.5, 'txt2': 0.25, 'txt3': 0.0, 'txt4': 0.5 }, n_jobs=-1)), ('rfr', rfr)]) print('run grid search') # TODO: search over relative weightings of transformer features? param_grid = {'rfr__max_features': [10], 'rfr__max_depth': [20]} RMSE = make_scorer(fmean_squared_error, greater_is_better=False) model = grid_search.GridSearchCV(estimator=clf, param_grid=param_grid, cv=2, scoring=RMSE) model.fit(X_train, y_train) print("Best parameters found by grid search:") print(model.best_params_) print("Best CV score:") print(model.best_score_) print('run predictions') y_pred = model.predict(X_test) print('save submission file') pd.DataFrame({ "id": id_test, "relevance": y_pred }).to_csv('submission.csv', index=False)
#x = T.tensor4() N_HIDDEN = 100 LEARNING_RATE = .001 GRAD_CLIP = 100 NUM_EPOCHS = 20 BATCH_SIZE = 200 vocab_size = 9 inp_t,inp_v,output_t,output_v = load_data() sli_l = 8 sli = 64 #y = T.ivector() def gen_data(): xx = np.zeros((BATCH_SIZE,512,512)) rng_state = np.random.get_state() np.random.shuffle(inp_t) np.random.set_state(rng_state) np.random.shuffle(output_t) y = output_t[0:BATCH_SIZE] xx = inp_t[0:BATCH_SIZE,:,:] y_v = output_v
import process as p from sklearn import datasets, svm, metrics, utils from sklearn.ensemble import RandomForestClassifier dataset = p.load_data("./pot.csv","./targets.csv") print("hello there :)") #clf = svm.SVC() clf = RandomForestClassifier(max_depth=5, random_state=0) print("100% of the data is {}.".format(len(dataset.data))) # Get 4/5 split_index = len(dataset.data)//5*4 print("80% of the data is {}.".format(split_index)) train_data = dataset.data[:split_index] test_data = dataset.data[split_index:] train_target = dataset.target[:split_index] test_target = dataset.target[split_index:] print train_target.shape clf.fit(train_data, train_target) out = clf.predict(test_data[0:]) print out
import numpy as np from process import load_data unique_base_classes = set(load_data('FC100_train.pickle')['labels']) np.random.seed(seed=42) base_test_image_indices = {} for cl in sorted(unique_base_classes): base_test_image_indices[cl] = sorted( np.random.choice(a=list(range(1, 601)), size=100, replace=False)) # print(cl, base_test_image_indices[cl]) import pickle with open('base_test_indices.pickle', 'wb') as file: pickle.dump(obj=base_test_image_indices, file=file)
head_data_dir = '/home/didi2021/didi2021/giscup_2021/final_train_data_0703/max_head_link_data_clear/' win_order_data_dir = '/home/didi2021/didi2021/giscup_2021/final_train_data_0703/win_order_xw/' #pre_arrival_data_dir = '/home/didi2021/didi2021/giscup_2021/final_train_data_0703/final_pre_arrival_data/' arrival_data_dir = '/home/didi2021/didi2021/giscup_2021/final_train_data_0703/max_link_sqe_for_order_arrival/' zsl_arrival_data_dir = '/home/didi2021/didi2021/giscup_2021/final_train_data_0703/zsl_arrival/' arrival_sqe_data_dir = '/home/didi2021/didi2021/giscup_2021/final_train_data_0703/max_170_lk_arrival_sqe_for_order/' #h_s_for_link_dir = '/home/didi2021/didi2021/giscup_2021/final_train_data_0703/max_hightmp_slice_for_link_eb/' pre_arrival_sqe_dir = '/home/didi2021/didi2021/giscup_2021/final_train_data_0703/sqe_arrival_for_link/' zsl_link_data_dir = '/home/didi2021/didi2021/giscup_2021/final_train_data_0703/zsl_train_link/' data, mk_cols_list, link_cols_list, cross_cols_list = process.load_data( making_data_dir, link_data_dir, cross_data_dir, link_data_other_dir, head_data_dir, win_order_data_dir, pre_arrival_sqe_dir, zsl_link_data_dir, #pre_arrival_data_dir, #h_s_for_link_dir, arrival_data_dir, zsl_arrival_data_dir, arrival_sqe_data_dir) #fd = dcn_model.FeatureDictionary(data, numeric_cols=NUMERIC_COLS, ignore_cols=IGNORE_COLS, # cate_cols=CATEGORICAL_COLS) # PROCESSING DATA data['date_time'] = data['date_time'].astype(int) print("type(data['date_time']):", data['date_time'].dtype) data = data[data['date_time'] != 20200901] print('Here train_test_split..................') # all_train_data, _ = train_test_split(all_train_data, test_size=0.9, random_state=42)