def main(): # Get label encoder lb = LabelBinarizer() lbenc = lb.fit(utils.get_classes()) # Get train data X_train, y_train, train_filenames = utils.get_train( '../input/train', list(lbenc.classes_), img_width, img_height) # Create and train model model = train(X_train, y_train, epochs=100, batch_size=32) print("+++++++++++++++++++++++++++++++++++++++++++") # Load model ... #model = load_model('../models/'+ 'model2_f0.86/'+ 'model2-64-0.341.h5') # Get test data X_test, X_test_id = utils.get_test('../input/test', img_width, img_height) # Predict on test data preds = model.predict(X_test, verbose=1) # Create submission utils.create_submission(lbenc.inverse_transform(preds), X_test_id, output_path="../submissions/", filename=modelname, isSubmission=True) utils.to_csv_ens(lbenc.inverse_transform(preds), preds, X_test_id, utils.get_classes(), output_path="../submissions/", filename=modelname) print('Finished.')
def eval(): # Get classes lb = LabelBinarizer() lbenc = lb.fit(utils.get_classes()) # Load model model = load_model('../models/' + 'model.h5') # Get test data X_test, X_test_id = utils.get_test('../input/test', img_width, img_height) # Predict on test data preds = model.predict(X_test, verbose=1) # Create ensembling file df_csv = utils.to_csv_ens(lbenc.inverse_transform(preds), preds, X_test_id, utils.get_classes(), output_path="../submissions/", filename=modelname) # Create submission file subm = utils.create_submission(lbenc.inverse_transform(preds), X_test_id, output_path="../submissions/", filename=modelname, isSubmission=True)
def read_vals(objective, graph_files): tool_files = ["/".join(path) for path in graph_files if utils.get_tool(utils.get_fn(path)) == tool] # Extract times name_to_n = utils.key_functions[objective] time_pairs = [(name_to_n(utils.get_test(utils.get_fn(path.split("/")))), utils.read_times(in_dir + "/" + path)) for path in tool_files] # Sort values times_sorted = sorted(time_pairs, key=lambda pair: pair[0]) n_vals = list(map(lambda pair: pair[0], times_sorted)) t_objective_vals = list(map(lambda pair: pair[1][0], times_sorted)) t_jacobian_vals = list(map(lambda pair: pair[1][1], times_sorted)) return (n_vals, t_objective_vals, t_jacobian_vals)
def get_test_instance(self): if os.path.exists('../DataSets/mydata/test_txtid.pkl') and os.path.exists('../DataSets/mydata/test_instances.pkl'): test_txtid_pkl = open('../DataSets/mydata/test_txtid.pkl', 'rb') test_txtid = pickle.load(test_txtid_pkl) test_instances_pkl = open('../DataSets/mydata/test_instances.pkl', 'rb') test_instances = pickle.load(test_instances_pkl) else: test_txtid, test_instances = get_test() test_paper_list = self.load_data(test_instances) test_tag_list, test_id_list, test_p1_list, test_p2_list, test_y_list = self.test_padding(test_paper_list, self.word2id_dic, self.max_sequence_length, self.position_max) self.testb_dataset = (test_txtid, test_tag_list, test_id_list, test_p1_list, test_p2_list, test_y_list) return self.testb_dataset
def read_vals(objective, graph_files, tool, in_dir): '''Extracts data for files of the specified tool.''' def get_violations(file_name): '''Extracts jacobian calculation correctness.''' folder, fn = os.path.split(file_name) correctness_file_name = os.path.join( in_dir, folder, fn.replace(TIMES_SUBSTRING, CORRECTNESS_SUBSTRING)) if not os.path.isfile(correctness_file_name): print(f"WARNING: correctness file {correctness_file_name} " "doesn't exist\n") return False try: with open(correctness_file_name, "r", encoding="ascii") as cf: correctness_data = json.load(cf) return correctness_data["ViolationsHappened"] except Exception as e: print(f"WARNING: correctness file {correctness_file_name} parsing " f"failed.\nError message:{e.args}\n") return False tool_files = [ os.path.join(*path) for path in graph_files if utils.get_tool_from_path(path) == tool ] if has_manual(tool): violation_info = [False for file in tool_files] else: violation_info = [get_violations(file) for file in tool_files] # Extract times name_to_n = utils.key_functions[objective] info = [(name_to_n(utils.get_test(utils.get_fn(path.split("/")))), utils.read_times(os.path.join(in_dir, path)), violation) for (path, violation) in zip(tool_files, violation_info)] # Sort values info_sorted = sorted(info, key=lambda t: t[0]) n_vals = list(map(lambda t: t[0], info_sorted)) t_objective_vals = list(map(lambda t: t[1][0], info_sorted)) t_jacobian_vals = list(map(lambda t: t[1][1], info_sorted)) violation_vals = list(map(lambda t: t[2], info_sorted)) return (n_vals, t_objective_vals, t_jacobian_vals, violation_vals)
def get_vocab(dset): return set(w for sent in dset for w in sent['ws']) def get_contexts(sent, c): ws = (['<s>']*c) + sent['ws'] + (['</s>']*c) contexts = [] for i, w in enumerate(sent['ws']): wi = i + c if sent['ii'][i]: contexts.append(' '.join([w for w in ws[wi-c:wi] + ['___'] + ws[wi+1:wi+c+1]])) return contexts if __name__ == '__main__': trn = get_dset() tst = get_test() print map(len, map(get_tagged_vocab, [trn,tst])) print 'tagged vocab size trn {} tst {}'.format(*map(len, map(get_tagged_vocab, [trn,tst]))) print 'all vocab size trn {} tst {}'.format(*map(len, map(get_vocab, [trn,tst]))) vtrn, vtst = map(get_tagged_vocab, [trn,tst]) print 'tagged vtst diff: {:.2f}'.format( len(vtst.difference(vtrn)) / len(vtst) ) vtrn, vtst = map(get_vocab, [trn,tst]) print 'all vtst diff: {:.2f}'.format( len(vtst.difference(vtrn)) / len(vtst) ) precnt = Counter(w[:j] for sent in trn for w, lbl in zip(sent['ws'],sent['ls']) for j in range(3,5) if lbl==1 and len(w)>j) sufcnt = Counter(w[-j:] for sent in trn for w, lbl in zip(sent['ws'],sent['ls']) for j in range(3,5) if lbl==1 and len(w)>j) print 'most common prefixes:', precnt.most_common(100) print 'most common suffixes:', sufcnt.most_common(100)
def submission(model, sampling_method, data_dir, results_dir, device='cpu', verbose=True): if verbose: print("Using device: {}".format(device)) print("Reading train data in...") if model == 'lgbm': X_train, Y_train, feature_labels = get_train(data_dir, one_hot=False) else: X_train, Y_train, feature_labels = get_train(data_dir) X_test = get_test(data_dir) train_ids, test_ids = get_ids(data_dir) country_names = get_country_names(data_dir) if verbose: print("Successfully loaded data") lgbm_params = { 'task': 'train', 'objective': 'multiclass', 'num_class': 12, 'num_leaves': 31, 'learning_rate': 0.3, 'lambda_l2': 1.0, 'feature_fraction': 0.9, 'min_child_weight': 1.0, 'device': device, 'gpu_device_id': 0, 'gpu_platform_id': 0, 'max_bin': 63, 'verbose': 0 } if device == 'cpu': xgb_params = { "objective": "multi:softprob", "num_class": 12, "tree_method": "hist", "colsample_bytree": 0.9, "n_jobs": 2, "silent": 1 } else: xgb_params = { "objective": "multi:softprob", "num_class": 12, "tree_method": "gpu_hist", "colsample_bytree": 0.9, "gpu_id": 0, "max_bin": 16, "silent": 1 } if verbose: print("{} sampling process started...".format(sampling_method)) curr_time = time.time() if sampling_method == "adasyn": X_train_resampled, Y_train_resampled = ADASYN().fit_sample( X_train, Y_train) elif sampling_method == "smote": X_train_resampled, Y_train_resampled = SMOTE().fit_sample( X_train, Y_train) elif sampling_method == "random": X_train_resampled, Y_train_resampled = RandomOverSampler().fit_sample( X_train, Y_train) elif sampling_method == "smoteenn": X_train_resampled, Y_train_resampled = SMOTEENN().fit_sample( X_train, Y_train) else: X_train_resampled, Y_train_resampled = X_train, Y_train if verbose: print("Oversampling completed") print("Time Taken: {:.2f}".format(time.time() - curr_time)) print("Size of Oversampled data: {}".format(X_train_resampled.shape)) print("{} selected for classification".format(model)) curr_time = time.time() if model == 'lgbm': categorical_feature = [ 'age_bucket', 'gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser' ] lgb_train = lgb.Dataset(data=X_train_resampled, label=Y_train_resampled, feature_name=feature_labels, categorical_feature=categorical_feature) clf = lgb.train(lgbm_params, lgb_train, num_boost_round=30) print("Time taken: {:.2f}".format(time.time() - curr_time)) Y_probs = clf.predict(X_test) order = np.argsort(-Y_probs[:, :5], axis=1) else: X_train_xgb = xgb.DMatrix(X_train_resampled, Y_train_resampled, feature_names=feature_labels) X_test_xgb = xgb.DMatrix(X_test, feature_names=feature_labels) clf = xgb.train(xgb_params, X_train_xgb, 30) print("Time taken: {:.2f}".format(time.time() - curr_time)) Y_probs = clf.predict(X_test_xgb) order = np.argsort(-Y_probs[:, :5], axis=1) print("Generating submission csv...") with open(os.path.join(results_dir, 'submission_{}.csv'.format(model)), 'w') as f: writer = csv.writer(f, delimiter=',', quoting=csv.QUOTE_MINIMAL) writer.writerow(['id', 'country']) for i in range(len(test_ids)): for k in range(5): writer.writerow([test_ids[i], country_names[order[i, k]]]) print("Finished.")
pred_labels = [] for sent, pred in zip(dset,preds): gold_labels.extend([sent['ls'][ii] for ii, interested in enumerate(sent['ii']) if interested]) # pred_labels.extend([pred[ii] for ii, interested in enumerate(sent['ii']) if interested]) pred_labels.extend(pred) logging.debug(tabulate(confusion_matrix(np.array(gold_labels), np.array(pred_labels)), headers=[0,1])) p, r, f = evaluate_system.evaluateIdentifier(gold_labels, pred_labels) return p,r,f if __name__ == '__main__': parser = get_arg_parser() args = vars(parser.parse_args()) setup_logger(args) logging.debug(tabulate([OrderedDict((k,v) for k,v in sorted(args.iteritems()))], headers='keys')) if args['testf']: trn = utils.get_dset(args['data']) tst = utils.get_test() ytrn, ytst = fit_predict(trn, tst, args, Emb(trn+tst)) with open(args['testf'], 'w') as out: out.write('\n'.join([str(y) for y in ytst])) else: dset = utils.get_dset(args['data']) if args['sample'] > 0: random.seed(0) dset = random.sample(dset, args['sample']) xvalidate(dset, args, Emb(dset))
# create initializations for the three different test problems # for every problem: create 100*5 random X values, distributed on the given slices # for every problem: create corresponding responses # save the data to be used in the torch and GPy implementation from utils import get_test import numpy as np import torch import pandas as pd torch.manual_seed(42) np.random.seed(42) problems = ['branin', 'eggholder', 'camel'] for problem in problems: testfunction, slices, scalers, hyperparameters = get_test(problem) # create 500 points on [0, 1] X1 = np.random.rand(500, 1) X2 = np.random.choice(slices.numpy(), (500, 1)) # slices are already scaled X = scalers[0].inverse_transform( torch.tensor(np.concatenate((X1, X2), axis=1))) # scale back for evaluation y = testfunction(torch.tensor(X)) # write to csv df = pd.DataFrame(np.concatenate((X, y.reshape(-1, 1)), axis=1), columns=['X1', 'X2', 'y']) df.to_csv('initialization_' + problem + '.csv', index=False)
def min_max_optimization(): np.random.seed(42) torch.manual_seed(32) iterations_list = [20, 20, 100] for problem_idx, problem in enumerate(['branin', 'camel', 'eggholder']): testfunction, slices, scalers, hyperparameters = get_test(problem) n_init = 5 jj = 0 # load data df_read = pd.read_csv('initialization_' + problem + '.csv') print("read data") for initialization in range(int(df_read.shape[0] / n_init)): X = np.array(df_read.iloc[(initialization * n_init):(initialization * n_init + n_init), 0:2]) z = np.array(df_read.iloc[(initialization * n_init):(initialization * n_init + n_init), 2]).reshape(-1, 1) # scale X_scaled = torch.tensor(scalers[0].transform(X), dtype=torch.float32) z_scaled = torch.tensor(scalers[1].transform(z), dtype=torch.float32) # run the optimization iterations = iterations_list[problem_idx] model = build_models(X_scaled, z_scaled, hyperparameters) model = model.eval() results = torch.zeros((1, 7)) print("started optimization") for i in range(iterations): new_candidate, min_max_location, current_min_max = thompson_sampling_acquisition( model, slices) current_min_max_unscaled = torch.tensor( scalers[1].inverse_transform( current_min_max.detach().numpy().reshape(1, 1))) min_max_location_unscaled = torch.tensor( scalers[0].inverse_transform( min_max_location.detach().numpy())) new_candidate_unscaled = torch.tensor( scalers[0].inverse_transform( new_candidate.detach().numpy())) new_function_value = testfunction( new_candidate_unscaled.reshape(1, -1)) # update the model model = model.condition_on_observations( new_candidate, torch.tensor(scalers[1].transform( new_function_value.numpy().reshape(-1, 1)))) print('new candidate:', new_candidate_unscaled) print('min max location:', min_max_location_unscaled) print('current min max:', current_min_max_unscaled) print('iteration ', i) results[0, 0] = i results[0, 1:3] = new_candidate_unscaled results[0, 3:5] = min_max_location_unscaled results[0, 5] = current_min_max_unscaled results[0, 6] = initialization df = pd.DataFrame(results.detach().numpy(), columns=[ 'i', 'x_cand0', 'x_cand1', 'min_max0', 'min_max1', 'min_max_val', 'init' ]) df['problem'] = problem if jj == 0: df.to_csv(problem + '_results_thompson.csv', index=False) else: df.to_csv(problem + '_results_thompson.csv', mode='a', header=False, index=False) jj += 1 print('finished the optimization')
best_valid_mae = model.mae_valid best_epoch_trial = model.early_stop_epoch print('trial valid rmse of best epoch:', model.best_rmse) print('trial valid r2 of best epoch:', model.r2_valid) print('trial valid mae of best epoch:', model.mae_valid) with open('params_dict.txt', 'w', encoding="utf8") as outfile: json.dump(trials_dict, outfile) return best_valid_rmse, best_valid_r_2, best_valid_mae, best_epoch_trial if __name__ == '__main__': train, validation = get_data(True, 1, 1) test = get_test() # SGD best_valid_rmse_sgd, best_valid_r_2_sgd, best_valid_mae_sgd, best_epoch = \ hyper_param_tuning('SGD', SGD_HYPER_PARAMS) print('best SGD model rmse:', best_valid_rmse_sgd) print('best SGD model r2:', best_valid_r_2_sgd) print('best SGD model mae:', best_valid_mae_sgd) final_model = SGD() final_model.fit_early_stop(train, validation, best_epoch) test['pred'] = test.apply(lambda row: final_model.predict(row[USER_COL], row[ITEM_COL]), axis=1) test[[USER_COL, ITEM_COL]] = test[[USER_COL, ITEM_COL]].apply(lambda col: col + 1)
print("Ensemble : ", mean_square_error(target_val, mean_pred_data)) """ Create a file of the ensemble prediction in the corresponding format for a submission on kaggle """ all_preds_han = [] for i in range(len(target_idx)): idx = target_idx[i] nb_model = 1 list_pred = [] # each model begin to predict on the documents test for i in range(len(type_loss)): name_doc = list_name_documents[i] docs_val = get_test(name_doc=name_doc, idx_target=idx, config=config) if type_loss[i] == "mse": custom_loss = "mean_squared_error" elif type_loss[i] == "higher": custom_loss = mse_asymetric_higher elif type_loss[i] == "lower": custom_loss = mse_asymetric_lower if type_model[i] == 1: model = get_model_1(docs_train=docs_val, config=config, name_embeddings=list_embeddings[i], custom_loss=custom_loss) elif type_model[i] == 2: model = get_model_2(docs_train=docs_val,