def compute_dtw(data, dataarray, w): click.echo('--- Compute DTW ---') timeseries, timeseries_label = load_labelled(dataarray) timeserie_1 = load_test(data) click.echo(' - data : %s ' % data) click.echo(' - dataarray : %s ' % dataarray) click.echo(' - w : %d ' % w) click.echo('\nRunning...') unsorted_dtws = get_distances(timeserie_1[0], data_array=timeseries, max_warping_window=w) # Save plots dtw_plots(unsorted_dtws) click.echo('Done. Plots have been saved.') click.echo('Choose a maximum number for labelling good and bad data based on DTW values.') click.echo('Check the plots to take better decsision.') click.echo(' Example: If value for Good is 150, all data with DTW 0-150 will be labelled "Good".') # Enter limit for 'Good' good_value = raw_input(' > Enter a value for "Good" (Ex: 150) : ') # Enter limit for 'Bad' bad_value = raw_input(' > Enter a value for "Bad" (Ex: 350) : ') # Print and save results to CSV fileName = raw_input(' > Enter a file name (add .csv at the end) : ') label_dtws(unsorted_dtws, int(good_value), int(bad_value), fileName) click.echo('\nDone.')
def predict(k, w, train, test): click.echo('--- Predicting a label ---') #click.echo('Predicting with k=%d and w=%d.' % (k,w)) train_data, train_label = load_labelled(train) test_data = load_test(test) click.echo(' - k : %d ' % k) click.echo(' - w : %d ' % w) click.echo(' - train : %s ' % train) click.echo(' - test : %s ' % test) click.echo('\nRunning...') model = KnnDtw(k_neighbours = k, max_warping_window = w) model.fit(train_data, train_label) predicted_label, probability = model.predict(test_data) click.echo('\nPredicted label : %s ' % str(predicted_label)) click.echo('\nDone.')
print "Best average score =", best[0] print "Average threshold =", threshold print "Best params =", params print "Save fold predictions for stacking..." decisions = best[5] for i, d in enumerate(decisions): np.save("stack/%s-fold%d.npy" % (prefix, i), decisions[i]) # Retrain on the training set print "Retrain on the full training set..." clf = Classifier(**params) w = rescale(w) w = rebalance(y, w) try: clf.fit(X, y, sample_weight=w) except: clf.fit(X, y) print "Save test predictions for stacking..." X_test, _, _, ids = load_test() #X_test = tf.transform(X_test.astype(np.float32)) d = clf.predict_proba(X_test)[:, 0] d = d.flatten() np.save("stack/%s-test.npy" % prefix, d)
utils.start(__file__) #============================================================================== PREF = 'f332_' KEY = 'SK_ID_CURR' ins_start = 0 # 1~277 ins_end = 1 # 1~277 os.system(f'rm ../feature/t*_{PREF}*') # ============================================================================= # # ============================================================================= train = utils.load_train([KEY]) test = utils.load_test([KEY]) # ============================================================================= # # ============================================================================= def aggregate(args): path, pref = args df = utils.read_pickles(path) df = df[df['NUM_INSTALMENT_NUMBER'].between(ins_start, ins_end)] del df['SK_ID_PREV'] df_agg = df.groupby(KEY).agg({**utils_agg.ins_num_aggregations}) df_agg.columns = pd.Index( [e[0] + "_" + e[1] for e in df_agg.columns.tolist()])
li = only_target1(c) feature[f'{PREF}_{c}'] = (df[c].isin(li)) * 1 feature[f'{PREF}_sum'] = feature.sum(1) feature.iloc[:200000].to_pickle(f'../data/train_{PREF}.pkl') feature.iloc[200000:].reset_index( drop=True).to_pickle(f'../data/test_{PREF}.pkl') return # ============================================================================= # main # ============================================================================= if __name__ == "__main__": utils.start(__file__) tr = utils.load_train().drop(['ID_code', 'target'], axis=1) y_train = utils.load_target()['target'] te = utils.load_test().drop(['ID_code'], axis=1) tr0 = tr[y_train == 0] tr1 = tr[y_train == 1] trte = pd.concat([tr, te], ignore_index=True)[tr.columns] fe(trte) utils.end(__file__)
# as any of the known words - so the default idf is the max of # known idf's max_idf = max(tfidf.idf_) self.word2weight = defaultdict( lambda: max_idf, [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()]) return self def transform(self, X): return np.array([ np.mean([self.word2vec[w] * self.word2weight[w] for w in words if w in self.word2vec] or [np.zeros(self.dim)], axis=0) for words in X ]) etree_w2v = Pipeline([ ("word2vec vectorizer", MeanEmbeddingVectorizer(w2v)), ("extra trees", ExtraTreesClassifier(n_estimators=200))]) etree_w2v_tfidf = Pipeline([ ("word2vec vectorizer", TfidfEmbeddingVectorizer(w2v)), ("extra trees", ExtraTreesClassifier(n_estimators=200))]) res = utils.load_train() etree_w2v.fit(res[0], res[1]) test = utils.load_test() preds = etree_w2v.predict(test[0]) print(metrics.classification_report(test[1], preds)) print(metrics.confusion_matrix(test[1], preds))
loop = 1 param = { 'max_depth': 15, 'eta': 0.1, 'colsample_bytree': 0.6, 'subsample': 0.5, 'silent': 1, # 'scale_pos_weight':1.707, # neg/pos 'eval_metric': 'auc', 'objective': 'binary:logistic' } train = utils.load_train(file_in=file_in) test = utils.load_test(file_in=file_in) #============================================================================== # logloss NO sampling #============================================================================== def get_valid_col(col): return [ c for c in col if c.count(',') > 0 or c.count('[') > 0 or c.count(']') > 0 or c.count('>') > 0 ] col = ['qid1', 'qid2', 'question1', 'question2', 'is_duplicate'] y_train = train.is_duplicate train_sub = train[['id', 'is_duplicate']]
def classify(train, examples): cv_res = { "PP": 0, "PN": 0, "NP": 0, "NN": 0, "contradictory": 0, } plus = train["plus"] minus = train["minus"] l = len(examples) i = 0 for elem in examples: i += 1 print "%i/%i" % (i, l) result = check_hypothesis(plus, minus, elem) cv_res[result] += 1 return cv_res if __name__ == "__main__": index = int(sys.argv[1]) train = utils.load_train(index) test = utils.load_test(index) res = classify(train, test) print res print utils.summary(res)
models.append(model) model.save_model('../model/xgb{}.model'.format(i)) train_col = dtrain.feature_names del train, dtrain, y_train gc.collect() imp = ex.getImp(models) imp.to_csv('../output/imp-{}.csv'.format(date), index=0) #============================================================================== # test #============================================================================== test1 = utils.load_test(file_in, file_remove) col = ['test_id', 'question1', 'question2'] sub = test1[col] test1.drop(col, axis=1, inplace=1) if is_mirror: print('q1_to_q2!') test2 = utils.q1_to_q2(test1) dtest1 = xgb.DMatrix(test1[train_col]) dtest2 = xgb.DMatrix(test2[train_col]) del test1, test2 gc.collect() sub['is_duplicate'] = 0
def get_ppr10(): p = utils.load_test() ppr10 = simple_extract(p) return ppr10[0]
def gae_for(args, iter='0.txt'): # print("Using {} dataset".format(args.ds)) # adj_cd, features = load_data(args.ds) 'Load features!' if args.ds.startswith('tf'): if args.labels == 'y': adj_cd, adj_dd, features, tags_nodes = my_load_data_tfidf_semi( args.wmd) else: adj_cd, adj_dd, features = my_load_data_tfidf(args.wmd) else: # if args.labels == 'y': adj_cd, adj_dd, features, tags_nodes = my_load_data_p2v_semi( args.wmd) else: adj_cd, adj_dd, features = my_load_data_p2v(args.wmd) # adj_cd, adj_dd, features = my_load_data_p2v() 'Load test adjacency matrix' adj_test = load_test() # adj_test = load_test_10_percent(iter) n_nodes, feat_dim = features.shape # Store original adjacency matrix (without diagonal entries) for later adj_orig_cd = adj_cd 'do again for adj_dd' adj_orig_dd = adj_dd adj_train_cd, train_edges, val_edges, val_edges_false = mask_train_edges( adj_cd) 'do again for adj_dd' adj_train_dd, train_edges_dd, _, _ = mask_train_edges(adj_dd) # adj_cd = adj_train_cd adj_dd = adj_train_dd test_edges, test_edges_false = make_test_edges(adj_train_cd, adj_test) # Some preprocessing: calculate norm adj_norm_cd = preprocess_graph(adj_cd) 'For loss function: add diag values' adj_label_cd = adj_train_cd + sp.eye(adj_train_cd.shape[0]) adj_label_cd = torch.FloatTensor(adj_label_cd.toarray()) pos_weight_cd = float(adj_cd.shape[0] * adj_cd.shape[0] - adj_cd.sum()) / adj_cd.sum() norm_cd = adj_cd.shape[0] * adj_cd.shape[0] / float( (adj_cd.shape[0] * adj_cd.shape[0] - adj_cd.sum()) * 2) 'do it again for adj_dd' adj_norm_dd = preprocess_graph(adj_dd) adj_label_dd = adj_train_dd + sp.eye(adj_train_dd.shape[0]) adj_label_dd = torch.FloatTensor(adj_label_dd.toarray()) pos_weight_dd = float(adj_dd.shape[0] * adj_dd.shape[0] - adj_dd.sum()) / adj_dd.sum() norm_dd = adj_dd.shape[0] * adj_dd.shape[0] / float( (adj_dd.shape[0] * adj_dd.shape[0] - adj_dd.sum()) * 2) if args.labels == 'y': model = GCNModelVAE_Semi(feat_dim, args.hidden1, args.hidden2, args.dropout, args.class_dim) else: model = GCNModelVAE(feat_dim, args.hidden1, args.hidden2, args.dropout) optimizer = optim.Adam(model.parameters(), lr=args.lr) print('Now start training...') hidden_emb = None for epoch in range(args.epochs): t = time.time() model.train() optimizer.zero_grad() # import pdb;pdb.set_trace if args.labels == 'y': recovered, mu, logvar, pred_nodes = model( features, [adj_norm_cd, adj_norm_dd]) loss = loss_function_relation_semi(preds=recovered, labels=(adj_label_cd, adj_label_dd), mu=mu, logvar=logvar, n_nodes=n_nodes, norm=(norm_cd, norm_dd), pos_weight=(pos_weight_cd, pos_weight_dd), pred_nodes=pred_nodes, tags_nodes=tags_nodes) else: recovered, mu, logvar = model(features, [adj_norm_cd, adj_norm_dd]) loss = loss_function_relation(preds=recovered, labels=(adj_label_cd, adj_label_dd), mu=mu, logvar=logvar, n_nodes=n_nodes, norm=(norm_cd, norm_dd), pos_weight=(pos_weight_cd, pos_weight_dd)) loss.backward() cur_loss = loss.item() optimizer.step() hidden_emb = mu.data.numpy() acc_curr, p, r, f1, map_curr, roc_curr = my_eval( hidden_emb, (adj_orig_cd, adj_orig_dd), val_edges, val_edges_false) print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(cur_loss), "val_ap=", "{:.5f}".format(map_curr), "val_ac=", "{:.5f}".format(acc_curr), "time=", "{:.5f}".format(time.time() - t)) print("Optimization Finished!") acc_score, p, r, f1, map_score, roc_score = my_eval_test( hidden_emb, (adj_orig_cd, adj_orig_dd), test_edges, test_edges_false) # print('Test ROC score: ' + str(roc_score)) # print('Test AP score: ' + str(ap_score)) # print ("Test accuracy ", "{:.5f}".format(acc_score)) # print ('P {:.5f}, R {:.5f}, F1 {:.5f}'.format(p,r,f1)) print('Acc, P, R, F1, MAP, AUC') print('{:5f},{:5f},{:5f},{:5f},{:5f},{:5f}'.format(acc_score, p, r, f1, map_score, roc_score)) return acc_score, p, r, f1, map_score, roc_score
""" import pandas as pd import os import utils utils.start(__file__) # ============================================================================= folders = ['../feature_prev', '../feature_prev_unused'] for fol in folders: os.system(f'rm -rf {fol}') os.system(f'mkdir {fol}') train = utils.load_train(['SK_ID_CURR', 'TARGET']) test = utils.load_test(['SK_ID_CURR']) prev = utils.read_pickles('../data/previous_application') prev_train = pd.merge(prev, train, on='SK_ID_CURR', how='inner') prev_test = pd.merge(prev, test, on='SK_ID_CURR', how='inner') utils.to_pickles(prev_train, '../data/prev_train', utils.SPLIT_SIZE) utils.to_pickles(prev_test, '../data/prev_test', utils.SPLIT_SIZE) utils.to_pickles(prev_train[['TARGET']], '../data/prev_label', utils.SPLIT_SIZE) """ prev_train = utils.read_pickles('../data/prev_train') prev_test = utils.read_pickles('../data/prev_test')
from alpaca import Alpaca from utils import load_test, split_df, TimeSeriesResampler, confusion_matrix import time from sklearn.model_selection import train_test_split from sklearn.utils import shuffle from sklearn.pipeline import Pipeline import numpy as np import pandas as pd if __name__ == '__main__': X, y = load_test() # Length of timeseries for resampler and cnn sz = 230 # Number of channels for cnn num_channels = X.shape[-1] # Number of classes for cnn num_classes = np.unique(y).shape[0] classes = np.array(["0", "1", "2", "3", "4", "?"]) repetitions = 1 results = [] outliers = np.empty((0, 230 * 3 + 5)) for r in range(repetitions): print("Repetition #", r) X, y = shuffle(X, y, random_state=r) # Turn y to numpy array y = np.array(y)
def predict(): model = RL(istrained=True, name='rl_noun') te_X, _ = load_test(size='_t') te_X = norm4d_per_sample(te_X) return model.predict(te_X)
import numpy as np from run_knn import run_knn import matplotlib.pyplot as plt """ CSC 2515 - Assignment 1 Tausif Sharif Notes: - Runs the run_knn.py functions from here - Will show and save relevant plots """ trainInputs, trainTargets = load_train() smallInputs, smallTargets = load_train_small() validInputs, validTargets = load_valid() testInputs, testTargets = load_test() kList = [1, 3, 5, 7, 9] classRates = range(0, len(kList)) classRatesT = range(0, len(kList)) listCount = 0 for k in kList: correctCount = 0 validLables = run_knn(k, trainInputs, trainTargets, validInputs) for i in xrange(len(validLables)): if validLables[i] == validTargets[i]: correctCount += 1 classRates[listCount] = (correctCount / float(len(validLables))) listCount += 1
import utils as utl import tensorflow as tf import numpy as np cuisine_list, ingredients_list, xs, ys = utl.load_train('vector') ts, ids = utl.load_test(ingredients_list) cuisine_count = len(cuisine_list) ingredients_count = len(ingredients_list) x = tf.placeholder(tf.float32, [None, ingredients_count]) W = tf.Variable(tf.zeros([ingredients_count, cuisine_count])) b = tf.Variable(tf.zeros([cuisine_count])) y = tf.nn.softmax(tf.matmul(x, W) + b) y_ = tf.placeholder(tf.float32, [None, cuisine_count]) t = tf.placeholder(tf.float32, [None, ingredients_count]) p = tf.nn.softmax(tf.matmul(t, W) + b) cross_entropy = -tf.reduce_sum(y_*tf.log(y)) train_step = tf.train.GradientDescentOptimizer(0.001).minimize(cross_entropy) # train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy) init = tf.initialize_all_variables() sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) sess.run(init)
'date_time_dow', 'date_time_hour', 'date_time_month', 'srch_ci_dow', 'srch_ci_month', \ 'srch_co_dow', 'srch_co_month', 'booking_window', 'length_of_stay']], \ site_name_encoding, posa_continent_encoding, user_location_country_encoding, user_location_region_encoding, \ channel_encoding, srch_destination_type_id_encoding, hotel_continent_encoding, hotel_country_encoding], axis=1) train_is_booking_features.to_csv(utils.processed_data_path + '_'.join(['train_is_booking_baseline', 'year', utils.train_year]) + '.csv', header=True, index=False) del train_is_booking ############################################################# #################### test dataset #################### ############################################################# test = utils.load_test('baseline') print 'generate test time features...' time_features_enricher(test) print 'generate test one hot encoding features...' site_name_encoding, posa_continent_encoding, user_location_country_encoding, user_location_region_encoding, \ channel_encoding, srch_destination_type_id_encoding, hotel_continent_encoding, hotel_country_encoding = \ gen_all_top_one_hot_encoding_columns(test) print 'fill test na features...' fill_na_features(test) print 'concat all test baseline features...' test_features = pd.concat([test[['date_time', 'orig_destination_distance', \ 'is_mobile', 'is_package', 'srch_adults_cnt', 'srch_children_cnt', 'srch_rm_cnt', \
def predict(): model = PlainCNN(istrained=True) te_X, _ = load_test(size='_t') te_X = norm4d_per_sample(te_X) return model.predict(te_X)
'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', # 'EMERGENCYSTATE_MODE' ] train = utils.load_train(categorical_features) test = utils.load_test(categorical_features) le = LabelEncoder() for c in categorical_features: train[c].fillna('na dayo', inplace=True) test[c].fillna('na dayo', inplace=True) le.fit(train[c].append(test[c])) train[c] = le.transform(train[c]) test[c] = le.transform(test[c]) utils.to_feature(train.add_prefix(PREF), '../feature/train') utils.to_feature(test.add_prefix(PREF), '../feature/test') #============================================================================== utils.end(__file__)
import pandas as pd import os import utils utils.start(__file__) #============================================================================== PREF = 'f110_' KEY = 'SK_ID_CURR' os.system(f'rm ../feature/t*_{PREF}*') # ============================================================================= # load # ============================================================================= train = utils.load_train(['SK_ID_CURR']).set_index('SK_ID_CURR') test = utils.load_test(['SK_ID_CURR']).set_index('SK_ID_CURR') prev = utils.read_pickles('../data/previous_application', ['SK_ID_CURR', 'SK_ID_PREV']) # ============================================================================= # prev # ============================================================================= gr = prev.groupby('SK_ID_CURR') train['SK_ID_PREV_min'] = gr.SK_ID_PREV.min() train['SK_ID_PREV_mean'] = gr.SK_ID_PREV.mean() train['SK_ID_PREV_max'] = gr.SK_ID_PREV.max() train['SK_ID_PREV_median'] = gr.SK_ID_PREV.median() train['SK_ID_PREV_std'] = gr.SK_ID_PREV.std() train['SK_ID_PREV_std-d-mean'] = train['SK_ID_PREV_std'] / train[ 'SK_ID_PREV_mean']
def _conditional_change(dictionary, target, value): dictionary[target] = test.get(value, dictionary[target]) def _conditional_remove(dictionary, key): if key in test: if not test[key]: dictionary.pop(key) _conditional_change(settings["input"], "shape", "shape") _conditional_change(settings["input"], "type", "input_type") _conditional_change(settings["normalize"], "means", "means") _conditional_change(settings["normalize"], "stddevs", "stddevs") _conditional_change(settings["return"]["result"], "operations", "operations") _conditional_change(settings["return"]["result"], "arguments", "arguments") _conditional_change(settings["return"]["result"], "type", "return_type") _conditional_change(settings["return"]["result"], "item", "item") # Remove if no output or result should be returned _conditional_remove(settings, "normalize") _conditional_remove(settings["return"], "output") _conditional_remove(settings["return"], "result") if __name__ == "__main__": args = utils.parse_args() test = utils.load_test(args) settings = load_settings() imput_arguments(settings, test) save(settings)
""" Created on Feb 27 2017 Author: Weiping Song """ import sys import numpy as np import argparse import tensorflow as tf from model import GRU4Rec from utils import load_test unfold_max = 20 cut_off = 20 test_x, test_y, n_items = load_test(unfold_max) class Args(): is_training = False layers = 1 rnn_size = 100 n_epochs = 10 batch_size = 50 keep_prob = 1 learning_rate = 0.002 decay = 0.98 decay_steps = 1e3 * 5 sigma = 0.0005 init_as_normal = False grad_cap = 0
# Leo Woiceshyn, Student Number 998082159, for CSC2515 Assignment 1 import numpy as np import utils as ut import matplotlib.pyplot as plt from run_knn import run_knn #Load data train_data, train_labels = ut.load_train() valid_data, valid_labels = ut.load_valid() test_data, test_labels = ut.load_test() #Create empty arrays for accuracy values validation_accuracies = [] test_accuracies = [] #List for k k_values = [1,3,5,7,9] #Validation Set for k in k_values: correct_predictions = 0 total_predictions = 0 predicted_valid_labels = run_knn(k, train_data, train_labels, valid_data) #Iterate through the predicted labels and compare them to the true labels to determine validation accuracy for index, value in enumerate(predicted_valid_labels): if predicted_valid_labels[index] == valid_labels[index]: correct_predictions += 1 total_predictions += 1
if FLAGS.is_train: train_data, valid_data = load_data(FLAGS.train_data) model = Model(FLAGS) model.print_parameters() if tf.train.get_checkpoint_state(FLAGS.train_dir): model.saver.restore(sess, tf.train.latest_checkpoint(FLAGS.train_dir)) else: sess.run(tf.global_variables_initializer()) epoch = 0 pre_loss = 1000000.0; while epoch < FLAGS.epoch: #train data start_time = time.time() train_acc, train_loss = train(sess, model, train_data, FLAGS.batch_size, trainable = True) epoch_time = time.time() - start_time lr = model.learning_rate.eval() print("epoch %d time: %.4f seconds, learning_rate: %.6f\n train loss: %.6f, train accuracy: %.6f" % (epoch, epoch_time, lr, train_loss, train_acc)) valid_acc, valid_loss = train(sess, model, valid_data, FLAGS.batch_size, trainable = False) print("valid loss: %.6f, valid accuracy: %.6f" % (valid_loss, valid_acc)) if valid_loss < pre_loss: pre_loss = valid_loss model.saver.save(sess, '%s/ckp' % FLAGS.train_dir, global_step = epoch) sess.run(model.learning_rate_decay_op) epoch += 1 else: model = Model(FLAGS) model.saver.restore(sess, tf.train.latest_checkpoint(FLAGS.train_dir)) test_data = load_test(FLAGS.test_data) get_test_label(sess, model, test_data, )
# 'CODE_GENDER', # 'FLAG_OWN_CAR', # 'FLAG_OWN_REALTY', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', # 'EMERGENCYSTATE_MODE' ] # ============================================================================= # # ============================================================================= train = utils.load_train().drop(['SK_ID_CURR', 'TARGET']+categorical_features, axis=1) test = utils.load_test().drop(['SK_ID_CURR']+categorical_features, axis=1) utils.to_feature(train.add_prefix(PREF), '../feature/train') utils.to_feature(test.add_prefix(PREF), '../feature/test') #============================================================================== utils.end(__file__)
import os import utils utils.start(__file__) #============================================================================== PREF = 'f705_' KEY = 'SK_ID_CURR' os.system(f'rm ../feature/t*_{PREF}*') # ============================================================================= # load # ============================================================================= train = utils.load_train([KEY]).set_index(KEY) test = utils.load_test([KEY]).set_index(KEY) prev_train = pd.read_feather('../data/prev_train_imputation_f705.f') prev_test = pd.read_feather('../data/prev_test_imputation_f705.f') # ============================================================================= # feature # ============================================================================= # train gr = prev_train.groupby(KEY) train['prev_y_min'] = gr.y_pred.min() train['prev_y_mean'] = gr.y_pred.mean() train['prev_y_max'] = gr.y_pred.max() train['prev_y_var'] = gr.y_pred.var() train['prev_y_median'] = gr.y_pred.median() train['prev_y_q25'] = gr.y_pred.quantile(.25)
from sklearn.multiclass import OneVsRestClassifier from sklearn.svm import LinearSVC import utils as utl cuisine_list, ingredients_list, x, y = utl.load_train('number') classifier = OneVsRestClassifier(LinearSVC(random_state=0)).fit(x, y) p = classifier.predict(x) precision = 0 for i in range(len(y)): if y[i] == p[i]: precision += 1 accuracy = (1.0 * precision) / len(y) print('Training Set Accuracy:', accuracy) t, ids = utl.load_test(ingredients_list) p = classifier.predict(t) utl.save_result('sk_svm', cuisine_list, p, ids, 'number')
import numpy as np import pandas as pd import sys import os from sklearn.externals import joblib scriptpath = os.path.dirname(os.path.realpath(sys.argv[0])) + '/../' sys.path.append(os.path.abspath(scriptpath)) import utils test = utils.load_test('group_by') def predict_group_by_model(group_by_field): """ Use group by model to predict the top 5 hotel clusters for the test data :param group_by_field: group by field to get the related group by model :return: the dataframe with the submission format according to the model """ group_by_model = joblib.load(utils.model_path + '_'.join(['top', str(utils.k), 'cw', str(utils.click_weight), 'group', group_by_field, 'year', utils.train_year]) + '.pkl') merged_df = test.merge(group_by_model, how='left',left_on= group_by_field, right_index=True) merged_df.reset_index(inplace = True) result = merged_df[['index', 'hotel_cluster']] result.columns = ['id', 'hotel_cluster'] return result print 'predict with orig_destination_distance model...' result = predict_group_by_model('orig_destination_distance') print 'predict with srch_destination_id model...'
def svm(training_file, development_file, test_file, counts): twords, tlabels_true = hs.load_file(training_file) dwords, dlabels_true = hs.load_file(development_file) test_words = utils.load_test(test_file) ## Length tlength_feature = hs.length_feature(twords) tlength_normalized, tl_mean, tl_std = utils.normalize(tlength_feature) dlength_feature = hs.length_feature(dwords) dlength_normalized = utils.normalize_with_params(dlength_feature, tl_mean, tl_std) ## Frequency tfrequency_feature = hs.frequency_feature(twords, counts) tfrequency_normalized, tf_mean, tf_std = utils.normalize( tfrequency_feature) dfrequency_feature = hs.frequency_feature(dwords, counts) dfrequency_normalized = utils.normalize_with_params( dfrequency_feature, tf_mean, tf_std) ## Syllables tsyllables_feature = features.syllables_feature(twords) tsyllables_normalized, tsy_mean, tsy_std = utils.normalize( tsyllables_feature) dsyllables_feature = features.syllables_feature(dwords) dsyllables_normalized = utils.normalize_with_params( dsyllables_feature, tsy_mean, tsy_std) ## Vowels tvowels_feature = features.vowels_feature(twords) tvowels_normalized, tv_mean, tv_std = utils.normalize(tvowels_feature) dvowels_feature = features.vowels_feature(dwords) dvowels_normalized = utils.normalize_with_params(dvowels_feature, tv_mean, tv_std) ## Consonants tconsonant_feature = features.vowels_feature(twords) tconsonant_normalized, tc_mean, tc_std = utils.normalize( tconsonant_feature) dconsonant_feature = features.vowels_feature(dwords) dconsonant_normalized = utils.normalize_with_params( dconsonant_feature, tc_mean, tc_std) ## Senses tsenses_feature = features.senses_feature(twords) tsenses_normalized, tse_mean, tse_std = utils.normalize(tsenses_feature) dsenses_feature = features.senses_feature(dwords) dsenses_normalized = utils.normalize_with_params(dsenses_feature, tse_mean, tse_std) ## Hypernyms thypernyms_feature = features.hypernyms_feature(twords) thypernyms_normalized, th_mean, th_std = utils.normalize( thypernyms_feature) dhypernyms_feature = features.hypernyms_feature(dwords) dhypernyms_normalized = utils.normalize_with_params( dhypernyms_feature, th_mean, th_std) x_train = np.column_stack((tlength_normalized, tfrequency_normalized, tsyllables_normalized, tsenses_normalized)) y = tlabels_true x_dev = np.column_stack((dlength_normalized, dfrequency_normalized, dsyllables_normalized, dsenses_normalized)) clf = SVC(C=48, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False) clf.fit(x_train, y) y_pred = clf.predict(x_dev) daccuracy = hs.get_accuracy(y_pred, dlabels_true) dprecision = hs.get_precision(y_pred, dlabels_true) drecall = hs.get_recall(y_pred, dlabels_true) dfscore = hs.get_fscore(y_pred, dlabels_true) # Test Set # test_length_feature = hs.length_feature(test_words) # test_frequency_feature = hs.frequency_feature(test_words, counts) # test_syllables_feature = features.syllables_feature(test_words) # test_senses_feature = features.senses_feature(test_words) # # test_length_normalized = utils.normalize_with_params(test_length_feature, tl_mean, tl_std) # test_frequency_normalized = utils.normalize_with_params(test_frequency_feature, tf_mean, tf_std) # test_syllables_normalized = utils.normalize_with_params(test_syllables_feature, tsy_mean, tsy_std) # test_senses_normalized = utils.normalize_with_params(test_senses_feature, tse_mean, tse_std) # # x_test = np.column_stack((test_length_normalized, test_frequency_normalized, # test_syllables_normalized, test_senses_normalized)) # y_pred_test = clf.predict(x_test) # # f = open('test_labels.txt', 'w') # for item in y_pred_test: # print(item, file=f) # f.close() # training_performance = (tprecision, trecall, tfscore) development_performance = (daccuracy, dprecision, drecall, dfscore) return development_performance
def random_forest(training_file, development_file, test_file, counts): twords, tlabels_true = hs.load_file(training_file) dwords, dlabels_true = hs.load_file(development_file) test_words = utils.load_test(test_file) ## Length tlength_feature = hs.length_feature(twords) tlength_normalized, tl_mean, tl_std = utils.normalize(tlength_feature) dlength_feature = hs.length_feature(dwords) dlength_normalized = utils.normalize_with_params(dlength_feature, tl_mean, tl_std) ## Frequency tfrequency_feature = hs.frequency_feature(twords, counts) tfrequency_normalized, tf_mean, tf_std = utils.normalize( tfrequency_feature) dfrequency_feature = hs.frequency_feature(dwords, counts) dfrequency_normalized = utils.normalize_with_params( dfrequency_feature, tf_mean, tf_std) ## Syllables tsyllables_feature = features.syllables_feature(twords) tsyllables_normalized, tsy_mean, tsy_std = utils.normalize( tsyllables_feature) dsyllables_feature = features.syllables_feature(dwords) dsyllables_normalized = utils.normalize_with_params( dsyllables_feature, tsy_mean, tsy_std) ## Vowels tvowels_feature = features.vowels_feature(twords) tvowels_normalized, tv_mean, tv_std = utils.normalize(tvowels_feature) dvowels_feature = features.vowels_feature(dwords) dvowels_normalized = utils.normalize_with_params(dvowels_feature, tv_mean, tv_std) ## Consonants tconsonant_feature = features.vowels_feature(twords) tconsonant_normalized, tc_mean, tc_std = utils.normalize( tconsonant_feature) dconsonant_feature = features.vowels_feature(dwords) dconsonant_normalized = utils.normalize_with_params( dconsonant_feature, tc_mean, tc_std) ## Senses tsenses_feature = features.senses_feature(twords) tsenses_normalized, tse_mean, tse_std = utils.normalize(tsenses_feature) dsenses_feature = features.senses_feature(dwords) dsenses_normalized = utils.normalize_with_params(dsenses_feature, tse_mean, tse_std) ## Hypernyms thypernyms_feature = features.hypernyms_feature(twords) thypernyms_normalized, th_mean, th_std = utils.normalize( thypernyms_feature) dhypernyms_feature = features.hypernyms_feature(dwords) dhypernyms_normalized = utils.normalize_with_params( dhypernyms_feature, th_mean, th_std) x_train = np.column_stack( (tlength_normalized, tfrequency_normalized, tsyllables_normalized, tsenses_normalized, thypernyms_normalized)) y = tlabels_true x_dev = np.column_stack( (dlength_normalized, dfrequency_normalized, dsyllables_normalized, dsenses_normalized, dhypernyms_normalized)) clf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=7, max_features=3, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=8, min_samples_split=50, min_weight_fraction_leaf=0.0, n_estimators=70, n_jobs=None, oob_score=False, random_state=0, verbose=0, warm_start=False) clf.fit(x_train, y) y_pred = clf.predict(x_dev) daccuracy = hs.get_accuracy(y_pred, dlabels_true) dprecision = hs.get_precision(y_pred, dlabels_true) drecall = hs.get_recall(y_pred, dlabels_true) dfscore = hs.get_fscore(y_pred, dlabels_true) # Test Set test_length_feature = hs.length_feature(test_words) test_frequency_feature = hs.frequency_feature(test_words, counts) test_syllables_feature = features.syllables_feature(test_words) test_vowels_feature = features.vowels_feature(test_words) test_consonants_feature = features.consonants_feature(test_words) test_senses_feature = features.senses_feature(test_words) test_hypernyms_feature = features.hypernyms_feature(test_words) test_length_normalized = utils.normalize_with_params( test_length_feature, tl_mean, tl_std) test_frequency_normalized = utils.normalize_with_params( test_frequency_feature, tf_mean, tf_std) test_syllables_normalized = utils.normalize_with_params( test_syllables_feature, tsy_mean, tsy_std) test_vowels_normalized = utils.normalize_with_params( test_vowels_feature, tv_mean, tv_std) test_consonants_normalized = utils.normalize_with_params( test_consonants_feature, tc_mean, tc_std) test_senses_normalized = utils.normalize_with_params( test_senses_feature, tse_mean, tse_std) test_hypernyms_normalized = utils.normalize_with_params( test_hypernyms_feature, th_mean, th_std) x_test = np.column_stack( (test_length_normalized, test_frequency_normalized, test_syllables_normalized, test_senses_normalized, test_hypernyms_normalized)) y_pred_test = clf.predict(x_test) f = open('test_labels.txt', 'w') for item in y_pred_test: print(item, file=f) f.close() # training_performance = (tprecision, trecall, tfscore) development_performance = (daccuracy, dprecision, drecall, dfscore) return development_performance
def evaluate_nonepisode(data, config, model, loss_fn, eval): x_te, y_te, te_len, te_mask, text_te = utils.load_test(data, eval) x_te, y_te, te_len, te_mask, text_te = utils.shuffle_data( x_te, y_te, te_len, te_mask, text_te) y_te_ind = utils.create_index(y_te) reverse_dict = data['reverse_dict'] num_class = np.unique(y_te) num_test_query = config['num_query_per_class'] * num_class.shape[0] x_support, y_support, x_len_support, support_m, support_text = utils.load_support( data, False) y_support_ind = utils.create_index(y_support) test_batch = int(math.ceil(x_te.shape[0] / float(num_test_query))) total_prediction = np.array([], dtype=np.int64) total_y_test = np.array([], dtype=np.int64) cum_loss = 0.0 kl_loss = torch.nn.KLDivLoss(reduction='batchmean').to(config['device']) with torch.no_grad(): for batch in range(test_batch): support_feature, support_class, support_len, support_ind, support_mask = utils.init_support_query( config['num_samples_per_class'], x_te.shape[1], num_class.shape[0]) query_feature, query_class, query_len, query_ind, query_mask = utils.init_support_query( config['num_query_per_class'], x_te.shape[1], num_class.shape[0]) begin_index = batch * (num_test_query) end_index = min((batch + 1) * num_test_query, x_te.shape[0]) query_feature = x_te[begin_index:end_index] query_len = te_len[begin_index:end_index] query_class = y_te[begin_index:end_index] query_mask = te_mask[begin_index:end_index] query_text = text_te[begin_index:end_index] support_idx = 0 num_class = np.unique(y_support) for counter in range(num_class.shape[0]): class_index = np.where(y_support == num_class[counter])[0] old_support_idx = support_idx support_idx = support_idx + config['num_samples_per_class'] support_feature[old_support_idx:support_idx] = x_support[ class_index] support_class[old_support_idx:support_idx] = y_support[ class_index] support_len[old_support_idx:support_idx] = x_len_support[ class_index] support_mask[old_support_idx:support_idx] = support_m[ class_index] support_text[old_support_idx:support_idx] = support_text[ class_index] cs = np.unique(query_class) #Obtain indexes q_ind_key = {} s_ind_key = {} for i in range(len(cs)): q_index = np.where(query_class == cs[i])[0] s_index = np.where(support_class == cs[i])[0] q_ind_key[cs[i]] = q_index s_ind_key[cs[i]] = s_index # Changet values for i in range(len(cs)): query_class[q_ind_key[cs[i]]] = i support_class[s_ind_key[cs[i]]] = i support_ind = utils.create_index(support_class) query_ind = utils.create_index(query_class) support_feature, support_id, support_ind, support_len, support_mask = convert_to_tensor( support_feature, support_class, support_ind, support_len, support_mask, config['device']) query_feature, query_id, query_ind, query_len, query_mask = convert_to_tensor( query_feature, query_class, query_ind, query_len, query_mask, config['device']) prediction, _, support_attn, query_attn, _, _ = model.forward( support_feature, support_len, support_mask, query_feature, query_len, query_mask) pred = np.argmax(prediction.cpu().detach().numpy(), 1) total_prediction = np.concatenate((total_prediction, pred)) total_y_test = np.concatenate((total_y_test, query_class)) acc = accuracy_score(total_y_test, total_prediction) cnf = confusion_matrix(total_y_test, total_prediction) print("Confusion matrix:") print(cnf) return acc
X_fold = np.hstack((X_fold, X_pred)) all_X.append(X_fold) all_y.append(y_fold) all_w.append(w_fold) X = np.vstack(all_X) y = np.concatenate(all_y) w = np.concatenate(all_w) clf = Classifier(**params) w = rescale(w) w = rebalance(y, w) try: clf.fit(X, y, sample_weight=w) except: clf.fit(X, y) # And make a submussion print "Making submission..." X_test, _, _, _ = load_test() X_pred = load_predictions("stack/*-test.npy") X_test = np.hstack((X_test, X_pred)) make_submission(clf, threshold, "output-stacking.csv", X_test=X_test) import IPython; IPython.embed()
# main # ============================================================================= if __name__ == "__main__": utils.start(__file__) # train tr = utils.load_train(['object_id']) df = pd.read_pickle( '../FROM_MYTEAM/LCfit_feature_allSN_i_train_v3_20181215.pkl.gz') df = pd.merge(tr, df, on='object_id', how='left') df.reset_index(drop=True, inplace=True) get_feature(df) del df['object_id'] df.add_prefix(PREF + '_').to_pickle(f'../data/train_{PREF}.pkl') # test te = utils.load_test(['object_id']) df = pd.read_pickle( '../FROM_MYTEAM/LCfit_feature_allSN_i_test_v3_20181215.pkl.gz') df = pd.merge(te, df, on='object_id', how='left') df.reset_index(drop=True, inplace=True) get_feature(df) del df['object_id'] df = df.add_prefix(PREF + '_') utils.to_pkl_gzip(df, f'../data/test_{PREF}.pkl') utils.end(__file__)
import lasagne import utils from lasagne.layers import * from nolearn.lasagne import NeuralNet from sklearn.metrics import classification_report from sklearn.metrics import confusion_matrix from sklearn import cross_validation from nolearn.lasagne import TrainSplit from nolearn.lasagne import objective from lasagne.nonlinearities import softmax from lasagne.updates import momentum X, y=utils.load_train() X_test=utils.load_test() X=X.reshape([X.shape[0],3,32,32]) y=np.array(y,dtype="int32") X_test=X_test.reshape([X_test.shape[0],3,32,32]) layers = [ # layer dealing with the input data (InputLayer, {'shape': (None, 3, 32, 32)}), # first stage of our convolutional layers # second stage of our convolutional layers (Conv2DLayer, {'pad':2,'num_filters': 32, 'filter_size': 5,'W':lasagne.init.Normal(std=0.01)}), (ParametricRectifierLayer, {'alpha':lasagne.init.Constant(0)}), (Pool2DLayer, {'pool_size': 2,'stride':2,'mode':'max'}),
''' X = T.tensor4('X') Y = T.ivector('y') # set up theano functions to generate output by feeding data through network, any test outputs should be deterministic output_layer = ResNet_FullPre(X, n=5) output_test = lasagne.layers.get_output(output_layer, deterministic=True) # set up training and prediction functions predict_proba = theano.function(inputs=[X], outputs=output_test) ''' Load data and make predictions ''' # load data X_test, X_test_id = load_test(cache=True) nn_count = 1 for ensb in range(19): # load network weights f = gzip.open('data/weights/resnet32_fullpre_' + str(nn_count) + '.pklz', 'rb') all_params = pickle.load(f) f.close() helper.set_all_param_values(output_layer, all_params) ''' # make regular predictions predictions = [] for j in range((X_test.shape[0] + BATCHSIZE - 1) // BATCHSIZE): sl = slice(j * BATCHSIZE, (j + 1) * BATCHSIZE) X_batch = X_test[sl]
result = utils.fill_all_top_5(train_is_booking, result, 'hotel_market', 'train') print 'hotel clusters to ranking features...' new_result = result.apply(lambda row: hotel_clusters_to_ranking_features(row), axis=1) new_result.columns = ['_'.join(['hotel_cluster', str(hotel_cluster_id), 'rank']) for hotel_cluster_id in range(100)] new_result = pd.concat([train_is_booking['date_time'], new_result], axis=1) new_result.to_csv(utils.processed_data_path + '_'.join(['train_is_booking_group_by', 'top', str(utils.k), 'cw', str(utils.click_weight), 'year', utils.train_year]) + '.csv', header=True, index=False) del train_is_booking ############################################################# #################### test dataset #################### ############################################################# test = utils.load_test('group_by') print 'generate top k hotel clusters with orig_destination_distance model...' result = gen_top_k_hotel_cluster(test, 'orig_destination_distance') print 'generate top k hotel clusters with srch_destination_id model...' result = utils.fill_all_top_5(test, result, 'srch_destination_id') print 'generate top k hotel clusters with user_id model...' result = utils.fill_all_top_5(test, result, 'user_id') print 'generate top k hotel clusters with hotel_market model...' result = utils.fill_all_top_5(test, result, 'hotel_market') print 'hotel clusters to ranking features...' new_result = result.apply(lambda row: hotel_clusters_to_ranking_features(row), axis=1) new_result.columns = ['_'.join(['hotel_cluster', str(hotel_cluster_id), 'rank']) for hotel_cluster_id in range(100)] new_result.to_csv(utils.processed_data_path + '_'.join(['test_groupb_by', 'top', str(utils.k), 'cw', str(utils.click_weight), 'year', utils.train_year]) +
from keras import metrics import tensorflow as tf import numpy as np from utils import load_test, PSNR import scipy.misc import argparse parser = argparse.ArgumentParser(description='Test function') parser.add_argument('--test', metavar='test', type=str, help='test directory') parser.add_argument('--network', metavar='network', type=str, help='network weight') args = parser.parse_args() val_in, val_out = load_test(directory=args.test) model = load_model(args.network) prediction = model.predict(val_in, batch_size=1, verbose=1) Result = PSNR(val_out, prediction) sess = tf.Session() RR = sess.run(Result) print(RR) for img_count in range(prediction.shape[0]): img_in = val_in[img_count, :, :, :] img_out = prediction[img_count, :, :, :] img_gt = val_out[img_count, :, :, :] scipy.misc.imsave('./Result/LR' + '{0:03d}'.format(img_count) + '.png',