def __init__(self): self.img_path = './data/images' self.anno_path = './data/annotations' self.ft_path = './feature_maps/' self.model_path = './checkpoint/' self.model_name = 'segmentation.ckpt-285' self.model = os.path.join(self.model_path, self.model_name) # Parameters self.depth = 7 self.classes = 1 self.img_size = 32 # Placeholders self.x = tf.placeholder(tf.float32, shape=[None, None, None, self.depth], name='input') self.y_true = tf.placeholder(tf.float32, shape=[None, None, None, self.classes], name='y_true') self.rate = tf.placeholder(tf.float32, name='dropout_rate') self.is_training = tf.placeholder(tf.bool, shape=()) # Build network self.y01 = cvmodel.build_model(input=self.x, drop_rate=0, is_training=False) # Calculate loss + f1 self.cost_reg, self.f1_vec, self.recall, \ self.precision, self.specificity, self.accuracy = utils.loss(logits=[self.y01], labels=self.y_true, classes_weights=[2.]) # Open session and restore model self.sess = tf.Session() self.sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver() self.saver.restore(self.sess, self.model) # Load data self.img_names = utils.load_train(path=self.img_path) self.anno_names = utils.load_train(path=self.anno_path) self.imgs_ = utils.get_image_array(self.img_names, self.img_size) self.annos_ = utils.get_annotation_array(self.anno_names, self.img_size) n = self.imgs_.shape[0] print('\nNumber of images:', n) # Get number of trainable variables v_nb = np.sum([ np.prod(v.get_shape().as_list()) for v in tf.trainable_variables() ]) print('Number of trainable variables:', v_nb)
def knn(): train_data, train_labels = load_train() #for validation valid_data, valid_labels = load_valid() #for test #valid_data, valid_labels = load_test() values = [1, 3, 5, 7, 9] ratio = [] for k in values: c = 0 prediction_labels = run_knn(k, train_data, train_labels, valid_data) for i in range(len(valid_labels)): if valid_labels[i] == prediction_labels[i]: c += 1 ratio.append(float(c) / len(prediction_labels)) plt.plot(values, ratio) #for validation plt.axis([1, 9, 0.81, 0.87]) #for test #plt.axis([1, 9, 0.87, 0.95]) plt.show()
def train(): tr_X, tr_y = load_train(size='_t') tr_X = norm4d_per_sample(tr_X) te_X, te_y = load_test(size='_t') te_X = norm4d_per_sample(te_X) model = PlainCNN(istrained=False, args=(0.01, 0.1, 0.9)) model.train(tr_X, tr_y, te_X, te_y)
def load_train(is_gabor): tr_identity, tr_labels, tr_images = utils.load_train() pc_tr_identity = reshape_labels(tr_identity) pc_tr_labels = reshape_labels(tr_labels) pc_tr_images = reshape_images(tr_images, is_gabor) return pc_tr_identity, pc_tr_labels, pc_tr_images
def run_logistic_regression(): train_inputs, train_targets = load_train() valid_inputs, valid_targets = load_valid() # TODO: initialize parameters parameters = { 'learning_rate': 0.01 , 'weight_regularization': 0 , 'num_iterations': 10 } # logistic regression weights dimension = 28*28 z = np.ones([dimension+1, 1], int) z = z/100.0 #weight = np.matrix('1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1') for i in xrange(0,28*28): if i%2 == 1: z[i] = 0 weights = z #weights = 1,1,2,1 # Verify that your logistic function produces the right gradient. # diff should be very close to 0. #run_check_grad(parameters) # Begin learning with gradient descent for t in xrange(parameters['num_iterations']): # TODO: you will need to modify this loop to create plots, etc. # find the negative log likelihood and derivatives w.r.t. weights f, df, frac_correct_train = logistic(weights, train_inputs, train_targets, parameters) _, _, frac_correct_valid = logistic(weights, valid_inputs, valid_targets, parameters) if np.isnan(f) or np.isinf(f): raise ValueError("nan/inf error") # update parameters for i in range(weights.shape[0]): weights[i] = weights[i] + parameters['learning_rate'] * (df[i] - 0.001*(weights[i])) # print some stats print ("ITERATION:{:4d} LOGL:{:4.2f} " "TRAIN FRAC:{:2.2f} VALID FRAC:{:2.2f}").format(t+1, f, frac_correct_train*100, frac_correct_valid*100)
def train(): tr_X, tr_y = load_train(size='_t') tr_X = norm4d_per_sample(tr_X) tr_y = one_hot(tr_y, 2) te_X, te_y = load_test(size='_t') te_y = one_hot(te_y, 2) te_X = norm4d_per_sample(te_X) model = DeepCNN('vgg') model.train(tr_X, tr_y, te_X, te_y)
def train(): tr_X, tr_y = load_train(size='_t') tr_X = norm4d_per_sample(tr_X) tr_y = one_hot(tr_y, 2) te_X, te_y = load_test(size='_t') te_y = one_hot(te_y, 2) te_X = norm4d_per_sample(te_X) model = RL(istrained=False, name='rl_noun') model.train(tr_X, tr_y, te_X, te_y)
def main(args): srcnn = SRCNN( image_size=args.image_size, c_dim=args.c_dim, is_training=True, learning_rate=args.learning_rate, batch_size=args.batch_size, epochs=args.epochs) X_train, Y_train = load_train(image_size=args.image_size, stride=args.stride, scale=args.scale) srcnn.train(X_train, Y_train)
def run_logistic_regression(hyperparameters): # TODO specify training data train_inputs, train_targets = load_train() valid_inputs, valid_targets = load_valid() # N is number of examples; M is the number of features per example. N, M = train_inputs.shape # Logistic regression weights # TODO:Initialize to random weights here. #weights = np.random.normal(0, 0.2, (train_inputs.shape[1]+1,1)) weights = np.zeros(785).reshape((785, 1)) # Verify that your logistic function produces the right gradient. # diff should be very close to 0. run_check_grad(hyperparameters) # Begin learning with gradient descent logging = np.zeros((hyperparameters['num_iterations'], 5)) for t in xrange(hyperparameters['num_iterations']): # Find the negative log likelihood and its derivatives w.r.t. the weights. f, df, predictions = logistic(weights, train_inputs, train_targets, hyperparameters) # Evaluate the prediction. cross_entropy_train, frac_correct_train = evaluate( train_targets, predictions) if np.isnan(f) or np.isinf(f): raise ValueError("nan/inf error") # update parameters weights = weights - hyperparameters['learning_rate'] * df / N # Make a prediction on the valid_inputs. predictions_valid = logistic_predict(weights, valid_inputs) # Evaluate the prediction. cross_entropy_valid, frac_correct_valid = evaluate( valid_targets, predictions_valid) # print some stats print( "ITERATION:{:4d} TRAIN NLOGL:{:4.2f} TRAIN CE:{:.6f} " "TRAIN FRAC:{:2.2f} VALID CE:{:.6f} VALID FRAC:{:2.2f}").format( t + 1, f / N, cross_entropy_train, frac_correct_train * 100, cross_entropy_valid, frac_correct_valid * 100) logging[t] = [ f / N, cross_entropy_train, frac_correct_train * 100, cross_entropy_valid, frac_correct_valid * 100 ] return logging
def kfold(classification_algorithm, k): res = {"accuracy": 0, "precision": 0, "recall": 0, "f1": 0} for i in range(1, k + 1): validation = utils.load_train(i) validation = validation["plus"] + validation["minus"] train = {"plus": [], "minus": []} for j in range(1, k + 1): if j != i: extension = utils.load_train(j) train["plus"].extend(extension["plus"]) train["minus"].extend(extension["minus"]) classification = classification_algorithm(train, validation) res["accuracy"] += utils.accuracy(classification) res["precision"] += utils.precision(classification) res["recall"] += utils.recall(classification) res["f1"] += utils.F1_score(classification) for k in res: res[k] /= k print res return res
def main(): x_train, y_train = load_train('train.csv') x_train = x_train[:20, :] x_train = x_train.reshape(x_train.shape[0], 1, 48, 48, 1) emotion_classifier = load_model('./model_cnn.h5') input_img = emotion_classifier.input for idx in tqdm(range(20)): val_proba = emotion_classifier.predict(x_train[idx]) pred = val_proba.argmax(axis=-1) target = K.mean(emotion_classifier.output[:, pred]) grads = K.gradients(target, input_img)[0] fn = K.function([input_img, K.learning_phase()], [grads]) grads_value = fn([x_train[idx], 0]) heatmap = np.array(grads_value).reshape(48, 48) s = np.sort(heatmap, axis = None) clip_rate = 0.1 clip_size = int(len(s) * clip_rate) heatmap = np.clip(heatmap, s[clip_size], s[len(s) - clip_size]) heatmap = abs(heatmap - np.mean(heatmap)) heatmap = (heatmap - np.mean(heatmap))/np.std(heatmap) heatmap = (heatmap - heatmap.min())/ heatmap.ptp() thres = 0.5 origin = x_train[idx].reshape(48, 48)*255 see = x_train[idx].reshape(48, 48) see[np.where(heatmap <= thres)] = np.mean(see) see *= 255 plt.figure() plt.imshow(heatmap, cmap=plt.cm.jet) plt.colorbar() plt.tight_layout() fig = plt.gcf() plt.draw() fig.savefig(os.path.join(cmap_dir, '{}.png'.format(idx)), dpi=100) plt.figure() plt.imshow(see, cmap='gray') plt.colorbar() plt.tight_layout() fig = plt.gcf() plt.draw() fig.savefig(os.path.join(partial_see_dir, '{}.png'.format(idx)), dpi=100) plt.figure() plt.imshow(origin, cmap='gray') plt.tight_layout() fig = plt.gcf() plt.draw() fig.savefig(os.path.join(origin_dir, '{}.png'.format(idx)), dpi=100)
def main(): filter_dir = './img/' if not os.path.isdir(filter_dir): os.mkdir(filter_dir) filter_dir = './img/filter/' if not os.path.isdir(filter_dir): os.mkdir(filter_dir) emotion_classifier = load_model('./model_cnn.h5') layer_dict = dict([layer.name, layer] for layer in emotion_classifier.layers[1:]) input_img = emotion_classifier.input name_ls = ['activation_1'] collect_layers = [ K.function([input_img, K.learning_phase()], [layer_dict[name].output]) for name in name_ls ] x_train, y_train = load_train('train.csv') x_train = x_train.reshape(x_train.shape[0], 1, 48, 48, 1) choose_id = 2044 photo = x_train[choose_id] for cnt, fn in enumerate(collect_layers): im = fn([photo, 0]) #get the output of that layer fig = plt.figure(figsize=(14, 8)) nb_filter = im[0].shape[3] for i in range(nb_filter): ax = fig.add_subplot(nb_filter / 16, 16, i + 1) ax.imshow(im[0][0, :, :, i], cmap='YlGnBu') plt.xticks(np.array([])) plt.yticks(np.array([])) plt.tight_layout() fig.suptitle('Output of layer{} (Given image{})'.format( cnt, choose_id)) img_path = filter_dir if not os.path.isdir(img_path): os.mkdir(img_path) fig.savefig(os.path.join(img_path, 'layer{}'.format(cnt)))
def data_generator(dataset, shuffle=True, augment=True, batch_size=32): b = 0 image_index = -1 image_ids = np.copy(dataset.image_ids()) #print(image_ids) error_count = 0 while True: try: #print(b) image_index = (image_index + 1) % len(image_ids) if shuffle and image_index == 0: np.random.shuffle(image_ids) image_id = image_ids[image_index] image_concat = utils.load_train(dataset, image_id, augment=augment) label = dataset.load_label(image_id) if b == 0: batch_images = np.zeros((batch_size, 448, 224, 3), dtype=np.float32) batch_actions = np.zeros((batch_size, 1), dtype=np.int32) batch_images[b] = image_concat batch_actions[b] = label b += 1 if b >= batch_size: #inputs=[batch_images, batch_actions] inputs = batch_images outputs = batch_actions yield inputs, outputs b = 0 except (GeneratorExit, KeyboardInterrupt): raise except: # Log it and skip the image logging.exception("Error processing image {}".format( dataset.image_info[image_id])) error_count += 1 if error_count > 5: raise
import numpy as np import pandas as pd import sys import os from sklearn.externals import joblib scriptpath = os.path.dirname(os.path.realpath(sys.argv[0])) + '/../' sys.path.append(os.path.abspath(scriptpath)) import utils train = utils.load_train('group_by') train_2013 = train[train.date_time < '2014-01-01 00:00:00'] train_2014 = train[train.date_time >= '2014-01-01 00:00:00'] def top_k_relevence(group, topk = utils.k): """ Order and get the topk hotel cluters by the relevance score in desc order :param group: the aggregate group with hotel cluster relevance scores :param topk: the top k value :return: the topk hotel clusters for the aggregate group """ idx = group.relevance.nlargest(topk).index top_k_relevence = group.hotel_cluster[idx].values return np.array_str(top_k_relevence)[1:-1] def gen_top_k_group_by_model(group_by_field, click_weight = utils.click_weight, year = 'all'):
def run_logistic_regression(hyperparameters): # specify training data xIn = False while xIn == False: x = raw_input('Training Set LARGE or SMALL? ') print(x) if x == 'LARGE': print("HELLO") train_inputs, train_targets = load_train() xIn = True elif x == 'SMALL': print("hello") train_inputs, train_targets = load_train_small() xIn = True else: print("Please input LARGE or SMALL") valid_inputs, valid_targets = load_valid() test_inputs, test_targets = load_test() # N is number of examples; M is the number of features per example. N, M = train_inputs.shape print("N:", N, " M:", M) # Logistic regression weights # Initialize to random weights here. weights = np.random.normal(0, 0.001, (M+1, 1)) # Verify that your logistic function produces the right gradient. # diff should be very close to 0. run_check_grad(hyperparameters) # Begin learning with gradient descent logging = np.zeros((hyperparameters['num_iterations'], 5)) for t in xrange(hyperparameters['num_iterations']): # Find the negative log likelihood and its derivatives w.r.t. the weights. f, df, predictions = logistic(weights, train_inputs, train_targets, hyperparameters) # Evaluate the prediction. cross_entropy_train, frac_correct_train = evaluate(train_targets, predictions) if np.isnan(f) or np.isinf(f): raise ValueError("nan/inf error") # update parameters weights = weights - hyperparameters['learning_rate'] * df / N # Make a prediction on the valid_inputs. predictions_valid = logistic_predict(weights, valid_inputs) # Evaluate the prediction. cross_entropy_valid, frac_correct_valid = evaluate(valid_targets, predictions_valid) # print some stats print ("ITERATION:{:4d} TRAIN NLOGL:{:4.2f} TRAIN CE:{:.6f} " "TRAIN FRAC:{:2.2f} VALID CE:{:.6f} VALID FRAC:{:2.2f}").format( t+1, f / N, cross_entropy_train, frac_correct_train*100, cross_entropy_valid, frac_correct_valid*100) logging[t] = [f / N, cross_entropy_train, frac_correct_train*100, cross_entropy_valid, frac_correct_valid*100] return logging
import pandas as pd import numpy as np import utils path_to_data = '' data, meta = utils.load_train(path_to_data) objects = meta['object_id'].values for obj in objects: df = data.loc[data['object_id'] == obj] arr = utils.conv_preprocess_data(df, 355feature) print pd.DataFrame(arr[0][0]) break
# -*- coding: utf-8 -*- from utils import load_train, load_valid from run_knn import run_knn (train_inputs, train_targets) = load_train() (valid_inputs, valid_targets) = load_valid() for k in [1, 3, 5, 7, 9]: print run_knn(k, train_inputs, train_targets, valid_inputs)
'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', ] # ============================================================================= # # ============================================================================= skf = StratifiedKFold(n_splits=FOLD, shuffle=True, random_state=SEED) train = utils.load_train(categorical_features+['TARGET']).fillna('na dayo') test = utils.load_test(categorical_features).fillna('na dayo') col = [] cat_comb = list(combinations(categorical_features, 2)) for c1,c2 in cat_comb: train[f'{c1}-{c2}'] = train[c1] + train[c2] test[f'{c1}-{c2}'] = test[c1] + test[c2] col.append( f'{c1}-{c2}' ) # ============================================================================= # cardinality check # ============================================================================= train['fold'] = 0 for i,(train_index, test_index) in enumerate(skf.split(train, train.TARGET)):
# # ============================================================================= prev = utils.read_pickles('../data/previous_application') base = prev[[KEY]].drop_duplicates().set_index(KEY) gr = prev.groupby(KEY) gr_app = prev[prev['NAME_CONTRACT_STATUS'] == 'Approved'].groupby(KEY) gr_ref = prev[prev['NAME_CONTRACT_STATUS'] == 'Refused'].groupby(KEY) gr_act = prev[prev['active'] == 1].groupby(KEY) gr_cmp = prev[prev['completed'] == 1].groupby(KEY) col = [ 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_CREDIT-d-AMT_ANNUITY', 'DAYS_BIRTH' ] train = utils.load_train([KEY] + col) test = utils.load_test([KEY] + col) train.AMT_ANNUITY.fillna(0, inplace=True) test.AMT_ANNUITY.fillna(0, inplace=True) train.columns = [KEY] + ['app_' + c for c in train.columns[1:]] test.columns = [KEY] + ['app_' + c for c in test.columns[1:]] col_init = train.columns.tolist() # ============================================================================= # feature # ============================================================================= # size
""" Created on Feb 26 2017 Author: Weiping Song """ import os, sys import tensorflow as tf import numpy as np import argparse, random from model import GRU4Rec from utils import load_train, load_valid unfold_max = 20 error_during_training = False train_x, train_y, n_items = load_train(unfold_max) valid_x, valid_y, _ = load_valid(unfold_max) class Args(): is_training = True layers = 1 rnn_size = 100 n_epochs = 10 batch_size = 50 keep_prob = 1 learning_rate = 0.001 decay = 0.98 decay_steps = 2 * 1e3 sigma = 0.0001 init_as_normal = False
import theano import lasagne import utils from lasagne.layers import * from nolearn.lasagne import NeuralNet from sklearn.metrics import classification_report from sklearn.metrics import confusion_matrix from sklearn import cross_validation from nolearn.lasagne import TrainSplit from nolearn.lasagne import objective from lasagne.nonlinearities import softmax from lasagne.updates import momentum X, y=utils.load_train() X_test=utils.load_test() X=X.reshape([X.shape[0],3,32,32]) y=np.array(y,dtype="int32") X_test=X_test.reshape([X_test.shape[0],3,32,32]) layers = [ # layer dealing with the input data (InputLayer, {'shape': (None, 3, 32, 32)}), # first stage of our convolutional layers # second stage of our convolutional layers (Conv2DLayer, {'pad':2,'num_filters': 32, 'filter_size': 5,'W':lasagne.init.Normal(std=0.01)}), (ParametricRectifierLayer, {'alpha':lasagne.init.Constant(0)}), (Pool2DLayer, {'pool_size': 2,'stride':2,'mode':'max'}),
np.reshape(Theta[:hidden_layer_size * (input_layer_size + 1)], (hidden_layer_size, input_layer_size + 1), order='F')) Theta2 = np.matrix( np.reshape(Theta[hidden_layer_size * (input_layer_size + 1):], (num_labels, hidden_layer_size + 1), order='F')) p = fnx.predict(Theta1, Theta2, X) precision = 0 for i in range(len(y)): if y[i] == p[i]: precision += 1 print('Training Set Accuracy:', (1.0 * precision) / len(y)) return Theta1, Theta2 if __name__ == '__main__': cuisine_list, ingredients_list, X, y = utl.load_train('number') ingredients_count = len(ingredients_list) cuisines_count = len(cuisine_list) Theta1, Theta2 = train_nn(ingredients_count, ingredients_count//16, cuisines_count, X, y) T, ids = utl.load_test(ingredients_list) p = fnx.predict(Theta1, Theta2, T) utl.save_result('nn', cuisine_list, p, ids)
merged_df.reset_index(inplace = True) if type == 'test': result = merged_df[['index', 'hotel_cluster']] result.columns = ['id', 'hotel_cluster'] elif type == 'train': result = merged_df[['index', 'hotel_cluster_y']] result.columns = ['id', 'hotel_cluster'] return result ############################################################# #################### train dataset #################### ############################################################# train = utils.load_train('group_by') train_is_booking = train[train.is_booking == 1] train_is_booking.reset_index(inplace = True) del train print 'generate top k hotel clusters with orig_destination_distance model...' result = gen_top_k_hotel_cluster(train_is_booking, 'orig_destination_distance', 'train') print 'generate top k hotel clusters with srch_destination_id model...' result = utils.fill_all_top_5(train_is_booking, result, 'srch_destination_id', 'train') print 'generate top k hotel clusters with user_id model...' result = utils.fill_all_top_5(train_is_booking, result, 'user_id', 'train') print 'generate top k hotel clusters with hotel_market model...' result = utils.fill_all_top_5(train_is_booking, result, 'hotel_market', 'train') print 'hotel clusters to ranking features...' new_result = result.apply(lambda row: hotel_clusters_to_ranking_features(row), axis=1) new_result.columns = ['_'.join(['hotel_cluster', str(hotel_cluster_id), 'rank']) for hotel_cluster_id in range(100)]
def main(): # ================ # time managment # # ================ program_st = time.time() # ===================================== # bert classification logging handler # # ===================================== logging_filename = f"../logs/bertclf_{args.corpus_name}.log" logging.basicConfig(level=logging.INFO, filename=logging_filename, filemode="w") console = logging.StreamHandler() console.setLevel(logging.INFO) formatter = logging.Formatter("%(levelname)s: %(message)s") console.setFormatter(formatter) logging.getLogger('').addHandler(console) # ======================= # predefined parameters # # ======================= cv = args.cross_validation num_labels = 3 batch_size = args.batch_size epochs = args.epochs learning_rate = args.learning_rate max_length = args.max_length if args.domain_adaption: if args.model == "german": if args.domain_adaption_alternative_path: model_name = '../corpora/domain-adaption/german-alternative/' else: model_name = '../corpora/domain-adaption/german/' elif args.model == "rede": if args.domain_adaption_alternative_path: model_name = '../corpora/domain-adaption/redewiedergabe-alternative/' else: model_name = '../corpora/domain-adaption/redewiedergabe/' elif args.model == "test": model_name = '../corpora/domain-adaption/test/' else: logging.warning( f"Couldn't find a model with the name '{args.model}'.") else: if args.model == "german": model_name = 'bert-base-german-dbmdz-cased' elif args.model == "rede": model_name = 'redewiedergabe/bert-base-historical-german-rw-cased' else: logging.warning( f"Couldn't find a model with the name '{args.model}'.") cv_acc_dict = defaultdict(list) year_cv_dict = {} poet_cv_dict = {} class_name1 = "epoch_year" class_name2 = "epoch_poet" text_name = "poem" false_clf_dict = {class_name1: {}, class_name2: {}} # ================ # classification # # ================ # ======================= # use GPU, if available # # ======================= if torch.cuda.is_available(): device = torch.device("cuda") logging.info( f'There are {torch.cuda.device_count()} GPU(s) available.') logging.info(f'Used GPU: {torch.cuda.get_device_name(0)}') else: logging.info('No GPU available, using the CPU instead.') device = torch.device("cpu") for i in range(1, cv + 1): if args.corpus_name == "poet": train_data = utils.load_train("../corpora/train_epochpoet", cv, i, "epochpoet") test_data = pd.read_csv( f"../corpora/train_epochpoet/epochpoet{i}.csv") elif args.corpus_name == "year": train_data = utils.load_train("../corpora/train_epochyear", cv, i, "epochyear") test_data = pd.read_csv( f"../corpora/train_epochyear/epochyear{i}.csv") elif args.corpus_name == "poeta": train_data = utils.load_train( "../corpora/train_epochpoetalternative", cv, i, "epochpoetalternative") test_data = pd.read_csv( f"../corpora/train_epochpoetalternative/epochpoetalternative{i}.csv" ) else: logging.warning( f"Couldn't find a corpus with the name '{args.corpus_name}'.") for class_name in [class_name1, class_name2]: # tmp lists and result dicts # input_ids = [] attention_masks = [] texts = train_data[text_name].values encoder = LabelEncoder() labels = encoder.fit_transform(train_data[class_name].values) encoder_mapping = dict( zip(encoder.transform(encoder.classes_), encoder.classes_)) # ============== # tokenization # # ============== tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case=False) for sent in texts: encoded_dict = tokenizer.encode_plus( sent, add_special_tokens=True, max_length=args.max_length, pad_to_max_length=True, return_attention_mask=True, return_tensors='pt') input_ids.append(encoded_dict['input_ids']) attention_masks.append(encoded_dict['attention_mask']) input_ids = torch.cat(input_ids, dim=0) attention_masks = torch.cat(attention_masks, dim=0) labels = torch.tensor(labels) # ================= # train val split # # ================= dataset = TensorDataset(input_ids, attention_masks, labels) train_size = int(0.9 * len(dataset)) val_size = len(dataset) - train_size train_dataset, val_dataset = random_split(dataset, [train_size, val_size]) # ============ # DataLoader # # ============ train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size) val_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=batch_size) # ======== # # Training # # ======== # model = BertForSequenceClassification.from_pretrained( model_name, num_labels=num_labels, output_attentions=False, output_hidden_states=False).cuda() optimizer = AdamW(model.parameters(), lr=learning_rate, eps=1e-8) total_steps = len(train_dataloader) * epochs scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=total_steps) training_stats = [] total_t0 = time.time() validation_losses = {} for epoch_i in range(0, epochs): print("") print('======== Epoch {:} / {:} ========'.format( epoch_i + 1, epochs)) print('Now Training.') t0 = time.time() total_train_loss = 0 model.train() for step, batch in enumerate(train_dataloader): if step % 50 == 0 and not step == 0: elapsed = utils.format_time(time.time() - t0) print('Batch {:>5,} of {:>5,}. Elapsed: {:}.'. format(step, len(train_dataloader), elapsed)) b_input_ids = batch[0].to(device) b_input_mask = batch[1].to(device) b_labels = batch[2].to(device) model.zero_grad() loss, logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels) total_train_loss += loss.item() loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() scheduler.step() # average loss (all batches) avg_train_loss = total_train_loss / len(train_dataloader) training_time = utils.format_time(time.time() - t0) print("") print( " Average training loss: {0:.2f}".format(avg_train_loss)) print(" Training epoch took: {:}".format(training_time)) # ========== # # Validation # # ========== # print("") print("Now Validating.") t0 = time.time() model.eval() total_eval_accuracy = 0 total_eval_loss = 0 nb_eval_steps = 0 for batch in val_dataloader: b_input_ids = batch[0].to(device) b_input_mask = batch[1].to(device) b_labels = batch[2].to(device) with torch.no_grad(): (loss, logits) = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels) # validation loss. total_eval_loss += loss.item() # Move logits and labels to CPU logits = logits.detach().cpu().numpy() label_ids = b_labels.to('cpu').numpy() total_eval_accuracy += utils.flat_f1(label_ids, logits) # final validation accuracy / loss avg_val_accuracy = total_eval_accuracy / len(val_dataloader) print( " Validation Accuracy: {0:.2f}".format(avg_val_accuracy)) avg_val_loss = total_eval_loss / len(val_dataloader) validation_time = utils.format_time(time.time() - t0) print(" Validation Loss: {0:.2f}".format(avg_val_loss)) print(" Validation took: {:}".format(validation_time)) training_stats.append({ 'epoch': epoch_i + 1, 'train_loss': avg_train_loss, 'val_loss': avg_val_loss, 'val_acc': avg_val_accuracy, 'train_time': training_time, 'val_time': validation_time }) current_epoch = f"epoch{epoch_i + 1}" validation_losses[current_epoch] = avg_val_loss # ================ # Early Stopping # # ================ if utils.early_stopping(validation_losses, patience=2): logging.info( f"Stopping epoch run early (Epoch {epoch_i}).") break logging.info(f"Training for {class_name} done.") logging.info("Training took {:} (h:mm:ss) \n".format( utils.format_time(time.time() - total_t0))) print("--------------------------------\n") # ========= # Testing # # ========= test_input_ids = [] test_attention_masks = [] X_test = test_data[text_name].values y_test = LabelEncoder().fit_transform(test_data[class_name].values) for sent in X_test: encoded_dict = tokenizer.encode_plus( sent, add_special_tokens=True, max_length=args.max_length, pad_to_max_length=True, return_attention_mask=True, return_tensors='pt') test_input_ids.append(encoded_dict['input_ids']) test_attention_masks.append(encoded_dict['attention_mask']) test_input_ids = torch.cat(test_input_ids, dim=0) test_attention_masks = torch.cat(test_attention_masks, dim=0) labels = torch.tensor(y_test) prediction_data = TensorDataset(test_input_ids, test_attention_masks, labels) prediction_sampler = SequentialSampler(prediction_data) prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size) model.eval() predictions, true_labels = [], [] for batch in prediction_dataloader: # Add batch to GPU batch = tuple(t.to(device) for t in batch) # Unpack the inputs from our dataloader b_input_ids, b_input_mask, b_labels = batch with torch.no_grad(): outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) logits = outputs[0] # Move logits and labels to CPU logits = logits.detach().cpu().numpy() label_ids = b_labels.to('cpu').numpy() # Store predictions and true labels predictions.append(logits) true_labels.append(label_ids) flat_predictions = np.concatenate(predictions, axis=0) flat_predictions = np.argmax(flat_predictions, axis=1).flatten() flat_true_labels = np.concatenate(true_labels, axis=0) if args.save_misclassification: logging.info("Saving misclassifications.") test_pid = test_data["pid"].values false_classifications = { "Jahrhundertwende": { "Naturalismus": [], "Expressionismus": [] }, "Naturalismus": { "Jahrhundertwende": [], "Expressionismus": [] }, "Expressionismus": { "Naturalismus": [], "Jahrhundertwende": [] } } for idx, (t, p) in enumerate( zip(flat_true_labels, flat_predictions)): if t != p: false_classifications[encoder_mapping[t]][ encoder_mapping[p]].append(int(test_pid[idx])) false_clf_dict[class_name][i] = false_classifications test_score = f1_score(flat_true_labels, flat_predictions, average="macro") classes = test_data[class_name].drop_duplicates().tolist() if args.save_confusion_matrices: logging.info("Saving confusion matrices.") cm = confusion_matrix(flat_true_labels, flat_predictions) cm_df = pd.DataFrame(cm, index=classes, columns=classes) if args.domain_adaption: cm_name = f"{args.corpus_name}c_{class_name}_da_{args.model}" else: cm_name = f"{args.corpus_name}c_{class_name}_{args.model}" if args.save_date: cm_name += f"({datetime.now():%d.%m.%y}_{datetime.now():%H:%M})" cm_df.to_csv( f"../results/bert/confusion_matrices/cm{i}_{cm_name}.csv") stats = pd.DataFrame(data=training_stats) cv_acc_dict[class_name].append(test_score) if class_name == "epoch_year": year_cv_dict[f"cv{i}"] = training_stats elif class_name == "epoch_poet": poet_cv_dict[f"cv{i}"] = training_stats else: logging.info(f"The class {class_name} does not exist.") logging.info(f"Testing for {class_name} done.") logging.info(f"CV Test F1-Score: {test_score} (run: {i}/{cv}).") logging.info("Testing took {:} (h:mm:ss) \n".format( utils.format_time(time.time() - total_t0))) print("--------------------------------\n") logging.info(f"Training for run {i}/{cv} completed.") logging.info("Training run took {:} (h:mm:ss)".format( utils.format_time(time.time() - total_t0))) print("________________________________") print("________________________________\n") # ================ # saving results # # ================ result_path = "../results/bert/" logging.info(f"Writing results to '{result_path}'.") if args.domain_adaption: output_name = f"{args.corpus_name}c_da_{args.model}" else: output_name = f"{args.corpus_name}c_{args.model}" if args.save_date: output_name += f"({datetime.now():%d.%m.%y}_{datetime.now():%H:%M})" with open(f'{result_path}cv_{output_name}.json', 'w') as f: json.dump(cv_acc_dict, f) with open(f'{result_path}eyear_{output_name}.json', 'w') as f: json.dump(year_cv_dict, f) with open(f'{result_path}epoet_{output_name}.json', 'w') as f: json.dump(poet_cv_dict, f) if args.save_misclassification: mis_output_path = f'{result_path}/misclassifications/pid_{output_name}' with open(f'{mis_output_path}.json', 'w') as f: json.dump(false_clf_dict, f) program_duration = float(time.time() - program_st) logging.info(f"Total duration: {int(program_duration)/60} minute(s).")
p = (10**i) for c in tqdm(df.columns): s = (df[c] * p * 2 + 1) // 2 / p # round di = s.value_counts().to_dict() feature[f'{PREF}_{c}_r{i}'] = s.map(di) tr_ = feature.iloc[:200000] output(tr_, 'train') te_ = feature.iloc[200000:].reset_index(drop=True) output(te_, 'test') return # ============================================================================= # main # ============================================================================= if __name__ == "__main__": utils.start(__file__) tr = utils.load_train().drop(['ID_code', 'target'], axis=1) te = utils.load_test().drop(['ID_code'], axis=1) te = te.drop(np.load('../data/fake_index.npy')) trte = pd.concat([tr, te], ignore_index=True)[tr.columns] fe(trte) utils.end(__file__)
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Tue Oct 16 19:55:54 2018 @author: kazuki.onodera """ import numpy as np import pandas as pd import utils, os os.system('rm -rf ../sample') os.system('mkdir ../sample') tr = utils.load_train() log = pd.read_feather('../data/train_log.f') oids = tr.sample(999).object_id.tolist() tr_ = tr[tr.object_id.isin(oids)] log_ = log[log.object_id.isin(oids)] tr_.to_csv('../sample/tr.csv', index=False) log_.to_csv('../sample/tr_log.csv', index=False)
os.system(f'rm ../data/t*_{PREF}*') os.system(f'rm ../feature/t*_{PREF}*') #def mk_feats(df): # df['hostgal_specz-m-hostgal_photoz'] = df['hostgal_specz'] - df['hostgal_photoz'] # df['hostgal_specz-d-hostgal_photoz'] = df['hostgal_specz'] / df['hostgal_photoz'] # df['hostgal_photoz-d-hostgal_photoz_err'] = df['hostgal_photoz'] / df['hostgal_photoz_err'] # df['hostgal_specz-d-hostgal_photoz_err'] = df['hostgal_specz'] / df['hostgal_photoz_err'] # return # ============================================================================= # main # ============================================================================= if __name__ == "__main__": utils.start(__file__) train = utils.load_train().drop(['object_id', 'target'], axis=1) train.add_prefix(PREF+'_').to_pickle(f'../data/train_{PREF}.pkl') train_aug = pd.read_pickle('../data/train_aug.pkl').drop(['object_id', 'object_id_bk', 'target'], axis=1) train_aug.add_prefix(PREF+'_').to_pickle(f'../data/train_aug_{PREF}.pkl') test = utils.load_test().drop(['object_id'], axis=1) test.loc[test.hostgal_photoz==0, 'hostgal_specz'] = 0 test = test.add_prefix(PREF+'_') test.to_pickle(f'../data/test_{PREF}.pkl') utils.save_test_features(test) utils.end(__file__)
import utils as utl import tensorflow as tf import numpy as np cuisine_list, ingredients_list, xs, ys = utl.load_train('vector') ts, ids = utl.load_test(ingredients_list) cuisine_count = len(cuisine_list) ingredients_count = len(ingredients_list) x = tf.placeholder(tf.float32, [None, ingredients_count]) W = tf.Variable(tf.zeros([ingredients_count, cuisine_count])) b = tf.Variable(tf.zeros([cuisine_count])) y = tf.nn.softmax(tf.matmul(x, W) + b) y_ = tf.placeholder(tf.float32, [None, cuisine_count]) t = tf.placeholder(tf.float32, [None, ingredients_count]) p = tf.nn.softmax(tf.matmul(t, W) + b) cross_entropy = -tf.reduce_sum(y_*tf.log(y)) train_step = tf.train.GradientDescentOptimizer(0.001).minimize(cross_entropy) # train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy) init = tf.initialize_all_variables() sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) sess.run(init)
from functools import partial from sklearn.cross_validation import train_test_split from sklearn.grid_search import ParameterGrid from sklearn.externals.joblib import Parallel, delayed from utils import load_train, load_test from utils import find_threshold from utils import rescale, rebalance from utils import make_submission def load_predictions(pattern): return np.column_stack([np.load(f) for f in sorted(glob.glob(pattern))]) # Load training data X, y, w, _ = load_train() # Tune stacker print "Optimize parameters in 5-CV..." from sklearn.ensemble import GradientBoostingClassifier from sklearn.ensemble import BaggingClassifier #Classifier = partial(BaggingClassifier, base_estimator=GradientBoostingClassifier(n_estimators=500, learning_rate=0.025, max_depth=4, max_features=None, min_samples_leaf=250)) #grid = ParameterGrid({"n_estimators": [24], "max_features": [1.0, 0.9, 0.8], "n_jobs": [24]}) Classifier = GradientBoostingClassifier grid = ParameterGrid({"n_estimators": [500], "max_features": [None, 0.95, 0.9], "learning_rate": [0.0225, 0.025, 0.0275], "max_depth": [4], "min_samples_leaf": [250]}) n_jobs = 24 def _parallel_eval(Classifier, params, X, y, w, n_repeat=5, verbose=1):
for suf in suffix_list: col = [c for c in col_init if c.endswith(suf)] df[f'{suf}_min'] = df[col].min(1) df[f'{suf}_mean'] = df[col].mean(1) df[f'{suf}_max'] = df[col].max(1) df[f'{suf}_std'] = df[col].std(1) return # ============================================================================= # main # ============================================================================= if __name__ == "__main__": utils.start(__file__) # train tr = utils.load_train(['object_id']) df = pd.read_pickle('../FROM_MYTEAM/LCfit_feature_allSN_r_train_v3_20181215.pkl.gz') df = pd.merge(tr, df, on='object_id', how='left') df.reset_index(drop=True, inplace=True) get_feature(df) del df['object_id'] df.add_prefix(PREF+'_').to_pickle(f'../data/train_{PREF}.pkl') # test te = utils.load_test(['object_id']) df = pd.read_pickle('../FROM_MYTEAM/LCfit_feature_allSN_r_test_v3_20181215.pkl.gz') df = pd.merge(te, df, on='object_id', how='left') df.reset_index(drop=True, inplace=True) get_feature(df)
import utils #path_to_data = '/courses/cs342/Assignment2/' path_to_data = '' train, train_meta = utils.load_train(path_to_data) g_train, eg_train, g_meta, eg_meta, g_target, eg_target = utils.gal_split_data( train, train_meta, True) g_features = utils.feature_engineering(g_train, g_meta) g_wtable, g_labels, g_classes, g_target_map = utils.preprocess_target(g_target) g_features = utils.standardize_data(g_features) utils.train_mlp(g_features, g_wtable, g_labels, g_classes, g_target_map, True) eg_features = utils.feature_engineering(eg_train, eg_meta) eg_wtable, eg_labels, eg_classes, eg_target_map = utils.preprocess_target( eg_target) eg_features = utils.standardize_data(eg_features) utils.train_mlp(eg_features, eg_wtable, eg_labels, eg_classes, eg_target_map, False)
def classify(train, examples): cv_res = { "PP": 0, "PN": 0, "NP": 0, "NN": 0, "contradictory": 0, } plus = train["plus"] minus = train["minus"] l = len(examples) i = 0 for elem in examples: i += 1 print "%i/%i" % (i, l) result = check_hypothesis(plus, minus, elem) cv_res[result] += 1 return cv_res if __name__ == "__main__": index = int(sys.argv[1]) train = utils.load_train(index) test = utils.load_test(index) res = classify(train, test) print res print utils.summary(res)
return site_name_encoding, posa_continent_encoding, user_location_country_encoding, user_location_region_encoding, \ channel_encoding, srch_destination_type_id_encoding, hotel_continent_encoding, hotel_country_encoding def fill_na_features(dataset): """ Fill the remaining missing values :param dataset: train/test dataset """ dataset.fillna(-1, inplace=True) ############################################################# #################### train dataset #################### ############################################################# train = utils.load_train('baseline') train_is_booking = train[train.is_booking == 1] train_is_booking.reset_index(inplace = True) train_is_booking.is_copy = False del train print 'generate train time features...' time_features_enricher(train_is_booking) print 'generate train one hot encoding features...' site_name_encoding, posa_continent_encoding, user_location_country_encoding, user_location_region_encoding, \ channel_encoding, srch_destination_type_id_encoding, hotel_continent_encoding, hotel_country_encoding = \ gen_all_top_one_hot_encoding_columns(train_is_booking) print 'fill train na features...'
# ============================================================================= # NAME_CONTRACT_STATUS # ============================================================================= ct1 = pd.crosstab(pos[KEY], pos['NAME_CONTRACT_STATUS']).add_suffix('_cnt') ct2 = pd.crosstab(pos[KEY], pos['NAME_CONTRACT_STATUS'], normalize='index').add_suffix('_nrm') base = pd.concat([base, ct1, ct2], axis=1) # TODO: DPD # ============================================================================= # merge # ============================================================================= base.reset_index(inplace=True) train = utils.load_train([KEY]) test = utils.load_test([KEY]) train_ = pd.merge(train, base, on=KEY, how='left').drop(KEY, axis=1) utils.to_feature(train_.add_prefix(PREF), '../feature/train') test_ = pd.merge(test, base, on=KEY, how='left').drop(KEY, axis=1) utils.to_feature(test_.add_prefix(PREF), '../feature/test') #============================================================================== utils.end(__file__)
import utils as utl import tensorflow as tf import numpy as np cuisine_list, ingredients_list, xs, ys = utl.load_train('vector') ts, ids = utl.load_test(ingredients_list) cuisine_count = len(cuisine_list) ingredients_count = len(ingredients_list) x = tf.placeholder(tf.float32, [None, ingredients_count]) W = tf.Variable(tf.zeros([ingredients_count, cuisine_count])) b = tf.Variable(tf.zeros([cuisine_count])) y = tf.nn.softmax(tf.matmul(x, W) + b) y_ = tf.placeholder(tf.float32, [None, cuisine_count]) t = tf.placeholder(tf.float32, [None, ingredients_count]) p = tf.nn.softmax(tf.matmul(t, W) + b) cross_entropy = -tf.reduce_sum(y_ * tf.log(y)) train_step = tf.train.GradientDescentOptimizer(0.001).minimize(cross_entropy) # train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy) init = tf.initialize_all_variables() sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) sess.run(init)
""" dist = l2_distance(valid_data.T, train_data.T) nearest = np.argsort(dist, axis=1)[:,:k] train_labels = train_labels.reshape(-1) valid_labels = train_labels[nearest] # note this only works for binary labels valid_labels = (np.mean(valid_labels, axis=1) >= 0.5).astype(np.int) valid_labels = valid_labels.reshape(-1,1) return valid_labels if __name__ == '__main__': train_inputs, train_targets = utils.load_train() valid_inputs, valid_targets = utils.load_valid() test_inputs, test_targets = utils.load_test() set_k = [1,3,5,7,9] accuracy_valid_output = {} accuracy_test_output = {} length_valid = len(valid_inputs) length_test = len(test_inputs) for k in set_k: valid_outputs = run_knn(k, train_inputs, train_targets, valid_inputs) test_outputs = run_knn(k, train_inputs, train_targets, test_inputs)
""" Created on Sun Jun 3 05:56:27 2018 @author: Kazuki """ import numpy as np import pandas as pd #from sklearn.preprocessing import LabelEncoder import utils utils.start(__file__) #============================================================================== PREF = 'app_002_' train = utils.load_train().drop(['SK_ID_CURR', 'TARGET'], axis=1) test = utils.load_test().drop(['SK_ID_CURR'], axis=1) col_init = train.columns df = pd.concat([train, test], ignore_index=True) # ============================================================================= # features # ============================================================================= df['AMT_CREDIT-by-AMT_INCOME_TOTAL'] = df['AMT_CREDIT'] / df['AMT_INCOME_TOTAL'] df['AMT_INCOME_TOTAL-AMT_CREDIT'] = df['AMT_INCOME_TOTAL'] - df['AMT_CREDIT'] df['AMT_ANNUITY-by-AMT_INCOME_TOTAL'] = df['AMT_ANNUITY'] / df[ 'AMT_INCOME_TOTAL'] df['AMT_INCOME_TOTAL-AMT_ANNUITY'] = df['AMT_INCOME_TOTAL'] - df['AMT_ANNUITY']
from sklearn.multiclass import OneVsRestClassifier from sklearn.linear_model import LogisticRegression import utils as utl cuisine_list, ingredients_list, x, y = utl.load_train('number') classifier = OneVsRestClassifier(LogisticRegression(C=1e6)).fit(x, y) p = classifier.predict(x) precision = 0 for i in range(len(y)): if y[i] == p[i]: precision += 1 accuracy = (1.0 * precision) / len(y) print('Training Set Accuracy:', accuracy) t, ids = utl.load_test(ingredients_list) p = classifier.predict(t) utl.save_result('sk_lr', cuisine_list, p, ids, 'number')
#hyper_parameter # ####################################################################### import utils from run_knn import run_knn import plot_digits import numpy as np import matplotlib.pyplot as plt if __name__ == "__main__": #loading the dataset train_data, train_labels = utils.load_train() #loading the validation set valid_data,valid_labels = utils.load_valid() # vector of each k K = np.array([1,3,5,7,9]) #dictionnay result results={} for k in K: #prediction prediction = run_knn(k,train_data,train_labels,valid_data)