def nb(): train_data = load_data.load_train_data() text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())]) text_clf = text_clf.fit(train_data.get_data(), train_data.get_target()) save_model(text_clf, config.SAVE_NB_PATH)
def predict(): word_weights, tag_weights = load_embedding() word_voc, tag_voc, label_voc = load_voc() # train data sentences, tags, labels = load_train_data(word_voc, tag_voc, label_voc) seed = 137 np.random.seed(seed) np.random.shuffle(sentences) np.random.seed(seed) np.random.shuffle(tags) np.random.seed(seed) np.random.shuffle(labels) # load data sentences_test, tags_test = load_test_data(word_voc, tag_voc, label_voc) labels_test = None # clear reslut command = 'rm ./Data/result/*' os.popen(command) # 划分训练、开发、测试集 kf = KFold(n_splits=config.KFOLD) train_indices, dev_indices = [], [] for train_index, dev_index in kf.split(labels): train_indices.append(train_index) dev_indices.append(dev_index) for num in range(config.KFOLD): train_index, dev_index = train_indices[num], dev_indices[num] sentences_train, sentences_dev = sentences[train_index], sentences[dev_index] tags_train, tags_dev = tags[train_index], tags[dev_index] labels_train, labels_dev = labels[train_index], labels[dev_index] # init model model = DCModel( config.MAX_LEN, word_weights, tag_weights, result_path='./Data/result/result.txt', label_voc=label_voc) # fit model model.fit( sentences_train, tags_train, labels_train, sentences_dev, tags_dev, labels_dev, sentences_test, tags_test, labels_test, config.BATCH_SIZE, config.NB_EPOCH, keep_prob=config.KEEP_PROB, word_keep_prob=config.WORD_KEEP_PROB, tag_keep_prob=config.TAG_KEEP_PROB) print(model.get_best_score()) [p_test, r_test, f_test], nb_epoch = model.get_best_score() command = 'cp ./Data/result/epoch_%d.csv ./Data/result/best_%d' % (nb_epoch+1, num) print(command) os.popen(command) print(p_test, r_test, f_test, '\n') # evaluate # result_path_k = result_path % k # p_test, r_test, f_test = model.evaluate(sentences_test, tags_test, positions_test, # labels_test, simple_compute=False, ignore_label=IGNORE_LABEL, # label_voc=relation_voc, result_path=result_path_k) # clear model model.clear_model() del model
def train_no_pruning(model, epochs, device): trainloader = load_train_data(batch_size=4) criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9) plot_data = train(trainloader, epochs, model, optimizer, criterion, device) return plot_data
def load(): logger.info('load start') x_train, y_train, cv = load_train_data() logger.info('merges') id_cols = [ col for col in x_train.columns.values if re.search('_id$', col) is not None and col not in set( ['o_user_id', 'o_product_id', 'p_aisle_id', 'p_department_id']) ] logger.debug('id_cols {}'.format(id_cols)) x_train.drop(id_cols, axis=1, inplace=True) dropcols = sorted( list(set(x_train.columns.values.tolist()) & set(DROP_FEATURE))) x_train.drop(dropcols, axis=1, inplace=True) logger.info('drop') gc.collect() """ # x_train.replace([np.inf, -np.inf], np.nan, inplace=True) usecols = x_train.columns.values with open(DIR + 'usecols.pkl', 'wb') as f: pickle.dump(usecols, f, -1) fillna_mean = x_train.mean() with open(DIR + 'fillna_mean.pkl', 'wb') as f: pickle.dump(fillna_mean, f, -1) x_train.fillna(fillna_mean, inplace=True) """ return x_train, y_train, cv
def main(): train_file = "data_train.txt" test_file = "data_test.txt" epoches = 100 alpha = 0.000000001 data_array, label_array = load_train_data(train_file) test_array = load_test_data(test_file) data_matrix = np.mat(data_array) label_matrix = np.mat(label_array) test_matrix = np.mat(test_array) theta, cost_vector = train(data_matrix, label_matrix, epoches, alpha) test_result = test(theta, test_matrix) print(theta) print(cost_vector[np.size(cost_vector)-1]) print(test_matrix, test_result) # Plot Result m,n = np.shape(data_array) plot_x = [] plot_y = [] plot_z = [] for i in range(m): plot_x.append(data_matrix[i,1]) plot_y.append(data_matrix[i,n-1]) plot_z.append(label_matrix[i,0]) test_m, test_n = np.shape(test_matrix) plot_testx = [] plot_testy = [] plot_testz = [] for i in range(test_m): plot_testx.append(test_matrix[i,1]) plot_testy.append(test_matrix[i,test_n-1]) plot_testz.append(test_result[i,0]) figure = plt.figure("Result") fig_plot = figure.add_subplot(111, projection='3d') fig_plot.scatter(plot_x, plot_y, plot_z, s=5, c='red', marker='s') # plot 0 fig_plot.scatter(plot_testx, plot_testy, plot_testz, s=30, c='green', marker='s') # plot 0 x = np.random.randint(1000, 5000, size=[10000]) y = np.random.randint(2, 5, size=[10000]) z = theta[0,0] + theta[1,0] * x + theta[2,0] * y fig_plot.plot(x,y,z) fig_plot.set_title("The Result Linear Regression") fig_plot.set_xlabel('Area') fig_plot.set_ylabel('Rooms') fig_plot.set_zlabel('Price') # Plot Cost cost_fig = plt.figure("Cost") cost_plot = cost_fig.add_subplot(111) epoch = np.arange(0, epoches+1, 1) cost_plot.plot(epoch, cost_vector) plt.title("The Cost") plt.xlabel('Epoch') plt.ylabel('Cost') plt.show()
def load(): logger.info('load start') x_train, y_train, cv = load_train_data() logger.info('merges') # x_train['stack1'] = get_stack('result_0727/') # init_score = np.log(init_score / (1 - init_score)) id_cols = [ col for col in x_train.columns.values if re.search('_id$', col) is not None and col not in set( ['o_user_id', 'o_product_id', 'p_aisle_id', 'p_department_id']) ] logger.debug('id_cols {}'.format(id_cols)) x_train.drop(id_cols, axis=1, inplace=True) dropcols = sorted( list(set(x_train.columns.values.tolist()) & set(DROP_FEATURE))) x_train.drop(dropcols, axis=1, inplace=True) logger.info('drop') x_train = x_train.merge(pd.read_csv('user_reorder_item_num.csv').astype( np.float32).rename(columns={'user_id': 'o_user_id'}), how='left', on='o_user_id', copy=False) x_train = x_train.merge(pd.read_csv('item_reorder_user_num.csv').astype( np.float32).rename(columns={'product_id': 'o_product_id'}), how='left', on='o_product_id', copy=False) x_train = x_train.merge( pd.read_csv('item_reorder_user_num_train.csv').astype( np.float32).rename(columns={'product_id': 'o_product_id'}), how='left', on='o_product_id', copy=False) x_train = x_train.merge(pd.read_csv('item_reorder_train.csv').astype( np.float32).rename(columns={'product_id': 'o_product_id'}), how='left', on='o_product_id', copy=False) gc.collect() # x_train.replace([np.inf, -np.inf], np.nan, inplace=True) usecols = x_train.columns.values with open(DIR + 'usecols.pkl', 'wb') as f: pickle.dump(usecols, f, -1) fillna_mean = x_train.mean() with open(DIR + 'fillna_mean.pkl', 'wb') as f: pickle.dump(fillna_mean, f, -1) x_train.fillna(fillna_mean, inplace=True) return x_train, y_train, cv
def iterative_pruning(model, iters, epochs, device): trainloader = load_train_data(batch_size=4) criterion = nn.CrossEntropyLoss() for iter in tqdm(range(iters)): optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9) plot_data = train(trainloader, epochs, model, optimizer, criterion, device) torch.save(plot_data, "plots/%.3f" % ((0.8 ** iter)*100)) model.prune_net(20) model.reinit_net()
def train(): # Load input data and label # x: input; y: label x_train, y_train, x_eval, y_eval = load_train_data(in_height, in_width, num_rows, train_ratio) model = build_model() # Define the input function for training input_fn_t = tf.estimator.inputs.numpy_input_fn(x={'file': x_train}, y=y_train, batch_size=batch_size, num_epochs=train_epoch, shuffle=True) # Train the Model model.train(input_fn_t, steps=num_steps) # Define the input function for evaluating input_fn_e = tf.estimator.inputs.numpy_input_fn(x={'file': x_eval}, y=y_eval, batch_size=batch_size, shuffle=False) # Evaluate the Model e = model.evaluate(input_fn_e) total_steps = e['global_step'] print('Evaluation Accuracy = ', e['accuracy'], "Loss = ", e['loss'], "global_step = ", total_steps) # Evaluate Checkpoints # all checkpoints have to be saved locally for evaluation, otherwise the evaluation is skipped # max_ckpt and ckpt_steps need to be properly tuned to enable checkpoints evaluation total_ckpts = total_steps // ckpt_steps print("Total number of checkpoints required = ", total_ckpts) if total_ckpts <= max_ckpt: # all checkpoints are saved eval_results = np.zeros((total_ckpts, 3)) for i in range(total_ckpts): j = np.min([(i + 1) * ckpt_steps + 1, total_steps]) ckpt_path = './model/model.ckpt-' + str(j) print(ckpt_path) e = model.evaluate(input_fn_e, checkpoint_path=ckpt_path) eval_results[i, :] = [j, e['accuracy'], e['loss']] df = pd.DataFrame(eval_results) header = ["step", "accuracy", "loss"] df.to_csv('./eval_ckpts.csv', header=header, index=None) print( "Checkpoints Evaluation is completed. The results can be found at ./eval_ckpts.csv" ) else: print("Checkpoints Evaluation is skipped.")
def svm(): train_data = load_data.load_train_data() text_clf_svm = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf-svm', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42))]) text_clf_svm.fit(train_data.get_data(), train_data.get_target()) save_model(text_clf_svm, config.SAVE_SVM_PATH)
def load(): logger.info('load start') x_train, y_train, cv = load_train_data() logger.info('merges') #x_train['stack1'] = get_stack('result_0727/') #init_score = np.log(init_score / (1 - init_score)) id_cols = [ col for col in x_train.columns.values if re.search('_id$', col) is not None and col not in set( ['o_user_id', 'o_product_id', 'p_aisle_id', 'p_department_id']) ] logger.debug('id_cols {}'.format(id_cols)) x_train.drop(id_cols, axis=1, inplace=True) dropcols = sorted( list(set(x_train.columns.values.tolist()) & set(DROP_FEATURE))) x_train.drop(dropcols, axis=1, inplace=True) logger.info('drop') #cols_ = pd.read_csv('result_0728_18000/feature_importances.csv') #cols_ = cols_[cols_.imp == 0]['col'].values.tolist() #cols_ = cols_['col'].values.tolist()[250:] #dropcols = sorted(list(set(x_train.columns.values.tolist()) & set(cols_))) #x_train.drop(dropcols, axis=1, inplace=True) #imp = pd.read_csv('result_0731_xentropy/feature_importances.csv')['col'].values #x_train = x_train[imp] usecols = x_train.columns.values #logger.debug('all_cols {}'.format(usecols)) with open(DIR + 'usecols.pkl', 'wb') as f: pickle.dump(usecols, f, -1) gc.collect() #x_train.replace([np.inf, -np.inf], np.nan, inplace=True) fillna_mean = x_train.mean() with open(DIR + 'fillna_mean.pkl', 'wb') as f: pickle.dump(fillna_mean, f, -1) x_train.fillna(fillna_mean, inplace=True) x_train = x_train.values.astype(np.float32) logger.info('data end') # x_train[np.isnan(x_train)] = -10 gc.collect() x_train[np.isnan(x_train)] = -100 x_train[np.isinf(x_train)] = 999 logger.info('load end {}'.format(x_train.shape)) return x_train, y_train, cv
def get_datas(self): datas, lbls = load_train_data() hot_labels = np_utils.to_categorical(lbls, self.n_classes) k1 = datas.shape[0] k2 = int(0.7 * k1) data_train, data_test = datas[:k2], datas[k2:] label_train, label_test = hot_labels[:k2], hot_labels[k2:] # print(data_train.shape) d_train, l_train = self.shuffleing(data_train, label_train) d_test, l_test = self.shuffleing(data_test, label_test) return d_train, l_train, d_test, l_test
def main(): log_fmt = Formatter( '%(asctime)s %(name)s %(lineno)d [%(levelname)s][%(funcName)s] %(message)s ' ) handler = StreamHandler() handler.setLevel(INFO) handler.setFormatter(log_fmt) logger.addHandler(handler) handler = FileHandler(DIR + 'train_lgb_clf_hyperopt.py.log', 'a') handler.setLevel(DEBUG) handler.setFormatter(log_fmt) logger.setLevel(DEBUG) logger.addHandler(handler) logger.info('start') logger.info("start exploring best params") logger.info("start exploring best params without iteration") df_train = load_train_data() x_train = df_train.loc[:, 'ABC':'2047'] y_train = df_train['Active_Nonactive'].values best_params = lgb_opt_params(x_train, y_train) logger.info("end exploring best params without iteration") logger.info("start optimizing iteration") best_iter = opt_iter(x_train, y_train, best_params) logger.info("end optimizing iteration") logger.info("end exploring best params") logger.info("start best params train") best_model_No, cutoff = create_models(x_train, y_train, best_params, best_iter) logger.info("end best params train") logger.info("start predict unknown data(test data)") df_test = load_test_data().sort_values('Name') use_cols = x_train.columns.values # x_test = df_test[use_cols] df_all = pd.concat([df_train, df_test], axis=0, sort=False).sort_values('Name') x_all = df_all[use_cols] predict_test(x_all, best_model_No, cutoff) logger.info("end predict unknown data(test data)") logger.info("end")
def validation(self, num_iterations): _, val_ext = load_train_data(self.batch_size, self.num_classes) start_time = time.time() for i in range(num_iterations): print "validating:", i #get batch x_batch, y_true_batch = val_ext.get_random_batch_balanced() #set feed_dict feed_dict_train = {self.x: x_batch, self.y_true: y_true_batch} #run session _, acc, loss = self.sess.run( [self.y_pred, self.accuracy, self.cost], feed_dict=feed_dict_train) print 'acc: ', acc * 100, 'loss: ', loss end_time = time.time() time_dif = end_time - start_time print("Time usage: " + str(timedelta(seconds=int(round(time_dif)))))
def optimize(self, num_iterations, classes=3, save=False): train_ext, _ = load_train_data(self.batch_size, self.num_classes) checkpoint = "ckpt" logfile = "train_log" # log/training report log = {} # log['start_time'] = time.ctime() # log['alpha'] = alpha # log['batch_size'] = batch_size # log['steps'] = steps log['checkpoint'] = checkpoint log['loss'] = [] ckpt_dir = 'checkpoints/' + str(datetime.datetime.now()) + '/' if not os.path.exists(ckpt_dir): os.makedirs(ckpt_dir) start_time = time.time() for i in range(self.total_iterations, self.total_iterations + num_iterations): print "training:", i, "/", self.total_iterations #get batch x_batch, y_true_batch = train_ext.get_random_batch_balanced() #set feed_dict feed_dict_train = {self.x: x_batch, self.y_true: y_true_batch} #run session _, acc, loss = self.sess.run( [self.optimizer, self.accuracy, self.cost], feed_dict=feed_dict_train) print 'acc: ', acc * 100, 'loss: ', loss if save and i % 500 == 0: self.save(ckpt_dir + 'c-' + str(classes) + '-itt-' + str(self.total_iterations)) self.total_iterations += num_iterations end_time = time.time() time_dif = end_time - start_time print("Time usage: " + str(timedelta(seconds=int(round(time_dif))))) # save final model if save: self.save(ckpt_dir + 'c-' + str(classes) + '-itt-' + str(self.total_iterations))
def post_predict(test_path, score_path, entity_path, alpha=0.75): candidate_dict = load_candidates2(score_path) test_data, all_data = load_train_data(test_path) entity_dict, _ = load_entity(entity_path) acc_cnt, w_l = 0, '' predict_dict = dict() for mention, candidates in candidate_dict.items(): if len(candidates) == 1: predict_dict[mention] = (candidates[0][0], candidates[0][1]) continue max_score, max_can = candidates[0][2], candidates[0] for e_id, e_name, e_score in candidates: if e_score > max_score: max_score = e_score max_can = (e_id, e_name, e_score) e_id, e_name, e_score = max_can if e_score < alpha: e_id, e_name = 'cui-less', 'cui-less' predict_dict[mention] = (e_id, e_name) for doc_id, mention, label in all_data: if str.lower(label) == 'cui-less': label = 'cui-less' pred_label, pred_entity_name = predict_dict[mention] if pred_label == label: acc_cnt += 1 else: entity_name = 'None' if label in entity_dict: entity_name = entity_dict[label][0] w_l += doc_id + '\t' + mention + '\t' + label + '\t' + \ entity_name + '\t' + pred_label + '\t' + pred_entity_name + '\n' with open('../checkpoints/post_predict_result.txt', 'w') as f: f.write(w_l) total_cnt = len(all_data) accuracy = 1.0 * acc_cnt / (total_cnt) return accuracy
def model_selection_and_evaluation(): """ Test some candidate models with validation set, select highest scoring, train on full train + validation set, evluate on test set :return: tuple: best model, list of feature sets it uses """ # Load train and test sets df_tr, df_te = load_data.load_train_data(), load_data.load_test_data() # Split train into validation (for model selection) and train df_tr_tr, df_tr_val = utils.split_train_validation(df_tr) # Assess accuracies of all models on validation set # Get best scoring canditate best_model, best_model_feats = model_selection(df_tr_tr, df_tr_val) print('Best scoring model is: {}, Using feature sets: {}'.format( best_model.name, best_model_feats)) # Evaluate test set accuracy of chosen model test_set_evaluation(df_tr, df_te, {best_model: best_model_feats}) return best_model, best_model_feats
from sklearn.cross_validation import train_test_split from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.svm import LinearSVC from load_data import load_train_data, load_predict_data from sklearn.datasets.base import Bunch from sklearn.svm import SVC import jieba def jieba_tokenizer(x): return jieba.cut(x) train_data = load_train_data(r"cuhk.csv") def predict(n, x_test, y_test): #print(train_data) x_train, _, y_train, _ = train_test_split(train_data['data'], train_data['target'][n], test_size=0.5) #print(y_train) words_tfidf_vec = TfidfVectorizer(binary=False, tokenizer=jieba_tokenizer) X_train = words_tfidf_vec.fit_transform(x_train) print(train_data['types'][n]) clf = SVC().fit(X_train, y_train) # 测试样本数据调用的是transform接口
else: return True processed_texts = [] for line, l in zip(tweets_list, tweets_labels): if isEnglish(line): processed_texts.append((l, preprocessor(line))) # else: # print or not ? # print(line) os_name = get_os_name() if os_name == 'windows': file_dir = 'C:/Corpus/' elif os_name == 'ubuntu': file_dir = '/home/hs/Data/' else: return csv_save(processed_texts, file_dir + filename) if __name__ == '__main__': # from load_data import load_test_data # test_texts, test_labels =load_test_data() # preprocess_tweeets(test_texts, test_labels, 'preprocessed_test_data_nostem_359.csv') # exit() from load_data import load_train_data texts, labels = load_train_data() processed_texts = [] preprocess_tweeets(texts, labels, 'preprocessed_training_data_nostem_160000.csv')
'%(asctime)s %(name)s %(lineno)d [%(levelname)s][%(funcName)s] %(message)s ' ) handler = StreamHandler() handler.setLevel('INFO') handler.setFormatter(log_fmt) logger.addHandler(handler) handler = FileHandler(DIR + 'train.py.log', 'a') handler.setLevel(DEBUG) handler.setFormatter(log_fmt) logger.setLevel(DEBUG) logger.addHandler(handler) logger.info('start') df = load_train_data() x_train = df.drop('target', axis=1) y_train = df['target'].values use_cols = x_train.columns.values logger.debug('train columns: {} {}'.format(use_cols.shape, use_cols)) logger.info('data preparation end {}'.format(x_train.shape)) cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0) all_params = { 'C': [10**i for i in range(-1, 2)], 'fit_intercept': [True, False], 'penalty': ['l2', 'l1'],
from sklearn.model_selection import train_test_split from keras.callbacks import History from keras.metrics import categorical_accuracy from utils import convert_arrays_to_accuracy, TimeHistory import augmentation as aug import numpy as np # m is the number of examples to load from the training dataset. # Reducing m is especially useful when debugging to allow # rapid training runs. For the full dataset, use m=60000. m = 60000 epochs = 150 # n_transforms is the number of transformations to create for each image n_transforms = 10 X, y = load_train_data(m) # Ensure that we always use the same training and cross-validation sets # by always using 1 as the seed for the PRNG. np.random.seed(1) Xtr, Xval, ytr, yval = train_test_split(X, y, train_size=0.6, test_size=0.4) Xtr, ytr = aug.augment_dataset(Xtr, ytr, n_transforms, fixed_seeds=True) model = model_build_dense() model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=[categorical_accuracy]) my_hist = History() time_hist = TimeHistory() # Because we are feeding an already augmented dataset to model.fit, # the training categorical accuracy returned by the model will be
predict_labels = clf.predict(test) dump_picle(predict_labels, './data/predict_labels/predict_labels.p') logger.info('SVM classifier training complete, saved predict labels to pickle') return def logit(train_data, train_labels, test): log_state('Use logistic regression classifier') clf = linear_model.LogisticRegression(C=1e5) clf.fit(train_data, train_labels) predict_labels = clf.predict(test) dump_picle(predict_labels, './data/predict_labels/predict_labels.p') logger.info('MaxEnt classifier training complete, saved predict labels to pickle') return def kNN(train_data, train_labels, test): log_state('Use kNN classifier') clf = KNeighborsClassifier(n_neighbors=5) clf.fit(train_data, train_labels) predict_labels = clf.predict(test) dump_picle(predict_labels, './data/predict_labels/predict_labels.p') logger.info('kNN classifier training complete, saved predict labels to pickle') return if __name__ == "__main__": train_data = load_pickle('./data/transformed_data/transformed_train.p') test = load_pickle('./data/transformed_data/transformed_test.p') _, train_labels = load_train_data() mNB(train_data, train_labels, test)
import numpy as np ############################### # Untar data def untar_data(name, outdir='./data'): my_tar = tarfile.open('./Indoor-scene-recognition/' + name) my_tar.extractall(outdir) my_tar.close() # Uncomment to untar data # untar_data("indoorCVPR_09annotations.tar") # untar_data("indoorCVPR_09.tar") ############################### ############################### # Load data test_data = load_data.load_test_data() train_data = load_data.load_train_data() # Show the data print(test_data.shape) print(train_data.shape) train_i = np.random.choice(train_data.shape[0]) test_i = np.random.choice(test_data.shape[0]) cv2.imshow("example in train", train_data[train_i]) cv2.imshow("example in test", test_data[test_i]) cv2.waitKey(0) ###############################
df['description_' + str(i)] = trans_desc[:, i] title = vec.fit_transform(df['title']) title = np.array(title.todense(), dtype=np.float32) pca.fit(title) trans_title = pca.fit_transform(title) print(pca.explained_variance_ratio_) print(np.cumsum(pca.explained_variance_ratio_)) for i in range(19): df['title_' + str(i)] = trans_title[:, i] return df if __name__ == '__main__': logger.debug('start load train data') df_train = load_train_data() #df_train = df_train.iloc[:10000,:] X_train = df_train.drop(['deal_probability'], axis=1) y_train = df_train['deal_probability'] logger.debug('start load test data') X_test = load_test_data() logger.debug('start fill null') X_train = fill_null(X_train) X_test = fill_null(X_test) X_train["Weekday"] = X_train['activation_date'].dt.weekday X_train["Weekd of Year"] = X_train['activation_date'].dt.week X_train["Day of Month"] = X_train['activation_date'].dt.day X_test["Weekday"] = X_test['activation_date'].dt.weekday
__author__ = 'NLP-PC' __author__ = 'NLP-PC' import feature_generating import classifiers import analysis from load_data import load_train_data from load_data import load_test_data from save_data import dump_picle from vectorizers import TFIDF_estimator, anew_estimator from analysis import analysis_result from classifiers import mNB print('Start') vectorizer = TFIDF_estimator() train_type = 'Sentiment140' texts, train_labels = load_train_data(train_type) transformed_train = vectorizer.fit_transform(texts) testdata, true_labels = load_test_data() transformed_test = vectorizer.transform(testdata) predict = mNB(transformed_train, train_labels, transformed_test) analysis_result(predict, true_labels)
"--train_or_predict", type=bool, default=True, help="train_or_predict") parser.add_argument("-l", "--layer1", type=int, default=1000) parser.add_argument("-ll", "--layer2", type=int, default=200) args = parser.parse_args() #################################################### ##Load data and set up training hyperparameters #################################################### DATA_DIR = './data/' pro_dir = os.path.join(DATA_DIR, 'pro_sg_tag') unique_sid = load_pro_data(os.path.join(pro_dir, 'unique_sid.txt')) n_tags = len(unique_sid) train_data = load_train_data(os.path.join(pro_dir, 'train.csv'), n_tags) vad_data_tr, vad_data_te = load_tr_te_data( os.path.join(pro_dir, 'validation_tr.csv'), os.path.join(pro_dir, 'validation_te.csv'), n_tags) test_data_tr, test_data_te = load_tr_te_data( os.path.join(pro_dir, 'test_tr.csv'), os.path.join(pro_dir, 'test_te.csv'), n_tags) N = train_data.shape[0] N_vad = vad_data_tr.shape[0] N_test = test_data_tr.shape[0] idxlist = list(range(N)) idxlist_vad = range(N_vad) idxlist_test = range(N_test)
log_fmt = Formatter('%(asctime)s %(name)s %(lineno)d [%(levelname)s][%(funcName)s] %(message)s ') handler = StreamHandler() handler.setLevel('INFO') handler.setFormatter(log_fmt) logger.addHandler(handler) handler = FileHandler(DIR + 'train.py.log', 'a') handler.setLevel(DEBUG) handler.setFormatter(log_fmt) logger.setLevel(DEBUG) logger.addHandler(handler) logger.info('start') df_train0 = load_train_data() df_test0 = load_test_data() logger.info('concat train and test datasets: {} {}'.format(df_train0.shape, df_test0.shape)) df_train0['train'] = 1 df_test0['train'] = 0 df = pd.concat([df_train0, df_test0], axis=0, sort=False) logger.info('Data preprocessing') # Drop PoolQC, MiscFeature, Alley and Fence features # because they have more than 80% of missing values. df = df.drop(['Alley','PoolQC','Fence','MiscFeature'],axis=1) object_columns_df = df.select_dtypes(include=['object'])
train_x.shape, train_y.shape)) test_x = nan_train_x.drop(nan_column, axis=1) logger.info('create test data from nan_train_x:{}'.format(test_x.shape)) lr = LinearRegression().fit(train_x, train_y) logger.info('lr fitted') test_y = lr.predict(test_x) logger.info('lr predicted:{}'.format(test_y.shape)) test_x['Age'] = test_y logger.info('test_x.shape:{} test_y.shape:{}'.format( test_x.shape, test_y.shape)) df_x = pd.concat([test_x, non_nan_train_x]) logger.info('df_temp.shape:{} non_nan_train_x.shape:{}'.format( test_x.shape, non_nan_train_x.shape)) return df_x if __name__ == '__main__': logger.info('enter') train_x, train_y = load_train_data() nan_train_x, non_nan_train_x = load_data_nan(train_x, 'Age') logger.info('load_data_nan loaded') logger.info('nan_train_x.shape:{}'.format(nan_train_x.shape)) logger.info('non_nan_train_x.shape:{}'.format(non_nan_train_x.shape)) df_x = nan_data_predict(nan_train_x, non_nan_train_x, 'Age') logger.info('result:{}'.format(df_x.shape)) logger.info('end')
def train(argv=None): # load data print("Loading data ... ") x_train, y_train = load_data.load_train_data() x_test, y_test = load_data.load_test_data() # concatenate and shuffle . x_sum = numpy.concatenate((x_train, x_test)) y_sum = numpy.concatenate((y_train, y_test)) numpy.random.seed(10) shuffle_indices = numpy.random.permutation(numpy.arange(len(y_sum))) x_shuffled = x_sum[shuffle_indices] y_shuffled = y_sum[shuffle_indices] # split to train and test . x_train = x_shuffled[1000:] y_train = y_shuffled[1000:] x_test = x_shuffled[:1000] y_test = y_shuffled[:1000] print(x_train.shape) print(x_test.shape) # expand (batch_size,MAX_SENTENCE_LENGTH,EMBEDDING_SIZE) to (batch_size,MAX_SENTENCE_LENGTH,EMBEDDING_SIZE,1) x_train = numpy.expand_dims(x_train, -1) x_test = numpy.expand_dims(x_test, -1) filter_sizes = [2, 3, 4, 5] filter_numbers = [300, 200, 100, 50] # input # input is sentence train_data_node = tf.placeholder(tf.float32, shape=(None, max_document_length, EMBEDDING_SIZE, NUM_CHANNELS)) train_labels_node = tf.placeholder(tf.float32, shape=(None, NUM_CLASSES)) dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob") # full connected - softmax layer, fc1_weights = tf.Variable( tf.truncated_normal([sum(filter_numbers), NUM_CLASSES], stddev=0.1, seed=SEED, dtype=tf.float32)) fc1_biases = tf.Variable( tf.constant(0.1, shape=[NUM_CLASSES], dtype=tf.float32)) # model def model(data): pooled_outputs = [] for idx, filter_size in enumerate(filter_sizes): conv = conv2d(train_data_node, filter_numbers[idx], filter_size, EMBEDDING_SIZE, name="kernel%d" % idx) # 1-max pooling,leave a tensor of shape[batch_size,1,1,num_filters] pool = tf.nn.max_pool( conv, ksize=[1, max_document_length - filter_size + 1, 1, 1], strides=[1, 1, 1, 1], padding='VALID') pooled_outputs.append(tf.squeeze(pool)) if len(filter_sizes) > 1: cnn_output = tf.concat(1, pooled_outputs) else: cnn_output = pooled_outputs[0] # add dropout reshape = tf.nn.dropout(cnn_output, dropout_keep_prob) # fc1 layer fc1_output = tf.matmul(reshape, fc1_weights) + fc1_biases return fc1_output # Training computation logits = model(train_data_node) loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits( tf.clip_by_value(logits, 1e-10, 1.0), train_labels_node)) # L2 regularization for the fully connected parameters. regularizers = (tf.nn.l2_loss(fc1_weights) + tf.nn.l2_loss(fc1_biases)) loss += 0.05 * regularizers tf.scalar_summary('loss', loss) # optimizer global_step = tf.Variable(0, name="global_step", trainable=False) learning_rate = tf.Variable(start_learning_rate, name="learning_rate") # learning_rate=tf.train.exponential_decay(start_learning_rate,global_step*BATCH_SIZE,train_size,0.9,staircase=True) optimizer = tf.train.AdamOptimizer(learning_rate) grads_and_vars = optimizer.compute_gradients(loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) # Evaluate model train_predict = tf.argmax(logits, 1) train_label = tf.argmax(train_labels_node, 1) # train accuracy train_correct_pred = tf.equal(train_predict, train_label) train_accuracy = tf.reduce_mean(tf.cast(train_correct_pred, tf.float32)) tf.scalar_summary('acc', train_accuracy) merged = tf.merge_all_summaries() def compute_index(y_label, y_predict): # macro print("{}: acc {:g}, recall {:g}, f1 {:g} ".format( "macro", accuracy_score(y_label, y_predict), recall_score(y_label, y_predict, average='macro'), f1_score(y_label, y_predict, average='macro'))) # macro print("{}: acc {:g}, recall {:g}, f1 {:g} ".format( "micro", accuracy_score(y_label, y_predict), recall_score(y_label, y_predict, average='micro'), f1_score(y_label, y_predict, average='micro'))) # weighted print("{}: acc {:g}, recall {:g}, f1 {:g} ".format( "weighted", accuracy_score(y_label, y_predict), recall_score(y_label, y_predict, average='weighted'), f1_score(y_label, y_predict, average='weighted'))) def dev_step(x_batch, y_batch, best_test_loss, sess): feed_dict = { train_data_node: x_batch, train_labels_node: y_batch, dropout_keep_prob: 1.0 } # Run the graph and fetch some of the nodes. # test dont apply train_op (train_op is update gradient). summary, step, losses, lr, acc, y_label, y_predict = sess.run( [ merged, global_step, loss, learning_rate, train_accuracy, train_label, train_predict ], feed_dict=feed_dict) test_writer.add_summary(summary, step) time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}, lr {:g} ,acc {:g}".format( time_str, step, losses, lr, acc)) # print("{}: step {}, loss {:g} ,acc {:g}".format(time_str, step, losses,acc)) # compute index compute_index(y_label, y_predict) new_best_test_loss = best_test_loss # decide if need to decay learning rate if (step % steps_each_check < 100) and (step > 100): loss_delta = (best_test_loss if best_test_loss is not None else 0) - losses if best_test_loss is not None and loss_delta < decay_delta: print( 'validation loss did not improve enough, decay learning rate' ) current_learning_rate = min_learning_rate if lr * learning_rate_decay < min_learning_rate else lr * learning_rate_decay if current_learning_rate == min_learning_rate: print('It is already the smallest learning rate.') sess.run(learning_rate.assign(current_learning_rate)) print('new learning rate is: ', current_learning_rate) else: # update new_best_test_loss = losses return new_best_test_loss # run the training with tf.Session() as sess: train_writer = tf.train.SummaryWriter(FLAGS.summaries_dir + '/train', sess.graph) test_writer = tf.train.SummaryWriter(FLAGS.summaries_dir + '/test') tf.initialize_all_variables().run() print('Initialized!') # Generate batches batches = data_helpers.batch_iter(list(zip(x_train, y_train)), BATCH_SIZE, NUM_EPOCHS) # batch count batch_count = 0 best_test_loss = None # Training loop.For each batch... for batch in batches: batch_count += 1 if batch_count % EVAL_FREQUENCY == 0: print("\nEvaluation:") best_test_loss = dev_step(x_test, y_test, best_test_loss, sess) print("") else: if batch_count % META_FREQUENCY == 99: x_batch, y_batch = zip(*batch) feed_dict = { train_data_node: x_batch, train_labels_node: y_batch, dropout_keep_prob: 0.5 } # Run the graph and fetch some of the nodes. # option run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() _, summary, step, losses, acc = sess.run( [train_op, merged, global_step, loss, train_accuracy], feed_dict=feed_dict, options=run_options, run_metadata=run_metadata) train_writer.add_run_metadata(run_metadata, 'step%03d' % step) train_writer.add_summary(summary, step) time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g},acc {:g}".format( time_str, step, losses, acc)) else: x_batch, y_batch = zip(*batch) feed_dict = { train_data_node: x_batch, train_labels_node: y_batch, dropout_keep_prob: 0.5 } # Run the graph and fetch some of the nodes. _, summary, step, losses, acc = sess.run( [train_op, merged, global_step, loss, train_accuracy], feed_dict=feed_dict) train_writer.add_summary(summary, step) time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}, acc {:g}".format( time_str, step, losses, acc)) train_writer.close() test_writer.close()
from sklearn.model_selection import GridSearchCV from sklearn.ensemble import VotingClassifier from sklearn.ensemble import VotingClassifier from sklearn.ensemble import BaggingClassifier from sklearn.ensemble import AdaBoostClassifier from sklearn.ensemble import GradientBoostingClassifier from sklearn.ensemble import VotingClassifier import xgboost as xg from load_data import load_test_data, load_train_data from data_cleaning import clean_data # laod data data = clean_data(load_train_data()) # split data into training/testing sets train,test=train_test_split(data,test_size=0.3,random_state=0,stratify=data['Survived']) train_X=train[train.columns[1:]] train_Y=train[train.columns[:1]] test_X=test[test.columns[1:]] test_Y=test[test.columns[:1]] X=data[data.columns[1:]] Y=data['Survived'] # Radial Support Vector Machines(rbf-SVM) model=svm.SVC(kernel='rbf',C=1,gamma=0.1) model.fit(train_X,train_Y) prediction1=model.predict(test_X) print('Accuracy for rbf SVM is ',metrics.accuracy_score(prediction1,test_Y))
min_params = xgb_gs(xg_params, xg_trn, trn_y, xg_val, val_y, wl=watchlist) model = xgb.train(min_params, xg_trn, num_boost_round=5000, evals=watchlist, early_stopping_rounds=100, verbose_eval=50) return model if __name__ == '__main__': logger.info('Start') train_df = load_train_data(nrows=100) logger.info('train load end {}'.format(train_df.shape)) test_df = load_test_data(nrows=100) logger.info('test load end {}'.format(test_df.shape)) # Labels train_y = train_df["deal_probability"].values test_id = test_df["item_id"].values # Feature Weekday train_df["activation_weekday"] = train_df["activation_date"].dt.weekday test_df["activation_weekday"] = test_df["activation_date"].dt.weekday # Label encode the categorical variables cat_vars = [
vectorizer_param = {'preprocessor': preprocessor, 'ngram_range': parameters['ngram_range'], 'analyzer': 'word', 'min_df': parameters['min_df'], 'max_df': parameters['max_df'], 'binary': parameters['TF_binary'], 'norm': parameters['norm'], 'sublinear_tf': parameters['sublinear_tf'], 'max_features': parameters['max_features']} if __name__ == "__main__": unigram = StemmedTfidfVectorizer(**vectorizer_param) anew = anew_vectorizer() pct = punctuation_estimator() strength = strength_vectorizer() avg_strength = avg_affective_vectorizer() log_state('combine unigram and avg strength features') combined_features = FeatureUnion([('unigram', unigram), ('avg_strength', avg_strength)]) # log_state('combine unigram and strength features') # combined_features =FeatureUnion([('unigram',unigram),('strength',strength)]) # log_state('combine unigram and anew features') # combined_features =FeatureUnion([('unigram',unigram),('anew',anew)]) # log_state('combine unigram and punctuation features') # combined_features =FeatureUnion([('unigram',unigram),('pct',pct)]) texts, _ = load_train_data('Sentiment140') transformed_train = combined_features.fit_transform(texts) testdata, _ = load_test_data() transformed_test = combined_features.transform(testdata) dump_picle(combined_features.get_feature_names(), './data/features/feature_names.p') dump_picle(transformed_train, "./data/transformed_data/transformed_train.p") dump_picle(transformed_test, "./data/transformed_data/transformed_test.p")
df_tmp['feat_i'] = df_tmp['feat_i'] / df_tmp['feat_i'].sum() # for i in range(len(df_tmp.index)): for i in range(15): logger.debug('\t{0:20s} : {1:>10.6f}'.format( df_tmp.ix[i, 0], df_tmp.ix[i, 1])) return model if __name__ == '__main__': logger.info('Start') # temp1_df = load_train_data(nrows=ROW) # temp2_df = pd.read_csv('../input/city_population_wiki_v3.csv') # train_df = pd.merge(temp1_df, temp2_df, on='city', how='left') # del temp1_df, temp2_df train_df = load_train_data(nrows=ROW) logger.info('Train Data load end {}'.format(train_df.shape)) test_df = load_test_data(nrows=ROW) logger.info('test load end {}'.format(test_df.shape)) # test_df = load_period_train_data(nrows=ROW) # logger.info('period train load end {}'.format(test_df.shape)) # pr_test_df = load_period_test_data(nrows=ROW) # logger.info('period test load end {}'.format(pr_test_df.shape)) # test_df = load_train_act_data(nrows=ROW) # tmp_df = pd.read_csv(TRN_PRED_FILE, index_col=['item_id']) # trn_act_df = load_train_act_data(nrows=ROW) # trn_act_df = trn_act_df.join(tmp_df, how='left')
def train(): df = load_train_data( ) # .sample(10000000, random_state=42).reset_index(drop=True) logger.info('train data size {}'.format(df.shape)) cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=871) train, test = next(cv.split(df, df.is_attributed)) x_train = df.drop(['is_attributed', 'click_id'], axis=1).astype( np.float32) # .loc[train].reset_index(drop=True) y_train = df.is_attributed.astype(int) # .values[train] df = load_valid_data( ) # .sample(x_train.shape[0], random_state=42).reset_index(drop=True) logger.info('valid data size {}'.format(df.shape)) x_valid = df.drop(['is_attributed', 'click_id'], axis=1).astype( np.float32) # .loc[test].reset_index(drop=True) y_valid = df.is_attributed.astype(int) # .values[test] del df gc.collect() usecols = x_train.columns.values with open(DIR + 'usecols.pkl', 'wb') as f: pickle.dump(usecols, f, -1) cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=871) # {'colsample_bytree': 0.9, 'learning_rate': 0.1, 'max_bin': 255, 'max_depth': -1, 'metric': 'auc', 'min_child_weight': 20, 'min_split_gain': 0, 'num_leaves': 127, 'objective': 'binary', 'reg_alpha': 0, 'scale_pos_weight': 1, 'seed': 114, 'subsample': 1.0, 'subsample_freq': 1, 'verbose': -1} all_params = { 'min_child_weight': [20], 'subsample': [1], 'subsample_freq': [1], 'seed': [114], 'colsample_bytree': [0.9], 'learning_rate': [0.1], 'max_depth': [-1], 'min_split_gain': [0], 'reg_alpha': [0], 'max_bin': [255], 'num_leaves': [127], 'objective': ['binary'], 'metric': ['auc'], 'scale_pos_weight': [1], 'verbose': [-1], #'device': ['gpu'], 'drop': [None] + list(range(0, len(usecols))) } use_score = 0 min_score = (100, 100, 100) drop_cols = [] import copy for params in tqdm(list(ParameterGrid(all_params))): cnt = -1 list_score = [] list_score2 = [] list_best_iter = [] all_pred = np.zeros(y_train.shape[0]) if 1: cnt += 1 trn_x = x_train.copy() val_x = x_valid.copy() trn_y = y_train val_y = y_valid _params = copy.deepcopy(params) drop_idx = _params.pop('drop') if drop_idx is not None: drop_col = drop_cols + [usecols[drop_idx]] else: drop_col = [] params['drop'] = drop_col trn_x.drop(drop_col, axis=1, inplace=True) val_x.drop(drop_col, axis=1, inplace=True) cat_feat = CAT_FEAT cols = trn_x.columns.values.tolist() train_data = lgb.Dataset( trn_x.values.astype(np.float32), label=trn_y, # categorical_feature=cat_feat, feature_name=cols) test_data = lgb.Dataset( val_x.values.astype(np.float32), label=val_y, # categorical_feature=cat_feat, feature_name=cols) del trn_x gc.collect() clf = lgb.train( _params, train_data, 10, # params['n_estimators'], early_stopping_rounds=30, valid_sets=[test_data], # feval=cst_metric_xgb, # callbacks=[callback], verbose_eval=10) pred = clf.predict(val_x.values.astype(np.float32)) # all_pred[test] = pred _score2 = log_loss(val_y, pred) _score = -roc_auc_score(val_y, pred) logger.info(f'drop: {drop_col}') logger.info(' _score: %s' % _score) logger.info(' _score2: %s' % _score2) list_score.append(_score) list_score2.append(_score2) score = (np.mean(list_score), np.min(list_score), np.max(list_score)) score2 = (np.mean(list_score2), np.min(list_score2), np.max(list_score2)) if min_score[use_score] > score[use_score]: min_score = score min_params = params drop_cols = drop_col logger.info('best score: {} {}'.format(min_score[use_score], min_score)) logger.info('best params: {}'.format(min_params))