def plot_cm_roc(self,data_loader): print("*****************Calculating Confusion Matrix*****************") save_path = os.path.join(self.output_model_dir, self.config.model_name, self.config.file_type,"data/") classes = np.arange(self.num_classes) probOutput,trueOutput,predictedOutput = self.test(data_loader) trueOutput = prepareData(trueOutput) predictedOutput = prepareData(predictedOutput) probOutput = prepareData(probOutput) one_hot_true_out = one_hot(trueOutput) normalized_confusion_matrix(trueOutput,predictedOutput,classes,save_path) plot_roc_curve(one_hot_true_out,probOutput,classes,save_path) if(self.config.debug == False): path = os.path.join(self.output_model_dir, self.config.model_name, self.config.file_type,"data/") cm = Image.open(path+"confusion_matrix.png") roc = Image.open(path+"roc_curve.png") wandb.log({"Confusion Matrix": [wandb.Image(cm, caption="Confusion Matrix")]}) wandb.log({"ROC Curve": [wandb.Image(roc, caption="ROC Curve")]}) # wandb.sklearn.plot_confusion_matrix(trueOutput,predictedOutput,classes) # wandb.sklearn.plot_roc(one_hot_true_out,probOutput,classes) return None
def _train_party_classifier(self, force: bool = False): """ Trains classifier learning to predict political party from moral relevance weight vectors. :param force: Trains and overwrites classifier even if already available. :return: """ pp_model_path = "data/party_predictor.pkl" pp_predictor = None # Build model predicting moral values for word. if force or not os.path.isfile(pp_model_path): df = self._users_df.sample(frac=1) df.mv_scores = df.mv_scores.values / df.num_words.values df.loc[df.party == "Libertarians", "party"] = "Republican Party" class_names = ["Republican Party", "Democratic Party"] x = np.asarray([np.asarray(x) for x in df.mv_scores.values]) le = preprocessing.LabelEncoder() le.fit(class_names) y = le.transform(df.party.values) for train_index, test_index in StratifiedShuffleSplit( n_splits=1, test_size=0.5).split(x, y): x_train, x_test = x[train_index], x[test_index] y_train, y_test = y[train_index], y[test_index] pp_predictor = xgb.XGBClassifier(objective='binary:logistic', colsample_bytree=0.7, learning_rate=0.05, n_estimators=6000, n_jobs=0, nthread=0) pp_predictor.fit(x_train, y_train) pickle.dump(pp_predictor, open(pp_model_path, "wb")) y_pred = pp_predictor.predict(x_test) print( classification_report(y_test, y_pred, target_names=class_names)) utils.plot_precision_recall_curve(y_test, y_pred) utils.plot_roc_curve(y_test, y_pred, 2) utils.plot_confusion_matrix( y_test, y_pred, ["Republican Party", "Democratic Party"], title="Confusion Matrix") # scores = cross_val_score(pp_predictor, x, y, cv=20, scoring='f1_macro') # print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) # Load built model. else: pp_predictor = pickle.load(open( pp_model_path, "rb")) # pd.read_pickle(path=pp_model_path) return pp_predictor
def test(): test_data = get_test_data() x = test_data[0] y = test_data[1] # Recreate the model. model = DeepSEA() model.compile(optimizer=tf.keras.optimizers.SGD(momentum=0.9), loss=tf.keras.losses.BinaryCrossentropy()) model.build(input_shape=(None, 1000, 4)) model.summary() # Load the weights of the old model. (The weights content the weights of model and status of optimizer.) # Because the tensorflow delay the creation of variables in model and optimizer, so the optimizer status will # be restored when the model is trained first. like: model.train_on_batch(x[0:1], y[0:1]) model.load_weights('./result/model/ckpt') # model.load_weights('./result/model/bestmodel.h5') result = model.predict(x) # shape = (455024, 919) np.savez('./result/test_result.npz', result=result, label=y) result = np.mean((result[0:227512], result[227512:]), axis=0) result_shape = np.shape(result) y = y[0:227512] fpr_list, tpr_list, auroc_list = [], [], [] precision_list, recall_list, aupr_list = [], [], [] for i in tqdm(range(result_shape[1]), ascii=True): fpr_temp, tpr_temp, auroc_temp = calculate_auroc(result[:, i], y[:, i]) precision_temp, recall_temp, aupr_temp = calculate_aupr( result[:, i], y[:, i]) fpr_list.append(fpr_temp) tpr_list.append(tpr_temp) precision_list.append(precision_temp) recall_list.append(recall_temp) auroc_list.append(auroc_temp) aupr_list.append(aupr_temp) plot_roc_curve(fpr_list, tpr_list, './result/') plot_pr_curve(precision_list, recall_list, './result/') header = np.array([['auroc', 'aupr']]) content = np.stack((auroc_list, aupr_list), axis=1) content = np.concatenate((header, content), axis=0) write2csv(content, './result/result.csv') write2txt(content, './result/result.txt') avg_auroc = np.nanmean(auroc_list) avg_aupr = np.nanmean(aupr_list) print('AVG-AUROC:{:.3f}, AVG-AUPR:{:.3f}.\n'.format(avg_auroc, avg_aupr))
def test(): dataset_test = get_test_data(64) model = DanQ() loss_object = keras.losses.BinaryCrossentropy() optimizer = keras.optimizers.Adam() trainer = Trainer(model=model, loss_object=loss_object, optimizer=optimizer, experiment_dir='./result/DanQ') result, label = trainer.test(dataset_test, test_steps=int(np.ceil(455024 / 64)), dis_show_bar=True) result = np.mean((result[0:227512], result[227512:]), axis=0) result_shape = np.shape(result) label = label[0:227512] fpr_list, tpr_list, auroc_list = [], [], [] precision_list, recall_list, aupr_list = [], [], [] for i in tqdm(range(result_shape[1]), ascii=True): fpr_temp, tpr_temp, auroc_temp = calculate_auroc( result[:, i], label[:, i]) precision_temp, recall_temp, aupr_temp = calculate_aupr( result[:, i], label[:, i]) fpr_list.append(fpr_temp) tpr_list.append(tpr_temp) precision_list.append(precision_temp) recall_list.append(recall_temp) auroc_list.append(auroc_temp) aupr_list.append(aupr_temp) plot_roc_curve(fpr_list, tpr_list, './result/DanQ/') plot_pr_curve(precision_list, recall_list, './result/DanQ/') header = np.array([['auroc', 'aupr']]) content = np.stack((auroc_list, aupr_list), axis=1) content = np.concatenate((header, content), axis=0) write2csv(content, './result/DanQ/result.csv') write2txt(content, './result/DanQ/result.txt') avg_auroc = np.nanmean(auroc_list) avg_aupr = np.nanmean(aupr_list) print('AVG-AUROC:{:.3f}, AVG-AUPR:{:.3f}.\n'.format(avg_auroc, avg_aupr))
def main(): emotionals, rationals = emotional_rational() preprocessor = Preprocessor() emotionals = preprocessor.parse_sentences(emotionals) rationals = preprocessor.parse_sentences(rationals) train_pos = emotionals[:len(emotionals) // 2] train_neg = rationals[:len(rationals) // 2] test_pos = emotionals[len(emotionals) // 2:] test_neg = rationals[len(rationals) // 2:] vectorizer = CountVectorizer() X_train = vectorizer.fit_transform(train_pos + train_neg) y_train = np.array([1] * len(train_pos) + [0] * len(train_neg)) X_test = vectorizer.transform(test_pos + test_neg) y_test = np.array([1] * len(test_pos) + [0] * len(test_neg)) print('Vocabulary size : {}'.format(len(vectorizer.vocabulary_))) nbsvm = NBSVM() nbsvm.fit(X_train, y_train) print('Test accuracy : {}'.format(nbsvm.score(X_test, y_test))) y_pred = nbsvm.predict(X_test) print('F1 score : {}'.format(f1_score(y_test, y_pred, average='macro'))) fpr, tpr, thresholds = roc_curve(y_test, y_pred, pos_label=1) roc_auc = auc(fpr, tpr) print('AUC of emotionals : {}'.format(roc_auc)) plot_roc_curve(fpr, tpr, roc_auc, 'nbsvm_emotional_roc.png') fpr, tpr, thresholds = roc_curve(y_test, y_pred, pos_label=0) roc_auc = auc(fpr, tpr) print('AUC of rationals : {}'.format(roc_auc)) plot_roc_curve(fpr, tpr, roc_auc, 'nbsvm_rational_roc.png')
"""Testing A Simple Prediction""" #print("Feature vector: %s" % X_test[:1]) print("Label: %s" % str(y_test[0])) print("Predicted: %s" % str(net0.predict(X_test[:1]))) """Metrics""" # layer_info = PrintLayerInfo() # net0.verbose = 3 # net0.initialize() #print layer_info(net0) print "[Classification Report]: " print classification_report(y_test, predicted) print "[Train dataset] Score: ", net0.score(X_train, y_train) print "[Test dataset] Score: ", net0.score(X_test, y_test) plot_matrix(net0, X_test, y_test, filename) valid_accuracies = np.array([i["valid_accuracy"] for i in net0.train_history_]) plot_accuracy(valid_accuracies, filename) train_loss = [row['train_loss'] for row in net0.train_history_] valid_loss = [row['valid_loss'] for row in net0.train_history_] plot_loss(valid_loss, train_loss, filename) y_score = net0.predict_proba(X_test) #[:, 1] y_test_bin = np.array(label_binarize(y_test, classes=np.unique(y))) n_classes = y_test_bin.shape[1] plot_roc_curve(n_classes, y_test_bin, y_score, filename=filename)
gt_labels = np.array(gt_labels) return logits, gt_labels if __name__ == '__main__': model = NaiveBayes() tokenizer = stemmedTokenizer model.create_dict(json_reader("col774_yelp_data/train.json"), tokenizer) model.train(json_reader("col774_yelp_data/train.json"), tokenizer) # outputs = model.predict(json_reader("col774_yelp_data/test.json"), tokenizer) # f = open("outputs_stemmed_test.pickle","wb") # pickle.dump(outputs, f) # f.close() logits, gt_labels = _load_object("outputs_stemmed_test.pickle") conf_matrix = create_confusion_matrix(logits, gt_labels) print(calc_accuracy(logits, gt_labels) * 100) print(conf_matrix) plot_confusion_matrix(conf_matrix, model.classes) probs = logits_to_prob_vector(logits) plot_roc_curve(logits, gt_labels)
def main(): args = parse_args() # set random seed utils.seed_torch(args.seed) # Setup CUDA, GPU if not torch.cuda.is_available(): print("cuda is not available") exit(0) else: args.device = torch.device("cuda") args.n_gpus = torch.cuda.device_count() print(f"available cuda: {args.n_gpus}") # Setup model model = MelanomaNet(arch=args.arch) if args.n_gpus > 1: model = torch.nn.DataParallel(module=model) model.to(args.device) model_path = f'{configure.MODEL_PATH}/{args.arch}_fold_{args.fold}.pth' # Setup data total_batch_size = args.per_gpu_batch_size * args.n_gpus train_loader, valid_loader = datasets.get_dataloader( image_dir=configure.TRAIN_IMAGE_PATH, fold=args.fold, batch_size=total_batch_size, num_workers=args.num_workers) # define loss function (criterion) and optimizer criterion = torch.nn.BCEWithLogitsLoss() # criterion = MarginFocalBCEWithLogitsLoss() optimizer = torch.optim.AdamW(model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5) """ Train the model """ current_time = datetime.now().strftime('%b%d_%H_%M_%S') log_dir = f'{configure.TRAINING_LOG_PATH}/{args.arch}_fold_{args.fold}_{current_time}' tb_writer = None if args.log: tb_writer = SummaryWriter(log_dir=log_dir) print(f'training started: {current_time}') best_score = 0.0 for epoch in range(args.epochs): train_loss = train(dataloader=train_loader, model=model, criterion=criterion, optimizer=optimizer, args=args) valid_loss, y_true, y_score = valid(dataloader=valid_loader, model=model, criterion=criterion, args=args) valid_score = roc_auc_score(y_true=y_true, y_score=y_score) learning_rate = scheduler.get_lr()[0] if args.log: tb_writer.add_scalar("learning_rate", learning_rate, epoch) tb_writer.add_scalar("Loss/train", train_loss, epoch) tb_writer.add_scalar("Loss/valid", valid_loss, epoch) tb_writer.add_scalar("Score/valid", valid_score, epoch) # Log the roc curve as an image summary. figure = utils.plot_roc_curve(y_true=y_true, y_score=y_score) figure = utils.plot_to_image(figure) tb_writer.add_image("ROC curve", figure, epoch) if valid_score > best_score: best_score = valid_score state = { 'state_dict': model.module.state_dict(), 'train_loss': train_loss, 'valid_loss': valid_loss, 'valid_score': valid_score } torch.save(state, model_path) current_time = datetime.now().strftime('%b%d_%H_%M_%S') print( f"epoch:{epoch:02d}, " f"train:{train_loss:0.3f}, valid:{valid_loss:0.3f}, " f"score:{valid_score:0.3f}, best:{best_score:0.3f}, date:{current_time}" ) scheduler.step() current_time = datetime.now().strftime('%b%d_%H_%M_%S') print(f'training finished: {current_time}') if args.log: tb_writer.close()
net0.fit(X_train, y_train) predicted = net0.predict(X_test) """Testing A Simple Prediction""" #print("Feature vector: %s" % X_test[:1]) print("Label: %s" % str(y_test[0])) print("Predicted: %s" % str(net0.predict(X_test[:1]))) """Metrics""" # layer_info = PrintLayerInfo() # net0.verbose = 3 # net0.initialize() #print layer_info(net0) print "[Classification Report]: " print classification_report(y_test, predicted) print "[Train dataset] Score: ", net0.score(X_train, y_train) print "[Test dataset] Score: ", net0.score(X_test, y_test) plot_matrix(net0, X_test, y_test, filename) valid_accuracies = np.array([i["valid_accuracy"] for i in net0.train_history_]) plot_accuracy(valid_accuracies, filename) train_loss = [row['train_loss'] for row in net0.train_history_] valid_loss = [row['valid_loss'] for row in net0.train_history_] plot_loss(valid_loss, train_loss, filename) y_score = net0.predict_proba(X_test) #[:, 1] y_test_bin = np.array(label_binarize(y_test, classes=np.unique(y))) n_classes = y_test_bin.shape[1] plot_roc_curve(n_classes, y_test_bin, y_score, filename=filename)
t0 = time() grid_params = {'C': [0.1, 0.5, 1, 2, 3, 4, 5, 10, 15, 20, 30, 100]} gs = GridSearchCV(SVC(kernel='rbf', probability=True), grid_params, verbose=1, cv=5, n_jobs=-1) gs_results = gs.fit(X_train, y_train) print("SVM Training done in %0.3fs\n" % (time() - t0)) print("Best estimator after cross validation:") print("C-support - %d\n" % gs.best_estimator_.C) # Testing t0 = time() y_pred = gs.predict(X_test) print("SVM Testing done in %0.3fs\n" % (time() - t0)) # ROC Curve plot probs = gs.predict_proba(X_test) probs = probs[:, 1] auc = metrics.roc_auc_score(y_test, probs) print('AUC: %.2f\n' % auc) fpr, tpr, thresholds = metrics.roc_curve(y_test, probs) utils.plot_roc_curve('SVM ROC', fpr, tpr) # Confusion Matrix print('SVM Confusion Matrix') print('-------------------------') print(confusion_matrix(y_test, y_pred))
scores = [] for c in np.random.uniform(1, 50, 50): model = SVC(C=c) # , probability=True cv_score = cross_val_score(model, x_train, y_train, cv=4, scoring='f1_micro') cs.append(c) scores.append(np.mean(cv_score)) print('C: {}. CV score. Mean: {}. Sd: {}'.format( c, np.mean(cv_score), np.std(cv_score))) print(list(zip(cs, cv_score))) elif do == 'test': model.fit(x_train, y_train) test_score = model.score(x_test, y_test) print('Test score: {}'.format(test_score)) class_dict_inv = {v: k for k, v in class_dict.items()} y_pred = model.predict(x_test) f1s = f1_score(y_test, y_pred, average=None) print('F1 scores:') for k, i in class_dict.items(): print('{}: {}'.format(k, f1s[i])) plot_confusion_matrix( y_pred, y_test, [class_dict_inv[i] for i in range(len(class_dict))], normalize=True) y_pred = model.predict_proba(x_test) plot_roc_curve(y_pred, y_test, class_dict)
} gs = GridSearchCV(KNeighborsClassifier(), grid_params, verbose=1, cv=5, n_jobs=-1) gs_results = gs.fit(X_train, y_train) print("KNN Training done in %0.3fs\n" % (time() - t0)) print("Best estimator after cross validation:") print("Metric - %s\nK - %d\n" % (gs.best_estimator_.metric, gs.best_estimator_.n_neighbors)) # Testing t0 = time() y_pred = gs.predict(X_test) print("KNN Testing done in %0.3fs\n" % (time() - t0)) # ROC Curve plot probs = gs.predict_proba(X_test) probs = probs[:, 1] auc = metrics.roc_auc_score(y_test, probs) print('AUC: %.2f\n' % auc) fpr, tpr, thresholds = metrics.roc_curve(y_test, probs) utils.plot_roc_curve('KNN ROC', fpr, tpr) # Confusion Matrix print('KNN Confusion Matrix') print('-------------------------') print(confusion_matrix(y_test, y_pred))
t0 = time() grid_params = {'n_estimators': [10, 25, 50, 75, 100, 125, 150, 175, 200, 300]} gs = GridSearchCV(AdaBoostClassifier(), grid_params, verbose=1, cv=5, n_jobs=-1) gs_results = gs.fit(X_train, y_train) print("Adaboost Training done in %0.3fs\n" % (time() - t0)) print("Best estimator after cross validation:") print("Decision Stumps - %d\n" % gs.best_estimator_.n_estimators) # Testing t0 = time() y_pred = gs.predict(X_test) print("Adaboost Testing done in %0.3fs\n" % (time() - t0)) # ROC Curve plot probs = gs.predict_proba(X_test) probs = probs[:, 1] auc = metrics.roc_auc_score(y_test, probs) print('AUC: %.2f\n' % auc) fpr, tpr, thresholds = metrics.roc_curve(y_test, probs) utils.plot_roc_curve('Adaboost ROC', fpr, tpr) # Confusion Matrix print('Adaboost Confusion Matrix') print('-------------------------') print(confusion_matrix(y_test, y_pred))
flist = [os.path.join(root_dir, _dir) for _dir in os.listdir(root_dir) \ if 'preds_labels' in _dir] aucs, aps = [], [] preds, trues = [], [] for fname in flist: with open(fname, 'rb') as f: data = pickle.load(f) # (pred, label) tuple aucs.append(get_auroc(data[1], data[0])) aps.append(get_ap(data[1], data[0])) preds.append(data[0]) trues.append(data[1]) plot_args = {'lw': 1, 'alpha': 0.5, 'color': 'gray', 'ls': '-'} plot_roc_curve(0, data[1], data[0], **plot_args) plot_pr_curve(1, data[1], data[0], **plot_args) plot_args = {'lw': 1, 'alpha': 0.9, 'color': 'black', 'ls': '-'} preds = np.concatenate(preds, axis=0) trues = np.concatenate(trues, axis=0) auc_cint = np.std(aucs) / np.sqrt(len(aucs)) * 1.96 ap_cint = np.std(aps) / np.sqrt(len(aps)) * 1.96 aucstr = '{} AUC: {:.4f} ({} {:.4f})'.format(model, np.mean(aucs), u"\u00B1", auc_cint) apstr = '{} AP: {:.4f} ({} {:.4f})'.format(model, np.mean(aps), u"\u00B1", ap_cint) plot_roc_curve(0, trues, preds, legend=aucstr, **plot_args) plt.savefig(os.path.join(root_dir, 'auc'))
def model(fout, X_train, X_valid, X_test, y_train, y_valid, ids, cat_vars, cat_sz, emb_szs, params, verbose): "Build and fit model for given train/validation/test/out files" nround = params.get('nround', 10) early_stopping_rounds = params.get('early_stopping_rounds', 50) drop_this = [ "PersonalField41", "PersonalField37", "PropertyField10", "PersonalField46", "GeographicField23A", "PersonalField32", "GeographicField21A", "GeographicField64_2", "GeographicField56A", "PersonalField51", "PersonalField52", "PersonalField30", "PersonalField71", "PersonalField68", "GeographicField22A", "PersonalField7_1", "PersonalField47", "Field12_1", "PropertyField2A", "PersonalField62", "PropertyField11A", "GeographicField64_0", "GeographicField63_0", "GeographicField5A", "PersonalField29", "SalesField9", "PersonalField72", "PersonalField23", "GeographicField60A", "PersonalField44", "GeographicField12A", "PersonalField78", "PersonalField48", "PersonalField58", "GeographicField13A", "PropertyField4_1", "PropertyField4_0", "PersonalField33", "GeographicField62A", "PropertyField36_1", "PersonalField74", "PropertyField38_0", "PersonalField36", "PersonalField50", "GeographicField61A", "PersonalField54", "PersonalField53", "PropertyField30_1", "PropertyField22", "PersonalField38", "PersonalField55", "GeographicField63_1", "GeographicField18A", "PropertyField38_1", "GeographicField64_1", "PropertyField30_0", "Field12_0", "SalesField13", "PersonalField59", "PersonalField56", "PropertyField28_2", "SalesField15", "PersonalField19_23", "PersonalField76", "PropertyField31_2", "SalesField14", "Field6_3", "PropertyField36_0", "PersonalField19_21", "PersonalField57", "GeographicField15A", "PersonalField63", "PropertyField23", "PersonalField7_0", "Field6_4", "Field6_2", "PropertyField14_1", "PersonalField75", "PropertyField13", "PropertyField11B", "Field6_1", "PersonalField19_22", "PersonalField19_24", "PersonalField31", "PersonalField19_8", "PersonalField19_20", "PropertyField28_0", "PersonalField77", "PersonalField61", "PersonalField25", "PersonalField17_4", "PersonalField19_17", "PropertyField32_1", "GeographicField7A", "PersonalField19_5", "year_1", "PropertyField3_1", "PersonalField19_12", "PropertyField14_2", "PersonalField19_10", "GeographicField12B", "GeographicField11A", "PersonalField18_21", "PersonalField79", "PropertyField17", "PropertyField28_1", "PersonalField19_18", "PersonalField19_3", "PersonalField80", "PropertyField7_5", "PersonalField19_4", "PropertyField15", "PropertyField7_2", "PersonalField19_7", "PersonalField18_12", "PersonalField19_0", "PersonalField18_20", "PersonalField19_25", "Field6_0", ] for col in drop_this: X_train = X_train.drop(col, axis=1) X_valid = X_valid.drop(col, axis=1) X_test = X_test.drop(col, axis=1) TOPF = [ 'PersonalField10A', 'SalesField1A', 'PersonalField9', 'SalesField1B', 'PersonalField10B' ] X_train["avg"] = X_train.mean(axis=1) X_valid["avg"] = X_valid.mean(axis=1) X_test["avg"] = X_test.mean(axis=1) X_train["sumTop"] = X_train[TOPF].sum(axis=1) X_valid["sumTop"] = X_valid[TOPF].sum(axis=1) X_test["sumTop"] = X_test[TOPF].sum(axis=1) ncols = X_test.columns.size X_train['value_count'] = X_train.apply(lambda x: (ncols - x.count()) / ncols, axis=1) X_valid['value_count'] = X_valid.apply(lambda x: (ncols - x.count()) / ncols, axis=1) X_test['value_count'] = X_test.apply(lambda x: (ncols - x.count()) / ncols, axis=1) X_train["comb1"] = X_train["sumTop"] * X_train["value_count"] X_valid["comb1"] = X_valid["sumTop"] * X_valid["value_count"] X_test["comb1"] = X_test["sumTop"] * X_test["value_count"] X_train["comb2"] = X_train["sumTop"] * X_train["avg"] X_valid["comb2"] = X_valid["sumTop"] * X_valid["avg"] X_test["comb2"] = X_test["sumTop"] * X_test["avg"] X_train["comb5"] = X_train["sumTop"] - X_train["value_count"] X_valid["comb5"] = X_valid["sumTop"] - X_valid["value_count"] X_test["comb5"] = X_test["sumTop"] - X_test["value_count"] X_train["comb6"] = X_train["sumTop"] - X_train["avg"] X_valid["comb6"] = X_valid["sumTop"] - X_valid["avg"] X_test["comb6"] = X_test["sumTop"] - X_test["avg"] newTOP = [ 'SalesField8', 'SalesField6', 'PersonalField10A', 'SalesField2B', 'PersonalField10B', 'PropertyField29', 'SalesField5', 'sumTop', 'SalesField1B', 'PersonalField9' ] X_train["avgTop2"] = X_train[newTOP].mean(axis=1) X_valid["avgTop2"] = X_valid[newTOP].mean(axis=1) X_test["avgTop2"] = X_test[newTOP].mean(axis=1) X_train["sumTop2"] = X_train[newTOP].sum(axis=1) X_valid["sumTop2"] = X_valid[newTOP].sum(axis=1) X_test["sumTop2"] = X_test[newTOP].sum(axis=1) print("Train shape", np.shape(X_train)) print("Valid shape", np.shape(X_valid)) print("Test shape", np.shape(X_test)) # preparre DMatrix object for training/fitting dtrain = xgb.DMatrix(X_train, label=y_train) deval = xgb.DMatrix(X_valid, label=y_valid) dtest = xgb.DMatrix(X_test) # model parameters args = { 'max_depth': 6, 'eta': 0.012, 'subsample': 0.86, 'colsample_bytree': 0.38, 'eval_metric': 'auc', 'silent': 0, 'n_jobs': 4, 'objective': 'binary:logistic' } if verbose: print("model parameters") pprint.pprint(args) # use evaluation list while traning evallist = [(deval, 'eval'), (dtrain, 'train')] # train our model with early stopping that we'll see that we don't overfit #bst = xgb.train(args, dtrain, nround, evallist, early_stopping_rounds=early_stopping_rounds) bst = xgb.train(args, dtrain, nround, evallist, early_stopping_rounds=early_stopping_rounds) # try eli5 explanation of our model # see permutation importance: # https://www.kaggle.com/dansbecker/permutation-importance?utm_medium=email&utm_source=mailchimp&utm_campaign=ml4insights try: import eli5 html_obj = eli5.show_weights(bst, top=10) import html2text print(html2text.html2text(html_obj.data)) except: pass # validate results pred = bst.predict(deval) myscores = bst.get_score() for i in sorted(myscores, key=myscores.get): print("\"" + i + "\"," + str(myscores[i])) print("AUC", metrics.roc_auc_score(y_valid, pred)) # create AUC/ROC plot plot_roc_curve(y_valid, pred) # make prediction if fout: #pred = sclf2.predict(X_test) #pred = sclf.predict(X_test) #pred = eclf1.predict(X_test) pred = bst.predict(dtest) data = {'QuoteNumber': ids, 'QuoteConversion_Flag': pred} sub = pd.DataFrame(data, columns=['QuoteNumber', 'QuoteConversion_Flag']) print("Write prediction to %s" % fout) sub.to_csv(fout, index=False)
def main(): emotionals, rationals = emotional_rational() preprocessor = Preprocessor() emotionals = preprocessor.parse_sentences(emotionals) rationals = preprocessor.parse_sentences(rationals) emotionals = emotionals[:len(emotionals)] rationals = rationals[:len(emotionals)] sentences = emotionals + rationals Y = np.array([[0, 1]] * len(emotionals) + [[1, 0]] * len(rationals)) max_features = 200 tokenizer = Tokenizer(num_words=max_features, split=' ') tokenizer.fit_on_texts(sentences) X = tokenizer.texts_to_sequences(sentences) X = pad_sequences(X, maxlen=MAX_LEN) epochs = 15 # --- Add Features --- dict_loader = EmotionalDict('dataset/nouns', 'dataset/verbs') emotional_dict = dict_loader.load() features_loader = AdditionalFeatures(emotionals+rationals, emotional_dict) add_features = features_loader.emotional_features() ###################### x_aux_train = add_features[:848] x_aux_test = add_features[848:] model = build_model(x_aux_train.shape) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42) print(X_train.shape, Y_train.shape) print(X_test.shape, Y_test.shape) batch_size = 32 model.fit({'main_input': X_train, 'add_input': x_aux_train}, Y_train, epochs=epochs, batch_size=batch_size, verbose=2) score, acc = model.evaluate({'main_input': X_test, 'add_input': x_aux_test}, Y_test, verbose=2, batch_size=batch_size) print('score: {}'.format(score)) print('acc: {}'.format(acc)) Y_pred = model.predict({'main_input': X_test, 'add_input': x_aux_test}, batch_size=1, verbose=2) print(classification_report(Y_test[:, 1], np.round(Y_pred[:, 1]), target_names=['rationals', 'emotionals'])) fpr, tpr, _ = roc_curve(Y_test[:, 1], Y_pred[:, 1]) roc_auc = auc(fpr, tpr) plot_roc_curve(fpr, tpr, roc_auc, 'roc.png') cnf_matrix = confusion_matrix(Y_test[:, 1], np.round(Y_pred[:, 1])) plot_confusion_matrix(cnf_matrix, ['rationals', 'emotionals'], 'cnf.png') attention_vector = np.mean(get_activations(model, X_test, True, 'attention_vec')[0], axis=2).squeeze() attention_vector = np.mean(attention_vector, axis=0) import matplotlib.pyplot as plt import pandas as pd pd.DataFrame(attention_vector, columns=['attention (%)']).plot(kind='bar', title='Attention') plt.savefig('attention_vec.png') attention_vector_indices = np.argsort(attention_vector)[::-1] word_index = tokenizer.word_index word_index_inv = {v: k for k, v in word_index.items()} with open('attention_word.txt', 'w') as f: for i, attention_index in enumerate(attention_vector_indices, start=1): try: print('No.{} : {}'.format(i, word_index_inv[attention_index]), file=f) except KeyError: continue
def run_main(args): # Define parameters epochs = args.epochs dim_au_out = args.bottleneck #8, 16, 32, 64, 128, 256,512 dim_dnn_in = dim_au_out dim_dnn_out = 1 select_drug = args.drug na = args.missing_value data_path = args.data_path label_path = args.label_path test_size = args.test_size valid_size = args.valid_size g_disperson = args.var_genes_disp model_path = args.source_model_path encoder_path = args.encoder_path log_path = args.logging_file batch_size = args.batch_size encoder_hdims = args.encoder_h_dims.split(",") preditor_hdims = args.predictor_h_dims.split(",") reduce_model = args.dimreduce prediction = args.predition sampling = args.sampling PCA_dim = args.PCA_dim encoder_hdims = list(map(int, encoder_hdims)) preditor_hdims = list(map(int, preditor_hdims)) load_model = bool(args.load_source_model) preditor_path = model_path + reduce_model + args.predictor + prediction + select_drug + '.pkl' # Read data data_r = pd.read_csv(data_path, index_col=0) label_r = pd.read_csv(label_path, index_col=0) label_r = label_r.fillna(na) now = time.strftime("%Y-%m-%d-%H-%M-%S") ut.save_arguments(args, now) # Initialize logging and std out out_path = log_path + now + ".err" log_path = log_path + now + ".log" out = open(out_path, "w") sys.stderr = out logging.basicConfig( level=logging.INFO, #控制台打印的日志级别 filename=log_path, filemode='a', ##模式,有w和a,w就是写模式,每次都会重新写日志,覆盖之前的日志 #a是追加模式,默认如果不写的话,就是追加模式 format= '%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s' #日志格式 ) logging.getLogger('matplotlib.font_manager').disabled = True logging.info(args) # data = data_r # Filter out na values selected_idx = label_r.loc[:, select_drug] != na if (g_disperson != None): hvg, adata = ut.highly_variable_genes(data_r, min_disp=g_disperson) # Rename columns if duplication exist data_r.columns = adata.var_names # Extract hvgs data = data_r.loc[selected_idx, hvg] else: data = data_r.loc[selected_idx, :] # Do PCA if PCA_dim!=0 if PCA_dim != 0: data = PCA(n_components=PCA_dim).fit_transform(data) else: data = data # Extract labels label = label_r.loc[selected_idx, select_drug] # Scaling data mmscaler = preprocessing.MinMaxScaler() lbscaler = preprocessing.MinMaxScaler() data = mmscaler.fit_transform(data) label = label.values.reshape(-1, 1) if prediction == "regression": label = lbscaler.fit_transform(label) dim_model_out = 1 else: le = LabelEncoder() label = le.fit_transform(label) dim_model_out = 2 #label = label.values.reshape(-1,1) logging.info(np.std(data)) logging.info(np.mean(data)) # Split traning valid test set X_train_all, X_test, Y_train_all, Y_test = train_test_split( data, label, test_size=test_size, random_state=42) X_train, X_valid, Y_train, Y_valid = train_test_split(X_train_all, Y_train_all, test_size=valid_size, random_state=42) # sampling method if sampling == None: X_train, Y_train = sam.nosampling(X_train, Y_train) logging.info("nosampling") elif sampling == "upsampling": X_train, Y_train = sam.upsampling(X_train, Y_train) logging.info("upsampling") elif sampling == "downsampling": X_train, Y_train = sam.downsampling(X_train, Y_train) logging.info("downsampling") elif sampling == "SMOTE": X_train, Y_train = sam.SMOTEsampling(X_train, Y_train) logging.info("SMOTE") else: logging.info("not a legal sampling method") logging.info(data.shape) logging.info(label.shape) #logging.info(X_train.shape, Y_train.shape) #logging.info(X_test.shape, Y_test.shape) logging.info(X_train.max()) logging.info(X_train.min()) # Select the Training device device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Assuming that we are on a CUDA machine, this should print a CUDA device: logging.info(device) torch.cuda.set_device(device) # Construct datasets and data loaders X_trainTensor = torch.FloatTensor(X_train).to(device) X_validTensor = torch.FloatTensor(X_valid).to(device) X_testTensor = torch.FloatTensor(X_test).to(device) X_allTensor = torch.FloatTensor(data).to(device) if prediction == "regression": Y_trainTensor = torch.FloatTensor(Y_train).to(device) Y_trainallTensor = torch.FloatTensor(Y_train_all).to(device) Y_validTensor = torch.FloatTensor(Y_valid).to(device) else: Y_trainTensor = torch.LongTensor(Y_train).to(device) Y_trainallTensor = torch.LongTensor(Y_train_all).to(device) Y_validTensor = torch.LongTensor(Y_valid).to(device) train_dataset = TensorDataset(X_trainTensor, X_trainTensor) valid_dataset = TensorDataset(X_validTensor, X_validTensor) test_dataset = TensorDataset(X_testTensor, X_testTensor) all_dataset = TensorDataset(X_allTensor, X_allTensor) X_trainDataLoader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True) X_validDataLoader = DataLoader(dataset=valid_dataset, batch_size=batch_size, shuffle=True) X_allDataLoader = DataLoader(dataset=all_dataset, batch_size=batch_size, shuffle=True) # construct TensorDataset trainreducedDataset = TensorDataset(X_trainTensor, Y_trainTensor) validreducedDataset = TensorDataset(X_validTensor, Y_validTensor) trainDataLoader_p = DataLoader(dataset=trainreducedDataset, batch_size=batch_size, shuffle=True) validDataLoader_p = DataLoader(dataset=validreducedDataset, batch_size=batch_size, shuffle=True) dataloaders_train = {'train': trainDataLoader_p, 'val': validDataLoader_p} if (bool(args.pretrain) != False): dataloaders_pretrain = { 'train': X_trainDataLoader, 'val': X_validDataLoader } if reduce_model == "VAE": encoder = VAEBase(input_dim=data.shape[1], latent_dim=dim_au_out, h_dims=encoder_hdims) else: encoder = AEBase(input_dim=data.shape[1], latent_dim=dim_au_out, h_dims=encoder_hdims) if torch.cuda.is_available(): encoder.cuda() logging.info(encoder) encoder.to(device) optimizer_e = optim.Adam(encoder.parameters(), lr=1e-2) loss_function_e = nn.MSELoss() exp_lr_scheduler_e = lr_scheduler.ReduceLROnPlateau(optimizer_e) if reduce_model == "AE": encoder, loss_report_en = t.train_AE_model( net=encoder, data_loaders=dataloaders_pretrain, optimizer=optimizer_e, loss_function=loss_function_e, n_epochs=epochs, scheduler=exp_lr_scheduler_e, save_path=encoder_path) elif reduce_model == "VAE": encoder, loss_report_en = t.train_VAE_model( net=encoder, data_loaders=dataloaders_pretrain, optimizer=optimizer_e, n_epochs=epochs, scheduler=exp_lr_scheduler_e, save_path=encoder_path) logging.info("Pretrained finished") # Train model of predictor if args.predictor == "DNN": if reduce_model == "AE": model = PretrainedPredictor(input_dim=X_train.shape[1], latent_dim=dim_au_out, h_dims=encoder_hdims, hidden_dims_predictor=preditor_hdims, output_dim=dim_model_out, pretrained_weights=encoder_path, freezed=bool(args.freeze_pretrain)) elif reduce_model == "VAE": model = PretrainedVAEPredictor( input_dim=X_train.shape[1], latent_dim=dim_au_out, h_dims=encoder_hdims, hidden_dims_predictor=preditor_hdims, output_dim=dim_model_out, pretrained_weights=encoder_path, freezed=bool(args.freeze_pretrain), z_reparam=bool(args.VAErepram)) elif args.predictor == "GCN": if reduce_model == "VAE": gcn_encoder = VAEBase(input_dim=data.shape[1], latent_dim=dim_au_out, h_dims=encoder_hdims) else: gcn_encoder = AEBase(input_dim=data.shape[1], latent_dim=dim_au_out, h_dims=encoder_hdims) gcn_encoder.load_state_dict(torch.load(args.GCNreduce_path)) gcn_encoder.to(device) train_embeddings = gcn_encoder.encode(X_trainTensor) zOut_tr = train_embeddings.cpu().detach().numpy() valid_embeddings = gcn_encoder.encode(X_validTensor) zOut_va = valid_embeddings.cpu().detach().numpy() test_embeddings = gcn_encoder.encode(X_testTensor) zOut_te = test_embeddings.cpu().detach().numpy() adj_tr, edgeList_tr = g.generateAdj( zOut_tr, graphType='KNNgraphStatsSingleThread', para='euclidean' + ':' + str('10'), adjTag=True) adj_va, edgeList_va = g.generateAdj( zOut_va, graphType='KNNgraphStatsSingleThread', para='euclidean' + ':' + str('10'), adjTag=True) adj_te, edgeList_te = g.generateAdj( zOut_te, graphType='KNNgraphStatsSingleThread', para='euclidean' + ':' + str('10'), adjTag=True) Adj_trainTensor = preprocess_graph(adj_tr) Adj_validTensor = preprocess_graph(adj_va) Adj_testTensor = preprocess_graph(adj_te) Z_trainTensor = torch.FloatTensor(zOut_tr).to(device) Z_validTensor = torch.FloatTensor(zOut_va).to(device) Z_testTensor = torch.FloatTensor(zOut_te).to(device) if (args.binarizied == 0): zDiscret_tr = zOut_tr > np.mean(zOut_tr, axis=0) zDiscret_tr = 1.0 * zDiscret_tr zDiscret_va = zOut_va > np.mean(zOut_va, axis=0) zDiscret_va = 1.0 * zDiscret_va zDiscret_te = zOut_te > np.mean(zOut_te, axis=0) zDiscret_te = 1.0 * zDiscret_te Z_trainTensor = torch.FloatTensor(zDiscret_tr).to(device) Z_validTensor = torch.FloatTensor(zDiscret_va).to(device) Z_testTensor = torch.FloatTensor(zDiscret_te).to(device) ZTensors_train = {'train': Z_trainTensor, 'val': Z_validTensor} XTensors_train = {'train': X_trainTensor, 'val': X_validTensor} YTensors_train = {'train': Y_trainTensor, 'val': Y_validTensor} AdjTensors_train = {'train': Adj_trainTensor, 'val': Adj_validTensor} if (args.GCNfeature == "x"): dim_GCNin = X_allTensor.shape[1] GCN_trainTensors = XTensors_train GCN_testTensor = X_testTensor else: dim_GCNin = Z_testTensor.shape[1] GCN_trainTensors = ZTensors_train GCN_testTensor = Z_testTensor model = GCNPredictor(input_feat_dim=dim_GCNin, hidden_dim1=encoder_hdims[0], hidden_dim2=dim_au_out, dropout=0.5, hidden_dims_predictor=preditor_hdims, output_dim=dim_model_out, pretrained_weights=encoder_path, freezed=bool(args.freeze_pretrain)) # model2 = GAEBase(input_dim=X_train_all.shape[1], latent_dim=128,h_dims=[512]) # model2.to(device) # test = model2((X_trainTensor,Adj_trainTensor)) logging.info(model) if torch.cuda.is_available(): model.cuda() model.to(device) # Define optimizer optimizer = optim.Adam(model.parameters(), lr=1e-2) if prediction == "regression": loss_function = nn.MSELoss() else: loss_function = nn.CrossEntropyLoss() exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau(optimizer) if args.predictor == "GCN": model, report = t.train_GCNpreditor_model(model=model, z=GCN_trainTensors, y=YTensors_train, adj=AdjTensors_train, optimizer=optimizer, loss_function=loss_function, n_epochs=epochs, scheduler=exp_lr_scheduler, save_path=preditor_path) else: model, report = t.train_predictor_model(model, dataloaders_train, optimizer, loss_function, epochs, exp_lr_scheduler, load=load_model, save_path=preditor_path) if args.predictor != 'GCN': dl_result = model(X_testTensor).detach().cpu().numpy() else: dl_result = model(GCN_testTensor, Adj_testTensor).detach().cpu().numpy() #torch.save(model.feature_extractor.state_dict(), preditor_path+"encoder.pkl") logging.info('Performances: R/Pearson/Mse/') if prediction == "regression": logging.info(r2_score(dl_result, Y_test)) logging.info(pearsonr(dl_result.flatten(), Y_test.flatten())) logging.info(mean_squared_error(dl_result, Y_test)) else: lb_results = np.argmax(dl_result, axis=1) #pb_results = np.max(dl_result,axis=1) pb_results = dl_result[:, 1] report_dict = classification_report(Y_test, lb_results, output_dict=True) report_df = pd.DataFrame(report_dict).T ap_score = average_precision_score(Y_test, pb_results) auroc_score = roc_auc_score(Y_test, pb_results) report_df['auroc_score'] = auroc_score report_df['ap_score'] = ap_score report_df.to_csv("saved/logs/" + reduce_model + args.predictor + prediction + select_drug + now + '_report.csv') logging.info(classification_report(Y_test, lb_results)) logging.info(average_precision_score(Y_test, pb_results)) logging.info(roc_auc_score(Y_test, pb_results)) model = DummyClassifier(strategy='stratified') model.fit(X_train, Y_train) yhat = model.predict_proba(X_test) naive_probs = yhat[:, 1] ut.plot_roc_curve(Y_test, naive_probs, pb_results, title=str(roc_auc_score(Y_test, pb_results)), path="saved/figures/" + reduce_model + args.predictor + prediction + select_drug + now + '_roc.pdf') ut.plot_pr_curve(Y_test, pb_results, title=average_precision_score(Y_test, pb_results), path="saved/figures/" + reduce_model + args.predictor + prediction + select_drug + now + '_prc.pdf')
SCORES['Y_TRUE'][tooth_type] = np.concatenate( [SCORES['Y_TRUE'][tooth_type], final_true.flatten()]) SCORES['Y_PRED'][tooth_type] = np.concatenate( [SCORES['Y_PRED'][tooth_type], final_pred.flatten()]) print('========') ''' ###### PLOT ROC/AUC CURVE ############# ''' for group_type in SCORES['GROUP_TYPE']: print("============= GROUPE {} =====================".format(group_type)) plot_roc_curve(SCORES['Y_TRUE'][group_type], SCORES['Y_PRED'][group_type], group_type) print('DICE : {}'.format( np_dice_coef(SCORES['Y_TRUE'][group_type], SCORES['Y_PRED'][group_type]))) matrix_data = confusion_matrix(SCORES['Y_TRUE'][group_type], SCORES['Y_PRED'][group_type]) plot_confusion_matrix(cm=matrix_data, normalize=True, target_names=['Background', 'Carry'], title='Confusion Matrix for {}'.format(group_type), cmap=plt.cm.Blues) try: tn, fp, fn, tp = confusion_matrix(