def find_weights(X_orig, y_orig, test_subject): ''' computes weights for each data point that is proportional to the probability of it belonging to the testing data ''' clf = RFC(n_estimators=10) # clf = LR(solver='lbfgs') X = X_orig.reshape(X_orig.shape[0], X_orig.shape[1] * X_orig.shape[2]) y = y_orig.reshape(y_orig.shape[0]) predictions = np.zeros(y.shape) kf = SKF(n_splits=10, shuffle=True, random_state=1234) # kf = KFold(n_splits=10, shuffle=True) for train_idx, test_idx in kf.split(X, y): # print('Training discriminator model for fold {}'.format(fold)) X_train, X_test = X[train_idx], X[test_idx] y_train = y[train_idx] X_train, y_train = RandomUnderSampler().fit_resample(X_train, y_train) scaler = MinMaxScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) clf.fit(X_train, y_train) probs = clf.predict_proba(X_test)[:, 1] predictions[test_idx] = probs print( f'{test_subject}: ROC-AUC for train and test distributions: {AUC(y, predictions)}' ) weights = predictions # (1/predictions_test) - 1 weights /= np.mean( weights) # we do this to re-normalize the computed log-loss np.save('../data/cs/CS_weights_{}.npy'.format(test_subject), weights)
def startified_splits_single_label(self): self.dataset_split = { 'train_X': [], 'train_y': [], 'valid_X': [], 'valid_y': [], } skf = SKF(n_splits=1, test_size=0.2, random_state=42) data = self.train_df['file_path'] labels = self.train_df['skill'] for train_index, test_index in skf.split(data, labels): self.dataset_split['train_X'].append( [data[d] for d in train_index if d in data][:]) self.dataset_split['valid_X'].append( [data[d] for d in test_index if d in data][:]) self.dataset_split['train_y'].append([[self.train_df['skill'][d]] for d in train_index if d in labels][:]) self.dataset_split['valid_y'].append([ self.train_df['skill'][d] for d in test_index if d in labels ][:])
def main(): rfc = RFC(n_estimators=100, n_jobs=-1) fs = SelectFromModel(rfc) pca = PCA() svm = SVC() estimators = zip(["feature_selection", "pca", "svm"], [fs, pca, svm]) pl = Pipeline(estimators) parameters = { "feature_selection__threshold" : ["mean", "median"], "pca__n_components" : [0.8, 0.5], "svm__gamma" : [0.001, 0.01, 0.05], "svm__C" : [1, 10] } gclf = GridSearchCV(pl, parameters, n_jobs=-1, verbose=2) digits = load_digits() X = digits.data y = digits.target first_fold = True trues = [] preds = [] for train_index, test_index in SKF().split(X, y): if first_fold: gclf.fit(X[train_index], y[train_index]) clf = gclf.best_estimator_ first_fold = False clf.fit(X[train_index,], y[train_index]) trues.append(y[test_index]) preds.append(clf.predict(X[test_index])) true_labels = np.hstack(trues) pred_labels = np.hstack(preds) print("p:{0:.6f} r:{1:.6f} f1:{2:.6f}".format(*prf(true_labels,pred_labels,average="macro")))
def make_setup( n_bins, max_depth, random_state, n_folds, datasets_folder, experiments_folder, methods, datasets ): try: os.mkdir(datasets_folder) except: rmtree(datasets_folder) os.mkdir(datasets_folder) for dataset in datasets: dataset_folder = datasets_folder + "/" + dataset try: os.mkdir(dataset_folder) except: pass X, y, s = globals()["get_"+dataset](show=False) joblib.dump(X, dataset_folder+"/X.pkl") joblib.dump(y, dataset_folder+"/y.pkl") joblib.dump(s, dataset_folder+"/s.pkl") # stratifying splits w.r.t [y, s] strata = [] for i in range(len(X)): row = str(y[i]) if len(s.shape) == 1: # if only single binary sens attr row += str(s[i]) else: for j in range(s.shape[1]): row += str(s[i,j]) strata.append(row) fold = 0 splitter = SKF(n_splits=n_folds, shuffle=True, random_state=random_state) for train_idx, test_idx in splitter.split(X, strata): joblib.dump(test_idx, dataset_folder+"/"+str(fold)+"_test_idx.pkl") joblib.dump(train_idx, dataset_folder+"/"+str(fold)+"_train_idx.pkl") fold += 1 try: os.mkdir(experiments_folder) except: rmtree(experiments_folder) os.mkdir(experiments_folder) for method in methods: method_folder = experiments_folder + "/" + method try: os.mkdir(method_folder) except: rmtree(method_folder) os.mkdir(method_folder) for dataset in datasets: exp_dataset_folder = method_folder + "/" + dataset try: os.mkdir(exp_dataset_folder) except: rmtree(exp_dataset_folder) os.mkdir(exp_dataset_folder)
def __init__(self, X, y, classifier, init_style, fratio_weight): Problem.__init__(self, minimized=True) self.X = X self.y = y self.no_instances, self.no_features = self.X.shape self.threshold = 0.6 self.dim = self.no_features self.clf = classifier self.init_style = init_style self.f_weight = fratio_weight # stratified only applicable when enough instnaces for each class k = 10 labels, counts = np.unique(self.y, return_counts=True) label_min = np.min(counts) if label_min < k: self.skf = KFold(n_splits=k, shuffle=True, random_state=1617) self.skf_valid = KFold(n_splits=k, shuffle=True, random_state=1990) else: self.skf = SKF(n_splits=k, shuffle=True, random_state=1617) self.skf_valid = SKF(n_splits=k, shuffle=True, random_state=1990) self.scores = reliefF(self.X, self.y, k=1) self.scores = self.scores / np.sum(self.scores) # from Orange.data import Domain, Table # from Orange.preprocess.discretize import EntropyMDL # from Orange.preprocess import Discretize # from skfeature.utility.mutual_information import su_calculation # domain = Domain.from_numpy(X=X, Y=y) # table = Table.from_numpy(domain=domain, X=X, Y=y) # disc = Discretize() # disc.method = EntropyMDL(force=True) # table_dis = disc(table) # X_dis = table_dis.X # test_scores = [] # for i in range(self.no_features): # test_scores.append(su_calculation(X_dis[:, i], y)) # test_scores = np.array(test_scores) # test_scores = test_scores/np.sum(test_scores) # self.scores = test_scores self.surrogate_clf = SVC(random_state=1617)
def SVM_gridsearch(parameters, data_train, labels_train, number_splits, num_threads): svm_clf = svm.SVC(gamma="scale", probability=True) # multiprocessing.cpu_count() clf = GSCV(svm_clf, parameters, cv=SKF(n_splits=number_splits), verbose=2, n_jobs=num_threads) clf.fit(data_train, labels_train) return clf
def param_selector(**kwargs): skf = SKF(n_splits=5, shuffle=True) skf.get_n_splits(X, y) y_pred = y.copy() flag = False # Iterate through folds for train_index, test_index in skf.split(X, y): X_train, X_test = X[train_index], X[test_index] y_train = y[train_index] # Initialize a classifier with key word arguments clf = clf_class(**kwargs) clf.fit(X_train, y_train) y_pred[test_index] = clf.predict(X_test) return y_pred
def SVM_KCross(self): """ SVM, 10交差検証で精度と結果を出力 参照 https://qiita.com/kazuki_hayakawa/items/18b7017da9a6f73eba77 https://qiita.com/nittaryo1977/items/44553b9f555fe7932cca https://hayataka2049.hatenablog.jp/entry/2018/03/12/213524 https://qiita.com/yhyhyhjp/items/c81f7cea72a44a7bfd3a """ # 交差検証 skf = SKF(n_splits=10, random_state=0, shuffle=True) trues = [] preds = [] test_files = [] for train_index, test_index in skf.split(self.features, self.targets): # 正規化 sc = StandardScaler() sc.fit(self.features[train_index]) X_train_std = sc.transform(self.features[train_index]) X_test_std = sc.transform(self.features[test_index]) # モデル定義(デフォルトだとRBFカーネル) num_m = self.targets[train_index].size num_c = np.sum(self.targets[train_index] == 0) svm = SVC(random_state=None, probability=True, class_weight={ 0: 1, 1: num_c / num_m }) # 学習 svm.fit(X_train_std, self.targets[train_index]) trues.append(self.targets[test_index]) # 推論 preds.append(svm.predict(X_test_std)) test_files.append(np.hstack(self.files[test_index])) # 精度出力 print( classification_report(np.hstack(trues), np.hstack(preds), target_names=["bad", "good"])) # 予測結果を出力 self.__makeResultDir(np.hstack(test_files), np.hstack(trues), np.hstack(preds))
def __init__(self, n_splits=5, n_repeats=None, shuffle=False, random_state=None): self.n_splits = n_splits self.shuffle = shuffle self.random_state = random_state self.n_repeats = n_repeats if self.n_repeats is not None: self.cvcls = RSKF(n_splits=self.n_splits, n_repeats=self.n_repeats, random_state=self.random_state) else: self.cvcls = SKF(n_splits=self.n_splits, shuffle=self.shuffle, random_state=self.random_state)
def split(self, n_splits=5, info=False): skf = SKF(n_splits=n_splits, shuffle=True).split(self.images, self.labels) folds = [] for train_idx, test_idx in skf: train_dataset = copy(self.dataset) train_dataset.__dict__["images"] = list( map(lambda idx: self.images[idx], train_idx)) train_dataset.__dict__["labels"] = list( map(lambda idx: self.labels[idx], train_idx)) test_dataset = copy(self.dataset) test_dataset.__dict__["images"] = list( map(lambda idx: self.images[idx], test_idx)) test_dataset.__dict__["labels"] = list( map(lambda idx: self.labels[idx], test_idx)) folds.append((train_dataset, test_dataset)) if info: folds_info(folds) return folds
def __init__(self, split_type='holdout', partitions=2, partition=0, test_size=0.3, seed=0, fields=None): if fields is None: fields = ['X', 'Y'] config = self._to_config(locals()) # Using 'self.algorithm' here to avoid 'algorithm' inside config. if split_type == "cv": self.algorithm = SKF(shuffle=True, n_splits=partitions, random_state=seed) del config['test_size'] elif split_type == "loo": self.algorithm = LOO() del config['partitions'] del config['partition'] del config['test_size'] del config['seed'] elif split_type == 'holdout': self.algorithm = HO(n_splits=partitions, test_size=test_size, random_state=seed) else: raise Exception('Wrong split_type: ', split_type) super().__init__(config) self.partitions = partitions self.partition = partition self.test_size = test_size self.seed = seed self.fields = fields
def startified_splits(self): self.dataset_split = { 'train_X': [], 'train_y': [], 'valid_X': [], 'valid_y': [], } skf = SKF( n_splits=self.config.KFolds, shuffle=True, random_state=42 ) data = self.df['file_path'] labels = self.df['isbeauty'] for train_index, test_index in skf.split(data, labels): self.dataset_split['train_X'].append([data[d] for d in train_index if d in data][:]) self.dataset_split['valid_X'].append([data[d] for d in test_index if d in data][:]) self.dataset_split['train_y'].append([ [self.df['isbeauty'][d], self.df['skill'][d]] for d in train_index if d in labels][:]) self.dataset_split['valid_y'].append( [[self.df['isbeauty'][d], self.df['skill'][d]] for d in test_index if d in labels][:])
# classif.fit(X_tr, Result[u[:n_train]]) # Pred = classif.predict(X_te) # print(np.sum(Pred == Result[u[n_train:]]) / (n_total - n_train)) # L.append(np.sum(Pred == Result[u[n_train:]]) / (n_total - n_train)) # plt.figure() # plt.plot(L, 'k+') # plt.show() clf = PCA(n_components=25) classif = RFC(n_estimators=500, n_jobs=-1, class_weight='balanced') # classif = ADA() S = [] Confusion = [] skf = SKF(n_splits=10, shuffle=True) for train, test in skf.split(mf, Result): X_train = mf[train] X_test = mf[test] label_train = Result[train] label_test = Result[test] X_tr = clf.fit_transform(X_train) X_te = clf.transform(X_test) classif.fit(X_tr, label_train) Pred = classif.predict(X_te) C = CM(Pred, label_test) s = score(Pred, label_test) S.append(s) Confusion.append(C) print(s, C)
def save_predictions(t, filename, rs): # create RandomForest classifier with parameters given in _conf.py clf = RandomForestClassifier(random_state=rs, verbose=verbosity, class_weight='balanced', n_estimators=conf.n_estimators, n_jobs=conf.max_n_jobs, max_features=conf.tree_max_features, max_depth=conf.tree_max_depth) # use ground truth to create folds for outer cross validation in a stratified way, i.e. such that # each label occurs equally often participant_scores = np.genfromtxt('Data/Binned_Personality.csv', skip_header=1, delimiter=',').astype(int)[:, t + 1] outer_cv = SKF(conf.n_outer_folds, shuffle=True, random_state=True) len_outer_cv = outer_cv.get_n_splits(participant_scores) # initialise arrays to save information feat_imp = np.zeros((len_outer_cv, conf.max_n_feat)) # feature importance preds = np.zeros((conf.n_participants), dtype=int) # predictions on participant level x = np.zeros(31) # placeholder for X instead of actual training data for outer_i, (outer_train_participants, outer_test_participants) in enumerate( outer_cv.split(x, participant_scores)): print(str(outer_i + 1) + '/' + str(conf.n_outer_folds)) # find best window size in inner cv, and discard unimportant features inner_performance = np.zeros((conf.n_inner_folds, 1)) inner_feat_importances = np.zeros((conf.max_n_feat, 1)) #load all the extracted features x_all, y_all, ids_all = load_data(t) if shuffle_labels: np.random.seed(316588 + 111 * t + rs) perm = np.random.permutation(len(y_all)) y_all = y_all[perm] ids_all = ids_all[perm] # cut out the outer train samples outer_train_samples = np.array( [p in outer_train_participants for p in ids_all]) outer_train_x = x_all[outer_train_samples, :] outer_train_y = y_all[outer_train_samples] outer_train_y_ids = ids_all[outer_train_samples] # build inner cross validation such that all samples of one person are either in training or testing inner_cv = LKF(n_splits=conf.n_inner_folds) for inner_i, (inner_train_indices, inner_test_indices) in enumerate( inner_cv.split(outer_train_y_ids)): # create inner train and test samples. Note: both are taken from outer train samples! inner_x_train = outer_train_x[inner_train_indices, :] inner_y_train = outer_train_y[inner_train_indices] inner_x_test = outer_train_x[inner_test_indices, :] inner_y_test = outer_train_y[inner_test_indices] # fit Random Forest clf.fit(inner_x_train, np.ravel(inner_y_train)) # save predictions and feature importance inner_pred = clf.predict(inner_x_test) inner_pred = inner_pred.reshape(-1, 1) inner_feat_importances[:, 0] += clf.feature_importances_ # compute and save performance in terms of accuracy innerpreds = [] innertruth = [] inner_test_ids = outer_train_y_ids[inner_test_indices] for testp in np.unique(inner_test_ids): (values, counts) = np.unique(inner_pred[inner_test_ids == testp], return_counts=True) ind = np.argmax(counts) innerpreds.append(values[ind]) innertruth.append(inner_y_test[inner_test_ids == testp][0]) inner_performance[inner_i, 0] = accuracy_score(np.array(innertruth), np.array(innerpreds)) print('ACC: ', '%.2f' % (inner_performance[inner_i, 0] * 100)) # evaluate classifier on outer cv using the most informative features chosen_i = np.argmax(np.mean(inner_performance, axis=0)) chosen_features = (inner_feat_importances[:, chosen_i] / float(conf.n_inner_folds)) > 0.005 # reload all data x, y, ids = load_data(t, chosen_features=chosen_features) if shuffle_labels: np.random.seed(316588 + 111 * t + rs + 435786) perm = np.random.permutation(len(y)) y = y[perm] ids = ids[perm] outer_train_samples = np.array( [p in outer_train_participants for p in ids]) outer_test_samples = np.array( [p in outer_test_participants for p in ids]) if outer_train_samples.size > 0 and outer_test_samples.size > 0: x_train = x[outer_train_samples, :] y_train = y[outer_train_samples] x_test = x[outer_test_samples, :] y_test = y[outer_test_samples] # fit Random Forest clf.fit(x_train, np.ravel(y_train)) pred = clf.predict(x_test) pred = pred.reshape(-1, 1) for testp in outer_test_participants: if testp in ids[outer_test_samples]: # majority voting over all samples that belong to participant testp (values, counts) = np.unique( pred[ids[outer_test_samples] == testp], return_counts=True) ind = np.argmax(counts) preds[testp] = values[ind] else: # participant does not occour in outer test set preds[testp] = -1 # save the resulting feature importance feat_imp[outer_i, chosen_features] = clf.feature_importances_ else: for testp in outer_test_participants: preds[testp] = -1 feat_imp[outer_i, chosen_features] = -1 # compute resulting F1 score and save to file nonzero_preds = preds[preds > -1] nonzero_truth = participant_scores[preds > -1] f1 = f1_score(nonzero_truth, nonzero_preds, average='macro') accuracy = accuracy_score(nonzero_truth, nonzero_preds) np.savez(filename, f1=f1, accuracy=accuracy, feature_importances=feat_imp, inner_feat_importances='inner_feat_importances')
from sklearn.metrics import classification_report as CR print("Classification Report:\n", CR(Y_test, pred, zero_division=0)) # ### Cross Validation # In[12]: from sklearn.model_selection import StratifiedKFold as SKF from sklearn.model_selection import cross_val_score as CVS model = SVC(kernel='rbf', C=13, gamma=0.325) folds = 5 start = T() cross_val = SKF(n_splits=folds, shuffle=True, random_state=4) scores = CVS(model, X, Y, scoring='accuracy', cv=cross_val) end = T() accuracy = scores.mean() * 100 print(f"SVC has mean accuracy of {accuracy:.3f}%\n" + f"Cross Validation took {(end-start)*1000:.3f}ms") # ### Calculate F1-Score of the model # In[13]: from sklearn.metrics import f1_score as F1 f1score = F1(Y_test, pred, average='weighted') print(f"SVC has F1-Score = {f1score * 100:.3f}%")
embed_text.append(vec_embed) embed_text = np.asarray(embed_text) print(embed_text.shape) # this is 3D dimension, need to change to 2d flat_embeds = np.reshape(embed_text, (embed_text.shape[0], -1)) print(flat_embeds.shape) # shape in 2d NB = MultinomialNB() pc = Perceptron() svm = LinearSVC() lr = LogisticRegression() random_forest = rf() KNN = knn(n_neighbors=3) CNN = cnn() from sklearn.model_selection import StratifiedKFold as SKF skf = SKF(n_splits=5) X = flat_embeds y = label for clf in [lr]: #for clf in [pc, svm, lr, KNN, CNN, random_forest]: acc = [] for train_index, test_index in skf.split(X, y): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] clf.fit(X_train, y_train) acc.append(clf.score(X_test, y_test)) acc = np.asarray(acc) print(clf, acc.mean())
print("[!] using adv_training") adversarial_training(model, adv_layer_names, 0.5) evaluator = Evaluate(filename=cfg["filename"] + "_fold{}".format(fold_id), data=dev_data) model.fit_generator(train_D.__iter__(), steps_per_epoch=len(train_D), epochs=RUN_EPOCH, callbacks=[evaluator], shuffle=True ) del model, train_data, dev_data gc.collect() print("[!] finish fold_id =", fold_id) print("-" * 81) skf = SKF(FOLD_NUM, shuffle=False, random_state=SEED) print(all_data.shape) _t0 = time() for fold_id, (trn_ind, val_ind) in enumerate(skf.split(range(len(all_data)), all_data["label"])): if fold_id not in FOLD_ID: continue t0 = time() dev_data = all_data.iloc[val_ind].reset_index(drop=True) train_data = all_data.iloc[trn_ind].reset_index(drop=True) cfg["num_example"] = len(train_data) print("-" * 81) print("[!] start fold_id =", fold_id, train_data.shape, dev_data.shape) print(cfg) K.clear_session() gc.collect()
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): plt.text(j, i, format(cm[i, j], fmt), horizontalalignment="center", color="white" if cm[i, j] > thresh else "black") plt.tight_layout() plt.ylabel('True label') plt.xlabel('Predicted label') clf_class = XGBC # Replace the classifier here to_add = 'XGBoost Classifier' # Replace the name here # kwargs # Construct a kfolds object skf = SKF(n_splits=5, shuffle=True) skf.get_n_splits(X, y) y_pred = y.copy() flag = False # Iterate through folds for train_index, test_index in skf.split(X, y): X_train, X_test = X[train_index], X[test_index] y_train = y[train_index] # Initialize a classifier with key word arguments clf = clf_class() clf.fit(X_train, y_train) y_pred[test_index] = clf.predict(X_test) # if (not flag): # flag = True # feature_importance = clf.feature_importances_ # # make importances relative to max importance
clf = lgb.train(params=params, train_set=dtrain,valid_sets=dval) imp_df = pd.DataFrame() imp_df['feature'] = featurename imp_df['importance_gain'] = clf.feature_importance(importance_type='gain') imp_df['importance_split'] = clf.feature_importance(importance_type='split') return imp_df, clf def impdf_all_fu(imp_df_all,threshold): imp_df_all_normal=imp_df_all[['feature','importance_split']].groupby(['feature'],as_index=False).mean() imp_df_all_normal = imp_df_all_normal.sort_values('importance_split', ascending = False).reset_index(drop = True) imp_df_all_normal['normalized_importance'] = imp_df_all_normal['importance_split'] / imp_df_all_normal['importance_split'].sum() imp_df_all_normal['cumulative_importance'] = np.cumsum(imp_df_all_normal['normalized_importance']) imp_df_all_normal= imp_df_all_normal.sort_values('cumulative_importance') record_low_importance = imp_df_all_normal[imp_df_all_normal['cumulative_importance'] > threshold] return imp_df_all_normal,record_low_importance skf=SKF(n_splits=500,random_state=3,shuffle=True) lightpara={'objective':'binary','n_estimators':1000,'learning_rate':0.05,'num_leaves':50,'tree_learner':'data','num_threads':8,'bagging_fraction':0.8,'feature_fraction':0.8,'metric':'auc'} clfreportall=pd.DataFrame() otherreportall=pd.DataFrame() inpathlb=['365lgb','730lgb','1095lgb','1460lgb','1825lgb'] aucresult1=[] accresult1=[] mccresult1=[] for k in range(len(inpath)): print("start reading") table=pd.read_csv(inpath[k]) table=table[['Uid','variable','status']] table=table.drop_duplicates() patientclass=pd.read_csv(classpath[k]) patientclass=patientclass[['Uid','class','t2dmclass','controlclass','classweight']] totaluid=patientclass['Uid'].values
set_rf_samples(60000) # reset_rf_samples() to revert back to default behavior # ### Building a classifier # In[23]: m = RandomForestClassifier(n_jobs=-1, max_depth=5) predictions = np.zeros(y.shape) # We are using stratified 4 fold to ensure that percentage for each class is preserved and we cover the whole data once. # For each row the classifier will calculate the probability of it belonging to train. # In[24]: skf = SKF(n_splits=20, shuffle=True, random_state=100) for fold, (train_idx, test_idx) in enumerate(skf.split(x, y)): X_train, X_test = x[train_idx], x[test_idx] y_train, y_test = y[train_idx], y[test_idx] m.fit(X_train, y_train) probs = m.predict_proba(X_test)[:, 1] predictions[test_idx] = probs # ### Results # We'll output the ROC-AUC metric for our classifier as an estimate how much covariate shift this data has. As we can see that value for AUC is very close to .5. It implies that our classifier is not able to distinguish the rows whether it is belonging to train or test. This implies that majority of the observations comes from a feature space which is not particular to test or train. # In[25]:
def train_model(train_data , valid_data, feat_cols,fold=1): #scores_=[] hold_score = {} train_score = {} train_data,valid_data = train_data.reset_index(),valid_data.reset_index() train = train_data test = valid_data X_train_1,y_train_1 = train.drop('isFraud',axis=1),train['isFraud'] X_test,y_test = test.drop('isFraud',axis=1),test['isFraud'] ##important row X = X_train_1 y = y_train_1 kf = SKF(n_splits=5, shuffle=True) kf.get_n_splits(X,y) kfold_train_1=[] kfold_test_1=[] for train_index, test_index in kf.split(X,y): #print("TRAIN:", train_index, "TEST:", test_index) kfold_train_1.append([train_index]) kfold_test_1.append([test_index]) #hidden_units=[16,16,16,16,16,16] dnn_model = tf.estimator.DNNClassifier(hidden_units=[16,16,16,16],feature_columns=feat_cols,model_dir='/home/edgar/Desktop/Fraud/saved_models_{}/'.format(fold),n_classes=2,optimizer=lambda: tf.train.AdamOptimizer( learning_rate=tf.train.exponential_decay( learning_rate=0.001, global_step=tf.train.get_global_step(), decay_steps=5000, decay_rate=0.86))) for i in range(0,5): train = train_data.iloc[list(kfold_train_1[:][i][0])] test = valid_data.iloc[list(kfold_test_1[:][i][0])] X_train_2,y_train_2 = train.drop('isFraud',axis=1),train['isFraud'] X_test_2,y_test_2 = test.drop('isFraud',axis=1),test['isFraud'] input_func = tf.estimator.inputs.pandas_input_fn(x=X_train_2,y=y_train_2,batch_size=800,num_epochs=1500,shuffle=True) dnn_model.train(input_fn=input_func,steps=15000) hold_score['Kfold_{}_sub_{}'.format(fold,i)] = eval_input_func(x_ = X_train_2, y_ =y_train_2,model = dnn_model) train_score['Kfold_{}_sub_{}'.format(fold,i)] = eval_input_func(x_ = X_test, y_ = y_test,model = dnn_model) train_score_,hold_score_ = [],[] for key in hold_score.keys(): hold_score_.append(hold_score[key]) train_score_.append(train_score[key]) dftocsv = pd.DataFrame({'train_score':train_score_,'hold_score':hold_score_}) dftocsv.to_csv('scores_{}.csv'.format(fold)) b = eval_input_func(x_ = X_test, y_ = y_test,model = dnn_model) print('\n') print('*********************************************************') print('*********************************************************') print('\n') #print(i) print('Fold {}, Accuracy: {}'.format((fold), b)) print('\n') print('*********************************************************') print('*********************************************************') print('\n')
'max_bin': 2**8 -1, 'metric': 'auc', 'colsample_bytree': 0.33, #0.4 'bagging_fraction': 0.9, 'bagging_freq': 10, 'scale_pos_weight': 1.02, 'bagging_seed': 619, #619 'feature_fraction_seed': 619 #619 } nrounds = 2000 kfolds = 5 oof_train=pd.DataFrame({'UCIC_ID': tr_ids, 'Responders':0}) best=[] score=[] skf = SKF( n_splits=kfolds, shuffle=True,random_state=123) i=0 for train_index, test_index in skf.split(df_train, Y): print('Fold {0}'.format(i + 1)) X_train, X_val = df_train[train_index], df_train[test_index] y_train, y_val = Y[train_index],Y[test_index] ltrain = lgb.Dataset(X_train,y_train) lval = lgb.Dataset(X_val,y_val, reference= ltrain) gbdt = lgb.train(lgb_params, ltrain, nrounds, valid_sets=lval, verbose_eval=100, early_stopping_rounds=30) bst=gbdt.best_iteration pred=gbdt.predict(X_val, num_iteration=bst) oof_train.loc[test_index,"Responders"]= pred
"pca-gnn f1", "lda-gnn precision", "lda-gnn recall", "lda-gnn f1" ]) for n_components in [5, 10, 15, 20, 25, 30, 40]: pca = PCA(n_components=n_components) # 主成分分析 lda = LDA(n_components=n_components) # 線形判別分析 # zipは複数のリストの要素をまとめて取得 steps1 = list(zip(["pca", "gnb"], [pca, gnb])) steps2 = list(zip(["lda", "gnb"], [lda, gnb])) p1 = Pipeline(steps1) p2 = Pipeline(steps2) score_lst = [] for decomp_name, clf in zip(["pca", "lda"], [p1, p2]): trues = [] preds = [] for train_index, test_index in SKF(shuffle=True, random_state=0).split( digits.data, digits.target): clf.fit(digits.data[train_index], digits.target[train_index]) trues.append(digits.target[test_index]) preds.append(clf.predict(digits.data[test_index])) scores = prf(np.hstack(trues), np.hstack(preds), average="macro") score_lst.extend(scores[:-1]) df = df.append(pd.Series([n_components, *score_lst], index=df.columns), ignore_index=True) print(df) df.plot(x="n_components", y=["pca-gnn f1", "lda-gnn f1"]) plt.savefig("判別成分分析_参考.png")