def make_mf_regression(X ,y, clf, qid, X_test, n_round=3): ''' Fit metafeature by @clf and get prediction for test. Assumed that @clf -- regressor ''' print clf mf_tr = np.zeros(X.shape[0]) mf_te = np.zeros(X_test.shape[0]) for i in range(n_round): skf = StratifiedKFold(qid, n_folds=2, shuffle=True, random_state=42+i*1000) for ind_tr, ind_te in skf: X_tr = X[ind_tr] X_te = X[ind_te] y_tr = y[ind_tr] y_te = y[ind_te] clf.fit(X_tr, y_tr) mf_tr[ind_te] += clf.predict(X_te) mf_te += clf.predict(X_test)*0.5 y_pred = np.round(clf.predict(X_te)) kappa = pykappa.quadratic_weighted_kappa(y_te, y_pred) acc = np.mean(y_te == y_pred) print 'pred[{}] kappa:{}, acc:{}'.format(i, kappa, acc) return (mf_tr / n_round, mf_te / n_round)
def LogR(C=1): from sklearn.linear_model import LogisticRegression lr = LogisticRegression( C=C, penalty="l2", dual=False, tol=1e-5, fit_intercept=True, intercept_scaling=1.0, class_weight='balanced', n_jobs = -1, max_iter = 10000, solver="lbfgs" ) lr.fit(x_train,y_train) y_pred = lr.predict(x_test) y_pred = list(y_pred) qwk=quadratic_weighted_kappa(y_test,y_pred) print("kappa",qwk) print("Log 验证集 Acc = ",calcAcc(y_pred,y_test)) ################################################# y_pred = lr.predict(x_train) y_pred = list(y_pred) print("Log 训练集 Acc = ",calcAcc(y_pred,y_train)) return y_pred
def RF(): from sklearn.ensemble import RandomForestClassifier #train #initialize a Random Forest classifier with 100 trees forest = RandomForestClassifier(n_estimators= 15000, max_features = "auto", max_depth = None, n_jobs = -1, ) #use the data set labeled_train_data ''' n_classes_ : int or list The number of classes (single output problem), or a list containing the number of classes for each output (multi-output problem). ''' forest = forest.fit(x_train, y_train) #predict y_pred = forest.predict(x_test) y_pred = list(y_pred) qwk=quadratic_weighted_kappa(y_test,y_pred) print("kappa",qwk) print("RF Acc = ",calcAcc(y_pred,y_test))
def make_mf_classification2(X ,y, clf, qid, X_test, n_round=3): ''' Fit metafeature by @clf and get prediction for test. Assumed that @clf -- classifier and only 2 class presented ''' print clf mf_tr = np.zeros((X.shape[0], 2)) mf_te = np.zeros((X_test.shape[0], 2)) for i in range(n_round): skf = StratifiedKFold(qid, n_folds=2, shuffle=True, random_state=42+i*1000) for ind_tr, ind_te in skf: X_tr = X[ind_tr] X_te = X[ind_te] y_tr = y[ind_tr] y_te = y[ind_te] clf.fit(X_tr, y_tr) try: mf_tr[ind_te] += clf.predict_proba(X_te) mf_te += clf.predict_proba(X_test)*0.5 except: mf_tr[ind_te, 0] += clf.decision_function(X_te) mf_te[:, 0] += clf.decision_function(X_test)*0.5 y_pred = np.round(clf.predict(X_te)) kappa = pykappa.quadratic_weighted_kappa(y_te, y_pred) acc = np.mean(y_te == y_pred) print 'prob[{}] kappa:{}, acc:{}'.format(i, kappa, acc) print return (mf_tr / n_round, mf_te / n_round)
def make_mf_classification4(X ,y, clf, qid, X_test, n_round=3): print clf mf_tr = np.zeros((X.shape[0], 5)) mf_te = np.zeros((X_test.shape[0], 5)) for i in range(n_round): skf = StratifiedKFold(qid, n_folds=2, shuffle=True, random_state=42+i*1000) for ind_tr, ind_te in skf: X_tr = X[ind_tr] X_te = X[ind_te] y_tr = y[ind_tr] y_te = y[ind_te] clf.fit(X_tr, y_tr) mf_tr[ind_te, 4] += clf.predict(X_te) mf_te[:, 4] += clf.predict(X_test)*0.5 try: mf_tr[ind_te, :4] += clf.predict_proba(X_te) mf_te[:, :4] += clf.predict_proba(X_test)*0.5 except: mf_tr[ind_te, :4] += clf.decision_function(X_te) mf_te[:,:4] += clf.decision_function(X_test)*0.5 y_pred = np.round(clf.predict(X_te)) kappa = pykappa.quadratic_weighted_kappa(y_te, y_pred) acc = np.mean(y_te == y_pred) print 'prob[{}] kappa:{}, acc:{}'.format(i, kappa, acc) print return (mf_tr / n_round, mf_te / n_round)
def make_mf_classification2(X, y, clf, qid, X_test, n_round=3): print clf mf_tr = np.zeros((X.shape[0], 2)) mf_te = np.zeros((X_test.shape[0], 2)) for i in range(n_round): skf = StratifiedKFold(qid, n_folds=2, shuffle=True, random_state=42 + i * 1000) for ind_tr, ind_te in skf: X_tr = X[ind_tr] X_te = X[ind_te] y_tr = y[ind_tr] y_te = y[ind_te] clf.fit(X_tr, y_tr) try: mf_tr[ind_te] += clf.predict_proba(X_te) mf_te += clf.predict_proba(X_test) * 0.5 except: mf_tr[ind_te, 0] += clf.decision_function(X_te) mf_te[:, 0] += clf.decision_function(X_test) * 0.5 y_pred = np.round(clf.predict(X_te)) kappa = pykappa.quadratic_weighted_kappa(y_te, y_pred) acc = np.mean(y_te == y_pred) print 'prob[{}] kappa:{}, acc:{}'.format(i, kappa, acc) print return (mf_tr / n_round, mf_te / n_round)
def make_mf_sliced_classification(subset_tr, subset_te, clf, n_round=3, target_col='median_relevance'): ''' Perform per-query slicing, BoW on text, fit @clf and get prediction for test. Assumed that @clf -- classifier ''' print '\n [make_mf_slice]' print clf mf_tr = np.zeros(len(subset_tr)) mf_te = np.zeros(len(subset_te)) #query-slice for cur_query in subset_tr.query_stem.value_counts().index: mask_tr = subset_tr.query_stem == cur_query mask_te = subset_te.query_stem == cur_query # build Bow vect = CountVectorizer(min_df=1, ngram_range=(1,2)) txts = (list((subset_tr[mask_tr]['title_stem']).values) + list((subset_te[mask_te]['title_stem']).values)) vect.fit(txts) X_loc_base = vect.transform(list((subset_tr[mask_tr]['title_stem']).values)).todense() X_loc_hold = vect.transform(list((subset_te[mask_te]['title_stem']).values)).todense() y_loc_train = subset_tr[mask_tr][target_col].values # intersect terms feat_counts = np.array(np.sum(X_loc_base, axis=0))[0] * np.array(np.sum(X_loc_hold, axis=0))[0] feat_mask = np.where(feat_counts>0)[0] # build final feats matrix X_loc_base = np.hstack((X_loc_base[:, feat_mask], subset_tr[mask_tr][feat_list])) X_loc_hold = np.hstack((X_loc_hold[:, feat_mask], subset_te[mask_te][feat_list])) # metafeatures iterators tmp_tr = np.zeros(sum(mask_tr)) tmp_te = np.zeros(sum(mask_te)) #print y_loc_train.shape, X_loc_base.shape for i in range(n_round): kf = KFold(len(y_loc_train), n_folds=2, shuffle=True, random_state=42+i*1000) for ind_tr, ind_te in kf: X_tr = X_loc_base[ind_tr] X_te = X_loc_base[ind_te] y_tr = y_loc_train[ind_tr] y_te = y_loc_train[ind_te] clf.fit(X_tr, y_tr) tmp_tr[ind_te] += clf.predict(X_te) tmp_te += clf.predict(X_loc_hold)*0.5 mf_tr[mask_tr.values] = tmp_tr / n_round mf_te[mask_te.values] = tmp_te / n_round y_valid = subset_tr[target_col].values kappa = pykappa.quadratic_weighted_kappa(y_valid, np.round(mf_tr)) acc = np.mean(y_valid == np.round(mf_tr)) print '[{}] kappa:{}, acc:{}'.format(i, kappa, acc) return (mf_tr, mf_te)
def svr(): clf = SVR(C=4.0,gamma=0.2,cache_size=2048,kernel='rbf') clf.fit(x_train,y_train,sample_weight=getWeights())#,sample_weight=weights) y_pred = clf.predict(x_test) y_pred = list(y_pred) y_p = rounding_cdf(y_pred) qwk=quadratic_weighted_kappa(y_test,y_p) print("SVR Kappa:",qwk)
def ridge(alpha=1.0): from sklearn.linear_model import Ridge#, Lasso, LassoLars, ElasticNet ridge = Ridge(alpha=alpha, normalize=False) ridge.fit(x_train, y_train,sample_weight=getWeights())#, sample_weight=weight_train[index_base] y_pred = ridge.predict(x_test) y_pred = list(y_pred) #print(y_pred[:10]) y_p = rounding_cdf(y_pred) qwk=quadratic_weighted_kappa(y_test,y_p) print("RidgeRegression Kappa:",qwk)
def LinearR(): from sklearn.linear_model import LinearRegression model = LinearRegression(n_jobs = -1) model.fit(x_train, y_train,sample_weight=getWeights()) y_pred = model.predict(x_test) y_pred = list(y_pred) y_p = rounding_cdf(y_pred) qwk=quadratic_weighted_kappa(y_test,y_p) print("LinearRegresion Kappa:",qwk)
def Lasso(alpha=1.0): from sklearn.linear_model import Lasso#, Lasso, LassoLars, ElasticNet lasso = Lasso(alpha=alpha, normalize=False) lasso.fit(x_train, y_train) y_pred = lasso.predict(x_test) y_pred = list(y_pred) y_p = rounding_cdf(y_pred) qwk=quadratic_weighted_kappa(y_test,y_p) print("kappa",qwk) print("LASSO rounding cdf Acc = ",calcAcc(y_p,y_test))
def validate(n_epochs, n_models, n_steps=5, activations=False): with h5py.File(constants.train_features_scaled_strat_file, "r") as fi: labels_train = fi.get("y_train")[:60000] X_train = fi.get("X_train")[:60000] y_train, _ = preprocess_labels(labels_train, categorical=(net_type == 'softmax')) labels_test = fi.get("y_test")[()] X_test = fi.get("X_test")[()] y_test, _ = preprocess_labels(labels_test, categorical=(net_type == 'softmax')) y_train = y_train / 5.0 / 2 + 0.5 y_test = y_test / 5.0 / 2 + 0.5 if net_type == 'softmax': n_classes = y_train.shape[1] elif net_type == 'regression': n_classes = 1 print(n_classes, 'classes') n_dims = X_train.shape[1] print(n_dims, 'dims') cum_blend = 0 models = range(1, n_models + 1) for i in models: print("\n-------------- Model %d --------------\n" % i) model = model_factory(n_classes, n_dims, net_type) for n in range(0, n_epochs, n_steps): model.fit(X_train, y_train, nb_epoch=n_steps, batch_size=128, show_accuracy=False, verbose=2) #, validation_data=(X_test, y_test)) # validate individual net if net_type == 'softmax': y_pred = model.predict_classes(X_test, verbose=0) elif net_type == 'regression': y_pred = model.predict(X_test, verbose=0) y_pred = np.floor((y_pred - 0.5) * 2 * 5.0).flatten() y_pred[y_pred < 0] = 0 y_pred[y_pred > 4] = 4 print('Epoch: %d. Accuracy: %0.2f%%. Kappa: %0.2f' % (n + n_steps, 100 * accuracy_score(labels_test, y_pred), quadratic_weighted_kappa(labels_test, y_pred))) # validate ensemble if net_type == 'softmax': cum_blend += model.predict_proba(X_test, verbose=0) y_pred = np.argmax(cum_blend, axis=1) elif net_type == 'regression': cum_blend += model.predict(X_test, verbose=0) y_pred = np.floor((cum_blend / i - 0.5) * 2 * 5.0).flatten() y_pred[y_pred < 0] = 0 y_pred[y_pred > 4] = 4 print('\nBlend %d. Accuracy: %0.2f%%. Kappa: %0.2f' % (i, 100 * accuracy_score(labels_test, y_pred), quadratic_weighted_kappa(labels_test, y_pred))) print('Confusion matrix:\n', confusion_matrix(labels_test, y_pred)) fitted = fit2distribution(labels_test, cum_blend) print('\nFitted. Accuracy: %0.2f%%. Kappa: %0.2f' % (100 * accuracy_score(labels_test, fitted), quadratic_weighted_kappa(labels_test, fitted))) print('Confusion matrix:\n', confusion_matrix(labels_test, fitted)) if activations: F_train = pick_activations(model, X_train, net_type) F_test = pick_activations(model, X_test, net_type) fout = os.path.join( constants.features_NN_dir, features_NN_prefix + format(i, '02d') + '.hd5') with h5py.File(fout, "w") as fo: fo.create_dataset("X_train", data=F_train) fo.create_dataset("y_train", data=labels_train) fo.create_dataset("X_test", data=F_test) fo.create_dataset("y_test", data=labels_test) with h5py.File(fout, "r") as fi: X = fi.get("X_train") y = fi.get("y_train") XX = fi.get("X_test") yy = fi.get("y_test") print(X.shape, y.shape, XX.shape, yy.shape)
def func(data): d1 = data[:, 0] d2 = data[:, 1] kappa = quadratic_weighted_kappa(d1, d2) return kappa
def validate(n_epochs, n_models, n_steps=5, activations=False): with h5py.File(constants.train_features_scaled_strat_file, "r") as fi: labels_train = fi.get("y_train")[:60000] X_train = fi.get("X_train")[:60000] y_train, _ = preprocess_labels(labels_train, categorical=(net_type=='softmax')) labels_test = fi.get("y_test")[()] X_test = fi.get("X_test")[()] y_test, _ = preprocess_labels(labels_test, categorical=(net_type=='softmax')) y_train = y_train/5.0/2+0.5 y_test = y_test/5.0/2+0.5 if net_type == 'softmax': n_classes = y_train.shape[1] elif net_type == 'regression': n_classes = 1 print(n_classes, 'classes') n_dims = X_train.shape[1] print(n_dims, 'dims') cum_blend = 0 models = range(1, n_models+1) for i in models: print("\n-------------- Model %d --------------\n" % i) model = model_factory(n_classes, n_dims, net_type) for n in range(0, n_epochs, n_steps): model.fit(X_train, y_train, nb_epoch=n_steps, batch_size=128, show_accuracy=False, verbose=2)#, validation_data=(X_test, y_test)) # validate individual net if net_type == 'softmax': y_pred = model.predict_classes(X_test, verbose=0) elif net_type == 'regression': y_pred = model.predict(X_test, verbose=0) y_pred = np.floor((y_pred-0.5)*2*5.0).flatten() y_pred[y_pred<0] = 0 y_pred[y_pred>4] = 4 print('Epoch: %d. Accuracy: %0.2f%%. Kappa: %0.2f' % (n+n_steps, 100 * accuracy_score(labels_test, y_pred), quadratic_weighted_kappa(labels_test, y_pred))) # validate ensemble if net_type == 'softmax': cum_blend += model.predict_proba(X_test, verbose=0) y_pred = np.argmax(cum_blend, axis=1) elif net_type == 'regression': cum_blend += model.predict(X_test, verbose=0) y_pred = np.floor((cum_blend/i-0.5)*2*5.0).flatten() y_pred[y_pred<0] = 0 y_pred[y_pred>4] = 4 print('\nBlend %d. Accuracy: %0.2f%%. Kappa: %0.2f' % (i, 100 * accuracy_score(labels_test, y_pred), quadratic_weighted_kappa(labels_test, y_pred))) print('Confusion matrix:\n', confusion_matrix(labels_test, y_pred)) fitted = fit2distribution(labels_test, cum_blend) print('\nFitted. Accuracy: %0.2f%%. Kappa: %0.2f' % (100 * accuracy_score(labels_test, fitted), quadratic_weighted_kappa(labels_test, fitted))) print('Confusion matrix:\n', confusion_matrix(labels_test, fitted)) if activations: F_train = pick_activations(model, X_train, net_type) F_test = pick_activations(model, X_test, net_type) fout = os.path.join(constants.features_NN_dir, features_NN_prefix + format(i,'02d') +'.hd5') with h5py.File(fout, "w") as fo: fo.create_dataset("X_train", data=F_train) fo.create_dataset("y_train", data=labels_train) fo.create_dataset("X_test", data=F_test) fo.create_dataset("y_test", data=labels_test) with h5py.File(fout, "r") as fi: X = fi.get("X_train") y = fi.get("y_train") XX = fi.get("X_test") yy = fi.get("y_test") print(X.shape, y.shape, XX.shape, yy.shape)
def make_mf_sliced_classification(subset_tr, subset_te, clf, n_round=3, target_col='median_relevance'): print '\n [make_mf_slice]' print clf mf_tr = np.zeros(len(subset_tr)) mf_te = np.zeros(len(subset_te)) #query-slice for cur_query in subset_tr.query_stem.value_counts().index: mask_tr = subset_tr.query_stem == cur_query mask_te = subset_te.query_stem == cur_query # build Bow vect = CountVectorizer(min_df=1, ngram_range=(1, 2)) txts = (list((subset_tr[mask_tr]['title_ext']).values) + list( (subset_te[mask_te]['title_ext']).values)) vect.fit(txts) X_loc_base = vect.transform( list((subset_tr[mask_tr]['title_ext']).values)).todense() X_loc_hold = vect.transform( list((subset_te[mask_te]['title_ext']).values)).todense() y_loc_train = subset_tr[mask_tr][target_col].values # intersect terms feat_counts = np.array(np.sum(X_loc_base, axis=0))[0] * np.array( np.sum(X_loc_hold, axis=0))[0] feat_mask = np.where(feat_counts > 0)[0] # build final feats matrix X_loc_base = np.hstack( (X_loc_base[:, feat_mask], subset_tr[mask_tr][feat_list])) X_loc_hold = np.hstack( (X_loc_hold[:, feat_mask], subset_te[mask_te][feat_list])) # metafeatures iterators tmp_tr = np.zeros(sum(mask_tr)) tmp_te = np.zeros(sum(mask_te)) #print y_loc_train.shape, X_loc_base.shape for i in range(n_round): kf = KFold(len(y_loc_train), n_folds=2, shuffle=True, random_state=42 + i * 1000) for ind_tr, ind_te in kf: X_tr = X_loc_base[ind_tr] X_te = X_loc_base[ind_te] y_tr = y_loc_train[ind_tr] y_te = y_loc_train[ind_te] clf.fit(X_tr, y_tr) tmp_tr[ind_te] += clf.predict(X_te) tmp_te += clf.predict(X_loc_hold) * 0.5 mf_tr[mask_tr.values] = tmp_tr / n_round mf_te[mask_te.values] = tmp_te / n_round y_valid = subset_tr[target_col].values kappa = pykappa.quadratic_weighted_kappa(y_valid, np.round(mf_tr)) acc = np.mean(y_valid == np.round(mf_tr)) print '[{}] kappa:{}, acc:{}'.format(i, kappa, acc) return (mf_tr, mf_te)