def gen_data_for_stacking(clf, X, y, X_test, n_splits=5, random_state=None): """ generate single model result data for stacking Args: clf: single model X: original X y: original y X_test: original X_test n_splits: n_splits for skf random_state: random_state for skf Returns: X, y, X_test """ skf = StratifiedKFold(n_splits=n_splits, shuffle=bool(random_state), random_state=random_state) y_pred = np.zeros((X.shape[0],)) # for printing score of each fold y_pred_proba = np.zeros((X.shape[0], N_CLASSES)) y_test_pred_proba = np.zeros((X_test.shape[0], N_CLASSES)) for ind, (train_index, cv_index) in enumerate(skf.split(X, y)): # cv split X_train, X_cv = X[train_index], X[cv_index] y_train, y_cv = y[train_index], y[cv_index] clf.fit(X_train, y_train) y_pred[cv_index] = clf.predict(X_cv) y_pred_proba[cv_index] = predict_proba(clf, X_cv) print("%d/%d cv macro f1 :" % (ind + 1, n_splits), f1_score(y_cv, y_pred[cv_index], average='macro')) y_test_pred_proba += predict_proba(clf, X_test) print("macro f1:", f1_score(y, y_pred, average='macro')) # calc macro_f1 score y_test_pred_proba /= n_splits # normalize to 1 return y_pred_proba, y, y_test_pred_proba
def train_and_gen_result(clf, X, y, X_test, use_proba=False, save_url=None, n_splits=1, random_state=None): """ train and generate result with specific clf Args: clf: classifier X: vectorized data y: target X_test: test data use_proba: predict probabilities of labels instead of label save_url: url to save the result file n_splits: n_splits for K-fold, None to not use k-fold random_state: random_state for 5-fold """ if n_splits > 1: slf = StratifiedKFold(n_splits=n_splits, shuffle=bool(random_state), random_state=random_state) y_pred_proba = np.zeros((X_test.shape[0], N_CLASSES)) for train_index, cv_index in slf.split(X, np.zeros((len(y), ))): X_train = X[train_index] y_train = y[train_index] clf.fit(X_train, y_train) #y_pred = clf.predict(X_test) #print(y_pred) y_pred_proba += predict_proba(clf, X_test, X_train, y_train) y_pred_proba /= n_splits y_pred = y_pred_proba.argmax(axis=1) + 1 # 正确生成多标签?并正确评价多标签 else: clf.fit(X, y) y_pred_proba = predict_proba(clf, X_test, X, y) y_pred = clf.predict(X_test) if use_proba: result_df = pd.DataFrame( y_pred_proba, columns=['class_prob_' + str(i + 1) for i in range(N_CLASSES)]) else: result_df = pd.DataFrame(y_pred, columns=['class']) if save_url: result_df.to_csv(save_url, index_label='id') return result_df
def gen_multi_data_for_stacking(n_splits=5, random_state=233): clf = OneVsRestClassifier(LogisticRegression(solver='liblinear')) X_one, _, X_test = joblib.load(from_project_root('data/vector/stacked_one_XyX_test_32_subjects.pk')) _, _, X_multi = joblib.load(from_project_root('data/vector/stacked_one_XyX_multi_32_subjects.pk')) train_df = pd.read_csv(from_project_root("data/train_2_ex.csv")) skf = StratifiedKFold(n_splits=n_splits, shuffle=bool(random_state), random_state=random_state) y = MultiLabelBinarizer().fit_transform(train_df['subjects'].apply(str.split)) one_ind = train_df['n_subjects'] == 1 multi_ind = train_df['n_subjects'] > 1 y_for_split = train_df['subjects'][one_ind].values.astype(int) y_one = y[one_ind] y_proba = np.zeros((len(train_df), 10)) y_pred_one = np.zeros((X_one.shape[0], 10)) # for printing score of each fold y_proba_one = np.zeros((X_one.shape[0], 10)) y_test_proba = np.zeros((X_test.shape[0], 10)) y_proba_multi = np.zeros((X_multi.shape[0], 10)) for ind, (train_index, cv_index) in enumerate(skf.split(X_one, y_for_split)): # cv split X_train, X_cv = X_one[train_index], X_one[cv_index] y_train, y_cv = y_one[train_index], y_one[cv_index] clf.fit(X_train, y_train) y_pred_one[cv_index] = clf.predict(X_cv) y_proba_one[cv_index] = predict_proba(clf, X_cv) print("%d/%d cv micro f1 :" % (ind + 1, n_splits), f1_score(y_cv, y_pred_one[cv_index], average='micro')) y_test_proba += predict_proba(clf, X_test) y_proba_multi += predict_proba(clf, X_multi) print("micro f1:", f1_score(y_one, y_pred_one, average='micro')) # calc micro_f1 score y_test_proba /= n_splits # avg y_proba_multi /= n_splits # avg y_proba[one_ind] = y_proba_one y_proba[multi_ind] = y_proba_multi print(y_proba.shape, y.shape, y_test_proba.shape) return y_proba, y, y_test_proba
def validate(pkl_url=None, cv=5, evaluating=False): """ do validating Args: pkl_url: load data from pickle file, set to None to generate data instantly cv: do cross validation or not evaluating: whether to do evaluating on test_gold """ clfs = init_clfs() val_url = from_project_root("data/preliminary/test_gold_ex.csv") if pkl_url is not None: # load from pickle print("loading data from", pkl_url) X, y, X_val = joblib.load(pkl_url) else: train_url = from_project_root("data/preliminary/train_ex.csv") # generate from original csv X, y, X_val = generate_vectors(train_url, val_url, column='article', max_n=3, min_df=3, max_df=0.8, max_features=20000, trans_type='dc', sublinear_tf=True, balanced=True, multilabel_out=False, label_col='subjects', only_single=True, shuffle=True) print("data shapes:\n", X.shape, y.shape, X_val.shape) for name, clf in clfs.items(): if len(y.shape) > 1: clf = OneVsRestClassifier(clf) print("cross validation on %s is running" % name) validate_clf(clf, X, y, cv=5, scoring='f1_micro') if evaluating: print("metrics of %s classifier:" % name) clf.fit(X, y) y_true = pd.read_csv(val_url, usecols=list(map( str, range(10)))).values < 2 y_pred = clf.predict(X_val) y_probas = predict_proba(clf, X_val) calc_metrics(y_true, y_pred, y_probas)
def gen_10bi_result(train_url, test_url, validating=False, evaluating=False): """ Args: train_url: url of csv train data test_url: url of csv test data validating: whether to do validating evaluating: whether to do evaluating on test_gold Returns: stacked probabilities of belonging to each subjects """ tdf = pd.read_csv(test_url)['content_id'] n_samples = len(tdf) y_probas = np.empty(shape=(n_samples, 0)) y_pred = np.empty(shape=(n_samples, 0), dtype=int) for col in range(10): # X, y, X_test = generate_vectors(train_url, test_url, column='article', max_n=3, min_df=3, max_df=0.8, # max_features=30000, trans_type='dc', sublinear_tf=True, balanced=True, # multilabel_out=False, label_col='subjects', only_single=False, shuffle=True, # apply_fun=lambda label: str(col) in label) X, y, X_test = joblib.load( from_project_root("data/vector/stacked_all_XyX_val_32_%d.pk" % col)) clf = LinearSVC() print("running on subject %s" % id2sub(col)) if validating: validate_clf(clf, X, y, scoring='f1') clf.fit(X, y) proba = predict_proba(clf, X_test)[:, 1:2] y_probas = np.hstack((y_probas, proba)) y_pred = np.hstack((y_pred, clf.predict(X_test).reshape(-1, 1))) if evaluating: y_true = pd.read_csv(test_url, usecols=list(map(str, range(10)))).values < 2 calc_metrics(y_true, y_pred, y_probas) return y_pred, y_probas