def run_tfidf(obj_field, target_field, generator, n_gram): obj_corpus = df[obj_field].values tgt_corpus = df[target_field].values ext = generator(obj_corpus, tgt_corpus, ngram=n_gram) x = ext.transform() print(x.shape) save_path = "features/feature_%d_gram_%s_%s_%s.pkl" % ( n_gram, ext.__name__(), obj_field, target_field) to_pkl(x, save_path)
def run_char_dist_sim(obj_field, target_field, generator): obj_corpus = df[obj_field].values tgt_corpus = df[target_field].values ext = generator(obj_corpus, tgt_corpus) x = ext.transform() print(x.shape) save_path = "features/feature_%s_%s_%s.pkl" % (ext.__name__(), obj_field, target_field) to_pkl(x, save_path)
def run_lsa_ngram(df, field): obj_corpus = df[field].values n_grams = [1, 2, 3] for n_gram in n_grams: ext = LSA_Word_Ngram(obj_corpus, None, n_gram, config.SVD_DIM, config.SVD_N_ITER) x = ext.transform() save_path = "features/feature_lsa_word_%d_gram_%s.pkl" % (n_gram, field) to_pkl(x, save_path)
def run_tfidf_char_ngram_cosinesim(obj_field, target_field): n_grams = [1, 2, 3] obj_corpus = df[obj_field].values tgt_corpus = df[target_field].values for n_gram in n_grams: ext = TFIDF_Char_Ngram_CosineSim(obj_corpus, tgt_corpus, n_gram) x = ext.transform() print(x.shape) save_path = "features/feature_tfidf_cosinesim_char_%d_gram_%s_%s.pkl" % ( n_gram, obj_field, target_field) to_pkl(x, save_path)
def feature_combine(feature_dir): features = [] file_names = os.listdir(feature_dir) for file_name in file_names: if not file_name.startswith("feature"): continue feature = load_pkl(os.path.join(feature_dir, file_name)) if len(feature.shape) == 1: feature = feature[np.newaxis, :].transpose() features.append(feature) print("features", len(features)) X = np.concatenate(features, axis=1) print("X shape is:", X.shape) to_pkl(X, "features/train/X_10.pkl")
def run_lsa_ngram_cooc(obj_field, target_field, generator): obs_ngrams = [1, 2] target_ngrams = [1, 2] obj_corpus = df[obj_field].values tgt_corpus = df[target_field].values for obs_ngram in obs_ngrams: for target_ngram in target_ngrams: ext = generator(obj_corpus, tgt_corpus, obs_ngram=obs_ngram, target_ngram=target_ngram) x = ext.transform() print(x.shape) save_path = "features/feature_%s_%s_%s.pkl" % ( ext.__name__(), obj_field, target_field) to_pkl(x, save_path)
def train(train_data, y, num_each_group): print("Start Training...") CASE_NUM = train_data.shape[0] GROUPS_NUM = int(CASE_NUM / num_each_group) assert CASE_NUM % GROUPS_NUM == 0 X_groups = np.arange(0, GROUPS_NUM).repeat(num_each_group) X = np.concatenate([X_groups[:, None], train_data], axis=1) ranker = XGBRanker(n_estimators=150, learning_rate=0.1, subsample=1.0, max_depth=6) ranker.fit(X, y, eval_metric=['ndcg', 'map@5-']) to_pkl(ranker, config.model_save_path) return ranker
def predict(test_data, ranker, num_each_group, predict_save): print("Start predicting...") fw = codecs.open(predict_save, 'w') CASE_NUM = test_data.shape[0] GROUPS_NUM = int(CASE_NUM / num_each_group) assert CASE_NUM % GROUPS_NUM == 0 X_groups = np.arange(0, GROUPS_NUM).repeat(num_each_group) X = np.concatenate([X_groups[:, None], test_data], axis=1) y_pred = ranker.predict(X) y_pred = y_pred.reshape(-1, 3) res = y_pred.argmax(axis=1).tolist() to_pkl(res, config.predict_index_save) with open(config.test_sample_file, 'r') as fr: for ix, line in zip(res, fr): data = json.loads(line.strip("\n")) preds = data["preds"] pred = preds[ix] fw.write(pred + "\n") fw.close()
def dumps_y(df): y = df["score"].values save_path = "features/train/y_10.pkl" to_pkl(y, save_path)
def dump_df_feature(df, fields): for field in fields: data = df[field].values save_path = "features/feature_%s.pkl" % (field) to_pkl(data, save_path)