list(df_test['search_term_stem'].values) + list(df_train['description_stem'].values) + list(df_test['description_stem'].values) ) ####################### ## Generate Features ## ####################### print("==================================================") print("Generate distance features...") extract_cosine_feat(df_train) extract_cosine_feat(df_test) feat_names = ['id', 'cosine_st_orig', 'cosine_st_t_stem', 'cosine_st_d_stem', 'set_st_t_stem'] if not os.path.exists(config.path_cosine_features): os.makedirs(config.path_cosine_features) with open(config.file_cosine_feat_train, "wb") as f: pickle.dump(df_train[feat_names], f, -1) with open(config.file_cosine_feat_test, "wb") as f: pickle.dump(df_test[feat_names], f, -1) # save feat names print("Feature names are stored in %s" % config.file_cosine_feat_name) # dump feat name feature_utils.dump_feat_name(feat_names, config.file_cosine_feat_name) print("All Done.")
df_train = df_train.fillna('') df_test = df_test.fillna('') generate_intersect_word_count(df_train) generate_intersect_word_count(df_test) feat_names = list() feat_names.append("id") for name in df_train.columns: if "count_of_" in name or "ratio_of_" in name: feat_names.append(name) X_train = df_train[feat_names] print(X_train.shape) X_test = df_test[feat_names] print(X_test.shape) if not os.path.exists(config.path_brand_counting_features): os.makedirs(config.path_brand_counting_features) with open(config.file_brand_count_feat_train, "wb") as f: pickle.dump(X_train, f, -1) with open(config.file_brand_count_feat_test, "wb") as f: pickle.dump(X_test, f, -1) # save feat names print("Feature names are stored in %s" % config.file_brand_feat_name) # dump feat name feature_utils.dump_feat_name(feat_names, config.file_brand_feat_name) print("All Done.")
print("==================================================") print("Generate distance features...") extract_basic_distance_feat(df_train) extract_basic_distance_feat(df_test) feat_names = list() feat_names.append("id") for name in df_train.columns: if "jaccard_coef" in name or "dice_dist" in name: feat_names.append(name) X_train = df_train[feat_names] print(X_train.shape) X_test = df_test[feat_names] print(X_test.shape) if not os.path.exists(config.path_distance_features): os.makedirs(config.path_distance_features) with open(config.file_distance_feat_train, "wb") as f: pickle.dump(X_train, f, -1) with open(config.file_distance_feat_test, "wb") as f: pickle.dump(X_test, f, -1) # save feat names print("Feature names are stored in %s" % config.file_distance_feat_name) # dump feat name feature_utils.dump_feat_name(feat_names, config.file_distance_feat_name) print("All Done.")
X_svd_train = svd.fit_transform(X_tfidf_train) print(type(X_svd_train)) print("X_svd_train shape: {0}".format(X_svd_train.shape)) if i == 0: nd_train = X_svd_train else: nd_train = np.hstack([nd_train, X_svd_train]) print("nd_train shape: {0}".format(nd_train.shape)) X_svd_test = svd.transform(X_tfidf_test) print("X_svd_test shape: {0}".format(X_svd_test.shape)) if i == 0: nd_test = X_svd_test else: nd_test = np.hstack([nd_test, X_svd_test]) print("df_train shape: {0}".format(nd_test.shape)) print("Done.") with open("%s/train.tfidf.feat.pkl" % (config.path_tfidf_features), "wb") as f: pickle.dump(nd_train, f, -1) with open("%s/test.tfidf.feat.pkl" % (config.path_tfidf_features), "wb") as f: pickle.dump(nd_test, f, -1) # save feat names print("Feature names are stored in %s" % config.file_tfidf_feat_name) svd_feat_names = ["%s_tfidf_individual_svd%d" % (name, svd_n_components) for name in column_names] feature_utils.dump_feat_name(svd_feat_names, config.file_tfidf_feat_name) print("All Done.")