def go(self): y_train = self.dfAll["relevance"].values[:TRAIN_SIZE] for obs_field in self.obs_fields: if obs_field not in self.dfAll.columns: self.logger.info("Skip %s"%obs_field) continue obs_corpus = self.dfAll[obs_field].values for target_field in self.target_fields: if target_field not in self.dfAll.columns: self.logger.info("Skip %s"%target_field) continue target_corpus = self.dfAll[target_field].values ext = self.generator(obs_corpus, target_corpus, *self.param_list) x = ext.transform() if isinstance(ext.__name__(), list): for i,feat_name in enumerate(ext.__name__()): dim = 1 fname = "%s_%s_x_%s_%dD"%(feat_name, obs_field, target_field, dim) pkl_utils._save(os.path.join(self.feat_dir, fname+config.FEAT_FILE_SUFFIX), x[:,i]) corr = np_utils._corr(x[:TRAIN_SIZE,i], y_train) self.logger.info("%s (%dD): corr = %.6f"%(fname, dim, corr)) else: dim = np_utils._dim(x) fname = "%s_%s_x_%s_%dD"%(ext.__name__(), obs_field, target_field, dim) pkl_utils._save(os.path.join(self.feat_dir, fname+config.FEAT_FILE_SUFFIX), x) if dim == 1: corr = np_utils._corr(x[:TRAIN_SIZE], y_train) self.logger.info("%s (%dD): corr = %.6f"%(fname, dim, corr)) elif self.force_corr: for j in range(dim): corr = np_utils._corr(x[:TRAIN_SIZE,j], y_train) self.logger.info("%s (%d/%dD): corr = %.6f"%(fname, j+1, dim, corr))
def go(self): y_train = self.dfAll["relevance"].values[:TRAIN_SIZE] for obs_field in self.obs_fields: if obs_field not in self.dfAll.columns: self.logger.info("Skip %s" % obs_field) continue obs_corpus = self.dfAll[obs_field].values ext = self.generator(obs_corpus, None, *self.param_list) x = ext.transform() if isinstance(ext.__name__(), list): for i, feat_name in enumerate(ext.__name__()): dim = 1 fname = "%s_%s_%dD" % (feat_name, obs_field, dim) pkl_utils._save( os.path.join(self.feat_dir, fname + config.FEAT_FILE_SUFFIX), x[:, i]) corr = np_utils._corr(x[:TRAIN_SIZE, i], y_train) self.logger.info("%s (%dD): corr = %.6f" % (fname, dim, corr)) else: dim = np_utils._dim(x) fname = "%s_%s_%dD" % (ext.__name__(), obs_field, dim) pkl_utils._save( os.path.join(self.feat_dir, fname + config.FEAT_FILE_SUFFIX), x) if dim == 1: corr = np_utils._corr(x[:TRAIN_SIZE], y_train) self.logger.info("%s (%dD): corr = %.6f" % (fname, dim, corr)) elif self.force_corr: for j in range(dim): corr = np_utils._corr(x[:TRAIN_SIZE, j], y_train) self.logger.info("%s (%d/%dD): corr = %.6f" % (fname, j + 1, dim, corr))
def save_feature(self, feat_name, obs_field, target_field, dim, x, y): fname = "%s_%s_x_%s_%dD" % (feat_name, obs_field, target_field, dim) table_utils._write( os.path.join(self.feat_dir, fname + config.FEAT_FILE_SUFFIX), x) if dim == 1: corr = np_utils._corr(x, y) self.logger.info("%s (%dD): corr=%.6f" % (fname, dim, corr))
def main(): logname = "generate_feature_query_quality_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) obs_corpus = [] query_suffix = [] # raw dfAll = pkl_utils._load(config.ALL_DATA_RAW) obs_corpus.append(dfAll["search_term"].values) query_suffix.append("raw") # after processing dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED) obs_corpus.append(dfAll["search_term"].values) query_suffix.append("lemmatized") # after extracting product_name in search_term obs_corpus.append(dfAll["search_term_product_name"].values) query_suffix.append("product_name") if "search_term_auto_corrected" in dfAll.columns: # after auto correction obs_corpus.append(dfAll["search_term_auto_corrected"].values) query_suffix.append("corrected") # after stemming dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) obs_corpus.append(dfAll["search_term"].values) query_suffix.append("stemmed") y_train = dfAll["relevance"].values[:TRAIN_SIZE] for i in range(len(query_suffix)-1): for j in range(i+1, len(query_suffix)): ext = QueryQuality(obs_corpus[i], obs_corpus[j]) x = ext.transform() dim = 1 fname = "%s_%s_x_%s_%dD"%(ext._get_feat_name(), query_suffix[i], query_suffix[j], dim) pkl_utils._save(os.path.join(config.FEAT_DIR, fname+config.FEAT_FILE_SUFFIX), x) corr = np_utils._corr(x[:TRAIN_SIZE], y_train) logger.info("%s (%dD): corr = %.6f"%(fname, dim, corr)) # raw dfAll = pkl_utils._load(config.ALL_DATA_RAW) obs_fields = ["search_term"] param_list = [] sf = StandaloneFeatureWrapper(IsInGoogleDict, dfAll, obs_fields, param_list, config.FEAT_DIR, logger) sf.go()
def main(): logname = "generate_feature_query_quality_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) obs_corpus = [] query_suffix = [] # raw dfAll = pkl_utils._load(config.ALL_DATA_RAW) obs_corpus.append(dfAll["search_term"].values) query_suffix.append("raw") # after processing dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED) obs_corpus.append(dfAll["search_term"].values) query_suffix.append("lemmatized") # after extracting product_name in search_term obs_corpus.append(dfAll["search_term_product_name"].values) query_suffix.append("product_name") if "search_term_auto_corrected" in dfAll.columns: # after auto correction obs_corpus.append(dfAll["search_term_auto_corrected"].values) query_suffix.append("corrected") # after stemming dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) obs_corpus.append(dfAll["search_term"].values) query_suffix.append("stemmed") y_train = dfAll["relevance"].values[:TRAIN_SIZE] for i in range(len(query_suffix)-1): for j in range(i+1, len(query_suffix)): ext = QueryQuality(obs_corpus[i], obs_corpus[j]) x = ext.transform() dim = np_utils._dim(x) fname = "%s_%s_x_%s_%dD"%(ext.__name__(), query_suffix[i], query_suffix[j], dim) pkl_utils._save(os.path.join(config.FEAT_DIR, fname+config.FEAT_FILE_SUFFIX), x) corr = np_utils._corr(x[:TRAIN_SIZE], y_train) logger.info("%s (%dD): corr = %.6f"%(fname, dim, corr)) # raw dfAll = pkl_utils._load(config.ALL_DATA_RAW) obs_fields = ["search_term"] param_list = [] sf = StandaloneFeatureWrapper(IsInGoogleDict, dfAll, obs_fields, param_list, config.FEAT_DIR, logger) sf.go()
def main(): logname = "generate_feature_group_distance_stat_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) y_train = dfAll["relevance"].values[:TRAIN_SIZE] group_id_names = ["DocId_search_term", "DocId_product_title", "DocIdEcho_product_uid"] match_list = [ "MatchQueryCount", "MatchQueryRatio", "LongestMatchRatio", ] tfidf_list = [ "StatCoocTF_Unigram_Mean", "StatCoocTF_Unigram_Max", "StatCoocTF_Unigram_Min", # "StatCoocNormTF_Unigram_Mean", # "StatCoocNormTF_Unigram_Max", # "StatCoocNormTF_Unigram_Min", "StatCoocTFIDF_Unigram_Mean", "StatCoocTFIDF_Unigram_Max", "StatCoocTFIDF_Unigram_Min", "StatCoocBM25_Unigram_Mean", "StatCoocBM25_Unigram_Max", "StatCoocBM25_Unigram_Min", # "StatCoocTF_Bigram_Mean", # "StatCoocTF_Bigram_Max", # "StatCoocTF_Bigram_Min", # "StatCoocNormTF_Bigram_Mean", # "StatCoocNormTF_Bigram_Max", # "StatCoocNormTF_Bigram_Min", # "StatCoocTFIDF_Bigram_Mean", # "StatCoocTFIDF_Bigram_Max", # "StatCoocTFIDF_Bigram_Min", # "StatCoocBM25_Bigram_Mean", # "StatCoocBM25_Bigram_Max", # "StatCoocBM25_Bigram_Min", # "StatCoocTF_Trigram_Mean", # "StatCoocTF_Trigram_Max", # "StatCoocTF_Trigram_Min", # "StatCoocNormTF_Trigram_Mean", # "StatCoocNormTF_Trigram_Max", # "StatCoocNormTF_Trigram_Min", # "StatCoocTFIDF_Trigram_Mean", # "StatCoocTFIDF_Trigram_Max", # "StatCoocTFIDF_Trigram_Min", # "StatCoocBM25_Trigram_Mean", # "StatCoocBM25_Trigram_Max", # "StatCoocBM25_Trigram_Min", ] intersect_ngram_count_list = [ "IntersectCount_Unigram", "IntersectRatio_Unigram", # "IntersectCount_Bigram", # "IntersectRatio_Bigram", # "IntersectCount_Trigram", # "IntersectRatio_Trigram", ] first_last_ngram_list = [ "FirstIntersectCount_Unigram", "FirstIntersectRatio_Unigram", "LastIntersectCount_Unigram", "LastIntersectRatio_Unigram", # "FirstIntersectCount_Bigram", # "FirstIntersectRatio_Bigram", # "LastIntersectCount_Bigram", # "LastIntersectRatio_Bigram", # "FirstIntersectCount_Trigram", # "FirstIntersectRatio_Trigram", # "LastIntersectCount_Trigram", # "LastIntersectRatio_Trigram", ] cooccurrence_ngram_count_list = [ "CooccurrenceCount_Unigram", "CooccurrenceRatio_Unigram", # "CooccurrenceCount_Bigram", # "CooccurrenceRatio_Bigram", # "CooccurrenceCount_Trigram", # "CooccurrenceRatio_Trigram", ] ngram_jaccard_list = [ "JaccardCoef_Unigram", # "JaccardCoef_Bigram", # "JaccardCoef_Trigram", "DiceDistance_Unigram", # "DiceDistance_Bigram", # "DiceDistance_Trigram", ] char_dist_sim_list = [ "CharDistribution_CosineSim", "CharDistribution_KL", ] tfidf_word_ngram_cosinesim_list = [ "TFIDF_Word_Unigram_CosineSim", # "TFIDF_Word_Bigram_CosineSim", # "TFIDF_Word_Trigram_CosineSim", ] tfidf_char_ngram_cosinesim_list = [ # "TFIDF_Char_Bigram_CosineSim", # "TFIDF_Char_Trigram_CosineSim", "TFIDF_Char_Fourgram_CosineSim", # "TFIDF_Char_Fivegram_CosineSim", ] lsa_word_ngram_cosinesim_list = [ "LSA100_Word_Unigram_CosineSim", # "LSA100_Word_Bigram_CosineSim", # "LSA100_Word_Trigram_CosineSim", ] lsa_char_ngram_cosinesim_list = [ # "LSA100_Char_Bigram_CosineSim", # "LSA100_Char_Trigram_CosineSim", "LSA100_Char_Fourgram_CosineSim", # "LSA100_Char_Fivegram_CosineSim", ] doc2vec_list = [ "Doc2Vec_Homedepot_D100_CosineSim", ] word2vec_list = [ "Word2Vec_N_Similarity", "Word2Vec_Homedepot_D100_CosineSim_Mean_Mean", "Word2Vec_Homedepot_D100_CosineSim_Max_Mean", "Word2Vec_Homedepot_D100_CosineSim_Min_Mean", ] distance_generator_list = \ match_list + \ tfidf_list + \ intersect_ngram_count_list + \ first_last_ngram_list + \ cooccurrence_ngram_count_list + \ ngram_jaccard_list + \ tfidf_word_ngram_cosinesim_list + \ tfidf_char_ngram_cosinesim_list + \ lsa_word_ngram_cosinesim_list + \ lsa_char_ngram_cosinesim_list + \ char_dist_sim_list + \ word2vec_list + \ doc2vec_list obs_fields_list = [] target_fields_list = [] ## query in document obs_fields_list.append( ["search_term"] ) target_fields_list.append( ["product_title", "product_title_product_name"] ) aggregation_mode = ["mean", "max", "min"] for group_id_name in group_id_names: group_id_list = pkl_utils._load(os.path.join(config.FEAT_DIR, group_id_name+"_1D.pkl")) for distance_generator in distance_generator_list: for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for obs_field in obs_fields: for target_field in target_fields: dist_name = "%s_%s_x_%s"%(distance_generator, obs_field, target_field) try: dist_list = pkl_utils._load(os.path.join(config.FEAT_DIR, dist_name+"_1D.pkl")) ext = GroupDistanceStat(dist_list, group_id_list, dist_name, group_id_name, aggregation_mode) x = ext.transform() if isinstance(ext.__name__(), list): for i,feat_name in enumerate(ext.__name__()): dim = 1 fname = "%s_%dD"%(feat_name, dim) pkl_utils._save(os.path.join(config.FEAT_DIR, fname+config.FEAT_FILE_SUFFIX), x[:,i]) corr = np_utils._corr(x[:TRAIN_SIZE,i], y_train) logger.info("%s (%dD): corr = %.6f"%(fname, dim, corr)) except: logger.info("Skip %s"%dist_name) pass
def main(): logname = "generate_feature_group_distance_stat_%s.log" % time_utils._timestamp( ) logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) y_train = dfAll["relevance"].values[:TRAIN_SIZE] group_id_names = [ "DocId_search_term", "DocId_product_title", "DocIdEcho_product_uid" ] match_list = [ "MatchQueryCount", "MatchQueryRatio", "LongestMatchRatio", ] tfidf_list = [ "StatCoocTF_Unigram_Mean", "StatCoocTF_Unigram_Max", "StatCoocTF_Unigram_Min", # "StatCoocNormTF_Unigram_Mean", # "StatCoocNormTF_Unigram_Max", # "StatCoocNormTF_Unigram_Min", "StatCoocTFIDF_Unigram_Mean", "StatCoocTFIDF_Unigram_Max", "StatCoocTFIDF_Unigram_Min", "StatCoocBM25_Unigram_Mean", "StatCoocBM25_Unigram_Max", "StatCoocBM25_Unigram_Min", # "StatCoocTF_Bigram_Mean", # "StatCoocTF_Bigram_Max", # "StatCoocTF_Bigram_Min", # "StatCoocNormTF_Bigram_Mean", # "StatCoocNormTF_Bigram_Max", # "StatCoocNormTF_Bigram_Min", # "StatCoocTFIDF_Bigram_Mean", # "StatCoocTFIDF_Bigram_Max", # "StatCoocTFIDF_Bigram_Min", # "StatCoocBM25_Bigram_Mean", # "StatCoocBM25_Bigram_Max", # "StatCoocBM25_Bigram_Min", # "StatCoocTF_Trigram_Mean", # "StatCoocTF_Trigram_Max", # "StatCoocTF_Trigram_Min", # "StatCoocNormTF_Trigram_Mean", # "StatCoocNormTF_Trigram_Max", # "StatCoocNormTF_Trigram_Min", # "StatCoocTFIDF_Trigram_Mean", # "StatCoocTFIDF_Trigram_Max", # "StatCoocTFIDF_Trigram_Min", # "StatCoocBM25_Trigram_Mean", # "StatCoocBM25_Trigram_Max", # "StatCoocBM25_Trigram_Min", ] intersect_ngram_count_list = [ "IntersectCount_Unigram", "IntersectRatio_Unigram", # "IntersectCount_Bigram", # "IntersectRatio_Bigram", # "IntersectCount_Trigram", # "IntersectRatio_Trigram", ] first_last_ngram_list = [ "FirstIntersectCount_Unigram", "FirstIntersectRatio_Unigram", "LastIntersectCount_Unigram", "LastIntersectRatio_Unigram", # "FirstIntersectCount_Bigram", # "FirstIntersectRatio_Bigram", # "LastIntersectCount_Bigram", # "LastIntersectRatio_Bigram", # "FirstIntersectCount_Trigram", # "FirstIntersectRatio_Trigram", # "LastIntersectCount_Trigram", # "LastIntersectRatio_Trigram", ] cooccurrence_ngram_count_list = [ "CooccurrenceCount_Unigram", "CooccurrenceRatio_Unigram", # "CooccurrenceCount_Bigram", # "CooccurrenceRatio_Bigram", # "CooccurrenceCount_Trigram", # "CooccurrenceRatio_Trigram", ] ngram_jaccard_list = [ "JaccardCoef_Unigram", # "JaccardCoef_Bigram", # "JaccardCoef_Trigram", "DiceDistance_Unigram", # "DiceDistance_Bigram", # "DiceDistance_Trigram", ] char_dist_sim_list = [ "CharDistribution_CosineSim", "CharDistribution_KL", ] tfidf_word_ngram_cosinesim_list = [ "TFIDF_Word_Unigram_CosineSim", # "TFIDF_Word_Bigram_CosineSim", # "TFIDF_Word_Trigram_CosineSim", ] tfidf_char_ngram_cosinesim_list = [ # "TFIDF_Char_Bigram_CosineSim", # "TFIDF_Char_Trigram_CosineSim", "TFIDF_Char_Fourgram_CosineSim", # "TFIDF_Char_Fivegram_CosineSim", ] lsa_word_ngram_cosinesim_list = [ "LSA100_Word_Unigram_CosineSim", # "LSA100_Word_Bigram_CosineSim", # "LSA100_Word_Trigram_CosineSim", ] lsa_char_ngram_cosinesim_list = [ # "LSA100_Char_Bigram_CosineSim", # "LSA100_Char_Trigram_CosineSim", "LSA100_Char_Fourgram_CosineSim", # "LSA100_Char_Fivegram_CosineSim", ] doc2vec_list = [ "Doc2Vec_Homedepot_D100_CosineSim", ] word2vec_list = [ "Word2Vec_N_Similarity", "Word2Vec_Homedepot_D100_CosineSim_Mean_Mean", "Word2Vec_Homedepot_D100_CosineSim_Max_Mean", "Word2Vec_Homedepot_D100_CosineSim_Min_Mean", ] distance_generator_list = \ match_list + \ tfidf_list + \ intersect_ngram_count_list + \ first_last_ngram_list + \ cooccurrence_ngram_count_list + \ ngram_jaccard_list + \ tfidf_word_ngram_cosinesim_list + \ tfidf_char_ngram_cosinesim_list + \ lsa_word_ngram_cosinesim_list + \ lsa_char_ngram_cosinesim_list + \ char_dist_sim_list + \ word2vec_list + \ doc2vec_list obs_fields_list = [] target_fields_list = [] ## query in document obs_fields_list.append(["search_term"]) target_fields_list.append(["product_title", "product_title_product_name"]) aggregation_mode = ["mean", "max", "min"] for group_id_name in group_id_names: group_id_list = pkl_utils._load( os.path.join(config.FEAT_DIR, group_id_name + "_1D.pkl")) for distance_generator in distance_generator_list: for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for obs_field in obs_fields: for target_field in target_fields: dist_name = "%s_%s_x_%s" % (distance_generator, obs_field, target_field) try: dist_list = pkl_utils._load( os.path.join(config.FEAT_DIR, dist_name + "_1D.pkl")) ext = GroupDistanceStat(dist_list, group_id_list, dist_name, group_id_name, aggregation_mode) x = ext.transform() if isinstance(ext.__name__(), list): for i, feat_name in enumerate(ext.__name__()): dim = 1 fname = "%s_%dD" % (feat_name, dim) pkl_utils._save( os.path.join( config.FEAT_DIR, fname + config.FEAT_FILE_SUFFIX), x[:, i]) corr = np_utils._corr( x[:TRAIN_SIZE, i], y_train) logger.info("%s (%dD): corr = %.6f" % (fname, dim, corr)) except: logger.info("Skip %s" % dist_name) pass
def combine(self): dfAll = pkl_utils._load(config.INFO_DATA) dfAll_raw = dfAll.copy() y_train = dfAll["relevance"].values[:TRAIN_SIZE] ## for basic features feat_cnt = 0 self.logger.info("Run for basic...") for file_name in sorted(os.listdir(config.FEAT_DIR)): if self.feature_suffix in file_name: fname = file_name.split(".")[0] if fname not in self.feature_dict: continue x = self.load_feature(config.FEAT_DIR, fname) x = np.nan_to_num(x) if np.isnan(x).any(): self.logger.info("%s nan"%fname) continue # apply feature transform mandatory = self.feature_dict[fname][0] transformer = self.feature_dict[fname][1] x = transformer.fit_transform(x) dim = np_utils._dim(x) if dim == 1: corr = np_utils._corr(x[:TRAIN_SIZE], y_train) if not mandatory and abs(corr) < self.corr_threshold: self.logger.info("Drop: {} ({}D) (abs corr = {}, < threshold = {})".format( fname, dim, abs(corr), self.corr_threshold)) continue dfAll[fname] = x self.feature_names.append(fname) else: columns = ["%s_%d"%(fname, x) for x in range(dim)] df = pd.DataFrame(x, columns=columns) dfAll = pd.concat([dfAll, df], axis=1) self.feature_names.extend(columns) feat_cnt += 1 self.feature_names_basic.append(fname) if dim == 1: self.logger.info("Combine {:>3}/{:>3} feat: {} ({}D) (corr = {})".format( feat_cnt, len(self.feature_dict.keys()), fname, dim, corr)) else: self.logger.info("Combine {:>3}/{:>3} feat: {} ({}D)".format( feat_cnt, len(self.feature_dict.keys()), fname, dim)) dfAll.fillna(config.MISSING_VALUE_NUMERIC, inplace=True) ## basic dfTrain = dfAll.iloc[:TRAIN_SIZE].copy() self.y_train = dfTrain["relevance"].values.astype(float) dfTrain.drop(["id","relevance"], axis=1, inplace=True) self.X_train = dfTrain.values.astype(float) dfTest = dfAll.iloc[TRAIN_SIZE:].copy() self.id_test = dfTest["id"].values.astype(int) dfTest.drop(["id","relevance"], axis=1, inplace=True) self.X_test = dfTest.values.astype(float) ## all first = True feat_cv_cnt = 0 dfAll_cv_all = dfAll_raw.copy() feature_dir = "%s/All" % (config.FEAT_DIR) for file_name in sorted(os.listdir(feature_dir)): if self.feature_suffix in file_name: fname = file_name.split(".")[0] if fname not in self.feature_dict: continue if first: self.logger.info("Run for all...") first = False x = self.load_feature(feature_dir, fname) x = np.nan_to_num(x) if np.isnan(x).any(): self.logger.info("%s nan"%fname) continue # apply feature transform mandatory = self.feature_dict[fname][0] transformer = self.feature_dict[fname][1] x = transformer.fit_transform(x) dim = np_utils._dim(x) if dim == 1: corr = np_utils._corr(x[:TRAIN_SIZE], y_train) if not mandatory and abs(corr) < self.corr_threshold: self.logger.info("Drop: {} ({}D) (abs corr = {}, < threshold = {})".format( fname, dim, abs(corr), self.corr_threshold)) continue dfAll_cv_all[fname] = x self.feature_names.append(fname) else: columns = ["%s_%d"%(fname, x) for x in range(dim)] df = pd.DataFrame(x, columns=columns) dfAll_cv_all = pd.concat([dfAll_cv_all, df], axis=1) self.feature_names.extend(columns) feat_cv_cnt += 1 self.feature_names_cv.append(fname) if dim == 1: self.logger.info("Combine {:>3}/{:>3} feat: {} ({}D) (corr = {})".format( feat_cnt+feat_cv_cnt, len(self.feature_dict.keys()), fname, dim, corr)) else: self.logger.info("Combine {:>3}/{:>3} feat: {} ({}D)".format( feat_cnt+feat_cv_cnt, len(self.feature_dict.keys()), fname, dim)) if feat_cv_cnt > 0: dfAll_cv_all.fillna(config.MISSING_VALUE_NUMERIC, inplace=True) X_tmp = dfAll_cv_all.drop(["id","relevance"], axis=1).values.astype(float) self.X_train_cv_all = X_tmp[:TRAIN_SIZE] self.X_test = np.hstack((self.X_test, X_tmp[TRAIN_SIZE:])) else: self.X_train_cv_all = None feat_cnt += feat_cv_cnt ## for cv features first = True for run in range(1,self.n_iter+1): feat_cv_cnt = 0 dfAll_cv = dfAll_raw.copy() feature_dir = "%s/Run%d" % (config.FEAT_DIR, run) for file_name in sorted(os.listdir(feature_dir)): if self.feature_suffix in file_name: fname = file_name.split(".")[0] if (fname not in self.feature_dict) or (fname not in self.feature_names_cv): continue if first: self.logger.info("Run for cv...") first = False if feat_cv_cnt == 0: self.logger.info("Run %d"%run) x = self.load_feature(feature_dir, fname) x = np.nan_to_num(x) if np.isnan(x).any(): self.logger.info("%s nan"%fname) continue # apply feature transform mandatory = self.feature_dict[fname][0] transformer = self.feature_dict[fname][1] x = transformer.fit_transform(x) dim = np_utils._dim(x) if dim == 1: dfAll_cv[fname] = x else: columns = ["%s_%d"%(fname, x) for x in range(dim)] df = pd.DataFrame(x, columns=columns) dfAll_cv = pd.concat([dfAll_cv, df], axis=1) feat_cv_cnt += 1 self.logger.info("Combine {:>3}/{:>3} feat: {} ({}D)".format( feat_cnt+feat_cv_cnt, len(self.feature_dict.keys()), fname, dim)) if feat_cv_cnt > 0: dfAll_cv.fillna(config.MISSING_VALUE_NUMERIC, inplace=True) dfTrain_cv = dfAll_cv.iloc[:TRAIN_SIZE].copy() X_tmp = dfTrain_cv.drop(["id","relevance"], axis=1).values.astype(float) if run == 1: self.X_train_cv = np.zeros((X_tmp.shape[0], X_tmp.shape[1], self.n_iter), dtype=float) self.X_train_cv[:,:,run-1] = X_tmp if feat_cv_cnt == 0: self.X_train_cv = None self.basic_only = 1 # report final results if self.basic_only: self.logger.info("Overall Shape: %d x %d"%(len(self.y_train), self.X_train.shape[1])) else: self.logger.info("Overall Shape: %d x %d"%( len(self.y_train), self.X_train.shape[1]+self.X_train_cv_all.shape[1])) self.logger.info("Done combinning.") return self
def combine(self): dfAll = pkl_utils._load(config.INFO_DATA) dfAll_raw = dfAll.copy() y_train = dfAll["relevance"].values[:TRAIN_SIZE] ## for basic features feat_cnt = 0 self.logger.info("Run for basic...") for file_name in sorted(os.listdir(config.FEAT_DIR)): if self.feature_suffix in file_name: fname = file_name.split(".")[0] if fname not in self.feature_dict: continue x = self.load_feature(config.FEAT_DIR, fname) x = np.nan_to_num(x) if np.isnan(x).any(): self.logger.info("%s nan" % fname) continue # apply feature transform mandatory = self.feature_dict[fname][0] transformer = self.feature_dict[fname][1] x = transformer.fit_transform(x) dim = np_utils._dim(x) if dim == 1: corr = np_utils._corr(x[:TRAIN_SIZE], y_train) if not mandatory and abs(corr) < self.corr_threshold: self.logger.info( "Drop: {} ({}D) (abs corr = {}, < threshold = {})". format(fname, dim, abs(corr), self.corr_threshold)) continue dfAll[fname] = x self.feature_names.append(fname) else: columns = ["%s_%d" % (fname, x) for x in range(dim)] df = pd.DataFrame(x, columns=columns) dfAll = pd.concat([dfAll, df], axis=1) self.feature_names.extend(columns) feat_cnt += 1 self.feature_names_basic.append(fname) if dim == 1: self.logger.info( "Combine {:>3}/{:>3} feat: {} ({}D) (corr = {})". format(feat_cnt, len(self.feature_dict.keys()), fname, dim, corr)) else: self.logger.info( "Combine {:>3}/{:>3} feat: {} ({}D)".format( feat_cnt, len(self.feature_dict.keys()), fname, dim)) dfAll.fillna(config.MISSING_VALUE_NUMERIC, inplace=True) ## basic dfTrain = dfAll.iloc[:TRAIN_SIZE].copy() self.y_train = dfTrain["relevance"].values.astype(float) dfTrain.drop(["id", "relevance"], axis=1, inplace=True) self.X_train = dfTrain.values.astype(float) dfTest = dfAll.iloc[TRAIN_SIZE:].copy() self.id_test = dfTest["id"].values.astype(int) dfTest.drop(["id", "relevance"], axis=1, inplace=True) self.X_test = dfTest.values.astype(float) ## all first = True feat_cv_cnt = 0 dfAll_cv_all = dfAll_raw.copy() feature_dir = "%s/All" % (config.FEAT_DIR) for file_name in sorted(os.listdir(feature_dir)): if self.feature_suffix in file_name: fname = file_name.split(".")[0] if fname not in self.feature_dict: continue if first: self.logger.info("Run for all...") first = False x = self.load_feature(feature_dir, fname) x = np.nan_to_num(x) if np.isnan(x).any(): self.logger.info("%s nan" % fname) continue # apply feature transform mandatory = self.feature_dict[fname][0] transformer = self.feature_dict[fname][1] x = transformer.fit_transform(x) dim = np_utils._dim(x) if dim == 1: corr = np_utils._corr(x[:TRAIN_SIZE], y_train) if not mandatory and abs(corr) < self.corr_threshold: self.logger.info( "Drop: {} ({}D) (abs corr = {}, < threshold = {})". format(fname, dim, abs(corr), self.corr_threshold)) continue dfAll_cv_all[fname] = x self.feature_names.append(fname) else: columns = ["%s_%d" % (fname, x) for x in range(dim)] df = pd.DataFrame(x, columns=columns) dfAll_cv_all = pd.concat([dfAll_cv_all, df], axis=1) self.feature_names.extend(columns) feat_cv_cnt += 1 self.feature_names_cv.append(fname) if dim == 1: self.logger.info( "Combine {:>3}/{:>3} feat: {} ({}D) (corr = {})". format(feat_cnt + feat_cv_cnt, len(self.feature_dict.keys()), fname, dim, corr)) else: self.logger.info( "Combine {:>3}/{:>3} feat: {} ({}D)".format( feat_cnt + feat_cv_cnt, len(self.feature_dict.keys()), fname, dim)) if feat_cv_cnt > 0: dfAll_cv_all.fillna(config.MISSING_VALUE_NUMERIC, inplace=True) X_tmp = dfAll_cv_all.drop(["id", "relevance"], axis=1).values.astype(float) self.X_train_cv_all = X_tmp[:TRAIN_SIZE] self.X_test = np.hstack((self.X_test, X_tmp[TRAIN_SIZE:])) else: self.X_train_cv_all = None feat_cnt += feat_cv_cnt ## for cv features first = True for run in range(1, self.n_iter + 1): feat_cv_cnt = 0 dfAll_cv = dfAll_raw.copy() feature_dir = "%s/Run%d" % (config.FEAT_DIR, run) for file_name in sorted(os.listdir(feature_dir)): if self.feature_suffix in file_name: fname = file_name.split(".")[0] if (fname not in self.feature_dict) or ( fname not in self.feature_names_cv): continue if first: self.logger.info("Run for cv...") first = False if feat_cv_cnt == 0: self.logger.info("Run %d" % run) x = self.load_feature(feature_dir, fname) x = np.nan_to_num(x) if np.isnan(x).any(): self.logger.info("%s nan" % fname) continue # apply feature transform mandatory = self.feature_dict[fname][0] transformer = self.feature_dict[fname][1] x = transformer.fit_transform(x) dim = np_utils._dim(x) if dim == 1: dfAll_cv[fname] = x else: columns = ["%s_%d" % (fname, x) for x in range(dim)] df = pd.DataFrame(x, columns=columns) dfAll_cv = pd.concat([dfAll_cv, df], axis=1) feat_cv_cnt += 1 self.logger.info( "Combine {:>3}/{:>3} feat: {} ({}D)".format( feat_cnt + feat_cv_cnt, len(self.feature_dict.keys()), fname, dim)) if feat_cv_cnt > 0: dfAll_cv.fillna(config.MISSING_VALUE_NUMERIC, inplace=True) dfTrain_cv = dfAll_cv.iloc[:TRAIN_SIZE].copy() X_tmp = dfTrain_cv.drop(["id", "relevance"], axis=1).values.astype(float) if run == 1: self.X_train_cv = np.zeros( (X_tmp.shape[0], X_tmp.shape[1], self.n_iter), dtype=float) self.X_train_cv[:, :, run - 1] = X_tmp if feat_cv_cnt == 0: self.X_train_cv = None self.basic_only = 1 # report final results if self.basic_only: self.logger.info("Overall Shape: %d x %d" % (len(self.y_train), self.X_train.shape[1])) else: self.logger.info("Overall Shape: %d x %d" % (len(self.y_train), self.X_train.shape[1] + self.X_train_cv_all.shape[1])) self.logger.info("Done combinning.") return self
def combine(self): dfAll = table_utils._read(config.INFO_DATA) dfAll_raw = dfAll.copy() y = dfAll['relevance'].values feat_cnt = 0 self.logger.info('Run for basic...') for file_name in sorted(os.listdir(config.FEAT_DIR)): if not config.FEAT_FILE_SUFFIX in file_name: continue fname = os.path.splitext(file_name)[0] if fname not in self.feature_dict: continue x = self.load_feature(config.FEAT_DIR, fname) x = np.nan_to_num(x) # Still necessary? if np.isnan(x).any(): self.logger.info("%s nan" % (fname)) continue # Apply feature transformers (?) mandatory = self.feature_dict[fname][0] transformer = self.feature_dict[fname][1] x = transformer.fit_transform(x) dim = np_utils._dim(x) if dim == 1: corr = np_utils._corr(x, y) if not mandatory and (np.isnan(corr) or abs(corr) < self.corr_threshold): self.logger.info( "Drop: {} ({}D) (abs_corr = {}, < threshold {})". format(fname, dim, abs(corr), self.corr_threshold)) continue dfAll[fname] = x self.feature_names.append(fname) else: columns = ["%s_%d" % (fname, x) for x in range(dim)] df = pd.DataFrame(x, columns=columns) dfAll = pd.concat([dfAll, df], axis=1) self.feature_names.extend(columns) feat_cnt += 1 self.feature_names_basic.append(fname) if dim == 1: self.logger.info( "Combine {:>3}/{:>3} feat: {} ({}D) (corr = {})".format( feat_cnt, len(self.feature_dict.keys()), fname, dim, corr)) else: self.logger.info("Combine {:>3}/{:>3} feat: {} ({}D)".format( feat_cnt, len(self.feature_dict.keys()), fname, dim)) dfAll.fillna(config.MISSING_VALUE_NUMERIC, inplace=True) self.y = dfAll["relevance"].values.astype(float) self.weights = dfAll['weight'].values self.query_ids = dfAll['norm_query_id'].values dfAll.drop(["relevance", "weight", "norm_query_id"], axis=1, inplace=True) self.X = dfAll.values.astype(float) self.logger.info("Overall Shape: %d x %d" % (len(self.y), self.X.shape[1])) self.logger.info("Done combining")