Exemplo n.º 1
0
 def go(self):
     y_train = self.dfAll["relevance"].values[:TRAIN_SIZE]
     for obs_field in self.obs_fields:
         if obs_field not in self.dfAll.columns:
             self.logger.info("Skip %s"%obs_field)
             continue
         obs_corpus = self.dfAll[obs_field].values
         for target_field in self.target_fields:
             if target_field not in self.dfAll.columns:
                 self.logger.info("Skip %s"%target_field)
                 continue
             target_corpus = self.dfAll[target_field].values
             ext = self.generator(obs_corpus, target_corpus, *self.param_list)
             x = ext.transform()
             if isinstance(ext.__name__(), list):
                 for i,feat_name in enumerate(ext.__name__()):
                     dim = 1
                     fname = "%s_%s_x_%s_%dD"%(feat_name, obs_field, target_field, dim)
                     pkl_utils._save(os.path.join(self.feat_dir, fname+config.FEAT_FILE_SUFFIX), x[:,i])
                     corr = np_utils._corr(x[:TRAIN_SIZE,i], y_train)
                     self.logger.info("%s (%dD): corr = %.6f"%(fname, dim, corr))
             else:
                 dim = np_utils._dim(x)
                 fname = "%s_%s_x_%s_%dD"%(ext.__name__(), obs_field, target_field, dim)
                 pkl_utils._save(os.path.join(self.feat_dir, fname+config.FEAT_FILE_SUFFIX), x)
                 if dim == 1:
                     corr = np_utils._corr(x[:TRAIN_SIZE], y_train)
                     self.logger.info("%s (%dD): corr = %.6f"%(fname, dim, corr))
                 elif self.force_corr:
                     for j in range(dim):
                         corr = np_utils._corr(x[:TRAIN_SIZE,j], y_train)
                         self.logger.info("%s (%d/%dD): corr = %.6f"%(fname, j+1, dim, corr))
 def go(self):
     y_train = self.dfAll["relevance"].values[:TRAIN_SIZE]
     for obs_field in self.obs_fields:
         if obs_field not in self.dfAll.columns:
             self.logger.info("Skip %s" % obs_field)
             continue
         obs_corpus = self.dfAll[obs_field].values
         ext = self.generator(obs_corpus, None, *self.param_list)
         x = ext.transform()
         if isinstance(ext.__name__(), list):
             for i, feat_name in enumerate(ext.__name__()):
                 dim = 1
                 fname = "%s_%s_%dD" % (feat_name, obs_field, dim)
                 pkl_utils._save(
                     os.path.join(self.feat_dir,
                                  fname + config.FEAT_FILE_SUFFIX), x[:, i])
                 corr = np_utils._corr(x[:TRAIN_SIZE, i], y_train)
                 self.logger.info("%s (%dD): corr = %.6f" %
                                  (fname, dim, corr))
         else:
             dim = np_utils._dim(x)
             fname = "%s_%s_%dD" % (ext.__name__(), obs_field, dim)
             pkl_utils._save(
                 os.path.join(self.feat_dir,
                              fname + config.FEAT_FILE_SUFFIX), x)
             if dim == 1:
                 corr = np_utils._corr(x[:TRAIN_SIZE], y_train)
                 self.logger.info("%s (%dD): corr = %.6f" %
                                  (fname, dim, corr))
             elif self.force_corr:
                 for j in range(dim):
                     corr = np_utils._corr(x[:TRAIN_SIZE, j], y_train)
                     self.logger.info("%s (%d/%dD): corr = %.6f" %
                                      (fname, j + 1, dim, corr))
Exemplo n.º 3
0
 def save_feature(self, feat_name, obs_field, target_field, dim, x, y):
     fname = "%s_%s_x_%s_%dD" % (feat_name, obs_field, target_field, dim)
     table_utils._write(
         os.path.join(self.feat_dir, fname + config.FEAT_FILE_SUFFIX), x)
     if dim == 1:
         corr = np_utils._corr(x, y)
         self.logger.info("%s (%dD): corr=%.6f" % (fname, dim, corr))
def main():
    logname = "generate_feature_query_quality_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)

    obs_corpus = []
    query_suffix = []
    # raw
    dfAll = pkl_utils._load(config.ALL_DATA_RAW)
    obs_corpus.append(dfAll["search_term"].values)
    query_suffix.append("raw")
    # after processing    
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED)
    obs_corpus.append(dfAll["search_term"].values)
    query_suffix.append("lemmatized")
    # after extracting product_name in search_term
    obs_corpus.append(dfAll["search_term_product_name"].values)
    query_suffix.append("product_name")
    if "search_term_auto_corrected" in dfAll.columns:
        # after auto correction
        obs_corpus.append(dfAll["search_term_auto_corrected"].values)
        query_suffix.append("corrected")  
    # after stemming
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
    obs_corpus.append(dfAll["search_term"].values)
    query_suffix.append("stemmed")

    y_train = dfAll["relevance"].values[:TRAIN_SIZE]
    for i in range(len(query_suffix)-1):
        for j in range(i+1, len(query_suffix)):
            ext = QueryQuality(obs_corpus[i], obs_corpus[j])
            x = ext.transform()
            dim = 1
            fname = "%s_%s_x_%s_%dD"%(ext._get_feat_name(), query_suffix[i], query_suffix[j], dim)
            pkl_utils._save(os.path.join(config.FEAT_DIR, fname+config.FEAT_FILE_SUFFIX), x)
            corr = np_utils._corr(x[:TRAIN_SIZE], y_train)
            logger.info("%s (%dD): corr = %.6f"%(fname, dim, corr))

    # raw
    dfAll = pkl_utils._load(config.ALL_DATA_RAW)
    obs_fields = ["search_term"]
    param_list = []
    sf = StandaloneFeatureWrapper(IsInGoogleDict, dfAll, obs_fields, param_list, config.FEAT_DIR, logger)
    sf.go()
def main():
    logname = "generate_feature_query_quality_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)

    obs_corpus = []
    query_suffix = []
    # raw
    dfAll = pkl_utils._load(config.ALL_DATA_RAW)
    obs_corpus.append(dfAll["search_term"].values)
    query_suffix.append("raw")
    # after processing    
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED)
    obs_corpus.append(dfAll["search_term"].values)
    query_suffix.append("lemmatized")
    # after extracting product_name in search_term
    obs_corpus.append(dfAll["search_term_product_name"].values)
    query_suffix.append("product_name")
    if "search_term_auto_corrected" in dfAll.columns:
        # after auto correction
        obs_corpus.append(dfAll["search_term_auto_corrected"].values)
        query_suffix.append("corrected")  
    # after stemming
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
    obs_corpus.append(dfAll["search_term"].values)
    query_suffix.append("stemmed")

    y_train = dfAll["relevance"].values[:TRAIN_SIZE]
    for i in range(len(query_suffix)-1):
        for j in range(i+1, len(query_suffix)):
            ext = QueryQuality(obs_corpus[i], obs_corpus[j])
            x = ext.transform()
            dim = np_utils._dim(x)
            fname = "%s_%s_x_%s_%dD"%(ext.__name__(), query_suffix[i], query_suffix[j], dim)
            pkl_utils._save(os.path.join(config.FEAT_DIR, fname+config.FEAT_FILE_SUFFIX), x)
            corr = np_utils._corr(x[:TRAIN_SIZE], y_train)
            logger.info("%s (%dD): corr = %.6f"%(fname, dim, corr))

    # raw
    dfAll = pkl_utils._load(config.ALL_DATA_RAW)
    obs_fields = ["search_term"]
    param_list = []
    sf = StandaloneFeatureWrapper(IsInGoogleDict, dfAll, obs_fields, param_list, config.FEAT_DIR, logger)
    sf.go()
def main():
    logname = "generate_feature_group_distance_stat_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
    y_train = dfAll["relevance"].values[:TRAIN_SIZE]

    group_id_names = ["DocId_search_term", "DocId_product_title", "DocIdEcho_product_uid"]

    match_list = [
    "MatchQueryCount",
    "MatchQueryRatio",
    "LongestMatchRatio",
    ]

    tfidf_list = [
    "StatCoocTF_Unigram_Mean", 
    "StatCoocTF_Unigram_Max",
    "StatCoocTF_Unigram_Min",
    # "StatCoocNormTF_Unigram_Mean", 
    # "StatCoocNormTF_Unigram_Max",
    # "StatCoocNormTF_Unigram_Min", 
    "StatCoocTFIDF_Unigram_Mean",
    "StatCoocTFIDF_Unigram_Max",
    "StatCoocTFIDF_Unigram_Min",
    "StatCoocBM25_Unigram_Mean",
    "StatCoocBM25_Unigram_Max",
    "StatCoocBM25_Unigram_Min",
    # "StatCoocTF_Bigram_Mean", 
    # "StatCoocTF_Bigram_Max",
    # "StatCoocTF_Bigram_Min",
    # "StatCoocNormTF_Bigram_Mean", 
    # "StatCoocNormTF_Bigram_Max",
    # "StatCoocNormTF_Bigram_Min",
    # "StatCoocTFIDF_Bigram_Mean",
    # "StatCoocTFIDF_Bigram_Max",
    # "StatCoocTFIDF_Bigram_Min",
    # "StatCoocBM25_Bigram_Mean",
    # "StatCoocBM25_Bigram_Max",
    # "StatCoocBM25_Bigram_Min",
    # "StatCoocTF_Trigram_Mean", 
    # "StatCoocTF_Trigram_Max",
    # "StatCoocTF_Trigram_Min",
    # "StatCoocNormTF_Trigram_Mean", 
    # "StatCoocNormTF_Trigram_Max",
    # "StatCoocNormTF_Trigram_Min", 
    # "StatCoocTFIDF_Trigram_Mean",
    # "StatCoocTFIDF_Trigram_Max",
    # "StatCoocTFIDF_Trigram_Min",
    # "StatCoocBM25_Trigram_Mean",
    # "StatCoocBM25_Trigram_Max",
    # "StatCoocBM25_Trigram_Min",
    ]
    intersect_ngram_count_list = [    
    "IntersectCount_Unigram", 
    "IntersectRatio_Unigram", 
    # "IntersectCount_Bigram", 
    # "IntersectRatio_Bigram", 
    # "IntersectCount_Trigram", 
    # "IntersectRatio_Trigram", 
    ]
    first_last_ngram_list = [
    "FirstIntersectCount_Unigram", 
    "FirstIntersectRatio_Unigram", 
    "LastIntersectCount_Unigram", 
    "LastIntersectRatio_Unigram",
    # "FirstIntersectCount_Bigram", 
    # "FirstIntersectRatio_Bigram", 
    # "LastIntersectCount_Bigram", 
    # "LastIntersectRatio_Bigram",
    # "FirstIntersectCount_Trigram", 
    # "FirstIntersectRatio_Trigram", 
    # "LastIntersectCount_Trigram", 
    # "LastIntersectRatio_Trigram",
    ]

    cooccurrence_ngram_count_list = [
    "CooccurrenceCount_Unigram", 
    "CooccurrenceRatio_Unigram", 
    # "CooccurrenceCount_Bigram", 
    # "CooccurrenceRatio_Bigram",
    # "CooccurrenceCount_Trigram", 
    # "CooccurrenceRatio_Trigram",
    ]

    ngram_jaccard_list = [
    "JaccardCoef_Unigram", 
    # "JaccardCoef_Bigram", 
    # "JaccardCoef_Trigram", 
    "DiceDistance_Unigram", 
    # "DiceDistance_Bigram", 
    # "DiceDistance_Trigram", 
    ]

    char_dist_sim_list = [
    "CharDistribution_CosineSim",
    "CharDistribution_KL",
    ]

    tfidf_word_ngram_cosinesim_list = [
    "TFIDF_Word_Unigram_CosineSim",
    # "TFIDF_Word_Bigram_CosineSim",
    # "TFIDF_Word_Trigram_CosineSim",
    ]
    tfidf_char_ngram_cosinesim_list = [
    # "TFIDF_Char_Bigram_CosineSim",
    # "TFIDF_Char_Trigram_CosineSim",
    "TFIDF_Char_Fourgram_CosineSim",
    # "TFIDF_Char_Fivegram_CosineSim",
    ]

    lsa_word_ngram_cosinesim_list = [
    "LSA100_Word_Unigram_CosineSim",
    # "LSA100_Word_Bigram_CosineSim",
    # "LSA100_Word_Trigram_CosineSim",
    ]
    lsa_char_ngram_cosinesim_list = [
    # "LSA100_Char_Bigram_CosineSim",
    # "LSA100_Char_Trigram_CosineSim",
    "LSA100_Char_Fourgram_CosineSim",
    # "LSA100_Char_Fivegram_CosineSim",
    ]

    doc2vec_list = [
    "Doc2Vec_Homedepot_D100_CosineSim",
    ]

    word2vec_list = [
    "Word2Vec_N_Similarity",
    "Word2Vec_Homedepot_D100_CosineSim_Mean_Mean",
    "Word2Vec_Homedepot_D100_CosineSim_Max_Mean",
    "Word2Vec_Homedepot_D100_CosineSim_Min_Mean",
    ]

    distance_generator_list = \
    match_list + \
    tfidf_list + \
    intersect_ngram_count_list + \
    first_last_ngram_list + \
    cooccurrence_ngram_count_list + \
    ngram_jaccard_list + \
    tfidf_word_ngram_cosinesim_list + \
    tfidf_char_ngram_cosinesim_list + \
    lsa_word_ngram_cosinesim_list + \
    lsa_char_ngram_cosinesim_list + \
    char_dist_sim_list + \
    word2vec_list + \
    doc2vec_list

    obs_fields_list = []
    target_fields_list = []
    ## query in document
    obs_fields_list.append( ["search_term"] )
    target_fields_list.append( ["product_title", "product_title_product_name"] )
    aggregation_mode = ["mean", "max", "min"]
    for group_id_name in group_id_names:
        group_id_list = pkl_utils._load(os.path.join(config.FEAT_DIR, group_id_name+"_1D.pkl"))
        for distance_generator in distance_generator_list:
            for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
                for obs_field in obs_fields:
                    for target_field in target_fields:
                        dist_name = "%s_%s_x_%s"%(distance_generator, obs_field, target_field)
                        try:
                            dist_list = pkl_utils._load(os.path.join(config.FEAT_DIR, dist_name+"_1D.pkl"))
                            ext = GroupDistanceStat(dist_list, group_id_list, dist_name, group_id_name, aggregation_mode)
                            x = ext.transform()
                            if isinstance(ext.__name__(), list):
                                for i,feat_name in enumerate(ext.__name__()):
                                    dim = 1
                                    fname = "%s_%dD"%(feat_name, dim)
                                    pkl_utils._save(os.path.join(config.FEAT_DIR, fname+config.FEAT_FILE_SUFFIX), x[:,i])
                                    corr = np_utils._corr(x[:TRAIN_SIZE,i], y_train)
                                    logger.info("%s (%dD): corr = %.6f"%(fname, dim, corr))
                        except:
                            logger.info("Skip %s"%dist_name)
                            pass
Exemplo n.º 7
0
def main():
    logname = "generate_feature_group_distance_stat_%s.log" % time_utils._timestamp(
    )
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
    y_train = dfAll["relevance"].values[:TRAIN_SIZE]

    group_id_names = [
        "DocId_search_term", "DocId_product_title", "DocIdEcho_product_uid"
    ]

    match_list = [
        "MatchQueryCount",
        "MatchQueryRatio",
        "LongestMatchRatio",
    ]

    tfidf_list = [
        "StatCoocTF_Unigram_Mean",
        "StatCoocTF_Unigram_Max",
        "StatCoocTF_Unigram_Min",
        # "StatCoocNormTF_Unigram_Mean",
        # "StatCoocNormTF_Unigram_Max",
        # "StatCoocNormTF_Unigram_Min",
        "StatCoocTFIDF_Unigram_Mean",
        "StatCoocTFIDF_Unigram_Max",
        "StatCoocTFIDF_Unigram_Min",
        "StatCoocBM25_Unigram_Mean",
        "StatCoocBM25_Unigram_Max",
        "StatCoocBM25_Unigram_Min",
        # "StatCoocTF_Bigram_Mean",
        # "StatCoocTF_Bigram_Max",
        # "StatCoocTF_Bigram_Min",
        # "StatCoocNormTF_Bigram_Mean",
        # "StatCoocNormTF_Bigram_Max",
        # "StatCoocNormTF_Bigram_Min",
        # "StatCoocTFIDF_Bigram_Mean",
        # "StatCoocTFIDF_Bigram_Max",
        # "StatCoocTFIDF_Bigram_Min",
        # "StatCoocBM25_Bigram_Mean",
        # "StatCoocBM25_Bigram_Max",
        # "StatCoocBM25_Bigram_Min",
        # "StatCoocTF_Trigram_Mean",
        # "StatCoocTF_Trigram_Max",
        # "StatCoocTF_Trigram_Min",
        # "StatCoocNormTF_Trigram_Mean",
        # "StatCoocNormTF_Trigram_Max",
        # "StatCoocNormTF_Trigram_Min",
        # "StatCoocTFIDF_Trigram_Mean",
        # "StatCoocTFIDF_Trigram_Max",
        # "StatCoocTFIDF_Trigram_Min",
        # "StatCoocBM25_Trigram_Mean",
        # "StatCoocBM25_Trigram_Max",
        # "StatCoocBM25_Trigram_Min",
    ]
    intersect_ngram_count_list = [
        "IntersectCount_Unigram",
        "IntersectRatio_Unigram",
        # "IntersectCount_Bigram",
        # "IntersectRatio_Bigram",
        # "IntersectCount_Trigram",
        # "IntersectRatio_Trigram",
    ]
    first_last_ngram_list = [
        "FirstIntersectCount_Unigram",
        "FirstIntersectRatio_Unigram",
        "LastIntersectCount_Unigram",
        "LastIntersectRatio_Unigram",
        # "FirstIntersectCount_Bigram",
        # "FirstIntersectRatio_Bigram",
        # "LastIntersectCount_Bigram",
        # "LastIntersectRatio_Bigram",
        # "FirstIntersectCount_Trigram",
        # "FirstIntersectRatio_Trigram",
        # "LastIntersectCount_Trigram",
        # "LastIntersectRatio_Trigram",
    ]

    cooccurrence_ngram_count_list = [
        "CooccurrenceCount_Unigram",
        "CooccurrenceRatio_Unigram",
        # "CooccurrenceCount_Bigram",
        # "CooccurrenceRatio_Bigram",
        # "CooccurrenceCount_Trigram",
        # "CooccurrenceRatio_Trigram",
    ]

    ngram_jaccard_list = [
        "JaccardCoef_Unigram",
        # "JaccardCoef_Bigram",
        # "JaccardCoef_Trigram",
        "DiceDistance_Unigram",
        # "DiceDistance_Bigram",
        # "DiceDistance_Trigram",
    ]

    char_dist_sim_list = [
        "CharDistribution_CosineSim",
        "CharDistribution_KL",
    ]

    tfidf_word_ngram_cosinesim_list = [
        "TFIDF_Word_Unigram_CosineSim",
        # "TFIDF_Word_Bigram_CosineSim",
        # "TFIDF_Word_Trigram_CosineSim",
    ]
    tfidf_char_ngram_cosinesim_list = [
        # "TFIDF_Char_Bigram_CosineSim",
        # "TFIDF_Char_Trigram_CosineSim",
        "TFIDF_Char_Fourgram_CosineSim",
        # "TFIDF_Char_Fivegram_CosineSim",
    ]

    lsa_word_ngram_cosinesim_list = [
        "LSA100_Word_Unigram_CosineSim",
        # "LSA100_Word_Bigram_CosineSim",
        # "LSA100_Word_Trigram_CosineSim",
    ]
    lsa_char_ngram_cosinesim_list = [
        # "LSA100_Char_Bigram_CosineSim",
        # "LSA100_Char_Trigram_CosineSim",
        "LSA100_Char_Fourgram_CosineSim",
        # "LSA100_Char_Fivegram_CosineSim",
    ]

    doc2vec_list = [
        "Doc2Vec_Homedepot_D100_CosineSim",
    ]

    word2vec_list = [
        "Word2Vec_N_Similarity",
        "Word2Vec_Homedepot_D100_CosineSim_Mean_Mean",
        "Word2Vec_Homedepot_D100_CosineSim_Max_Mean",
        "Word2Vec_Homedepot_D100_CosineSim_Min_Mean",
    ]

    distance_generator_list = \
    match_list + \
    tfidf_list + \
    intersect_ngram_count_list + \
    first_last_ngram_list + \
    cooccurrence_ngram_count_list + \
    ngram_jaccard_list + \
    tfidf_word_ngram_cosinesim_list + \
    tfidf_char_ngram_cosinesim_list + \
    lsa_word_ngram_cosinesim_list + \
    lsa_char_ngram_cosinesim_list + \
    char_dist_sim_list + \
    word2vec_list + \
    doc2vec_list

    obs_fields_list = []
    target_fields_list = []
    ## query in document
    obs_fields_list.append(["search_term"])
    target_fields_list.append(["product_title", "product_title_product_name"])
    aggregation_mode = ["mean", "max", "min"]
    for group_id_name in group_id_names:
        group_id_list = pkl_utils._load(
            os.path.join(config.FEAT_DIR, group_id_name + "_1D.pkl"))
        for distance_generator in distance_generator_list:
            for obs_fields, target_fields in zip(obs_fields_list,
                                                 target_fields_list):
                for obs_field in obs_fields:
                    for target_field in target_fields:
                        dist_name = "%s_%s_x_%s" % (distance_generator,
                                                    obs_field, target_field)
                        try:
                            dist_list = pkl_utils._load(
                                os.path.join(config.FEAT_DIR,
                                             dist_name + "_1D.pkl"))
                            ext = GroupDistanceStat(dist_list, group_id_list,
                                                    dist_name, group_id_name,
                                                    aggregation_mode)
                            x = ext.transform()
                            if isinstance(ext.__name__(), list):
                                for i, feat_name in enumerate(ext.__name__()):
                                    dim = 1
                                    fname = "%s_%dD" % (feat_name, dim)
                                    pkl_utils._save(
                                        os.path.join(
                                            config.FEAT_DIR,
                                            fname + config.FEAT_FILE_SUFFIX),
                                        x[:, i])
                                    corr = np_utils._corr(
                                        x[:TRAIN_SIZE, i], y_train)
                                    logger.info("%s (%dD): corr = %.6f" %
                                                (fname, dim, corr))
                        except:
                            logger.info("Skip %s" % dist_name)
                            pass
Exemplo n.º 8
0
    def combine(self):

        dfAll = pkl_utils._load(config.INFO_DATA)
        dfAll_raw = dfAll.copy()
        y_train = dfAll["relevance"].values[:TRAIN_SIZE]

        ## for basic features
        feat_cnt = 0
        self.logger.info("Run for basic...")
        for file_name in sorted(os.listdir(config.FEAT_DIR)):
            if self.feature_suffix in file_name:
                fname = file_name.split(".")[0]
                if fname not in self.feature_dict:
                    continue
                x = self.load_feature(config.FEAT_DIR, fname)
                x = np.nan_to_num(x)
                if np.isnan(x).any():
                    self.logger.info("%s nan"%fname)
                    continue
                # apply feature transform
                mandatory = self.feature_dict[fname][0]
                transformer = self.feature_dict[fname][1]
                x = transformer.fit_transform(x)
                dim = np_utils._dim(x)
                if dim == 1:
                    corr = np_utils._corr(x[:TRAIN_SIZE], y_train)
                    if not mandatory and abs(corr) < self.corr_threshold:
                        self.logger.info("Drop: {} ({}D) (abs corr = {}, < threshold = {})".format(
                            fname, dim, abs(corr), self.corr_threshold))
                        continue
                    dfAll[fname] = x
                    self.feature_names.append(fname)
                else:
                    columns = ["%s_%d"%(fname, x) for x in range(dim)]
                    df = pd.DataFrame(x, columns=columns)
                    dfAll = pd.concat([dfAll, df], axis=1)
                    self.feature_names.extend(columns)
                feat_cnt += 1
                self.feature_names_basic.append(fname)
                if dim == 1:
                    self.logger.info("Combine {:>3}/{:>3} feat: {} ({}D) (corr = {})".format(
                        feat_cnt, len(self.feature_dict.keys()), fname, dim, corr))
                else:
                    self.logger.info("Combine {:>3}/{:>3} feat: {} ({}D)".format(
                        feat_cnt, len(self.feature_dict.keys()), fname, dim))
        dfAll.fillna(config.MISSING_VALUE_NUMERIC, inplace=True)
        ## basic
        dfTrain = dfAll.iloc[:TRAIN_SIZE].copy()
        self.y_train = dfTrain["relevance"].values.astype(float)
        dfTrain.drop(["id","relevance"], axis=1, inplace=True)
        self.X_train = dfTrain.values.astype(float)

        dfTest = dfAll.iloc[TRAIN_SIZE:].copy()
        self.id_test = dfTest["id"].values.astype(int)
        dfTest.drop(["id","relevance"], axis=1, inplace=True)
        self.X_test = dfTest.values.astype(float)

        ## all
        first = True
        feat_cv_cnt = 0
        dfAll_cv_all = dfAll_raw.copy()
        feature_dir = "%s/All" % (config.FEAT_DIR)
        for file_name in sorted(os.listdir(feature_dir)):
            if self.feature_suffix in file_name:
                fname = file_name.split(".")[0]
                if fname not in self.feature_dict:
                    continue
                if first:
                    self.logger.info("Run for all...")
                    first = False
                x = self.load_feature(feature_dir, fname)
                x = np.nan_to_num(x)
                if np.isnan(x).any():
                    self.logger.info("%s nan"%fname)
                    continue
                # apply feature transform
                mandatory = self.feature_dict[fname][0]
                transformer = self.feature_dict[fname][1]
                x = transformer.fit_transform(x)
                dim = np_utils._dim(x)
                if dim == 1:
                    corr = np_utils._corr(x[:TRAIN_SIZE], y_train)
                    if not mandatory and abs(corr) < self.corr_threshold:
                        self.logger.info("Drop: {} ({}D) (abs corr = {}, < threshold = {})".format(
                            fname, dim, abs(corr), self.corr_threshold))
                        continue
                    dfAll_cv_all[fname] = x
                    self.feature_names.append(fname)
                else:
                    columns = ["%s_%d"%(fname, x) for x in range(dim)]
                    df = pd.DataFrame(x, columns=columns)
                    dfAll_cv_all = pd.concat([dfAll_cv_all, df], axis=1)
                    self.feature_names.extend(columns)
                feat_cv_cnt += 1
                self.feature_names_cv.append(fname)
                if dim == 1:
                    self.logger.info("Combine {:>3}/{:>3} feat: {} ({}D) (corr = {})".format(
                        feat_cnt+feat_cv_cnt, len(self.feature_dict.keys()), fname, dim, corr))
                else:
                    self.logger.info("Combine {:>3}/{:>3} feat: {} ({}D)".format(
                        feat_cnt+feat_cv_cnt, len(self.feature_dict.keys()), fname, dim))
        if feat_cv_cnt > 0:
            dfAll_cv_all.fillna(config.MISSING_VALUE_NUMERIC, inplace=True)
            X_tmp = dfAll_cv_all.drop(["id","relevance"], axis=1).values.astype(float)
            self.X_train_cv_all = X_tmp[:TRAIN_SIZE]
            self.X_test = np.hstack((self.X_test, X_tmp[TRAIN_SIZE:]))
        else:
            self.X_train_cv_all = None
        feat_cnt += feat_cv_cnt

        ## for cv features
        first = True
        for run in range(1,self.n_iter+1):
            feat_cv_cnt = 0
            dfAll_cv = dfAll_raw.copy()
            feature_dir = "%s/Run%d" % (config.FEAT_DIR, run)
            for file_name in sorted(os.listdir(feature_dir)):
                if self.feature_suffix in file_name:
                    fname = file_name.split(".")[0]
                    if (fname not in self.feature_dict) or (fname not in self.feature_names_cv):
                        continue
                    if first:
                        self.logger.info("Run for cv...")
                        first = False
                    if feat_cv_cnt == 0:
                        self.logger.info("Run %d"%run)
                    x = self.load_feature(feature_dir, fname)
                    x = np.nan_to_num(x)
                    if np.isnan(x).any():
                        self.logger.info("%s nan"%fname)
                        continue
                    # apply feature transform
                    mandatory = self.feature_dict[fname][0]
                    transformer = self.feature_dict[fname][1]
                    x = transformer.fit_transform(x)
                    dim = np_utils._dim(x)
                    if dim == 1:
                        dfAll_cv[fname] = x
                    else:
                        columns = ["%s_%d"%(fname, x) for x in range(dim)]
                        df = pd.DataFrame(x, columns=columns)
                        dfAll_cv = pd.concat([dfAll_cv, df], axis=1)
                    feat_cv_cnt += 1
                    self.logger.info("Combine {:>3}/{:>3} feat: {} ({}D)".format(
                        feat_cnt+feat_cv_cnt, len(self.feature_dict.keys()), fname, dim))
            if feat_cv_cnt > 0:
                dfAll_cv.fillna(config.MISSING_VALUE_NUMERIC, inplace=True)
                dfTrain_cv = dfAll_cv.iloc[:TRAIN_SIZE].copy()
                X_tmp = dfTrain_cv.drop(["id","relevance"], axis=1).values.astype(float)
                if run == 1:
                    self.X_train_cv = np.zeros((X_tmp.shape[0], X_tmp.shape[1], self.n_iter), dtype=float)
                self.X_train_cv[:,:,run-1] = X_tmp
        if feat_cv_cnt == 0:
            self.X_train_cv = None
            self.basic_only = 1

        # report final results
        if self.basic_only:
            self.logger.info("Overall Shape: %d x %d"%(len(self.y_train), self.X_train.shape[1]))
        else:
            self.logger.info("Overall Shape: %d x %d"%(
                len(self.y_train), self.X_train.shape[1]+self.X_train_cv_all.shape[1])) 
        self.logger.info("Done combinning.")

        return self
    def combine(self):

        dfAll = pkl_utils._load(config.INFO_DATA)
        dfAll_raw = dfAll.copy()
        y_train = dfAll["relevance"].values[:TRAIN_SIZE]

        ## for basic features
        feat_cnt = 0
        self.logger.info("Run for basic...")
        for file_name in sorted(os.listdir(config.FEAT_DIR)):
            if self.feature_suffix in file_name:
                fname = file_name.split(".")[0]
                if fname not in self.feature_dict:
                    continue
                x = self.load_feature(config.FEAT_DIR, fname)
                x = np.nan_to_num(x)
                if np.isnan(x).any():
                    self.logger.info("%s nan" % fname)
                    continue
                # apply feature transform
                mandatory = self.feature_dict[fname][0]
                transformer = self.feature_dict[fname][1]
                x = transformer.fit_transform(x)
                dim = np_utils._dim(x)
                if dim == 1:
                    corr = np_utils._corr(x[:TRAIN_SIZE], y_train)
                    if not mandatory and abs(corr) < self.corr_threshold:
                        self.logger.info(
                            "Drop: {} ({}D) (abs corr = {}, < threshold = {})".
                            format(fname, dim, abs(corr), self.corr_threshold))
                        continue
                    dfAll[fname] = x
                    self.feature_names.append(fname)
                else:
                    columns = ["%s_%d" % (fname, x) for x in range(dim)]
                    df = pd.DataFrame(x, columns=columns)
                    dfAll = pd.concat([dfAll, df], axis=1)
                    self.feature_names.extend(columns)
                feat_cnt += 1
                self.feature_names_basic.append(fname)
                if dim == 1:
                    self.logger.info(
                        "Combine {:>3}/{:>3} feat: {} ({}D) (corr = {})".
                        format(feat_cnt, len(self.feature_dict.keys()), fname,
                               dim, corr))
                else:
                    self.logger.info(
                        "Combine {:>3}/{:>3} feat: {} ({}D)".format(
                            feat_cnt, len(self.feature_dict.keys()), fname,
                            dim))
        dfAll.fillna(config.MISSING_VALUE_NUMERIC, inplace=True)
        ## basic
        dfTrain = dfAll.iloc[:TRAIN_SIZE].copy()
        self.y_train = dfTrain["relevance"].values.astype(float)
        dfTrain.drop(["id", "relevance"], axis=1, inplace=True)
        self.X_train = dfTrain.values.astype(float)

        dfTest = dfAll.iloc[TRAIN_SIZE:].copy()
        self.id_test = dfTest["id"].values.astype(int)
        dfTest.drop(["id", "relevance"], axis=1, inplace=True)
        self.X_test = dfTest.values.astype(float)

        ## all
        first = True
        feat_cv_cnt = 0
        dfAll_cv_all = dfAll_raw.copy()
        feature_dir = "%s/All" % (config.FEAT_DIR)
        for file_name in sorted(os.listdir(feature_dir)):
            if self.feature_suffix in file_name:
                fname = file_name.split(".")[0]
                if fname not in self.feature_dict:
                    continue
                if first:
                    self.logger.info("Run for all...")
                    first = False
                x = self.load_feature(feature_dir, fname)
                x = np.nan_to_num(x)
                if np.isnan(x).any():
                    self.logger.info("%s nan" % fname)
                    continue
                # apply feature transform
                mandatory = self.feature_dict[fname][0]
                transformer = self.feature_dict[fname][1]
                x = transformer.fit_transform(x)
                dim = np_utils._dim(x)
                if dim == 1:
                    corr = np_utils._corr(x[:TRAIN_SIZE], y_train)
                    if not mandatory and abs(corr) < self.corr_threshold:
                        self.logger.info(
                            "Drop: {} ({}D) (abs corr = {}, < threshold = {})".
                            format(fname, dim, abs(corr), self.corr_threshold))
                        continue
                    dfAll_cv_all[fname] = x
                    self.feature_names.append(fname)
                else:
                    columns = ["%s_%d" % (fname, x) for x in range(dim)]
                    df = pd.DataFrame(x, columns=columns)
                    dfAll_cv_all = pd.concat([dfAll_cv_all, df], axis=1)
                    self.feature_names.extend(columns)
                feat_cv_cnt += 1
                self.feature_names_cv.append(fname)
                if dim == 1:
                    self.logger.info(
                        "Combine {:>3}/{:>3} feat: {} ({}D) (corr = {})".
                        format(feat_cnt + feat_cv_cnt,
                               len(self.feature_dict.keys()), fname, dim,
                               corr))
                else:
                    self.logger.info(
                        "Combine {:>3}/{:>3} feat: {} ({}D)".format(
                            feat_cnt + feat_cv_cnt,
                            len(self.feature_dict.keys()), fname, dim))
        if feat_cv_cnt > 0:
            dfAll_cv_all.fillna(config.MISSING_VALUE_NUMERIC, inplace=True)
            X_tmp = dfAll_cv_all.drop(["id", "relevance"],
                                      axis=1).values.astype(float)
            self.X_train_cv_all = X_tmp[:TRAIN_SIZE]
            self.X_test = np.hstack((self.X_test, X_tmp[TRAIN_SIZE:]))
        else:
            self.X_train_cv_all = None
        feat_cnt += feat_cv_cnt

        ## for cv features
        first = True
        for run in range(1, self.n_iter + 1):
            feat_cv_cnt = 0
            dfAll_cv = dfAll_raw.copy()
            feature_dir = "%s/Run%d" % (config.FEAT_DIR, run)
            for file_name in sorted(os.listdir(feature_dir)):
                if self.feature_suffix in file_name:
                    fname = file_name.split(".")[0]
                    if (fname not in self.feature_dict) or (
                            fname not in self.feature_names_cv):
                        continue
                    if first:
                        self.logger.info("Run for cv...")
                        first = False
                    if feat_cv_cnt == 0:
                        self.logger.info("Run %d" % run)
                    x = self.load_feature(feature_dir, fname)
                    x = np.nan_to_num(x)
                    if np.isnan(x).any():
                        self.logger.info("%s nan" % fname)
                        continue
                    # apply feature transform
                    mandatory = self.feature_dict[fname][0]
                    transformer = self.feature_dict[fname][1]
                    x = transformer.fit_transform(x)
                    dim = np_utils._dim(x)
                    if dim == 1:
                        dfAll_cv[fname] = x
                    else:
                        columns = ["%s_%d" % (fname, x) for x in range(dim)]
                        df = pd.DataFrame(x, columns=columns)
                        dfAll_cv = pd.concat([dfAll_cv, df], axis=1)
                    feat_cv_cnt += 1
                    self.logger.info(
                        "Combine {:>3}/{:>3} feat: {} ({}D)".format(
                            feat_cnt + feat_cv_cnt,
                            len(self.feature_dict.keys()), fname, dim))
            if feat_cv_cnt > 0:
                dfAll_cv.fillna(config.MISSING_VALUE_NUMERIC, inplace=True)
                dfTrain_cv = dfAll_cv.iloc[:TRAIN_SIZE].copy()
                X_tmp = dfTrain_cv.drop(["id", "relevance"],
                                        axis=1).values.astype(float)
                if run == 1:
                    self.X_train_cv = np.zeros(
                        (X_tmp.shape[0], X_tmp.shape[1], self.n_iter),
                        dtype=float)
                self.X_train_cv[:, :, run - 1] = X_tmp
        if feat_cv_cnt == 0:
            self.X_train_cv = None
            self.basic_only = 1

        # report final results
        if self.basic_only:
            self.logger.info("Overall Shape: %d x %d" %
                             (len(self.y_train), self.X_train.shape[1]))
        else:
            self.logger.info("Overall Shape: %d x %d" %
                             (len(self.y_train), self.X_train.shape[1] +
                              self.X_train_cv_all.shape[1]))
        self.logger.info("Done combinning.")

        return self
Exemplo n.º 10
0
    def combine(self):
        dfAll = table_utils._read(config.INFO_DATA)
        dfAll_raw = dfAll.copy()
        y = dfAll['relevance'].values

        feat_cnt = 0
        self.logger.info('Run for basic...')
        for file_name in sorted(os.listdir(config.FEAT_DIR)):
            if not config.FEAT_FILE_SUFFIX in file_name:
                continue
            fname = os.path.splitext(file_name)[0]
            if fname not in self.feature_dict:
                continue
            x = self.load_feature(config.FEAT_DIR, fname)
            x = np.nan_to_num(x)
            # Still necessary?
            if np.isnan(x).any():
                self.logger.info("%s nan" % (fname))
                continue
            # Apply feature transformers (?)
            mandatory = self.feature_dict[fname][0]
            transformer = self.feature_dict[fname][1]
            x = transformer.fit_transform(x)
            dim = np_utils._dim(x)
            if dim == 1:
                corr = np_utils._corr(x, y)
                if not mandatory and (np.isnan(corr)
                                      or abs(corr) < self.corr_threshold):
                    self.logger.info(
                        "Drop: {} ({}D) (abs_corr = {}, < threshold {})".
                        format(fname, dim, abs(corr), self.corr_threshold))
                    continue
                dfAll[fname] = x
                self.feature_names.append(fname)
            else:
                columns = ["%s_%d" % (fname, x) for x in range(dim)]
                df = pd.DataFrame(x, columns=columns)
                dfAll = pd.concat([dfAll, df], axis=1)
                self.feature_names.extend(columns)
            feat_cnt += 1
            self.feature_names_basic.append(fname)
            if dim == 1:
                self.logger.info(
                    "Combine {:>3}/{:>3} feat: {} ({}D) (corr = {})".format(
                        feat_cnt, len(self.feature_dict.keys()), fname, dim,
                        corr))
            else:
                self.logger.info("Combine {:>3}/{:>3} feat: {} ({}D)".format(
                    feat_cnt, len(self.feature_dict.keys()), fname, dim))

        dfAll.fillna(config.MISSING_VALUE_NUMERIC, inplace=True)
        self.y = dfAll["relevance"].values.astype(float)
        self.weights = dfAll['weight'].values
        self.query_ids = dfAll['norm_query_id'].values
        dfAll.drop(["relevance", "weight", "norm_query_id"],
                   axis=1,
                   inplace=True)
        self.X = dfAll.values.astype(float)

        self.logger.info("Overall Shape: %d x %d" %
                         (len(self.y), self.X.shape[1]))
        self.logger.info("Done combining")