def main(): logname = "generate_feature_match_%s.log" % time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) generators = [ MatchQueryCount, MatchQueryRatio, LongestMatchSize, LongestMatchRatio, ] obs_fields_list = [] target_fields_list = [] ## question1 in question2 obs_fields_list.append(['question1']) target_fields_list.append(['question2']) ## question2 in question1 obs_fields_list.append(['question2']) target_fields_list.append(['question1']) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: param_list = [] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def main(): logname = "generate_feature_intersect_position_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) generators = [ IntersectPosition_Ngram, IntersectNormPosition_Ngram, ] obs_fields_list = [] target_fields_list = [] ## query in document obs_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"] ) target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"][1:2] ) ## document in query obs_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"][1:2] ) target_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"] ) ngrams = [1,2,3,12,123][:3] aggregation_mode = ["mean", "std", "max", "min", "median"] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: for ngram in ngrams: param_list = [ngram, aggregation_mode] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def run_ngram_jaccard(): logname = "generate_feature_ngram_jaccard_%s.log" % time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) generators = [JaccardCoef_Ngram, DiceDistance_Ngram] obs_fields_list = [] target_fields_list = [] obs_fields_list.append([ "search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected" ][:2]) target_fields_list.append([ "product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color" ]) ngrams = [1, 2, 3, 12, 123][:3] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: for ngram in ngrams: param_list = [ngram] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def run_count(): logname = "generate_feature_first_last_ngram_count_%s.log" % time_utils._timestamp( ) logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) generators = [ FirstIntersectCount_Ngram, LastIntersectCount_Ngram, FirstIntersectRatio_Ngram, LastIntersectRatio_Ngram, ] obs_fields_list = [] target_fields_list = [] ## query in document obs_fields_list.append(["question1"]) target_fields_list.append(["question2"]) ## document in query obs_fields_list.append(["question2"]) target_fields_list.append(["question1"]) ngrams = [1, 2, 3, 12, 123][:3] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: for ngram in ngrams: param_list = [ngram] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def main(): logname = "generate_feature_doc2vec_%s.log" % time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) #### NOTE: use data BEFORE STEMMING dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED) doc2vec_model_dirs = [] model_prefixes = [] ## doc2vec model trained with Homedepot dataset: brand/color/obs/title/description doc2vec_model_dirs.append( config.DOC2VEC_MODEL_DIR + "/Homedepot-doc2vec-D%d-min_count%d.model" % (config.EMBEDDING_DIM, config.EMBEDDING_MIN_COUNT)) model_prefixes.append("Homedepot") for doc2vec_model_dir, model_prefix in zip(doc2vec_model_dirs, model_prefixes): ## load model try: if ".bin" in doc2vec_model_dir: doc2vec_model = gensim.models.Doc2Vec.load_word2vec_format( doc2vec_model_dir, binary=True) if ".txt" in doc2vec_model_dir: doc2vec_model = gensim.models.Doc2Vec.load_word2vec_format( doc2vec_model_dir, binary=False) else: doc2vec_model = gensim.models.Doc2Vec.load(doc2vec_model_dir) doc2vec_model_sent_label = pkl_utils._load(doc2vec_model_dir + ".sent_label") except: continue # ## standalone (not used in model building) # obs_fields = ["search_term", "search_term_alt", "product_title", "product_description", "product_attribute"] # generator = Doc2Vec_Vector # param_list = [doc2vec_model, doc2vec_model_sent_label, model_prefix] # sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger) # sf.go() ## pairwise generators = [ Doc2Vec_CosineSim, Doc2Vec_RMSE, # Doc2Vec_Vdiff, ] obs_fields_list = [] target_fields_list = [] obs_fields_list.append(["search_term", "search_term_alt"][:1]) target_fields_list.append([ "product_title", "product_description", "product_attribute", "product_brand", "product_color" ]) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: param_list = [ doc2vec_model, doc2vec_model_sent_label, model_prefix ] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def main(): logname = "generate_feature_intersect_position_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) generators = [ IntersectPosition_Ngram, IntersectNormPosition_Ngram, ] obs_fields_list = [] target_fields_list = [] ## query in document obs_fields_list.append( ["question1"] ) target_fields_list.append( ["question2"] ) ## document in query obs_fields_list.append( ["question2"] ) target_fields_list.append( ["question1"] ) ngrams = [1,2,3,12,123][:3] aggregation_mode = ["mean", "std", "max", "min", "median"] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: for ngram in ngrams: param_list = [ngram, aggregation_mode] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def run_position(): logname = "generate_feature_first_last_ngram_position_%s.log" % time_utils._timestamp( ) logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) generators = [ FirstIntersectPosition_Ngram, LastIntersectPosition_Ngram, FirstIntersectNormPosition_Ngram, LastIntersectNormPosition_Ngram, ] obs_fields_list = [["question1"], ["question2"]] target_fields_list = [["question2"], ["question1"]] ngrams = [1, 2, 3, 12, 123] aggregation_mode = ["mean", "std", "max", "min", "median"] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: for ngram in ngrams: param_list = [ngram, aggregation_mode] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def run_tsne_lsa_ngram(): logname = "generate_feature_tsne_lsa_ngram_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) generators = [TSNE_LSA_Word_Ngram, TSNE_LSA_Char_Ngram] ngrams_list = [[1,2,3], [2,3,4,5]] ngrams_list = [[3], [4]] obs_fields = ["question1", "question2"] for generator,ngrams in zip(generators, ngrams_list): for ngram in ngrams: param_list = [ngram, config.SVD_DIM, config.SVD_N_ITER] sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger, force_corr=True) sf.go() generators = [TSNE_LSA_Word_Ngram_Pair] ngrams = [1, 2] obs_fields_list = [] target_fields_list = [] obs_fields_list.append( ["question1"] ) target_fields_list.append( ["question2"] ) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for ngram in ngrams: for generator in generators: param_list = [ngram, config.SVD_DIM, config.SVD_N_ITER] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger, force_corr=True) pf.go()
def run_lsa_ngram_pair(): """Symmetric in obs and target""" logname = "generate_feature_lsa_ngram_pair_%s.log" % time_utils._timestamp( ) logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) generators = [LSA_Word_Ngram_Pair] ngrams = [1, 2, 3] obs_fields_list = [] target_fields_list = [] ## question1 in question2 obs_fields_list.append(['question1']) target_fields_list.append(['question2']) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for ngram in ngrams: for generator in generators: param_list = [ngram, config.SVD_DIM, config.SVD_N_ITER] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger, force_corr=True) pf.go() del pf gc.collect()
def run_tsne_lsa_ngram(): logname = "generate_feature_tsne_lsa_ngram_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) dfAll.drop(["product_attribute_list"], inplace=True, axis=1) generators = [TSNE_LSA_Word_Ngram, TSNE_LSA_Char_Ngram] ngrams_list = [[1,2,3], [2,3,4,5]] ngrams_list = [[1,2,3], [4]] obs_fields = ["search_term", "search_term_alt", "search_term_auto_corrected", "product_title", "product_description"] for generator,ngrams in zip(generators, ngrams_list): for ngram in ngrams: param_list = [ngram, config.SVD_DIM, config.SVD_N_ITER] sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger, force_corr=True) sf.go() generators = [TSNE_LSA_Word_Ngram_Pair] ngrams = [1, 2] obs_fields_list = [] target_fields_list = [] obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"] ) target_fields_list.append( ["product_title", "product_description"] ) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for ngram in ngrams: for generator in generators: param_list = [ngram, config.SVD_DIM, config.SVD_N_ITER] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger, force_corr=True) pf.go()
def run_lsa_ngram_cooc(): logname = "generate_feature_lsa_ngram_cooc_%s.log" % time_utils._timestamp( ) logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) dfAll.drop(["product_attribute_list"], inplace=True, axis=1) generators = [LSA_Word_Ngram_Cooc] obs_ngrams = [1, 2] target_ngrams = [1, 2] obs_fields_list = [] target_fields_list = [] obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"][:1]) target_fields_list.append(["product_title", "product_description"][:1]) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for obs_ngram in obs_ngrams: for target_ngram in target_ngrams: for generator in generators: param_list = [ obs_ngram, target_ngram, config.SVD_DIM, config.SVD_N_ITER ] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def run_tfidf_ngram_cosinesim(): """Symmetric in obs and target""" logname = "generate_feature_tfidf_ngram_cosinesim_%s.log" % time_utils._timestamp( ) logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) generators = [TFIDF_Word_Ngram_CosineSim, TFIDF_Char_Ngram_CosineSim] ngrams_list = [[2, 3], [4]] obs_fields_list = [['question1']] target_fields_list = [['question2']] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator, ngrams in zip(generators, ngrams_list): for ngram in ngrams: param_list = [ngram] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger, force_corr=True) pf.go() del pf gc.collect()
def run_count(): logname = "generate_feature_first_last_ngram_count_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) generators = [ FirstIntersectCount_Ngram, LastIntersectCount_Ngram, FirstIntersectRatio_Ngram, LastIntersectRatio_Ngram, ] obs_fields_list = [] target_fields_list = [] ## query in document obs_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"] ) target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] ) ## document in query obs_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] ) target_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"] ) ngrams = [1,2,3,12,123][:3] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: for ngram in ngrams: param_list = [ngram] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def main(): logname = "generate_feature_wordnet_similarity_%s.log" % time_utils._timestamp( ) logger = logging_utils._get_logger(config.LOG_DIR, logname) #### NOTE: use data BEFORE STEMMING dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED) # WordNet_Lch_Similarity and WordNet_Wup_Similarity are not used in final submission generators = [ WordNet_Path_Similarity, WordNet_Lch_Similarity, WordNet_Wup_Similarity, ][:1] obs_fields_list = [] target_fields_list = [] # only search_term and product_title are used in final submission obs_fields_list.append(["question1"]) target_fields_list.append(["question2"]) # double aggregation aggregation_mode_prev = ["mean", "max", "min", "median"] aggregation_mode = ["mean", "std", "max", "min", "median"] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: param_list = [aggregation_mode_prev, aggregation_mode] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def run_compression_distance(): logname = "generate_feature_compression_distance_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) obs_fields_list = [] target_fields_list = [] obs_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"] ) target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] ) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): param_list = [] pf = PairwiseFeatureWrapper(CompressionDistance, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def run_compression_distance(): logname = "generate_feature_compression_distance_%s.log" % time_utils._timestamp( ) logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) obs_fields_list = [] target_fields_list = [] obs_fields_list.append([ "search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected" ][:2]) target_fields_list.append([ "product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color" ]) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): param_list = [] pf = PairwiseFeatureWrapper(CompressionDistance, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go() for ngram in ngrams: param_list = [ngram, aggregation_mode_prev, aggregation_mode] pf = PairwiseFeatureWrapper(CompressionDistance_Ngram, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def run_ngram_jaccard(): logname = "generate_feature_ngram_jaccard.log" logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = table_utils._read(config.ALL_DATA) generators = [JaccardCoef_Ngram, DiceDistance_Ngram] # single valued fields obs_fields_list = [["query", "norm_query"]] target_fields_list = [["hit_title", "opening_text" ]] ngrams = [1,2,3,12,123][:3] dedup = True for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: for ngram in ngrams: param_list = [ngram] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger, dedup) pf.go() # multi-valued fields target_fields_list = [["category", "template", "heading", "outgoing_link", "external_link", "redirect.title", "auxiliary_text"]] aggregation_mode = ["mean", "std", "max", "min", "median"] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: multi_gen = MultiTargetEstimatorWrapper(generator) for ngram in ngrams: param_list = [ngram, aggregation_mode] pf = PairwiseFeatureWrapper(multi_gen, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger, dedup) pf.go()
def run_char_dist_sim(): logname = "generate_feature_char_dist_sim_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) generators = [CharDistribution_Ratio, CharDistribution_CosineSim, CharDistribution_KL] obs_fields_list = [] target_fields_list = [] obs_fields_list.append( ["question1"] ) target_fields_list.append( ["question2"] ) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: param_list = [] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def main(): logname = "generate_feature_doc2vec_%s.log" % time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) #### NOTE: use data BEFORE STEMMING dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED) doc2vec_model_dirs = [] model_prefixes = [] ## doc2vec model trained with Quora dataset: question1/question2 doc2vec_model_dirs.append( config.DOC2VEC_MODEL_DIR + "/Quora-doc2vec-D%d-min_count%d.model" % (config.EMBEDDING_DIM, config.EMBEDDING_MIN_COUNT)) model_prefixes.append("Quora") for doc2vec_model_dir, model_prefix in zip(doc2vec_model_dirs, model_prefixes): ## load model try: if ".bin" in doc2vec_model_dir: doc2vec_model = gensim.models.Doc2Vec.load_word2vec_format( doc2vec_model_dir, binary=True) if ".txt" in doc2vec_model_dir: doc2vec_model = gensim.models.Doc2Vec.load_word2vec_format( doc2vec_model_dir, binary=False) else: doc2vec_model = gensim.models.Doc2Vec.load(doc2vec_model_dir) doc2vec_model_sent_label = pkl_utils._load(doc2vec_model_dir + ".sent_label") except: continue ## pairwise generators = [ Doc2Vec_CosineSim, Doc2Vec_RMSE, Doc2Vec_Vdiff, ] obs_fields_list = [["question1"]] target_fields_list = [["question2"]] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: param_list = [ doc2vec_model, doc2vec_model_sent_label, model_prefix ] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def run_ngram_jaccard(): """Symmetric in obs and target. """ logname = "generate_feature_ngram_jaccard_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) generators = [JaccardCoef_Ngram, DiceDistance_Ngram] obs_fields_list = [["question1"]] target_fields_list = [["question2"]] ngrams = [1, 2, 3] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: for ngram in ngrams: param_list = [ngram] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def run_char_dist_sim(): logname = "generate_feature_char_dist_sim_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) dfAll.drop(["product_attribute_list"], inplace=True, axis=1) generators = [CharDistribution_Ratio, CharDistribution_CosineSim, CharDistribution_KL] obs_fields_list = [] target_fields_list = [] obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"] ) target_fields_list.append( ["product_title", "product_description", "product_attribute"] ) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: param_list = [] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def run_lsa_ngram_pair(): logname = "generate_feature_lsa_ngram_pair_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) generators = [LSA_Word_Ngram_Pair] ngrams = [1, 2] obs_fields_list = [] target_fields_list = [] obs_fields_list.append( ["question1"] ) target_fields_list.append( ["question2"] ) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for ngram in ngrams: for generator in generators: param_list = [ngram, config.SVD_DIM, config.SVD_N_ITER] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def main(): logname = "generate_feature_doc2vec_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) #### NOTE: use data BEFORE STEMMING dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED) doc2vec_model_dirs = [] model_prefixes = [] ## doc2vec model trained with Homedepot dataset: brand/color/obs/title/description doc2vec_model_dirs.append( config.DOC2VEC_MODEL_DIR + "/Homedepot-doc2vec-D%d-min_count%d.model"%(config.EMBEDDING_DIM, config.EMBEDDING_MIN_COUNT) ) model_prefixes.append( "Homedepot" ) for doc2vec_model_dir, model_prefix in zip(doc2vec_model_dirs, model_prefixes): ## load model try: if ".bin" in doc2vec_model_dir: doc2vec_model = gensim.models.Doc2Vec.load_word2vec_format(doc2vec_model_dir, binary=True) if ".txt" in doc2vec_model_dir: doc2vec_model = gensim.models.Doc2Vec.load_word2vec_format(doc2vec_model_dir, binary=False) else: doc2vec_model = gensim.models.Doc2Vec.load(doc2vec_model_dir) doc2vec_model_sent_label = pkl_utils._load(doc2vec_model_dir+".sent_label") except: continue # ## standalone (not used in model building) # obs_fields = ["search_term", "search_term_alt", "product_title", "product_description", "product_attribute"] # generator = Doc2Vec_Vector # param_list = [doc2vec_model, doc2vec_model_sent_label, model_prefix] # sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger) # sf.go() ## pairwise generators = [ Doc2Vec_CosineSim, Doc2Vec_RMSE, Doc2Vec_Vdiff, ] obs_fields_list = [] target_fields_list = [] obs_fields_list.append( ["search_term", "search_term_alt"] ) target_fields_list.append( ["product_title", "product_description", "product_attribute", "product_brand", "product_color"] ) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: param_list = [doc2vec_model, doc2vec_model_sent_label, model_prefix] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def run_ngram_jaccard(): logname = "generate_feature_ngram_jaccard_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) generators = [JaccardCoef_Ngram, DiceDistance_Ngram] obs_fields_list = [] target_fields_list = [] obs_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"][:2] ) target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] ) ngrams = [1,2,3,12,123][:3] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: for ngram in ngrams: param_list = [ngram] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def run_tfidf_ngram_cosinesim(): logname = "generate_feature_tfidf_ngram_cosinesim_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) generators = [TFIDF_Word_Ngram_CosineSim, TFIDF_Char_Ngram_CosineSim] ngrams_list = [[1,2,3], [2,3,4,5]] ngrams_list = [[3], [4]] obs_fields_list = [] target_fields_list = [] obs_fields_list.append( ["question1"] ) target_fields_list.append( ["question2"] ) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator,ngrams in zip(generators, ngrams_list): for ngram in ngrams: param_list = [ngram] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def main(which): logname = "generate_feature_stat_cooc_tfidf_%s_%s.log"%(which, time_utils._timestamp()) logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) generators = [] if which == "tf": generators.append( StatCoocTF_Ngram ) elif which == "norm_tf": generators.append( StatCoocNormTF_Ngram ) elif which == "tfidf": generators.append( StatCoocTFIDF_Ngram ) elif which == "norm_tfidf": generators.append( StatCoocNormTFIDF_Ngram ) elif which == "bm25": generators.append( StatCoocBM25_Ngram ) obs_fields_list = [] target_fields_list = [] ## query in document obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"][:1] ) target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] ) ## document in query obs_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] ) target_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"][:1] ) ngrams = [1,2,3,12,123][:3] aggregation_mode = ["mean", "std", "max", "min", "median"] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: for ngram in ngrams: param_list = [ngram, aggregation_mode] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go() obs_fields_list = [] target_fields_list = [] ## query in document obs_fields_list.append( ["search_term_product_name"] ) target_fields_list.append( ["product_title_product_name"] ) ngrams = [1,2] aggregation_mode = ["mean", "std", "max", "min", "median"] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: for ngram in ngrams: if ngram == 2: # since product_name is of length 2, it makes no difference # for various aggregation as there is only one item param_list = [ngram, "mean"] else: param_list = [ngram, aggregation_mode] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def run_tsne_lsa_ngram(): logname = "generate_feature_tsne_lsa_ngram_%s.log" % time_utils._timestamp( ) logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) dfAll.drop(["product_attribute_list"], inplace=True, axis=1) generators = [TSNE_LSA_Word_Ngram, TSNE_LSA_Char_Ngram] ngrams_list = [[1, 2, 3], [2, 3, 4, 5]] ngrams_list = [[3], [4]] # obs_fields = ["search_term", "search_term_alt", "search_term_auto_corrected", "product_title", "product_description"] obs_fields = ["search_term", "product_title", "product_description"] for generator, ngrams in zip(generators, ngrams_list): for ngram in ngrams: param_list = [ngram, config.SVD_DIM, config.SVD_N_ITER] sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger, force_corr=True) sf.go() generators = [TSNE_LSA_Word_Ngram_Pair] ngrams = [1, 2] obs_fields_list = [] target_fields_list = [] obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"][:1]) target_fields_list.append(["product_title", "product_description"]) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for ngram in ngrams: for generator in generators: param_list = [ngram, config.SVD_DIM, config.SVD_N_ITER] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger, force_corr=True) pf.go()
def main(): logname = "generate_feature_group_distance_%s.log" % time_utils._timestamp( ) logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) dfTrain = dfAll.iloc[:TRAIN_SIZE].copy() ## run python3 splitter.py first split = pkl_utils._load("%s/splits_level1.pkl" % config.SPLIT_DIR) n_iter = len(split) relevances_complete = [ 1, 1.25, 1.33, 1.5, 1.67, 1.75, 2, 2.25, 2.33, 2.5, 2.67, 2.75, 3 ] relevances = [1, 1.33, 1.67, 2, 2.33, 2.67, 3] ngrams = [1] obs_fields = ["search_term"] target_fields = ["product_title", "product_description"] aggregation_mode = ["mean", "std", "max", "min", "median"] ## for cv for i in range(n_iter): trainInd, validInd = split[i][0], split[i][1] dfTrain2 = dfTrain.iloc[trainInd].copy() sub_feature_dir = "%s/Run%d" % (config.FEAT_DIR, i + 1) for target_field in target_fields: for relevance in relevances: for ngram in ngrams: param_list = [ dfAll["id"], dfTrain2, target_field, relevance, ngram, aggregation_mode ] pf = PairwiseFeatureWrapper(GroupRelevance_Ngram_Jaccard, dfAll, obs_fields, [target_field], param_list, sub_feature_dir, logger) pf.go() ## for all sub_feature_dir = "%s/All" % (config.FEAT_DIR) for target_field in target_fields: for relevance in relevances: for ngram in ngrams: param_list = [ dfAll["id"], dfTrain, target_field, relevance, ngram, aggregation_mode ] pf = PairwiseFeatureWrapper(GroupRelevance_Ngram_Jaccard, dfAll, obs_fields, [target_field], param_list, sub_feature_dir, logger) pf.go()
def run_position(): logname = "generate_feature_first_last_ngram_position_%s.log" % time_utils._timestamp( ) logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) generators = [ FirstIntersectPosition_Ngram, LastIntersectPosition_Ngram, FirstIntersectNormPosition_Ngram, LastIntersectNormPosition_Ngram, ] obs_fields_list = [] target_fields_list = [] ## query in document obs_fields_list.append([ "search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected" ][:2]) target_fields_list.append([ "product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color" ]) ## document in query obs_fields_list.append([ "product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color" ]) target_fields_list.append([ "search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected" ][:2]) ngrams = [1, 2, 3, 12, 123][:3] aggregation_mode = ["mean", "std", "max", "min", "median"] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: for ngram in ngrams: param_list = [ngram, aggregation_mode] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def run_tfidf_ngram_cosinesim(): logname = "generate_feature_tfidf_ngram_cosinesim_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) dfAll.drop(["product_attribute_list"], inplace=True, axis=1) generators = [TFIDF_Word_Ngram_CosineSim, TFIDF_Char_Ngram_CosineSim] ngrams_list = [[1,2,3], [2,3,4,5]] ngrams_list = [[1,2,3], [4]] obs_fields_list = [] target_fields_list = [] obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"] ) target_fields_list.append( ["product_title", "product_description", "product_attribute"] ) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator,ngrams in zip(generators, ngrams_list): for ngram in ngrams: param_list = [ngram] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def run_edit_distance(): logname = "generate_feature_edit_distance_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) obs_fields_list = [] target_fields_list = [] obs_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"][1:2] ) target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] ) ngrams = [1,2,3,12,123][:3] aggregation_mode_prev = ["mean", "max", "min", "median"] aggregation_mode = ["mean", "std", "max", "min", "median"] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): param_list = [] PairwiseFeatureWrapper(EditDistance, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) for ngram in ngrams: param_list = [ngram, aggregation_mode_prev, aggregation_mode] pf = PairwiseFeatureWrapper(EditDistance_Ngram, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def run_lsa_ngram_cooc(): logname = "generate_feature_lsa_ngram_cooc_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) dfAll.drop(["product_attribute_list"], inplace=True, axis=1) generators = [LSA_Word_Ngram_Cooc] obs_ngrams = [1, 2] target_ngrams = [1, 2] obs_fields_list = [] target_fields_list = [] obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"] ) target_fields_list.append( ["product_title", "product_description"] ) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for obs_ngram in obs_ngrams: for target_ngram in target_ngrams: for generator in generators: param_list = [obs_ngram, target_ngram, config.SVD_DIM, config.SVD_N_ITER] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def main(): logname = "generate_feature_intersect_count_%s.log" % time_utils._timestamp( ) logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) generators = [ IntersectCount_Ngram, IntersectRatio_Ngram, CooccurrenceCount_Ngram, CooccurrenceRatio_Ngram, ] obs_fields_list = [] target_fields_list = [] ## query in document obs_fields_list.append([ "search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected" ][:2]) target_fields_list.append([ "product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color" ]) ## document in query obs_fields_list.append([ "product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color" ]) target_fields_list.append([ "search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected" ][:2]) ngrams = [1, 2, 3, 12, 123][:3] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: for ngram in ngrams: param_list = [ngram] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def main(): logname = "generate_feature_basic_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) ## basic generators = [DocId, DocLen, DocFreq, DocEntropy, DigitCount, DigitRatio] #DocIdOneHot not used obs_fields = ["question1", "question2"] for generator in generators: param_list = [] sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger) sf.go() ## id generators = [DocIdEcho] obs_fields = ["id"] for generator in generators: param_list = [] sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger) sf.go() ## qid generators = [MaxValue, DiffValue] obs_fields_list = [['qid1']] target_fields_list = [['qid2']] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: param_list = [] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go() ## DocLenRatio generators = [DocLenRatio] obs_fields_list = [['question1']] target_fields_list = [['question2']] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: param_list = [] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go() ## unique count generators = [UniqueCount_Ngram, UniqueRatio_Ngram] obs_fields = ["question1", "question2"] ngrams = [1, 2, 3, 4, 5, 12, 123] for generator in generators: for ngram in ngrams: param_list = [ngram] sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger) sf.go()
def run_char_dist_sim(): logname = "generate_feature_char_dist_sim_%s.log" % time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) dfAll.drop(["product_attribute_list"], inplace=True, axis=1) generators = [ CharDistribution_Ratio, CharDistribution_CosineSim, CharDistribution_KL ] obs_fields_list = [] target_fields_list = [] obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"][:1]) target_fields_list.append( ["product_title", "product_description", "product_attribute"]) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: param_list = [] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def main(): logname = "generate_feature_tfidf.log" logger = logging_utils._get_logger(config.LOG_DIR, logname) obs_fields = ['query', 'norm_query'] target_fields = [x + '_termvec' for x in config.ES_TERM_FIELDS] dfAll = table_utils._read(config.ALL_DATA) docs = table_utils._open_shelve_read(config.ES_PAGE_TERM_VEC_SHELVE) queries = table_utils._open_shelve_read(config.ES_QUERY_TERM_VEC_SHELVE) generators = [ES_TFIDF_Unigram_TopN_CosineSim] dedup = True for generator in generators: for target_field in target_fields: obs_fields_tv = [x + '_' + target_field for x in obs_fields] param_list = [] # TODO: why iterate obs_fields instead of passing all at once? pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields_tv, [target_field], param_list, config.FEAT_DIR, logger, dedup) pf.go()
def main(): logname = "generate_feature_wordnet_similarity_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) #### NOTE: use data BEFORE STEMMING dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED) generators = [ WordNet_Path_Similarity, WordNet_Lch_Similarity, WordNet_Wup_Similarity, ] obs_fields_list = [] target_fields_list = [] obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"] ) target_fields_list.append( ["product_title", "product_description", "product_attribute"] ) # double aggregation aggregation_mode_prev = ["mean", "max", "min", "median"] aggregation_mode = ["mean", "std", "max", "min", "median"] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: param_list = [aggregation_mode_prev, aggregation_mode] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def main(): logname = "generate_feature_match_%s.log" % time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) generators = [ MatchQueryCount, MatchQueryRatio, LongestMatchSize, LongestMatchRatio, ] obs_fields_list = [] target_fields_list = [] obs_fields_list.append([ "search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected" ][:2]) target_fields_list.append([ "product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color" ]) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: param_list = [] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go() # product_attribute_list generators = [ MatchAttrCount, MatchAttrRatio, IsIndoorOutdoorMatch, ] obs_fields_list = [] target_fields_list = [] obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"][:1]) target_fields_list.append(["product_attribute_list"]) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: param_list = [] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def main(which): logname = "generate_feature_stat_cooc_tfidf_%s_%s.log" % ( which, time_utils._timestamp()) logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) generators = [] if which == "tf": generators.append(StatCoocTF_Ngram) elif which == "norm_tf": generators.append(StatCoocNormTF_Ngram) elif which == "tfidf": generators.append(StatCoocTFIDF_Ngram) elif which == "norm_tfidf": generators.append(StatCoocNormTFIDF_Ngram) elif which == "bm25": generators.append(StatCoocBM25_Ngram) obs_fields_list = [] target_fields_list = [] ## query in document obs_fields_list.append(["question1"]) target_fields_list.append(["question2"]) ## document in query obs_fields_list.append(["question2"]) target_fields_list.append(["question1"]) ngrams = [1, 2, 3, 12, 123][:3] aggregation_mode = ["mean", "std", "max", "min", "median"] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: for ngram in ngrams: param_list = [ngram, aggregation_mode] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def run_tfidf_ngram_cosinesim(): logname = "generate_feature_tfidf_ngram_cosinesim_%s.log" % time_utils._timestamp( ) logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) dfAll.drop(["product_attribute_list"], inplace=True, axis=1) generators = [TFIDF_Word_Ngram_CosineSim, TFIDF_Char_Ngram_CosineSim] ngrams_list = [[1, 2, 3], [2, 3, 4, 5]] ngrams_list = [[3], [4]] obs_fields_list = [] target_fields_list = [] obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"][:1]) target_fields_list.append( ["product_title", "product_description", "product_attribute"]) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator, ngrams in zip(generators, ngrams_list): for ngram in ngrams: param_list = [ngram] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def run_compression_distance(): logname = "generate_feature_compression_distance_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) obs_fields_list = [] target_fields_list = [] obs_fields_list.append( ["question1"] ) target_fields_list.append( ["question2"] ) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): param_list = [] pf = PairwiseFeatureWrapper(CompressionDistance, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go() for ngram in ngrams: param_list = [ngram, aggregation_mode_prev, aggregation_mode] pf = PairwiseFeatureWrapper(CompressionDistance_Ngram, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def main(which): logname = "generate_feature_stat_cooc_tfidf_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) generators = [] for w in which.split(","): if w == "tf": generators.append( StatCoocTF_Ngram ) elif w == "norm_tf": generators.append( StatCoocNormTF_Ngram ) elif w == "tfidf": generators.append( StatCoocTFIDF_Ngram ) elif w == "norm_tfidf": generators.append( StatCoocNormTFIDF_Ngram ) elif w == "bm25": generators.append( StatCoocBM25_Ngram ) obs_fields_list = [] target_fields_list = [] ## query in document obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"] ) target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] ) ## document in query obs_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] ) target_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"] ) ngrams = [1,2,3,12,123][:3] aggregation_mode = ["mean", "std", "max", "min", "median"] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: for ngram in ngrams: param_list = [ngram, aggregation_mode] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go() obs_fields_list = [] target_fields_list = [] ## query in document obs_fields_list.append( ["search_term_product_name"] ) target_fields_list.append( ["product_title_product_name"] ) ngrams = [1,2] aggregation_mode = ["mean", "std", "max", "min", "median"] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: for ngram in ngrams: if ngram == 2: # since product_name is of length 2, it makes no difference # for various aggregation as there is only one item param_list = [ngram, "mean"] else: param_list = [ngram, aggregation_mode] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def main(): logname = "generate_feature_group_distance_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) dfTrain = dfAll.iloc[:TRAIN_SIZE].copy() ## run python3 splitter.py first split = pkl_utils._load("%s/splits_level1.pkl"%config.SPLIT_DIR) n_iter = len(split) relevances_complete = [1, 1.25, 1.33, 1.5, 1.67, 1.75, 2, 2.25, 2.33, 2.5, 2.67, 2.75, 3] relevances = [1, 1.33, 1.67, 2, 2.33, 2.67, 3] ngrams = [1] obs_fields = ["search_term"] target_fields = ["product_title", "product_description"] aggregation_mode = ["mean", "std", "max", "min", "median"] ## for cv for i in range(n_iter): trainInd, validInd = split[i][0], split[i][1] dfTrain2 = dfTrain.iloc[trainInd].copy() sub_feature_dir = "%s/Run%d" % (config.FEAT_DIR, i+1) for target_field in target_fields: for relevance in relevances: for ngram in ngrams: param_list = [dfAll["id"], dfTrain2, target_field, relevance, ngram, aggregation_mode] pf = PairwiseFeatureWrapper(GroupRelevance_Ngram_Jaccard, dfAll, obs_fields, [target_field], param_list, sub_feature_dir, logger) pf.go() ## for all sub_feature_dir = "%s/All" % (config.FEAT_DIR) for target_field in target_fields: for relevance in relevances: for ngram in ngrams: param_list = [dfAll["id"], dfTrain, target_field, relevance, ngram, aggregation_mode] pf = PairwiseFeatureWrapper(GroupRelevance_Ngram_Jaccard, dfAll, obs_fields, [target_field], param_list, sub_feature_dir, logger) pf.go()
def main(): logname = "generate_feature_match_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) generators = [ MatchQueryCount, MatchQueryRatio, LongestMatchSize, LongestMatchRatio ] obs_fields_list = [] target_fields_list = [] obs_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"] ) target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] ) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: param_list = [] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go() # product_attribute_list generators = [ MatchAttrCount, MatchAttrRatio, IsIndoorOutdoorMatch, ] obs_fields_list = [] target_fields_list = [] obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"] ) target_fields_list.append( ["product_attribute_list"] ) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: param_list = [] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def main(which): logname = "generate_feature_word2vec_%s_%s.log"%(which, time_utils._timestamp()) logger = logging_utils._get_logger(config.LOG_DIR, logname) #### NOTE: use data BEFORE STEMMinG dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED) word2vec_model_dirs = [] model_prefixes = [] if which == "homedepot": ## word2vec model trained with Homedepot dataset: brand/color/query/title/description word2vec_model_dirs.append( config.WORD2VEC_MODEL_DIR + "/Homedepot-word2vec-D%d-min_count%d.model"%(config.EMBEDDING_DIM, config.EMBEDDING_MIN_COUNT) ) model_prefixes.append( "Homedepot" ) elif which == "wikipedia": ## word2vec model pretrained with Wikipedia+Gigaword 5 word2vec_model_dirs.append( config.GLOVE_WORD2VEC_MODEL_DIR + "/glove.6B.300d.txt" ) model_prefixes.append( "Wikipedia" ) elif which == "google": ## word2vec model pretrained with Google News word2vec_model_dirs.append( config.WORD2VEC_MODEL_DIR + "/GoogleNews-vectors-negative300.bin" ) model_prefixes.append( "GoogleNews" ) for word2vec_model_dir, model_prefix in zip(word2vec_model_dirs, model_prefixes): ## load model try: if ".bin" in word2vec_model_dir: word2vec_model = gensim.models.Word2Vec.load_word2vec_format(word2vec_model_dir, binary=True) elif ".txt" in word2vec_model_dir: word2vec_model = gensim.models.Word2Vec.load_word2vec_format(word2vec_model_dir, binary=False) else: word2vec_model = gensim.models.Word2Vec.load(word2vec_model_dir) except: continue # ## standalone (not used in model building) # obs_fields = ["search_term", "product_title", "product_description"] # generator = Word2Vec_Centroid_Vector # param_list = [word2vec_model, model_prefix] # sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger) # sf.go() ## pairwise generators = [ Word2Vec_Importance, Word2Vec_N_Similarity, Word2Vec_N_Similarity_Imp, Word2Vec_Centroid_RMSE, Word2Vec_Centroid_RMSE_IMP, # # not used in final submission # Word2Vec_Centroid_Vdiff, ] obs_fields_list = [] target_fields_list = [] obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"][:1] ) target_fields_list.append( ["product_title", "product_description", "product_attribute", "product_brand", "product_color"] ) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: param_list = [word2vec_model, model_prefix] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go() ## cosine sim generators = [ Word2Vec_CosineSim, ] # double aggregation aggregation_mode_prev = ["mean", "max", "min", "median"] aggregation_mode = ["mean", "std", "max", "min", "median"] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: param_list = [word2vec_model, model_prefix, aggregation_mode, aggregation_mode_prev] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()