def main(options): if options.eval: time_str = datetime.datetime.now().isoformat() logname = "Eval_[Model@%s]_%s.log" % (options.model_name, time_str) logger = logging_utils._get_logger(config.LOG_DIR, logname) else: time_str = datetime.datetime.now().isoformat() logname = "Final_[Model@%s]_%s.log" % (options.model_name, time_str) logger = logging_utils._get_logger(config.LOG_DIR, logname) params_dict = param_space_dict[options.model_name] task = Task(options.model_name, options.runs, params_dict, logger) if options.eval: task.refit(options.prefix) else: task.evaluate(options.prefix)
def run_tfidf_ngram_cosinesim(): """Symmetric in obs and target""" logname = "generate_feature_tfidf_ngram_cosinesim_%s.log" % time_utils._timestamp( ) logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) generators = [TFIDF_Word_Ngram_CosineSim, TFIDF_Char_Ngram_CosineSim] ngrams_list = [[2, 3], [4]] obs_fields_list = [['question1']] target_fields_list = [['question2']] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator, ngrams in zip(generators, ngrams_list): for ngram in ngrams: param_list = [ngram] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger, force_corr=True) pf.go() del pf gc.collect()
def run_lsa_ngram_cooc(): logname = "generate_feature_lsa_ngram_cooc_%s.log" % time_utils._timestamp( ) logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) dfAll.drop(["product_attribute_list"], inplace=True, axis=1) generators = [LSA_Word_Ngram_Cooc] obs_ngrams = [1, 2] target_ngrams = [1, 2] obs_fields_list = [] target_fields_list = [] obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"][:1]) target_fields_list.append(["product_title", "product_description"][:1]) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for obs_ngram in obs_ngrams: for target_ngram in target_ngrams: for generator in generators: param_list = [ obs_ngram, target_ngram, config.SVD_DIM, config.SVD_N_ITER ] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def main(options): time_str = datetime.datetime.now().isoformat() time_str = re.sub(':','-',time_str) logname = "[Model@%s]_[Data@%s]_%s.log" % (options.model_name, options.data_name, time_str) logger = logging_utils._get_logger(config.LOG_DIR, logname) optimizer = TaskOptimizer(options.model_name, options.data_name, options.cv_runs, options.max_evals, logger) optimizer.run()
def experiments(args): runs = 5 time_str = datetime.now().date().isoformat() logname = "[Data@%s]_[Encoder@%s]" % (args.data_name, args.sent_encoder) if args.bidirectional: logname += "_[Bi]" logname += "_%s.log" % time_str logger = _get_logger(config.LOG_PATH, logname) dis_accs = [] ins_accs = [] for i in range(runs): dis_acc, ins_acc = run_bigram_coherence(args) dis_accs.append(dis_acc[0]) ins_accs.append(ins_acc[0]) for _ in range(10): gc.collect() logger.info("=" * 50) for i in range(runs): logger.info("Run %d" % (i + 1)) logger.info("Dis Acc: %.6f" % dis_accs[i]) logger.info("Ins Acc: %.6f" % ins_accs[i]) logger.info("=" * 50) logger.info("Average Dis Acc: %.6f (%.6f)" % (np.mean(dis_accs), np.std(dis_accs))) logger.info("Average Ins Acc: %.6f (%.6f)" % (np.mean(ins_accs), np.std(ins_accs)))
def main(): FNAME = "model_predict_lgbm" logname = "%s_%s.log" % (FNAME, now) logger = logging_utils._get_logger(config.LOG_DIR, logname) # Load raw data # test_raw = dl.load_test_data() # gc.collect() # Load generated features test_features = load_combined_features(logger) #test_features = pd.concat([test_features, test_raw[config.NUMBER_FEATURES]], axis=1) logger.info('Final test data shape: %s' % str(test_features.shape)) lightgbm_model = load_model(logger) t0 = time() pred = lightgbm_model.predict(test_features) submission = pd.read_csv(config.SAMPLE_SUBMISSION_DATA, nrows=config.RAW_DATA_ROWS) submission['deal_probability'] = pred submission['deal_probability'].clip(0.0, 1.0, inplace=True) submission_file = os.path.join(config.DATA_SUBMISSION_DIR, "submission_lightgbm.csv") submission.to_csv(submission_file, index=False) # Compress (zip) submission file. submission_zip_file = os.path.join(config.DATA_SUBMISSION_DIR, "submission_lightgbm.csv.zip") submission_zip = zipfile.ZipFile(submission_zip_file, 'w') submission_zip.write(submission_file, arcname="submission_lightgbm.csv", compress_type=zipfile.ZIP_DEFLATED) submission_zip.close() logger.info('LightGBM submission file generation took: %s minutes' % round((time() - t0) / 60, 1))
def main(): logname = "generate_feature_match_%s.log" % time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) generators = [ MatchQueryCount, MatchQueryRatio, LongestMatchSize, LongestMatchRatio, ] obs_fields_list = [] target_fields_list = [] ## question1 in question2 obs_fields_list.append(['question1']) target_fields_list.append(['question2']) ## question2 in question1 obs_fields_list.append(['question2']) target_fields_list.append(['question1']) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: param_list = [] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def main(): logname = "generate_feature_wordnet_similarity_%s.log" % time_utils._timestamp( ) logger = logging_utils._get_logger(config.LOG_DIR, logname) #### NOTE: use data BEFORE STEMMING dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED) # WordNet_Lch_Similarity and WordNet_Wup_Similarity are not used in final submission generators = [ WordNet_Path_Similarity, WordNet_Lch_Similarity, WordNet_Wup_Similarity, ][:1] obs_fields_list = [] target_fields_list = [] # only search_term and product_title are used in final submission obs_fields_list.append(["question1"]) target_fields_list.append(["question2"]) # double aggregation aggregation_mode_prev = ["mean", "max", "min", "median"] aggregation_mode = ["mean", "std", "max", "min", "median"] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: param_list = [aggregation_mode_prev, aggregation_mode] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def main(): FNAME = "feature_date" logname = "%s_%s.log" % (FNAME, now) logger = logging_utils._get_logger(config.LOG_DIR, logname) train, test = dl.load_data() logger.info("Generating activation date features ...") t0 = time() # Generating activation date features # Train data train['month'] = train['activation_date'].dt.month train['weekday'] = train['activation_date'].dt.weekday train['month_day'] = train['activation_date'].dt.day train['year_day'] = train['activation_date'].dt.dayofyear # Test data test['month'] = test['activation_date'].dt.month test['weekday'] = test['activation_date'].dt.weekday test['month_day'] = test['activation_date'].dt.day test['year_day'] = test['activation_date'].dt.dayofyear gc.collect() logger.info(FNAME + ' took: %s minutes' % round((time() - t0) / 60, 1)) # save data train_fname = os.path.join(config.DATA_FEATURES_DIR, "train_" + FNAME + config.FEAT_FILE_SUFFIX) test_fname = os.path.join(config.DATA_FEATURES_DIR, "test_" + FNAME + config.FEAT_FILE_SUFFIX) logger.info("Save to %s" % train_fname) pkl_utils._save(train_fname, train[config.GENERATED_DATE_FEATURES]) logger.info("Save to %s" % test_fname) pkl_utils._save(test_fname, test[config.GENERATED_DATE_FEATURES]) gc.collect()
def main(options): logname = "[Feat@%s]_[Learner@%s]_hyperopt_%s.log"%( options.feature_name, options.learner_name, time_utils._timestamp()) logger = logging_utils._get_logger(config.LOG_DIR, logname) optimizer = TaskOptimizer(options.task_mode, options.learner_name, options.feature_name, logger, options.max_evals, verbose=True, refit_once=options.refit_once) optimizer.run()
def main(): logname = "generate_feature_ident.log" logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = table_utils._read(config.ALL_DATA) # Copies of data from ES docs. Note that multi-valued fields are first # converted into their length obs_fields = ["incoming_links", "popularity_score", "text_bytes", "category", "template", "heading", "outgoing_link", "external_link", "redirect.title", "auxiliary_text"] transforms = [None, np.log, np.log10, np.sqrt] dedup = True for transform in transforms: param_list = [transform] sf = StandaloneFeatureWrapper(Ident, dfAll, obs_fields, param_list, config.FEAT_DIR, logger, dedup) sf.go() # Sub-fields from termvec data obs_fields = [x + '_termvec' for x in config.ES_TERM_FIELDS] obs_fields += ['query_' + x + '_termvec' for x in config.ES_TERM_FIELDS] obs_fields += ['norm_query_' + x + '_termvec' for x in config.ES_TERM_FIELDS] es_fields = ['score', 'term_freq', 'ttf', 'doc_freq'] aggregation_mode = ["mean", "std", "max", "min", "median"] for es_field in es_fields: for transform in transforms: param_list = [es_field, transform, aggregation_mode] sf = StandaloneFeatureWrapper(SubFieldIdent, dfAll, obs_fields, param_list, config.FEAT_DIR, logger, dedup) sf.go()
def run_lsa_ngram_pair(): """Symmetric in obs and target""" logname = "generate_feature_lsa_ngram_pair_%s.log" % time_utils._timestamp( ) logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) generators = [LSA_Word_Ngram_Pair] ngrams = [1, 2, 3] obs_fields_list = [] target_fields_list = [] ## question1 in question2 obs_fields_list.append(['question1']) target_fields_list.append(['question2']) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for ngram in ngrams: for generator in generators: param_list = [ngram, config.SVD_DIM, config.SVD_N_ITER] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger, force_corr=True) pf.go() del pf gc.collect()
def main(): logname = "generate_feature_intersect_position_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) generators = [ IntersectPosition_Ngram, IntersectNormPosition_Ngram, ] obs_fields_list = [] target_fields_list = [] ## query in document obs_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"] ) target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"][1:2] ) ## document in query obs_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"][1:2] ) target_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"] ) ngrams = [1,2,3,12,123][:3] aggregation_mode = ["mean", "std", "max", "min", "median"] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: for ngram in ngrams: param_list = [ngram, aggregation_mode] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def run_count(): logname = "generate_feature_first_last_ngram_count_%s.log" % time_utils._timestamp( ) logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) generators = [ FirstIntersectCount_Ngram, LastIntersectCount_Ngram, FirstIntersectRatio_Ngram, LastIntersectRatio_Ngram, ] obs_fields_list = [] target_fields_list = [] ## query in document obs_fields_list.append(["question1"]) target_fields_list.append(["question2"]) ## document in query obs_fields_list.append(["question2"]) target_fields_list.append(["question1"]) ngrams = [1, 2, 3, 12, 123][:3] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: for ngram in ngrams: param_list = [ngram] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def run_count(): logname = "generate_feature_first_last_ngram_count_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) generators = [ FirstIntersectCount_Ngram, LastIntersectCount_Ngram, FirstIntersectRatio_Ngram, LastIntersectRatio_Ngram, ] obs_fields_list = [] target_fields_list = [] ## query in document obs_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"] ) target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] ) ## document in query obs_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] ) target_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"] ) ngrams = [1,2,3,12,123][:3] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: for ngram in ngrams: param_list = [ngram] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def main(options): time_str = datetime.datetime.now().isoformat() if len(options.save_name) == 0: logname = "Eval_[Model@%s]_[Data@%s]_%s.log" % ( options.model_name, options.data_name, time_str) else: logname = "Eval_[Model@%s]_[Data@%s]_%s.log" % ( options.save_name, options.data_name, time_str) logger = logging_utils._get_logger(config.LOG_DIR, logname) # else: # time_str = datetime.datetime.now().isoformat() # logname = "Final_[Model@%s]_[Data@%s]_%s.log" % (options.model_name, # options.data_name, time_str) # logger = logging_utils._get_logger(config.LOG_DIR, logname) # params_dict = param_space_dict[options.model_name] params_dict['alpha'] = options.alpha task = Task(model_name=options.model_name, data_name=options.data_name, cv_runs=options.runs, params_dict=params_dict, logger=logger, portion=options.portion, save_name=options.save_name) print('-' * 50 + 'refit' + '-' * 50) task.refit()
def run_position(): logname = "generate_feature_first_last_ngram_position_%s.log" % time_utils._timestamp( ) logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) generators = [ FirstIntersectPosition_Ngram, LastIntersectPosition_Ngram, FirstIntersectNormPosition_Ngram, LastIntersectNormPosition_Ngram, ] obs_fields_list = [["question1"], ["question2"]] target_fields_list = [["question2"], ["question1"]] ngrams = [1, 2, 3, 12, 123] aggregation_mode = ["mean", "std", "max", "min", "median"] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: for ngram in ngrams: param_list = [ngram, aggregation_mode] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def run_compression_distance(): logname = "generate_feature_compression_distance_%s.log" % time_utils._timestamp( ) logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) obs_fields_list = [] target_fields_list = [] obs_fields_list.append([ "search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected" ][:2]) target_fields_list.append([ "product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color" ]) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): param_list = [] pf = PairwiseFeatureWrapper(CompressionDistance, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go() for ngram in ngrams: param_list = [ngram, aggregation_mode_prev, aggregation_mode] pf = PairwiseFeatureWrapper(CompressionDistance_Ngram, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def run_ngram_jaccard(): logname = "generate_feature_ngram_jaccard_%s.log" % time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) generators = [JaccardCoef_Ngram, DiceDistance_Ngram] obs_fields_list = [] target_fields_list = [] obs_fields_list.append([ "search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected" ][:2]) target_fields_list.append([ "product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color" ]) ngrams = [1, 2, 3, 12, 123][:3] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: for ngram in ngrams: param_list = [ngram] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def main(): logname = "generate_feature_doc2vec_%s.log" % time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) #### NOTE: use data BEFORE STEMMING dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED) doc2vec_model_dirs = [] model_prefixes = [] ## doc2vec model trained with Homedepot dataset: brand/color/obs/title/description doc2vec_model_dirs.append( config.DOC2VEC_MODEL_DIR + "/Homedepot-doc2vec-D%d-min_count%d.model" % (config.EMBEDDING_DIM, config.EMBEDDING_MIN_COUNT)) model_prefixes.append("Homedepot") for doc2vec_model_dir, model_prefix in zip(doc2vec_model_dirs, model_prefixes): ## load model try: if ".bin" in doc2vec_model_dir: doc2vec_model = gensim.models.Doc2Vec.load_word2vec_format( doc2vec_model_dir, binary=True) if ".txt" in doc2vec_model_dir: doc2vec_model = gensim.models.Doc2Vec.load_word2vec_format( doc2vec_model_dir, binary=False) else: doc2vec_model = gensim.models.Doc2Vec.load(doc2vec_model_dir) doc2vec_model_sent_label = pkl_utils._load(doc2vec_model_dir + ".sent_label") except: continue # ## standalone (not used in model building) # obs_fields = ["search_term", "search_term_alt", "product_title", "product_description", "product_attribute"] # generator = Doc2Vec_Vector # param_list = [doc2vec_model, doc2vec_model_sent_label, model_prefix] # sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger) # sf.go() ## pairwise generators = [ Doc2Vec_CosineSim, Doc2Vec_RMSE, # Doc2Vec_Vdiff, ] obs_fields_list = [] target_fields_list = [] obs_fields_list.append(["search_term", "search_term_alt"][:1]) target_fields_list.append([ "product_title", "product_description", "product_attribute", "product_brand", "product_color" ]) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: param_list = [ doc2vec_model, doc2vec_model_sent_label, model_prefix ] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def main(): FNAME = "feature_aggregates" logname = "%s_%s.log" % (FNAME, now) logger = logging_utils._get_logger(config.LOG_DIR, logname) train, test = dl.load_data() train['weekday'] = train['activation_date'].dt.weekday train['month_day'] = train['activation_date'].dt.day test['weekday'] = test['activation_date'].dt.weekday test['month_day'] = test['activation_date'].dt.day logger.info("Train shape: %s & Test shape: %s" % (train.shape, test.shape)) logger.info("Generating aggregate features ...") t0 = time() # Generating aggregate features agg_deal_probability_features(train, test, config.AGGREGATE_COLUMNS) agg_price_features(train, test, config.AGGREGATE_COLUMNS) logger.info(FNAME + ' took: %s minutes' % round((time() - t0) / 60, 1)) logger.info("Train shape: %s & Test shape: %s" % (train.shape, test.shape)) gc.collect() # save data train_fname = os.path.join(config.DATA_FEATURES_DIR, "train_" + FNAME + config.FEAT_FILE_SUFFIX) test_fname = os.path.join(config.DATA_FEATURES_DIR, "test_" + FNAME + config.FEAT_FILE_SUFFIX) logger.info("Save to %s" % train_fname) pkl_utils._save(train_fname, train[config.AGGREGATE_DEAL_FEATURES + config.AGGREGATE_PRICE_FEATURES]) logger.info("Save to %s" % test_fname) pkl_utils._save(test_fname, test[config.AGGREGATE_DEAL_FEATURES + config.AGGREGATE_PRICE_FEATURES]) gc.collect()
def main(): logname = "generate_feature_intersect_position_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) generators = [ IntersectPosition_Ngram, IntersectNormPosition_Ngram, ] obs_fields_list = [] target_fields_list = [] ## query in document obs_fields_list.append( ["question1"] ) target_fields_list.append( ["question2"] ) ## document in query obs_fields_list.append( ["question2"] ) target_fields_list.append( ["question1"] ) ngrams = [1,2,3,12,123][:3] aggregation_mode = ["mean", "std", "max", "min", "median"] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: for ngram in ngrams: param_list = [ngram, aggregation_mode] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def main(): logname = "generate_feature_group_relevance_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) dfTrain = dfAll.iloc[:TRAIN_SIZE].copy() ## run python3 splitter.py first split = pkl_utils._load("%s/splits_level1.pkl"%config.SPLIT_DIR) n_iter = len(split) ## for cv for i in range(n_iter): trainInd, validInd = split[i][0], split[i][1] dfTrain2 = dfTrain.iloc[trainInd].copy() sub_feature_dir = "%s/Run%d" % (config.FEAT_DIR, i+1) obs_fields = ["search_term", "product_title"][1:] aggregation_mode = ["mean", "std", "max", "min", "median", "size"] param_list = [dfAll["id"], dfTrain2, aggregation_mode] sf = StandaloneFeatureWrapper(GroupRelevance, dfAll, obs_fields, param_list, sub_feature_dir, logger) sf.go() ## for all sub_feature_dir = "%s/All" % (config.FEAT_DIR) obs_fields = ["search_term", "product_title"][1:] aggregation_mode = ["mean", "std", "max", "min", "median", "size"] param_list = [dfAll["id"], dfTrain, aggregation_mode] sf = StandaloneFeatureWrapper(GroupRelevance, dfAll, obs_fields, param_list, sub_feature_dir, logger) sf.go()
def run_tsne_lsa_ngram(): logname = "generate_feature_tsne_lsa_ngram_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) dfAll.drop(["product_attribute_list"], inplace=True, axis=1) generators = [TSNE_LSA_Word_Ngram, TSNE_LSA_Char_Ngram] ngrams_list = [[1,2,3], [2,3,4,5]] ngrams_list = [[1,2,3], [4]] obs_fields = ["search_term", "search_term_alt", "search_term_auto_corrected", "product_title", "product_description"] for generator,ngrams in zip(generators, ngrams_list): for ngram in ngrams: param_list = [ngram, config.SVD_DIM, config.SVD_N_ITER] sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger, force_corr=True) sf.go() generators = [TSNE_LSA_Word_Ngram_Pair] ngrams = [1, 2] obs_fields_list = [] target_fields_list = [] obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"] ) target_fields_list.append( ["product_title", "product_description"] ) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for ngram in ngrams: for generator in generators: param_list = [ngram, config.SVD_DIM, config.SVD_N_ITER] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger, force_corr=True) pf.go()
def __init__(self, feature_list, feature_name, feature_suffix=".csv", feature_level=2, meta_feature_dict={}, corr_threshold=0): self.feature_name = feature_name self.feature_list = feature_list self.feature_suffix = feature_suffix self.feature_level = feature_level # for meta features self.meta_feature_dict = meta_feature_dict self.corr_threshold = corr_threshold self.feature_names_basic = [] self.feature_names_cv = [] self.feature_names = [] self.has_basic = 1 if self.meta_feature_dict else 0 logname = "feature_combiner_%s_%s.log" % (feature_name, time_utils._timestamp()) self.logger = logging_utils._get_logger(config.LOG_DIR, logname) if self.feature_level == 2: self.splitter = splitter_level2 elif self.feature_level == 3: self.splitter = splitter_level3 self.n_iter = n_iter self.splitter_prev = [0] * self.n_iter
def main(): logname = "generate_feature_group_relevance_%s.log" % time_utils._timestamp( ) logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) dfTrain = dfAll.iloc[:TRAIN_SIZE].copy() ## run python3 splitter.py first split = pkl_utils._load("%s/splits_level1.pkl" % config.SPLIT_DIR) n_iter = len(split) ## for cv for i in range(n_iter): trainInd, validInd = split[i][0], split[i][1] dfTrain2 = dfTrain.iloc[trainInd].copy() sub_feature_dir = "%s/Run%d" % (config.FEAT_DIR, i + 1) obs_fields = ["search_term", "product_title"][1:] aggregation_mode = ["mean", "std", "max", "min", "median", "size"] param_list = [dfAll["id"], dfTrain2, aggregation_mode] sf = StandaloneFeatureWrapper(GroupRelevance, dfAll, obs_fields, param_list, sub_feature_dir, logger) sf.go() ## for all sub_feature_dir = "%s/All" % (config.FEAT_DIR) obs_fields = ["search_term", "product_title"][1:] aggregation_mode = ["mean", "std", "max", "min", "median", "size"] param_list = [dfAll["id"], dfTrain, aggregation_mode] sf = StandaloneFeatureWrapper(GroupRelevance, dfAll, obs_fields, param_list, sub_feature_dir, logger) sf.go()
def run_ngram_jaccard(): logname = "generate_feature_ngram_jaccard.log" logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = table_utils._read(config.ALL_DATA) generators = [JaccardCoef_Ngram, DiceDistance_Ngram] # single valued fields obs_fields_list = [["query", "norm_query"]] target_fields_list = [["hit_title", "opening_text" ]] ngrams = [1,2,3,12,123][:3] dedup = True for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: for ngram in ngrams: param_list = [ngram] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger, dedup) pf.go() # multi-valued fields target_fields_list = [["category", "template", "heading", "outgoing_link", "external_link", "redirect.title", "auxiliary_text"]] aggregation_mode = ["mean", "std", "max", "min", "median"] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: multi_gen = MultiTargetEstimatorWrapper(generator) for ngram in ngrams: param_list = [ngram, aggregation_mode] pf = PairwiseFeatureWrapper(multi_gen, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger, dedup) pf.go()
def __init__(self, model_folder, model_list, subm_prefix, weight_opt_max_evals=10, w_min=-1., w_max=1., inst_subsample=0.5, inst_subsample_replacement=False, inst_splitter=None, model_subsample=1.0, model_subsample_replacement=True, bagging_size=10, init_top_k=5, epsilon=0.00001, multiprocessing=False, multiprocessing_num_cores=1, enable_extreme=True, random_seed=0): self.model_folder = model_folder self.model_list = model_list self.subm_prefix = subm_prefix self.weight_opt_max_evals = weight_opt_max_evals self.w_min = w_min self.w_max = w_max assert inst_subsample > 0 and inst_subsample <= 1. self.inst_subsample = inst_subsample self.inst_subsample_replacement = inst_subsample_replacement self.inst_splitter = inst_splitter assert model_subsample > 0 assert (type(model_subsample) == int) or (model_subsample <= 1.) self.model_subsample = model_subsample self.model_subsample_replacement = model_subsample_replacement self.bagging_size = bagging_size self.init_top_k = init_top_k self.epsilon = epsilon self.multiprocessing = multiprocessing self.multiprocessing_num_cores = multiprocessing_num_cores self.enable_extreme = enable_extreme self.random_seed = random_seed logname = "ensemble_selection_%s.log"%time_utils._timestamp() self.logger = logging_utils._get_logger(config.LOG_DIR, logname) self.n_models = len(self.model_list)
def run_tsne_lsa_ngram(): logname = "generate_feature_tsne_lsa_ngram_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) generators = [TSNE_LSA_Word_Ngram, TSNE_LSA_Char_Ngram] ngrams_list = [[1,2,3], [2,3,4,5]] ngrams_list = [[3], [4]] obs_fields = ["question1", "question2"] for generator,ngrams in zip(generators, ngrams_list): for ngram in ngrams: param_list = [ngram, config.SVD_DIM, config.SVD_N_ITER] sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger, force_corr=True) sf.go() generators = [TSNE_LSA_Word_Ngram_Pair] ngrams = [1, 2] obs_fields_list = [] target_fields_list = [] obs_fields_list.append( ["question1"] ) target_fields_list.append( ["question2"] ) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for ngram in ngrams: for generator in generators: param_list = [ngram, config.SVD_DIM, config.SVD_N_ITER] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger, force_corr=True) pf.go()
def initi_task(self, model_name, data_name, epoch_num) : time_str = datetime.datetime.now().isoformat() logname = "Final_[Model@%s]_[Data@%s]_%s.log" % (model_name, data_name, time_str) logger = logging_utils._get_logger(config.LOG_DIR, logname) params_dict = param_space_dict[model_name] task = Task(model_name, data_name, epoch_num, params_dict, logger) # default:cv_run=5 return task
def main(): logname = "generate_feature_basic.log" logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = table_utils._read(config.ALL_DATA) # basic generators = [DocLen, DocFreq, DocEntropy, DigitCount, DigitRatio] obs_fields = ["query", "norm_query", "hit_title", 'opening_text'] for generator in generators: param_list = [] dedup = False if generator == DocFreq else True sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger, dedup) sf.go() # basic against multi-value fields obs_fields = [ 'category', 'template', 'heading', 'outgoing_link', 'external_link', 'redirect.title', 'auxiliary_text' ] aggregations = ['mean', 'std', 'max', 'min', 'median'] param_list = [aggregations] for generator in generators: multi_gen = MultiObjEstimatorWrapper(generator) dedup = False if generator == DocFreq else True sf = StandaloneFeatureWrapper(multi_gen, dfAll, obs_fields, param_list, config.FEAT_DIR, logger, dedup) sf.go() # unique count generators = [UniqueCount_Ngram, UniqueRatio_Ngram] obs_fields = ["query", "norm_query", "hit_title", 'opening_text'] ngrams = [1, 2, 3] for generator in generators: for ngram in ngrams: param_list = [ngram] dedup = True sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger, dedup) sf.go() # unique count against multi-value fields generators = [UniqueCount_Ngram, UniqueRatio_Ngram] obs_fields = [ 'category', 'template', 'heading', 'outgoing_link', 'external_link', 'redirect.title', 'auxiliary_text' ] aggregations = ['mean', 'std', 'max', 'min', 'median'] ngrams = [1, 2, 3] for generator in generators: for ngram in ngrams: multi_gen = MultiObjEstimatorWrapper(generator) param_list = [ngram, aggregations] dedup = True sf = StandaloneFeatureWrapper(multi_gen, dfAll, obs_fields, param_list, config.FEAT_DIR, logger, dedup) sf.go()
def main(options): logname = "[Feat@%s]_[Learner@%s]_hyperopt_%s.log"%( options.feature_name, options.learner_name, time_utils._timestamp()) logger = logging_utils._get_logger(config.LOG_DIR, logname) optimizer = TaskOptimizer(options.task_mode, options.learner_name, options.feature_name, logger, options.max_evals, verbose=True, refit_once=options.refit_once, plot_importance=options.plot_importance) optimizer.run()
def main(): FNAME = "feature_general" logname = "%s_%s.log" % (FNAME, now) logger = logging_utils._get_logger(config.LOG_DIR, logname) logger.info("Generating time period feature ...") train, test = dl.load_data() periods_train, periods_test = dl.load_periods_data() t0 = time() # Generating general features train_general = pd.DataFrame() test_general = pd.DataFrame() logger.info("Generating general features ...") train = train.merge(periods_train[['item_id', 'date_to', 'date_from']], how='left', on=['item_id']) test = test.merge(periods_test[['item_id', 'date_to', 'date_from']], how='left', on=['item_id']) logger.info('Train shape: %s & Test shape: %s' % (train.shape, test.shape)) # https: // stackoverflow.com / questions / 37840812 / pandas - subtracting - two - date - columns - and -the - result - being - an - integer train_general['total_period'] = train['date_to'].sub(train['date_from'], axis=0) train_general['total_period'] = train_general['total_period'] / np.timedelta64(1, 'D') train_general['total_period'].fillna(0, inplace=True) test_general['total_period'] = test['date_to'].sub(test['date_from'], axis=0) test_general['total_period'] = test_general['total_period'] / np.timedelta64(1, 'D') test_general['total_period'].fillna(0, inplace=True) generate_count_features('title', train_general, train) generate_count_features('title', test_general, test) generate_count_features('description', train_general, train) generate_count_features('description', test_general, test) train_general['log_price'] = np.log(train["price"] + 0.001) train_general['log_price'].fillna(-999, inplace=True) test_general['log_price'] = np.log(test["price"] + 0.001) test_general['log_price'].fillna(-999, inplace=True) train['has_image'] = train['image'].isnull().astype(int) test['has_image'] = test['image'].isnull().astype(int) logger.info(FNAME + ' took: %s minutes' % round((time() - t0) / 60, 1)) del train del test gc.collect() logger.info('Train general shape: %s & Test general shape: %s' % (train_general.shape, test_general.shape)) # save data train_fname = os.path.join(config.DATA_FEATURES_DIR, "train_" + FNAME + config.FEAT_FILE_SUFFIX) test_fname = os.path.join(config.DATA_FEATURES_DIR, "test_" + FNAME + config.FEAT_FILE_SUFFIX) logger.info("Save to %s" % train_fname) pkl_utils._save(train_fname, train_general) logger.info("Save to %s" % test_fname) pkl_utils._save(test_fname, test_general) gc.collect()
def __init__(self, feature_dict, feature_name, corr_threshold=0): self.feature_dict = feature_dict self.feature_name = feature_name self.corr_threshold = corr_threshold self.feature_names_basic = [] self.feature_names = [] logname = "feature_combiner_%s.log" % (feature_name) self.logger = logging_utils._get_logger(config.LOG_DIR, logname)
def main(): logname = "generate_feature_intersect_count_%s.log" % time_utils._timestamp( ) logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) # Ngram generators = [ IntersectCount_Ngram, IntersectRatio_Ngram, ] obs_fields_list = [['question1'], ['question2']] target_fields_list = [['question2'], ['question1']] ngrams = [1, 2, 3, 4, 5, 12, 123] # only 1,2,3,4,5,12,123 available, see ngram_utils.py for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: for ngram in ngrams: param_list = [ngram] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go() # Ngram symmetric generators = [ CooccurrenceCount_Ngram, CooccurrenceRatio_Ngram, #CooccurrenceCount_Nterm, # not used in Quora project, takes long to run #CooccurrenceRatio_Nterm, ] obs_fields_list = [['question1']] target_fields_list = [['question2']] ngrams = [1, 2, 3, 4, 5, 12, 123] # only 1,2,3,4,5,12,123 available, see ngram_utils.py nterms = [ 2, 3, 4 ] # only 1,2,3,4 available,(uniterms is the same as unigrams) see ngram_utils.py for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: if generator.__name__[-5:] == 'Ngram': for ngram in ngrams: param_list = [ngram] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go() elif generator.__name__[-5:] == 'Nterm': for nterm in nterms: param_list = [nterm] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go() else: print("Wrong Generator") pass
def main(): FNAME = "model_train_lgbm" logname = "%s_%s.log" % (FNAME, now) logger = logging_utils._get_logger(config.LOG_DIR, logname) # Load raw data train_raw = dl.load_train_data() # Load generated features train_features = load_combined_features(logger) train_column_names = list(train_features.columns.values) logger.info("Training set column names: " + str(train_column_names)) # train_features = pd.concat([train_features, train_raw[config.NUMBER_FEATURES]], axis=1) logger.info('Final training data shape: %s' % str(train_features.shape)) x_train, x_valid, y_train, y_valid = train_test_split( train_features, train_raw[config.TARGET_FEATURE], test_size=0.20, random_state=42) del train_raw del train_features gc.collect() lgtrain = lgb.Dataset(x_train, label=y_train, feature_name=train_column_names, categorical_feature=config.ENCODED_CATEGORY_FEATURES) lgvalid = lgb.Dataset(x_valid, label=y_valid, feature_name=train_column_names, categorical_feature=config.ENCODED_CATEGORY_FEATURES) t0 = time() lightgbm_model = lgb.train( config.LGBM_PARAMS, lgtrain, config.LGBM_NUM_ROUNDS, valid_sets=lgvalid, verbose_eval=50, early_stopping_rounds=config.LGBM_EARLY_STOPPING_ROUNDS) logger.info('Training LightGBM model took: %s minutes' % round( (time() - t0) / 60, 1)) # Save model t0 = time() MODEL_FILE_NAME = "lightgbm_model" model_file = os.path.join(config.DATA_MODELS_DIR, MODEL_FILE_NAME + config.FEAT_FILE_SUFFIX) logger.info("Save to %s" % model_file) lightgbm_model.save_model(model_file, num_iteration=lightgbm_model.best_iteration) logger.info('Saving %s lightgbm model took: %s minutes' % (MODEL_FILE_NAME, round((time() - t0) / 60, 1))) generate_figure_importance(lightgbm_model, logger)
def main(which): logname = "generate_feature_stat_cooc_tfidf_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) generators = [] for w in which.split(","): if w == "tf": generators.append( StatCoocTF_Ngram ) elif w == "norm_tf": generators.append( StatCoocNormTF_Ngram ) elif w == "tfidf": generators.append( StatCoocTFIDF_Ngram ) elif w == "norm_tfidf": generators.append( StatCoocNormTFIDF_Ngram ) elif w == "bm25": generators.append( StatCoocBM25_Ngram ) obs_fields_list = [] target_fields_list = [] ## query in document obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"] ) target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] ) ## document in query obs_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] ) target_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"] ) ngrams = [1,2,3,12,123][:3] aggregation_mode = ["mean", "std", "max", "min", "median"] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: for ngram in ngrams: param_list = [ngram, aggregation_mode] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go() obs_fields_list = [] target_fields_list = [] ## query in document obs_fields_list.append( ["search_term_product_name"] ) target_fields_list.append( ["product_title_product_name"] ) ngrams = [1,2] aggregation_mode = ["mean", "std", "max", "min", "median"] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: for ngram in ngrams: if ngram == 2: # since product_name is of length 2, it makes no difference # for various aggregation as there is only one item param_list = [ngram, "mean"] else: param_list = [ngram, aggregation_mode] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def __init__(self, feature_dict, feature_name, feature_suffix=".pkl", corr_threshold=0): self.feature_name = feature_name self.feature_dict = feature_dict self.feature_suffix = feature_suffix self.corr_threshold = corr_threshold self.feature_names_basic = [] self.feature_names_cv = [] self.basic_only = 0 logname = "feature_combiner_%s_%s.log"%(feature_name, time_utils._timestamp()) self.logger = logging_utils._get_logger(config.LOG_DIR, logname) self.splitter = splitter_level1 self.n_iter = n_iter
def run_compression_distance(): logname = "generate_feature_compression_distance_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) obs_fields_list = [] target_fields_list = [] obs_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"] ) target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] ) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): param_list = [] pf = PairwiseFeatureWrapper(CompressionDistance, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def main(): logname = "generate_feature_basic_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) ## basic generators = [DocId, DocLen, DocFreq, DocEntropy, DigitCount, DigitRatio] obs_fields = ["search_term", "product_title", "product_description", "product_attribute", "product_brand", "product_color"] for generator in generators: param_list = [] sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger) sf.go() ## for product_uid generators = [DocIdEcho, DocFreq, ProductUidDummy1, ProductUidDummy2, ProductUidDummy3] obs_fields = ["product_uid"] for generator in generators: param_list = [] sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger) sf.go() ## unique count generators = [UniqueCount_Ngram, UniqueRatio_Ngram] obs_fields = ["search_term", "product_title", "product_description", "product_attribute", "product_brand", "product_color"] ngrams = [1,2,3] for generator in generators: for ngram in ngrams: param_list = [ngram] sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger) sf.go() ## for product_attribute_list generators = [ AttrCount, AttrBulletCount, AttrBulletRatio, AttrNonBulletCount, AttrNonBulletRatio, AttrHasProductHeight, AttrHasProductWidth, AttrHasProductLength, AttrHasProductDepth, AttrHasIndoorOutdoor, ] obs_fields = ["product_attribute_list"] for generator in generators: param_list = [] sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger) sf.go()
def run_char_dist_sim(): logname = "generate_feature_char_dist_sim_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) dfAll.drop(["product_attribute_list"], inplace=True, axis=1) generators = [CharDistribution_Ratio, CharDistribution_CosineSim, CharDistribution_KL] obs_fields_list = [] target_fields_list = [] obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"] ) target_fields_list.append( ["product_title", "product_description", "product_attribute"] ) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: param_list = [] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def run_ngram_jaccard(): logname = "generate_feature_ngram_jaccard_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) generators = [JaccardCoef_Ngram, DiceDistance_Ngram] obs_fields_list = [] target_fields_list = [] obs_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"][:2] ) target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] ) ngrams = [1,2,3,12,123][:3] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: for ngram in ngrams: param_list = [ngram] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def main(): logname = "generate_feature_doc2vec_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) #### NOTE: use data BEFORE STEMMING dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED) doc2vec_model_dirs = [] model_prefixes = [] ## doc2vec model trained with Homedepot dataset: brand/color/obs/title/description doc2vec_model_dirs.append( config.DOC2VEC_MODEL_DIR + "/Homedepot-doc2vec-D%d-min_count%d.model"%(config.EMBEDDING_DIM, config.EMBEDDING_MIN_COUNT) ) model_prefixes.append( "Homedepot" ) for doc2vec_model_dir, model_prefix in zip(doc2vec_model_dirs, model_prefixes): ## load model try: if ".bin" in doc2vec_model_dir: doc2vec_model = gensim.models.Doc2Vec.load_word2vec_format(doc2vec_model_dir, binary=True) if ".txt" in doc2vec_model_dir: doc2vec_model = gensim.models.Doc2Vec.load_word2vec_format(doc2vec_model_dir, binary=False) else: doc2vec_model = gensim.models.Doc2Vec.load(doc2vec_model_dir) doc2vec_model_sent_label = pkl_utils._load(doc2vec_model_dir+".sent_label") except: continue # ## standalone (not used in model building) # obs_fields = ["search_term", "search_term_alt", "product_title", "product_description", "product_attribute"] # generator = Doc2Vec_Vector # param_list = [doc2vec_model, doc2vec_model_sent_label, model_prefix] # sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger) # sf.go() ## pairwise generators = [ Doc2Vec_CosineSim, Doc2Vec_RMSE, Doc2Vec_Vdiff, ] obs_fields_list = [] target_fields_list = [] obs_fields_list.append( ["search_term", "search_term_alt"] ) target_fields_list.append( ["product_title", "product_description", "product_attribute", "product_brand", "product_color"] ) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: param_list = [doc2vec_model, doc2vec_model_sent_label, model_prefix] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def run_edit_distance(): logname = "generate_feature_edit_distance_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) obs_fields_list = [] target_fields_list = [] obs_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"][1:2] ) target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] ) ngrams = [1,2,3,12,123][:3] aggregation_mode_prev = ["mean", "max", "min", "median"] aggregation_mode = ["mean", "std", "max", "min", "median"] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): param_list = [] PairwiseFeatureWrapper(EditDistance, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) for ngram in ngrams: param_list = [ngram, aggregation_mode_prev, aggregation_mode] pf = PairwiseFeatureWrapper(EditDistance_Ngram, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def main(): logname = "generate_feature_query_quality_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) obs_corpus = [] query_suffix = [] # raw dfAll = pkl_utils._load(config.ALL_DATA_RAW) obs_corpus.append(dfAll["search_term"].values) query_suffix.append("raw") # after processing dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED) obs_corpus.append(dfAll["search_term"].values) query_suffix.append("lemmatized") # after extracting product_name in search_term obs_corpus.append(dfAll["search_term_product_name"].values) query_suffix.append("product_name") if "search_term_auto_corrected" in dfAll.columns: # after auto correction obs_corpus.append(dfAll["search_term_auto_corrected"].values) query_suffix.append("corrected") # after stemming dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) obs_corpus.append(dfAll["search_term"].values) query_suffix.append("stemmed") y_train = dfAll["relevance"].values[:TRAIN_SIZE] for i in range(len(query_suffix)-1): for j in range(i+1, len(query_suffix)): ext = QueryQuality(obs_corpus[i], obs_corpus[j]) x = ext.transform() dim = 1 fname = "%s_%s_x_%s_%dD"%(ext._get_feat_name(), query_suffix[i], query_suffix[j], dim) pkl_utils._save(os.path.join(config.FEAT_DIR, fname+config.FEAT_FILE_SUFFIX), x) corr = np_utils._corr(x[:TRAIN_SIZE], y_train) logger.info("%s (%dD): corr = %.6f"%(fname, dim, corr)) # raw dfAll = pkl_utils._load(config.ALL_DATA_RAW) obs_fields = ["search_term"] param_list = [] sf = StandaloneFeatureWrapper(IsInGoogleDict, dfAll, obs_fields, param_list, config.FEAT_DIR, logger) sf.go()
def run_tfidf_ngram_cosinesim(): logname = "generate_feature_tfidf_ngram_cosinesim_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) dfAll.drop(["product_attribute_list"], inplace=True, axis=1) generators = [TFIDF_Word_Ngram_CosineSim, TFIDF_Char_Ngram_CosineSim] ngrams_list = [[1,2,3], [2,3,4,5]] ngrams_list = [[1,2,3], [4]] obs_fields_list = [] target_fields_list = [] obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"] ) target_fields_list.append( ["product_title", "product_description", "product_attribute"] ) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator,ngrams in zip(generators, ngrams_list): for ngram in ngrams: param_list = [ngram] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def __init__(self, feature_list, feature_name, feature_suffix=".csv", feature_level=2, meta_feature_dict={}, corr_threshold=0): self.feature_name = feature_name self.feature_list = feature_list self.feature_suffix = feature_suffix self.feature_level = feature_level # for meta features self.meta_feature_dict = meta_feature_dict self.corr_threshold = corr_threshold self.feature_names_basic = [] self.feature_names_cv = [] self.has_basic = 1 if self.meta_feature_dict else 0 logname = "feature_combiner_%s_%s.log"%(feature_name, time_utils._timestamp()) self.logger = logging_utils._get_logger(config.LOG_DIR, logname) if self.feature_level == 2: self.splitter = splitter_level2 elif self.feature_level == 3: self.splitter = splitter_level3 self.n_iter = n_iter self.splitter_prev = [0]*self.n_iter
def run_lsa_ngram_cooc(): logname = "generate_feature_lsa_ngram_cooc_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) dfAll.drop(["product_attribute_list"], inplace=True, axis=1) generators = [LSA_Word_Ngram_Cooc] obs_ngrams = [1, 2] target_ngrams = [1, 2] obs_fields_list = [] target_fields_list = [] obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"] ) target_fields_list.append( ["product_title", "product_description"] ) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for obs_ngram in obs_ngrams: for target_ngram in target_ngrams: for generator in generators: param_list = [obs_ngram, target_ngram, config.SVD_DIM, config.SVD_N_ITER] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def main(): logname = "generate_feature_group_distance_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) dfTrain = dfAll.iloc[:TRAIN_SIZE].copy() ## run python3 splitter.py first split = pkl_utils._load("%s/splits_level1.pkl"%config.SPLIT_DIR) n_iter = len(split) relevances_complete = [1, 1.25, 1.33, 1.5, 1.67, 1.75, 2, 2.25, 2.33, 2.5, 2.67, 2.75, 3] relevances = [1, 1.33, 1.67, 2, 2.33, 2.67, 3] ngrams = [1] obs_fields = ["search_term"] target_fields = ["product_title", "product_description"] aggregation_mode = ["mean", "std", "max", "min", "median"] ## for cv for i in range(n_iter): trainInd, validInd = split[i][0], split[i][1] dfTrain2 = dfTrain.iloc[trainInd].copy() sub_feature_dir = "%s/Run%d" % (config.FEAT_DIR, i+1) for target_field in target_fields: for relevance in relevances: for ngram in ngrams: param_list = [dfAll["id"], dfTrain2, target_field, relevance, ngram, aggregation_mode] pf = PairwiseFeatureWrapper(GroupRelevance_Ngram_Jaccard, dfAll, obs_fields, [target_field], param_list, sub_feature_dir, logger) pf.go() ## for all sub_feature_dir = "%s/All" % (config.FEAT_DIR) for target_field in target_fields: for relevance in relevances: for ngram in ngrams: param_list = [dfAll["id"], dfTrain, target_field, relevance, ngram, aggregation_mode] pf = PairwiseFeatureWrapper(GroupRelevance_Ngram_Jaccard, dfAll, obs_fields, [target_field], param_list, sub_feature_dir, logger) pf.go()
def main(): logname = "generate_feature_wordnet_similarity_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) #### NOTE: use data BEFORE STEMMING dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED) generators = [ WordNet_Path_Similarity, WordNet_Lch_Similarity, WordNet_Wup_Similarity, ] obs_fields_list = [] target_fields_list = [] obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"] ) target_fields_list.append( ["product_title", "product_description", "product_attribute"] ) # double aggregation aggregation_mode_prev = ["mean", "max", "min", "median"] aggregation_mode = ["mean", "std", "max", "min", "median"] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: param_list = [aggregation_mode_prev, aggregation_mode] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def main(): logname = "generate_feature_match_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) generators = [ MatchQueryCount, MatchQueryRatio, LongestMatchSize, LongestMatchRatio ] obs_fields_list = [] target_fields_list = [] obs_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"] ) target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] ) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: param_list = [] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go() # product_attribute_list generators = [ MatchAttrCount, MatchAttrRatio, IsIndoorOutdoorMatch, ] obs_fields_list = [] target_fields_list = [] obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"] ) target_fields_list.append( ["product_attribute_list"] ) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: param_list = [] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def main(): ########### ## Setup ## ########### logname = "data_processor_%s.log"%now logger = logging_utils._get_logger(config.LOG_DIR, logname) # put product_attribute_list, product_attribute and product_description first as they are # quite time consuming to process columns_to_proc = [ # # product_attribute_list is very time consuming to process # # so we just process product_attribute which is of the form # # attr_name1 | attr_value1 | attr_name2 | attr_value2 | ... # # and split it into a list afterwards # "product_attribute_list", "product_attribute_concat", "product_description", "product_brand", "product_color", "product_title", "search_term", ] if config.PLATFORM == "Linux": config.DATA_PROCESSOR_N_JOBS = len(columns_to_proc) # clean using a list of processors processors = [ LowerCaseConverter(), # See LowerUpperCaseSplitter and UnitConverter for why we put UnitConverter here UnitConverter(), LowerUpperCaseSplitter(), WordReplacer(replace_fname=config.WORD_REPLACER_DATA), LetterLetterSplitter(), DigitLetterSplitter(), DigitCommaDigitMerger(), NumberDigitMapper(), UnitConverter(), QuartetCleaner(), HtmlCleaner(parser="html.parser"), Lemmatizer(), ] stemmers = [ Stemmer(stemmer_type="snowball"), Stemmer(stemmer_type="porter") ][0:1] ## simple test text = "1/2 inch rubber lep tips Bullet07" print("Original:") print(text) list_processor = ListProcessor(processors) print("After:") print(list_processor.process([text])) ############# ## Process ## ############# ## load raw data dfAll = pkl_utils._load(config.ALL_DATA_RAW) columns_to_proc = [col for col in columns_to_proc if col in dfAll.columns] ## extract product name from search_term and product_title ext = ProductNameExtractor() dfAll["search_term_product_name"] = dfAll["search_term"].apply(ext.transform) dfAll["product_title_product_name"] = dfAll["product_title"].apply(ext.transform) if config.TASK == "sample": print(dfAll[["search_term", "search_term_product_name", "product_title_product_name"]]) ## clean using GoogleQuerySpellingChecker # MUST BE IN FRONT OF ALL THE PROCESSING logger.info("Run GoogleQuerySpellingChecker at search_term") checker = GoogleQuerySpellingChecker() dfAll["search_term"] = dfAll["search_term"].apply(checker.correct) ## clean uisng a list of processors df_processor = DataFrameParallelProcessor(processors, config.DATA_PROCESSOR_N_JOBS) df_processor.process(dfAll, columns_to_proc) # split product_attribute_concat into product_attribute and product_attribute_list dfAll["product_attribute"] = dfAll["product_attribute_concat"].apply(_split_attr_to_text) dfAll["product_attribute_list"] = dfAll["product_attribute_concat"].apply(_split_attr_to_list) if config.TASK == "sample": print(dfAll[["product_attribute", "product_attribute_list"]]) # query expansion list_processor = ListProcessor(processors) base_stopwords = set(list_processor.process(list(config.STOP_WORDS))) qe = QueryExpansion(dfAll, ngram=3, stopwords_threshold=0.9, base_stopwords=base_stopwords) dfAll["search_term_alt"] = qe.build() if config.TASK == "sample": print(dfAll[["search_term", "search_term_alt"]]) # save data logger.info("Save to %s"%config.ALL_DATA_LEMMATIZED) columns_to_save = [col for col in dfAll.columns if col != "product_attribute_concat"] pkl_utils._save(config.ALL_DATA_LEMMATIZED, dfAll[columns_to_save]) ## auto correcting query if config.AUTO_CORRECTING_QUERY: logger.info("Run AutoSpellingChecker at search_term") checker = AutoSpellingChecker(dfAll, exclude_stopwords=False, min_len=4) dfAll['search_term_auto_corrected'] = list(dfAll["search_term"].apply(checker.correct)) columns_to_proc += ['search_term_auto_corrected'] if config.TASK == "sample": print(dfAll[["search_term", "search_term_auto_corrected"]]) # save query_correction_map and spelling checker fname = "%s/auto_spelling_checker_query_correction_map_%s.log"%(config.LOG_DIR, now) checker.save_query_correction_map(fname) # save data logger.info("Save to %s"%config.ALL_DATA_LEMMATIZED) columns_to_save = [col for col in dfAll.columns if col != "product_attribute_concat"] pkl_utils._save(config.ALL_DATA_LEMMATIZED, dfAll[columns_to_save]) ## clean using stemmers df_processor = DataFrameParallelProcessor(stemmers, config.DATA_PROCESSOR_N_JOBS) df_processor.process(dfAll, columns_to_proc) # split product_attribute_concat into product_attribute and product_attribute_list dfAll["product_attribute"] = dfAll["product_attribute_concat"].apply(_split_attr_to_text) dfAll["product_attribute_list"] = dfAll["product_attribute_concat"].apply(_split_attr_to_list) # query expansion list_processor = ListProcessor(stemmers) base_stopwords = set(list_processor.process(list(config.STOP_WORDS))) qe = QueryExpansion(dfAll, ngram=3, stopwords_threshold=0.9, base_stopwords=base_stopwords) dfAll["search_term_alt"] = qe.build() if config.TASK == "sample": print(dfAll[["search_term", "search_term_alt"]]) # save data logger.info("Save to %s"%config.ALL_DATA_LEMMATIZED_STEMMED) columns_to_save = [col for col in dfAll.columns if col != "product_attribute_concat"] pkl_utils._save(config.ALL_DATA_LEMMATIZED_STEMMED, dfAll[columns_to_save])
def main(): logname = "generate_feature_group_distance_stat_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) y_train = dfAll["relevance"].values[:TRAIN_SIZE] group_id_names = ["DocId_search_term", "DocId_product_title", "DocIdEcho_product_uid"] match_list = [ "MatchQueryCount", "MatchQueryRatio", "LongestMatchRatio", ] tfidf_list = [ "StatCoocTF_Unigram_Mean", "StatCoocTF_Unigram_Max", "StatCoocTF_Unigram_Min", # "StatCoocNormTF_Unigram_Mean", # "StatCoocNormTF_Unigram_Max", # "StatCoocNormTF_Unigram_Min", "StatCoocTFIDF_Unigram_Mean", "StatCoocTFIDF_Unigram_Max", "StatCoocTFIDF_Unigram_Min", "StatCoocBM25_Unigram_Mean", "StatCoocBM25_Unigram_Max", "StatCoocBM25_Unigram_Min", # "StatCoocTF_Bigram_Mean", # "StatCoocTF_Bigram_Max", # "StatCoocTF_Bigram_Min", # "StatCoocNormTF_Bigram_Mean", # "StatCoocNormTF_Bigram_Max", # "StatCoocNormTF_Bigram_Min", # "StatCoocTFIDF_Bigram_Mean", # "StatCoocTFIDF_Bigram_Max", # "StatCoocTFIDF_Bigram_Min", # "StatCoocBM25_Bigram_Mean", # "StatCoocBM25_Bigram_Max", # "StatCoocBM25_Bigram_Min", # "StatCoocTF_Trigram_Mean", # "StatCoocTF_Trigram_Max", # "StatCoocTF_Trigram_Min", # "StatCoocNormTF_Trigram_Mean", # "StatCoocNormTF_Trigram_Max", # "StatCoocNormTF_Trigram_Min", # "StatCoocTFIDF_Trigram_Mean", # "StatCoocTFIDF_Trigram_Max", # "StatCoocTFIDF_Trigram_Min", # "StatCoocBM25_Trigram_Mean", # "StatCoocBM25_Trigram_Max", # "StatCoocBM25_Trigram_Min", ] intersect_ngram_count_list = [ "IntersectCount_Unigram", "IntersectRatio_Unigram", # "IntersectCount_Bigram", # "IntersectRatio_Bigram", # "IntersectCount_Trigram", # "IntersectRatio_Trigram", ] first_last_ngram_list = [ "FirstIntersectCount_Unigram", "FirstIntersectRatio_Unigram", "LastIntersectCount_Unigram", "LastIntersectRatio_Unigram", # "FirstIntersectCount_Bigram", # "FirstIntersectRatio_Bigram", # "LastIntersectCount_Bigram", # "LastIntersectRatio_Bigram", # "FirstIntersectCount_Trigram", # "FirstIntersectRatio_Trigram", # "LastIntersectCount_Trigram", # "LastIntersectRatio_Trigram", ] cooccurrence_ngram_count_list = [ "CooccurrenceCount_Unigram", "CooccurrenceRatio_Unigram", # "CooccurrenceCount_Bigram", # "CooccurrenceRatio_Bigram", # "CooccurrenceCount_Trigram", # "CooccurrenceRatio_Trigram", ] ngram_jaccard_list = [ "JaccardCoef_Unigram", # "JaccardCoef_Bigram", # "JaccardCoef_Trigram", "DiceDistance_Unigram", # "DiceDistance_Bigram", # "DiceDistance_Trigram", ] char_dist_sim_list = [ "CharDistribution_CosineSim", "CharDistribution_KL", ] tfidf_word_ngram_cosinesim_list = [ "TFIDF_Word_Unigram_CosineSim", # "TFIDF_Word_Bigram_CosineSim", # "TFIDF_Word_Trigram_CosineSim", ] tfidf_char_ngram_cosinesim_list = [ # "TFIDF_Char_Bigram_CosineSim", # "TFIDF_Char_Trigram_CosineSim", "TFIDF_Char_Fourgram_CosineSim", # "TFIDF_Char_Fivegram_CosineSim", ] lsa_word_ngram_cosinesim_list = [ "LSA100_Word_Unigram_CosineSim", # "LSA100_Word_Bigram_CosineSim", # "LSA100_Word_Trigram_CosineSim", ] lsa_char_ngram_cosinesim_list = [ # "LSA100_Char_Bigram_CosineSim", # "LSA100_Char_Trigram_CosineSim", "LSA100_Char_Fourgram_CosineSim", # "LSA100_Char_Fivegram_CosineSim", ] doc2vec_list = [ "Doc2Vec_Homedepot_D100_CosineSim", ] word2vec_list = [ "Word2Vec_N_Similarity", "Word2Vec_Homedepot_D100_CosineSim_Mean_Mean", "Word2Vec_Homedepot_D100_CosineSim_Max_Mean", "Word2Vec_Homedepot_D100_CosineSim_Min_Mean", ] distance_generator_list = \ match_list + \ tfidf_list + \ intersect_ngram_count_list + \ first_last_ngram_list + \ cooccurrence_ngram_count_list + \ ngram_jaccard_list + \ tfidf_word_ngram_cosinesim_list + \ tfidf_char_ngram_cosinesim_list + \ lsa_word_ngram_cosinesim_list + \ lsa_char_ngram_cosinesim_list + \ char_dist_sim_list + \ word2vec_list + \ doc2vec_list obs_fields_list = [] target_fields_list = [] ## query in document obs_fields_list.append( ["search_term"] ) target_fields_list.append( ["product_title", "product_title_product_name"] ) aggregation_mode = ["mean", "max", "min"] for group_id_name in group_id_names: group_id_list = pkl_utils._load(os.path.join(config.FEAT_DIR, group_id_name+"_1D.pkl")) for distance_generator in distance_generator_list: for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for obs_field in obs_fields: for target_field in target_fields: dist_name = "%s_%s_x_%s"%(distance_generator, obs_field, target_field) try: dist_list = pkl_utils._load(os.path.join(config.FEAT_DIR, dist_name+"_1D.pkl")) ext = GroupDistanceStat(dist_list, group_id_list, dist_name, group_id_name, aggregation_mode) x = ext.transform() if isinstance(ext.__name__(), list): for i,feat_name in enumerate(ext.__name__()): dim = 1 fname = "%s_%dD"%(feat_name, dim) pkl_utils._save(os.path.join(config.FEAT_DIR, fname+config.FEAT_FILE_SUFFIX), x[:,i]) corr = np_utils._corr(x[:TRAIN_SIZE,i], y_train) logger.info("%s (%dD): corr = %.6f"%(fname, dim, corr)) except: logger.info("Skip %s"%dist_name) pass
def main(which): logname = "generate_feature_word2vec_%s_%s.log"%(which, time_utils._timestamp()) logger = logging_utils._get_logger(config.LOG_DIR, logname) #### NOTE: use data BEFORE STEMMinG dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED) word2vec_model_dirs = [] model_prefixes = [] if which == "homedepot": ## word2vec model trained with Homedepot dataset: brand/color/query/title/description word2vec_model_dirs.append( config.WORD2VEC_MODEL_DIR + "/Homedepot-word2vec-D%d-min_count%d.model"%(config.EMBEDDING_DIM, config.EMBEDDING_MIN_COUNT) ) model_prefixes.append( "Homedepot" ) elif which == "wikipedia": ## word2vec model pretrained with Wikipedia+Gigaword 5 word2vec_model_dirs.append( config.GLOVE_WORD2VEC_MODEL_DIR + "/glove.6B.300d.txt" ) model_prefixes.append( "Wikipedia" ) elif which == "google": ## word2vec model pretrained with Google News word2vec_model_dirs.append( config.WORD2VEC_MODEL_DIR + "/GoogleNews-vectors-negative300.bin" ) model_prefixes.append( "GoogleNews" ) for word2vec_model_dir, model_prefix in zip(word2vec_model_dirs, model_prefixes): ## load model try: if ".bin" in word2vec_model_dir: word2vec_model = gensim.models.Word2Vec.load_word2vec_format(word2vec_model_dir, binary=True) elif ".txt" in word2vec_model_dir: word2vec_model = gensim.models.Word2Vec.load_word2vec_format(word2vec_model_dir, binary=False) else: word2vec_model = gensim.models.Word2Vec.load(word2vec_model_dir) except: continue # ## standalone (not used in model building) # obs_fields = ["search_term", "product_title", "product_description"] # generator = Word2Vec_Centroid_Vector # param_list = [word2vec_model, model_prefix] # sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger) # sf.go() ## pairwise generators = [ Word2Vec_Importance, Word2Vec_N_Similarity, Word2Vec_N_Similarity_Imp, Word2Vec_Centroid_RMSE, Word2Vec_Centroid_RMSE_IMP, # # not used in final submission # Word2Vec_Centroid_Vdiff, ] obs_fields_list = [] target_fields_list = [] obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"][:1] ) target_fields_list.append( ["product_title", "product_description", "product_attribute", "product_brand", "product_color"] ) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: param_list = [word2vec_model, model_prefix] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go() ## cosine sim generators = [ Word2Vec_CosineSim, ] # double aggregation aggregation_mode_prev = ["mean", "max", "min", "median"] aggregation_mode = ["mean", "std", "max", "min", "median"] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: param_list = [word2vec_model, model_prefix, aggregation_mode, aggregation_mode_prev] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()