def process(lang, pivot): print "[%s]: process for language %s" % (time_utils._timestamp(), lang) linkDict = pkl_utils._load(config.ILL_DICT["%s2%s" % (lang, pivot)]) templateDict = pkl_utils._load(config.TEMPLATE2ARTICLE[lang]) articleDict = pkl_utils._load(config.ARTICLE2TEMPLATE[pivot]) mapping = pd.read_csv(config.EXISTING_MAPPING_OUTPUT[pivot], index_col="template") template1 = []; template2 = [] article1 = []; article2 = []; ontology = [] for template in templateDict: articles = templateDict[template] for article in articles: if article in linkDict: tmp = linkDict[article] template1.append(template) article1.append(article) article2.append(tmp) if tmp in articleDict: templateList = articleDict[tmp] else: templateList = [] c = "" t = "" for Template in templateList: if Template in mapping.index: c = mapping.at[Template, "ontology"] t = Template template2.append(t) ontology.append(c) data = {"template1":template1, "article1":article1, "template2":template2, \ "article2":article2, "ontology":ontology} df = pd.DataFrame(data) df.to_csv(config.ENTITY_MATRIX["%s2%s" % (lang, pivot)], index=False) print "[%s]: processing complete" % time_utils._timestamp()
def main(): print "[%s]: generate ontology hierarchy tree" % (time_utils._timestamp()) G = g.Graph() G.parse(config.ONTOLOGY, format="n3") q = ''' PREFIX rr: <http://www.w3.org/2000/01/rdf-schema#> SELECT ?child ?parent WHERE { ?child rr:subClassOf ?parent . }''' results = G.query(q) ontologyDict = {} for row in results: child = str(row[0]) parent = str(row[1]) if parent in ontologyDict: ontologyDict[parent].append(child) else: ontologyDict[parent] = [ child, ] pkl_utils._save(config.ONTOLOGY_TREE, ontologyDict) print "[%s]: generation complete" % time_utils._timestamp()
def getILL(lang, target): print "[%s]: generate ILL dict from language %s to language %s" % ( time_utils._timestamp(), lang, target) infile = open(config.ILL[lang]) prefix1 = config.LANG_PREFIX[lang] prefix2 = config.LANG_PREFIX[target] len1 = len(prefix1) len2 = len(prefix2) linkDict = {} for line in infile.readlines(): if line[0] != "<": continue row = line.split() lang1 = row[0][1:-1] lang2 = row[2][1:-1] if prefix1 not in lang1: continue if prefix2 not in lang2: continue lang1 = lang1[len1:] lang2 = lang2[len2:] linkDict[lang1] = lang2 print "%d links in total" % len(linkDict) pkl_utils._save(config.ILL_DICT["%s2%s" % (lang, target)], linkDict) print "[%s]: generation complete" % time_utils._timestamp()
def Article2Template(lang="en"): print "[%s]: generate article2template dict for language %s" % (time_utils._timestamp(), lang) infile = open(config.ARTICLE_TEMPLATES[lang]) prefix = config.LANG_PREFIX[lang] len_prefix = len(prefix) articleDict = {} for line in infile.readlines(): if line[0] != "<": continue row = line.split() article = row[0][1:-1] template = row[2][1:-1] article = article[len_prefix:] template = template[len_prefix:] if "/" in template: continue if article in articleDict: articleDict[article].append(template) else: articleDict[article] = [template, ] print "%d articles in total" % len(articleDict) pkl_utils._save(config.ARTICLE2TEMPLATE[lang], articleDict) print "[%s]: generation complete" % time_utils._timestamp()
def Article2Template(lang="en"): print "[%s]: generate article2template dict for language %s" % ( time_utils._timestamp(), lang) infile = open(config.ARTICLE_TEMPLATES[lang]) prefix = config.LANG_PREFIX[lang] len_prefix = len(prefix) articleDict = {} for line in infile.readlines(): if line[0] != "<": continue row = line.split() article = row[0][1:-1] template = row[2][1:-1] article = article[len_prefix:] template = template[len_prefix:] if "/" in template: continue if article in articleDict: articleDict[article].append(template) else: articleDict[article] = [ template, ] print "%d articles in total" % len(articleDict) pkl_utils._save(config.ARTICLE2TEMPLATE[lang], articleDict) print "[%s]: generation complete" % time_utils._timestamp()
def run_tsne_lsa_ngram(): logname = "generate_feature_tsne_lsa_ngram_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) generators = [TSNE_LSA_Word_Ngram, TSNE_LSA_Char_Ngram] ngrams_list = [[1,2,3], [2,3,4,5]] ngrams_list = [[3], [4]] obs_fields = ["question1", "question2"] for generator,ngrams in zip(generators, ngrams_list): for ngram in ngrams: param_list = [ngram, config.SVD_DIM, config.SVD_N_ITER] sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger, force_corr=True) sf.go() generators = [TSNE_LSA_Word_Ngram_Pair] ngrams = [1, 2] obs_fields_list = [] target_fields_list = [] obs_fields_list.append( ["question1"] ) target_fields_list.append( ["question2"] ) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for ngram in ngrams: for generator in generators: param_list = [ngram, config.SVD_DIM, config.SVD_N_ITER] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger, force_corr=True) pf.go()
def main(): logname = "generate_feature_intersect_position_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) generators = [ IntersectPosition_Ngram, IntersectNormPosition_Ngram, ] obs_fields_list = [] target_fields_list = [] ## query in document obs_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"] ) target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"][1:2] ) ## document in query obs_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"][1:2] ) target_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"] ) ngrams = [1,2,3,12,123][:3] aggregation_mode = ["mean", "std", "max", "min", "median"] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: for ngram in ngrams: param_list = [ngram, aggregation_mode] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def run_position(): logname = "generate_feature_first_last_ngram_position_%s.log" % time_utils._timestamp( ) logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) generators = [ FirstIntersectPosition_Ngram, LastIntersectPosition_Ngram, FirstIntersectNormPosition_Ngram, LastIntersectNormPosition_Ngram, ] obs_fields_list = [["question1"], ["question2"]] target_fields_list = [["question2"], ["question1"]] ngrams = [1, 2, 3, 12, 123] aggregation_mode = ["mean", "std", "max", "min", "median"] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: for ngram in ngrams: param_list = [ngram, aggregation_mode] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def run_lsa_ngram_pair(): """Symmetric in obs and target""" logname = "generate_feature_lsa_ngram_pair_%s.log" % time_utils._timestamp( ) logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) generators = [LSA_Word_Ngram_Pair] ngrams = [1, 2, 3] obs_fields_list = [] target_fields_list = [] ## question1 in question2 obs_fields_list.append(['question1']) target_fields_list.append(['question2']) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for ngram in ngrams: for generator in generators: param_list = [ngram, config.SVD_DIM, config.SVD_N_ITER] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger, force_corr=True) pf.go() del pf gc.collect()
def run_ngram_jaccard(): logname = "generate_feature_ngram_jaccard_%s.log" % time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) generators = [JaccardCoef_Ngram, DiceDistance_Ngram] obs_fields_list = [] target_fields_list = [] obs_fields_list.append([ "search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected" ][:2]) target_fields_list.append([ "product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color" ]) ngrams = [1, 2, 3, 12, 123][:3] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: for ngram in ngrams: param_list = [ngram] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def run_compression_distance(): logname = "generate_feature_compression_distance_%s.log" % time_utils._timestamp( ) logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) obs_fields_list = [] target_fields_list = [] obs_fields_list.append([ "search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected" ][:2]) target_fields_list.append([ "product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color" ]) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): param_list = [] pf = PairwiseFeatureWrapper(CompressionDistance, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go() for ngram in ngrams: param_list = [ngram, aggregation_mode_prev, aggregation_mode] pf = PairwiseFeatureWrapper(CompressionDistance_Ngram, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def main(): logname = "generate_feature_doc2vec_%s.log" % time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) #### NOTE: use data BEFORE STEMMING dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED) doc2vec_model_dirs = [] model_prefixes = [] ## doc2vec model trained with Homedepot dataset: brand/color/obs/title/description doc2vec_model_dirs.append( config.DOC2VEC_MODEL_DIR + "/Homedepot-doc2vec-D%d-min_count%d.model" % (config.EMBEDDING_DIM, config.EMBEDDING_MIN_COUNT)) model_prefixes.append("Homedepot") for doc2vec_model_dir, model_prefix in zip(doc2vec_model_dirs, model_prefixes): ## load model try: if ".bin" in doc2vec_model_dir: doc2vec_model = gensim.models.Doc2Vec.load_word2vec_format( doc2vec_model_dir, binary=True) if ".txt" in doc2vec_model_dir: doc2vec_model = gensim.models.Doc2Vec.load_word2vec_format( doc2vec_model_dir, binary=False) else: doc2vec_model = gensim.models.Doc2Vec.load(doc2vec_model_dir) doc2vec_model_sent_label = pkl_utils._load(doc2vec_model_dir + ".sent_label") except: continue # ## standalone (not used in model building) # obs_fields = ["search_term", "search_term_alt", "product_title", "product_description", "product_attribute"] # generator = Doc2Vec_Vector # param_list = [doc2vec_model, doc2vec_model_sent_label, model_prefix] # sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger) # sf.go() ## pairwise generators = [ Doc2Vec_CosineSim, Doc2Vec_RMSE, # Doc2Vec_Vdiff, ] obs_fields_list = [] target_fields_list = [] obs_fields_list.append(["search_term", "search_term_alt"][:1]) target_fields_list.append([ "product_title", "product_description", "product_attribute", "product_brand", "product_color" ]) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: param_list = [ doc2vec_model, doc2vec_model_sent_label, model_prefix ] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def __init__(self, model_folder, model_list, subm_prefix, weight_opt_max_evals=10, w_min=-1., w_max=1., inst_subsample=0.5, inst_subsample_replacement=False, inst_splitter=None, model_subsample=1.0, model_subsample_replacement=True, bagging_size=10, init_top_k=5, epsilon=0.00001, multiprocessing=False, multiprocessing_num_cores=1, enable_extreme=True, random_seed=0): self.model_folder = model_folder self.model_list = model_list self.subm_prefix = subm_prefix self.weight_opt_max_evals = weight_opt_max_evals self.w_min = w_min self.w_max = w_max assert inst_subsample > 0 and inst_subsample <= 1. self.inst_subsample = inst_subsample self.inst_subsample_replacement = inst_subsample_replacement self.inst_splitter = inst_splitter assert model_subsample > 0 assert (type(model_subsample) == int) or (model_subsample <= 1.) self.model_subsample = model_subsample self.model_subsample_replacement = model_subsample_replacement self.bagging_size = bagging_size self.init_top_k = init_top_k self.epsilon = epsilon self.multiprocessing = multiprocessing self.multiprocessing_num_cores = multiprocessing_num_cores self.enable_extreme = enable_extreme self.random_seed = random_seed logname = "ensemble_selection_%s.log"%time_utils._timestamp() self.logger = logging_utils._get_logger(config.LOG_DIR, logname) self.n_models = len(self.model_list)
def main(options): logname = "[Feat@%s]_[Learner@%s]_hyperopt_%s.log"%( options.feature_name, options.learner_name, time_utils._timestamp()) logger = logging_utils._get_logger(config.LOG_DIR, logname) optimizer = TaskOptimizer(options.task_mode, options.learner_name, options.feature_name, logger, options.max_evals, verbose=True, refit_once=options.refit_once) optimizer.run()
def __init__(self, feature_list, feature_name, feature_suffix=".csv", feature_level=2, meta_feature_dict={}, corr_threshold=0): self.feature_name = feature_name self.feature_list = feature_list self.feature_suffix = feature_suffix self.feature_level = feature_level # for meta features self.meta_feature_dict = meta_feature_dict self.corr_threshold = corr_threshold self.feature_names_basic = [] self.feature_names_cv = [] self.feature_names = [] self.has_basic = 1 if self.meta_feature_dict else 0 logname = "feature_combiner_%s_%s.log" % (feature_name, time_utils._timestamp()) self.logger = logging_utils._get_logger(config.LOG_DIR, logname) if self.feature_level == 2: self.splitter = splitter_level2 elif self.feature_level == 3: self.splitter = splitter_level3 self.n_iter = n_iter self.splitter_prev = [0] * self.n_iter
def main(): logname = "generate_feature_match_%s.log" % time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) generators = [ MatchQueryCount, MatchQueryRatio, LongestMatchSize, LongestMatchRatio, ] obs_fields_list = [] target_fields_list = [] ## question1 in question2 obs_fields_list.append(['question1']) target_fields_list.append(['question2']) ## question2 in question1 obs_fields_list.append(['question2']) target_fields_list.append(['question1']) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: param_list = [] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def run_count(): logname = "generate_feature_first_last_ngram_count_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) generators = [ FirstIntersectCount_Ngram, LastIntersectCount_Ngram, FirstIntersectRatio_Ngram, LastIntersectRatio_Ngram, ] obs_fields_list = [] target_fields_list = [] ## query in document obs_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"] ) target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] ) ## document in query obs_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] ) target_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"] ) ngrams = [1,2,3,12,123][:3] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: for ngram in ngrams: param_list = [ngram] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def run_tfidf_ngram_cosinesim(): """Symmetric in obs and target""" logname = "generate_feature_tfidf_ngram_cosinesim_%s.log" % time_utils._timestamp( ) logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) generators = [TFIDF_Word_Ngram_CosineSim, TFIDF_Char_Ngram_CosineSim] ngrams_list = [[2, 3], [4]] obs_fields_list = [['question1']] target_fields_list = [['question2']] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator, ngrams in zip(generators, ngrams_list): for ngram in ngrams: param_list = [ngram] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger, force_corr=True) pf.go() del pf gc.collect()
def main(): # source domain print("load svhn") svhn_images_train, _ = DataLoader.load_svhn(SVHN_DIR, "train_32x32.mat") svhn_images_test, svhn_labels_test = DataLoader.load_svhn(SVHN_DIR, "test_32x32.mat") svhn_images_extra, svhn_labels_extra = DataLoader.load_svhn(SVHN_DIR, "extra_32x32.mat") auxiliary_data = { "X_train": svhn_images_extra, "y_train": svhn_labels_extra, "X_test": svhn_images_test, "y_test": svhn_labels_test, } # target domain print("load mnist") if not os.path.isfile(os.path.join(MNIST_DIR, "train.pkl")): DataLoader.prepare_mnist(MNIST_DIR, "train") mnist_images_train, _ = DataLoader.load_mnist(MNIST_DIR, "train") # dtn model print("init dtn") os_utils._makedirs(params["summary_dir"], force=True) os_utils._makedirs(params["log_dir"]) logger = log_utils._get_logger(params["log_dir"], "tf-%s.log" % time_utils._timestamp()) model = DomainTransferNet(params, logger) print("fit dtn") model.fit(auxiliary_data, Xs_train=svhn_images_train, Xt_train=mnist_images_train) print("evaluate dtn") model.evaluate(Xs=svhn_images_train, sample_batch=100, batch_size=100, sample_dir=SAMPLE_DIR)
def run_count(): logname = "generate_feature_first_last_ngram_count_%s.log" % time_utils._timestamp( ) logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) generators = [ FirstIntersectCount_Ngram, LastIntersectCount_Ngram, FirstIntersectRatio_Ngram, LastIntersectRatio_Ngram, ] obs_fields_list = [] target_fields_list = [] ## query in document obs_fields_list.append(["question1"]) target_fields_list.append(["question2"]) ## document in query obs_fields_list.append(["question2"]) target_fields_list.append(["question1"]) ngrams = [1, 2, 3, 12, 123][:3] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: for ngram in ngrams: param_list = [ngram] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def main(): logname = "generate_feature_wordnet_similarity_%s.log" % time_utils._timestamp( ) logger = logging_utils._get_logger(config.LOG_DIR, logname) #### NOTE: use data BEFORE STEMMING dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED) # WordNet_Lch_Similarity and WordNet_Wup_Similarity are not used in final submission generators = [ WordNet_Path_Similarity, WordNet_Lch_Similarity, WordNet_Wup_Similarity, ][:1] obs_fields_list = [] target_fields_list = [] # only search_term and product_title are used in final submission obs_fields_list.append(["question1"]) target_fields_list.append(["question2"]) # double aggregation aggregation_mode_prev = ["mean", "max", "min", "median"] aggregation_mode = ["mean", "std", "max", "min", "median"] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: param_list = [aggregation_mode_prev, aggregation_mode] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def run_tsne_lsa_ngram(): logname = "generate_feature_tsne_lsa_ngram_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) dfAll.drop(["product_attribute_list"], inplace=True, axis=1) generators = [TSNE_LSA_Word_Ngram, TSNE_LSA_Char_Ngram] ngrams_list = [[1,2,3], [2,3,4,5]] ngrams_list = [[1,2,3], [4]] obs_fields = ["search_term", "search_term_alt", "search_term_auto_corrected", "product_title", "product_description"] for generator,ngrams in zip(generators, ngrams_list): for ngram in ngrams: param_list = [ngram, config.SVD_DIM, config.SVD_N_ITER] sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger, force_corr=True) sf.go() generators = [TSNE_LSA_Word_Ngram_Pair] ngrams = [1, 2] obs_fields_list = [] target_fields_list = [] obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"] ) target_fields_list.append( ["product_title", "product_description"] ) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for ngram in ngrams: for generator in generators: param_list = [ngram, config.SVD_DIM, config.SVD_N_ITER] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger, force_corr=True) pf.go()
def main(): logname = "generate_feature_intersect_position_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) generators = [ IntersectPosition_Ngram, IntersectNormPosition_Ngram, ] obs_fields_list = [] target_fields_list = [] ## query in document obs_fields_list.append( ["question1"] ) target_fields_list.append( ["question2"] ) ## document in query obs_fields_list.append( ["question2"] ) target_fields_list.append( ["question1"] ) ngrams = [1,2,3,12,123][:3] aggregation_mode = ["mean", "std", "max", "min", "median"] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: for ngram in ngrams: param_list = [ngram, aggregation_mode] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def run_lsa_ngram_cooc(): logname = "generate_feature_lsa_ngram_cooc_%s.log" % time_utils._timestamp( ) logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) dfAll.drop(["product_attribute_list"], inplace=True, axis=1) generators = [LSA_Word_Ngram_Cooc] obs_ngrams = [1, 2] target_ngrams = [1, 2] obs_fields_list = [] target_fields_list = [] obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"][:1]) target_fields_list.append(["product_title", "product_description"][:1]) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for obs_ngram in obs_ngrams: for target_ngram in target_ngrams: for generator in generators: param_list = [ obs_ngram, target_ngram, config.SVD_DIM, config.SVD_N_ITER ] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def main(): logname = "generate_feature_group_relevance_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) dfTrain = dfAll.iloc[:TRAIN_SIZE].copy() ## run python3 splitter.py first split = pkl_utils._load("%s/splits_level1.pkl"%config.SPLIT_DIR) n_iter = len(split) ## for cv for i in range(n_iter): trainInd, validInd = split[i][0], split[i][1] dfTrain2 = dfTrain.iloc[trainInd].copy() sub_feature_dir = "%s/Run%d" % (config.FEAT_DIR, i+1) obs_fields = ["search_term", "product_title"][1:] aggregation_mode = ["mean", "std", "max", "min", "median", "size"] param_list = [dfAll["id"], dfTrain2, aggregation_mode] sf = StandaloneFeatureWrapper(GroupRelevance, dfAll, obs_fields, param_list, sub_feature_dir, logger) sf.go() ## for all sub_feature_dir = "%s/All" % (config.FEAT_DIR) obs_fields = ["search_term", "product_title"][1:] aggregation_mode = ["mean", "std", "max", "min", "median", "size"] param_list = [dfAll["id"], dfTrain, aggregation_mode] sf = StandaloneFeatureWrapper(GroupRelevance, dfAll, obs_fields, param_list, sub_feature_dir, logger) sf.go()
def main(): logname = "generate_feature_group_relevance_%s.log" % time_utils._timestamp( ) logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) dfTrain = dfAll.iloc[:TRAIN_SIZE].copy() ## run python3 splitter.py first split = pkl_utils._load("%s/splits_level1.pkl" % config.SPLIT_DIR) n_iter = len(split) ## for cv for i in range(n_iter): trainInd, validInd = split[i][0], split[i][1] dfTrain2 = dfTrain.iloc[trainInd].copy() sub_feature_dir = "%s/Run%d" % (config.FEAT_DIR, i + 1) obs_fields = ["search_term", "product_title"][1:] aggregation_mode = ["mean", "std", "max", "min", "median", "size"] param_list = [dfAll["id"], dfTrain2, aggregation_mode] sf = StandaloneFeatureWrapper(GroupRelevance, dfAll, obs_fields, param_list, sub_feature_dir, logger) sf.go() ## for all sub_feature_dir = "%s/All" % (config.FEAT_DIR) obs_fields = ["search_term", "product_title"][1:] aggregation_mode = ["mean", "std", "max", "min", "median", "size"] param_list = [dfAll["id"], dfTrain, aggregation_mode] sf = StandaloneFeatureWrapper(GroupRelevance, dfAll, obs_fields, param_list, sub_feature_dir, logger) sf.go()
def parse_args(parser): parser.add_option("-d", "--dim", default=1, type=int, dest="lsa_columns", help="lsa_columns") parser.add_option("-o", "--outfile", default="feature_conf_%s.py"%time_utils._timestamp(), type="string", dest="outfile", help="outfile") (options, args) = parser.parse_args() return options, args
def main(options): logname = "[Feat@%s]_[Learner@%s]_hyperopt_%s.log"%( options.feature_name, options.learner_name, time_utils._timestamp()) logger = logging_utils._get_logger(config.LOG_DIR, logname) optimizer = TaskOptimizer(options.task_mode, options.learner_name, options.feature_name, logger, options.max_evals, verbose=True, refit_once=options.refit_once, plot_importance=options.plot_importance) optimizer.run()
def main(): logname = "generate_feature_intersect_count_%s.log" % time_utils._timestamp( ) logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) # Ngram generators = [ IntersectCount_Ngram, IntersectRatio_Ngram, ] obs_fields_list = [['question1'], ['question2']] target_fields_list = [['question2'], ['question1']] ngrams = [1, 2, 3, 4, 5, 12, 123] # only 1,2,3,4,5,12,123 available, see ngram_utils.py for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: for ngram in ngrams: param_list = [ngram] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go() # Ngram symmetric generators = [ CooccurrenceCount_Ngram, CooccurrenceRatio_Ngram, #CooccurrenceCount_Nterm, # not used in Quora project, takes long to run #CooccurrenceRatio_Nterm, ] obs_fields_list = [['question1']] target_fields_list = [['question2']] ngrams = [1, 2, 3, 4, 5, 12, 123] # only 1,2,3,4,5,12,123 available, see ngram_utils.py nterms = [ 2, 3, 4 ] # only 1,2,3,4 available,(uniterms is the same as unigrams) see ngram_utils.py for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: if generator.__name__[-5:] == 'Ngram': for ngram in ngrams: param_list = [ngram] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go() elif generator.__name__[-5:] == 'Nterm': for nterm in nterms: param_list = [nterm] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go() else: print("Wrong Generator") pass
def parse_args(parser): parser.add_option("-l", "--level", default=2, type="int", dest="level", help="level") parser.add_option("-t", "--top", default=10, type="int", dest="topN", help="top-N") parser.add_option("-o", "--outfile", default="stacking_feature_conf_%s.py"%time_utils._timestamp(), type="string", dest="outfile", help="outfile") (options, args) = parser.parse_args() return options, args
def main(which): logname = "generate_feature_stat_cooc_tfidf_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) generators = [] for w in which.split(","): if w == "tf": generators.append( StatCoocTF_Ngram ) elif w == "norm_tf": generators.append( StatCoocNormTF_Ngram ) elif w == "tfidf": generators.append( StatCoocTFIDF_Ngram ) elif w == "norm_tfidf": generators.append( StatCoocNormTFIDF_Ngram ) elif w == "bm25": generators.append( StatCoocBM25_Ngram ) obs_fields_list = [] target_fields_list = [] ## query in document obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"] ) target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] ) ## document in query obs_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] ) target_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"] ) ngrams = [1,2,3,12,123][:3] aggregation_mode = ["mean", "std", "max", "min", "median"] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: for ngram in ngrams: param_list = [ngram, aggregation_mode] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go() obs_fields_list = [] target_fields_list = [] ## query in document obs_fields_list.append( ["search_term_product_name"] ) target_fields_list.append( ["product_title_product_name"] ) ngrams = [1,2] aggregation_mode = ["mean", "std", "max", "min", "median"] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: for ngram in ngrams: if ngram == 2: # since product_name is of length 2, it makes no difference # for various aggregation as there is only one item param_list = [ngram, "mean"] else: param_list = [ngram, aggregation_mode] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def main(which): logname = "generate_feature_stat_cooc_tfidf_%s_%s.log"%(which, time_utils._timestamp()) logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) generators = [] if which == "tf": generators.append( StatCoocTF_Ngram ) elif which == "norm_tf": generators.append( StatCoocNormTF_Ngram ) elif which == "tfidf": generators.append( StatCoocTFIDF_Ngram ) elif which == "norm_tfidf": generators.append( StatCoocNormTFIDF_Ngram ) elif which == "bm25": generators.append( StatCoocBM25_Ngram ) obs_fields_list = [] target_fields_list = [] ## query in document obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"][:1] ) target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] ) ## document in query obs_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] ) target_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"][:1] ) ngrams = [1,2,3,12,123][:3] aggregation_mode = ["mean", "std", "max", "min", "median"] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: for ngram in ngrams: param_list = [ngram, aggregation_mode] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go() obs_fields_list = [] target_fields_list = [] ## query in document obs_fields_list.append( ["search_term_product_name"] ) target_fields_list.append( ["product_title_product_name"] ) ngrams = [1,2] aggregation_mode = ["mean", "std", "max", "min", "median"] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: for ngram in ngrams: if ngram == 2: # since product_name is of length 2, it makes no difference # for various aggregation as there is only one item param_list = [ngram, "mean"] else: param_list = [ngram, aggregation_mode] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def __init__(self, feature_dict, feature_name, feature_suffix=".pkl", corr_threshold=0): self.feature_name = feature_name self.feature_dict = feature_dict self.feature_suffix = feature_suffix self.corr_threshold = corr_threshold self.feature_names_basic = [] self.feature_names_cv = [] self.basic_only = 0 logname = "feature_combiner_%s_%s.log"%(feature_name, time_utils._timestamp()) self.logger = logging_utils._get_logger(config.LOG_DIR, logname) self.splitter = splitter_level1 self.n_iter = n_iter
def main(): logname = "generate_feature_group_distance_%s.log" % time_utils._timestamp( ) logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) dfTrain = dfAll.iloc[:TRAIN_SIZE].copy() ## run python3 splitter.py first split = pkl_utils._load("%s/splits_level1.pkl" % config.SPLIT_DIR) n_iter = len(split) relevances_complete = [ 1, 1.25, 1.33, 1.5, 1.67, 1.75, 2, 2.25, 2.33, 2.5, 2.67, 2.75, 3 ] relevances = [1, 1.33, 1.67, 2, 2.33, 2.67, 3] ngrams = [1] obs_fields = ["search_term"] target_fields = ["product_title", "product_description"] aggregation_mode = ["mean", "std", "max", "min", "median"] ## for cv for i in range(n_iter): trainInd, validInd = split[i][0], split[i][1] dfTrain2 = dfTrain.iloc[trainInd].copy() sub_feature_dir = "%s/Run%d" % (config.FEAT_DIR, i + 1) for target_field in target_fields: for relevance in relevances: for ngram in ngrams: param_list = [ dfAll["id"], dfTrain2, target_field, relevance, ngram, aggregation_mode ] pf = PairwiseFeatureWrapper(GroupRelevance_Ngram_Jaccard, dfAll, obs_fields, [target_field], param_list, sub_feature_dir, logger) pf.go() ## for all sub_feature_dir = "%s/All" % (config.FEAT_DIR) for target_field in target_fields: for relevance in relevances: for ngram in ngrams: param_list = [ dfAll["id"], dfTrain, target_field, relevance, ngram, aggregation_mode ] pf = PairwiseFeatureWrapper(GroupRelevance_Ngram_Jaccard, dfAll, obs_fields, [target_field], param_list, sub_feature_dir, logger) pf.go()
def main(options): os_utils._makedirs("../logs") os_utils._makedirs("../output") os_utils._makedirs(params["offline_model_dir"]) os_utils._makedirs(params["pb_model_dir"]) logger = log_utils._get_logger("../logs", "tf-%s.log" % time_utils._timestamp()) params["granularity"] = options.granularity # save path model_name = "augmentation_%s_%s_%s"%(str(options.augmentation), options.granularity, options.model) path = config.SUB_DIR + "/" + model_name os_utils._makedirs(path) # load data X_dev, X_valid, Q, X_itest= get_train_valid_test_data(options.augmentation) # validation model = get_model(options.model)(params, logger, init_embedding_matrix=init_embedding_matrix) if os.path.exists(params["offline_model_dir"] + "/checkpoint"): print('restoring model.......') model.restore_session() train_model = True if train_model: print('training model...') model.fit(X_dev, Q, validation_data=X_valid, shuffle=True) print('ready to save model....') model.save_session() print('model save done!') y_pred_itest = model.predict_proba(X_itest, Q).flatten() #print('build saving.....') if not os.path.exists(params["pb_model_dir"]+'/1'): build_model.build_save(model,str(1),params["pb_model_dir"]) #acu assert(len(y_pred_itest)==len(X_itest["label"])) print(len(y_pred_itest)) print(len(X_itest["label"])) count = 0 for i in range(len(y_pred_itest)): score = y_pred_itest[i] if score > 0.5: prob = 1 else: prob = 0 if prob == X_itest["label"][i]: count += 1 print(count/len(y_pred_itest)) # save for stacking df = pd.DataFrame({"y_pred": y_pred_itest, "y_true": X_itest["label"]}) df.to_csv(path + "/valid.csv", index=False, header=True) input('wait') print('save done!')
def getExistingMapping(lang="en"): print "[%s]: parse existing mapping for language %s" % (time_utils._timestamp(), lang) G = g.Graph() G.parse(config.EXISTING_MAPPING[lang], format="n3") q = ''' PREFIX rr: <http://www.w3.org/ns/r2rml#> SELECT ?template ?class WHERE { ?template rr:subjectMap ?mapping . ?mapping rr:class ?class . } ''' results = G.query(q) mapping = [row[0] for row in results] ontology = [row[1] for row in results] df = pd.DataFrame({'mapping':mapping, 'ontology':ontology}) df["template"] = df["mapping"].apply(lambda x: config.TEMPLATE_NAME[lang] + x[47:]) df.to_csv(config.EXISTING_MAPPING_OUTPUT[lang], index=False) print "[%s]: parsing complete" % time_utils._timestamp()
def run_compression_distance(): logname = "generate_feature_compression_distance_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) obs_fields_list = [] target_fields_list = [] obs_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"] ) target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] ) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): param_list = [] pf = PairwiseFeatureWrapper(CompressionDistance, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def main(): logname = "generate_feature_basic_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) ## basic generators = [DocId, DocLen, DocFreq, DocEntropy, DigitCount, DigitRatio] obs_fields = ["search_term", "product_title", "product_description", "product_attribute", "product_brand", "product_color"] for generator in generators: param_list = [] sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger) sf.go() ## for product_uid generators = [DocIdEcho, DocFreq, ProductUidDummy1, ProductUidDummy2, ProductUidDummy3] obs_fields = ["product_uid"] for generator in generators: param_list = [] sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger) sf.go() ## unique count generators = [UniqueCount_Ngram, UniqueRatio_Ngram] obs_fields = ["search_term", "product_title", "product_description", "product_attribute", "product_brand", "product_color"] ngrams = [1,2,3] for generator in generators: for ngram in ngrams: param_list = [ngram] sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger) sf.go() ## for product_attribute_list generators = [ AttrCount, AttrBulletCount, AttrBulletRatio, AttrNonBulletCount, AttrNonBulletRatio, AttrHasProductHeight, AttrHasProductWidth, AttrHasProductLength, AttrHasProductDepth, AttrHasIndoorOutdoor, ] obs_fields = ["product_attribute_list"] for generator in generators: param_list = [] sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger) sf.go()
def process(lang, pivot): print "[%s]: process for language %s" % (time_utils._timestamp(), lang) linkDict = pkl_utils._load(config.ILL_DICT["%s2%s" % (lang, pivot)]) templateDict = pkl_utils._load(config.TEMPLATE2ARTICLE[lang]) articleDict = pkl_utils._load(config.ARTICLE2TEMPLATE[pivot]) mapping = pd.read_csv(config.EXISTING_MAPPING_OUTPUT[pivot], index_col="template") template1 = [] template2 = [] article1 = [] article2 = [] ontology = [] for template in templateDict: articles = templateDict[template] for article in articles: if article in linkDict: tmp = linkDict[article] template1.append(template) article1.append(article) article2.append(tmp) if tmp in articleDict: templateList = articleDict[tmp] else: templateList = [] c = "" t = "" for Template in templateList: if Template in mapping.index: c = mapping.at[Template, "ontology"] t = Template template2.append(t) ontology.append(c) data = {"template1":template1, "article1":article1, "template2":template2, \ "article2":article2, "ontology":ontology} df = pd.DataFrame(data) df.to_csv(config.ENTITY_MATRIX["%s2%s" % (lang, pivot)], index=False) print "[%s]: processing complete" % time_utils._timestamp()
def main(): logname = "generate_feature_basic_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) ## basic generators = [DocId, DocLen, DocFreq, DocEntropy, DigitCount, DigitRatio] #DocIdOneHot not used obs_fields = ["question1", "question2"] for generator in generators: param_list = [] sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger) sf.go() ## id generators = [DocIdEcho] obs_fields = ["id"] for generator in generators: param_list = [] sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger) sf.go() ## qid generators = [MaxValue, DiffValue] obs_fields_list = [['qid1']] target_fields_list = [['qid2']] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: param_list = [] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go() ## DocLenRatio generators = [DocLenRatio] obs_fields_list = [['question1']] target_fields_list = [['question2']] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: param_list = [] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go() ## unique count generators = [UniqueCount_Ngram, UniqueRatio_Ngram] obs_fields = ["question1", "question2"] ngrams = [1, 2, 3, 4, 5, 12, 123] for generator in generators: for ngram in ngrams: param_list = [ngram] sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger) sf.go()
def main(conf,learner_name,exp_name): task_mode = expname feature_name = conf.name max_evals = 10 refit_once = True logname = "%s_[Feat@%s]_[Learner@%s]_hyperopt_%s.log"%( exp_name, feature_name, learner_name, time_utils._timestamp()) logger = logging_utils._get_logger(config.LOG_DIR, logname) optimizer = TaskOptimizer(task_mode, learner_name, conf, logger, max_evals, verbose=True, refit_once=refit_once, plot_importance=False) optimizer.run()
def run_lsa_ngram(): logname = "generate_feature_lsa_ngram_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) generators = [LSA_Word_Ngram, LSA_Char_Ngram] ngrams_list = [[1,2,3], [2,3,4,5]] ngrams_list = [[3], [4]] obs_fields = ["question1", "question2"] for generator,ngrams in zip(generators, ngrams_list): for ngram in ngrams: param_list = [ngram, config.SVD_DIM, config.SVD_N_ITER] sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger) sf.go()
def getILL(lang, target): print "[%s]: generate ILL dict from language %s to language %s" % (time_utils._timestamp(), lang, target) infile = open(config.ILL[lang]) prefix1 = config.LANG_PREFIX[lang] prefix2 = config.LANG_PREFIX[target] len1 = len(prefix1) len2 = len(prefix2) linkDict = {} for line in infile.readlines(): if line[0] != "<": continue row = line.split() lang1 = row[0][1:-1] lang2 = row[2][1:-1] if prefix1 not in lang1: continue if prefix2 not in lang2: continue lang1 = lang1[len1:] lang2 = lang2[len2:] linkDict[lang1] = lang2 print "%d links in total" % len(linkDict) pkl_utils._save(config.ILL_DICT["%s2%s" % (lang, target)], linkDict) print "[%s]: generation complete" % time_utils._timestamp()
def parse_args(parser): parser.add_option("-l", "--level", default=1, type="int", dest="feature_level", help="feature level, e.g., 1, 2, 3") parser.add_option("-c", "--config", default="feature_conf", type="string", dest="feature_conf", help="feature config name") parser.add_option("-n", "--name", default="basic%s"%time_utils._timestamp(), type="string", dest="feature_name", help="feature name") parser.add_option("-s", "--suffix", default=".pkl", type="string", dest="feature_suffix", help="feature suffix") parser.add_option("-m", "--meta_config", default="feature_conf_meta", type="string", dest="feature_conf_meta", help="meta feature config name") parser.add_option("-t", "--threshold", default=0.0, type="float", dest="corr_threshold", help="correlation threshold for dropping features") (options, args) = parser.parse_args() return options, args
def getExistingMapping(lang="en"): print "[%s]: parse existing mapping for language %s" % ( time_utils._timestamp(), lang) G = g.Graph() G.parse(config.EXISTING_MAPPING[lang], format="n3") q = ''' PREFIX rr: <http://www.w3.org/ns/r2rml#> SELECT ?template ?class WHERE { ?template rr:subjectMap ?mapping . ?mapping rr:class ?class . } ''' results = G.query(q) mapping = [row[0] for row in results] ontology = [row[1] for row in results] df = pd.DataFrame({'mapping': mapping, 'ontology': ontology}) df["template"] = df["mapping"].apply( lambda x: config.TEMPLATE_NAME[lang] + x[47:]) df.to_csv(config.EXISTING_MAPPING_OUTPUT[lang], index=False) print "[%s]: parsing complete" % time_utils._timestamp()
def main(): print "[%s]: generate ontology hierarchy tree" % (time_utils._timestamp()) G = g.Graph() G.parse(config.ONTOLOGY, format="n3") q = ''' PREFIX rr: <http://www.w3.org/2000/01/rdf-schema#> SELECT ?child ?parent WHERE { ?child rr:subClassOf ?parent . }''' results = G.query(q) ontologyDict = {} for row in results: child = str(row[0]) parent = str(row[1]) if parent in ontologyDict: ontologyDict[parent].append(child) else: ontologyDict[parent] = [child,] pkl_utils._save(config.ONTOLOGY_TREE, ontologyDict) print "[%s]: generation complete" % time_utils._timestamp()
def select_feature(conf,learner_name,exp_name): task_mode = expname feature_name = conf.name max_evals = 20 refit_once = True logname = "%s_[Feat@%s]_[Learner@%s]_hyperopt_%s.log"%( exp_name, feature_name, learner_name, time_utils._timestamp()) logger = logging_utils._get_logger(config.LOG_DIR, logname) optimizer = TaskOptimizer(task_mode, learner_name, conf, logger, max_evals, verbose=True, refit_once=refit_once, plot_importance=False) given_predictors =['0_ip','0_app','0_device','0_os','0_channel','0_day','0_hour','0_next_click','0_next_click_shift'] optimizer.select_features(given_predictors)
def run_char_dist_sim(): logname = "generate_feature_char_dist_sim_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) dfAll.drop(["product_attribute_list"], inplace=True, axis=1) generators = [CharDistribution_Ratio, CharDistribution_CosineSim, CharDistribution_KL] obs_fields_list = [] target_fields_list = [] obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"] ) target_fields_list.append( ["product_title", "product_description", "product_attribute"] ) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: param_list = [] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def run_ngram_jaccard(): logname = "generate_feature_ngram_jaccard_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) generators = [JaccardCoef_Ngram, DiceDistance_Ngram] obs_fields_list = [] target_fields_list = [] obs_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"][:2] ) target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] ) ngrams = [1,2,3,12,123][:3] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: for ngram in ngrams: param_list = [ngram] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def main(): logname = "generate_feature_doc2vec_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) #### NOTE: use data BEFORE STEMMING dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED) doc2vec_model_dirs = [] model_prefixes = [] ## doc2vec model trained with Homedepot dataset: brand/color/obs/title/description doc2vec_model_dirs.append( config.DOC2VEC_MODEL_DIR + "/Homedepot-doc2vec-D%d-min_count%d.model"%(config.EMBEDDING_DIM, config.EMBEDDING_MIN_COUNT) ) model_prefixes.append( "Homedepot" ) for doc2vec_model_dir, model_prefix in zip(doc2vec_model_dirs, model_prefixes): ## load model try: if ".bin" in doc2vec_model_dir: doc2vec_model = gensim.models.Doc2Vec.load_word2vec_format(doc2vec_model_dir, binary=True) if ".txt" in doc2vec_model_dir: doc2vec_model = gensim.models.Doc2Vec.load_word2vec_format(doc2vec_model_dir, binary=False) else: doc2vec_model = gensim.models.Doc2Vec.load(doc2vec_model_dir) doc2vec_model_sent_label = pkl_utils._load(doc2vec_model_dir+".sent_label") except: continue # ## standalone (not used in model building) # obs_fields = ["search_term", "search_term_alt", "product_title", "product_description", "product_attribute"] # generator = Doc2Vec_Vector # param_list = [doc2vec_model, doc2vec_model_sent_label, model_prefix] # sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger) # sf.go() ## pairwise generators = [ Doc2Vec_CosineSim, Doc2Vec_RMSE, Doc2Vec_Vdiff, ] obs_fields_list = [] target_fields_list = [] obs_fields_list.append( ["search_term", "search_term_alt"] ) target_fields_list.append( ["product_title", "product_description", "product_attribute", "product_brand", "product_color"] ) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: param_list = [doc2vec_model, doc2vec_model_sent_label, model_prefix] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def run_edit_distance(): logname = "generate_feature_edit_distance_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) obs_fields_list = [] target_fields_list = [] obs_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"][1:2] ) target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] ) ngrams = [1,2,3,12,123][:3] aggregation_mode_prev = ["mean", "max", "min", "median"] aggregation_mode = ["mean", "std", "max", "min", "median"] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): param_list = [] PairwiseFeatureWrapper(EditDistance, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) for ngram in ngrams: param_list = [ngram, aggregation_mode_prev, aggregation_mode] pf = PairwiseFeatureWrapper(EditDistance_Ngram, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def main(): logname = "generate_feature_query_quality_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) obs_corpus = [] query_suffix = [] # raw dfAll = pkl_utils._load(config.ALL_DATA_RAW) obs_corpus.append(dfAll["search_term"].values) query_suffix.append("raw") # after processing dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED) obs_corpus.append(dfAll["search_term"].values) query_suffix.append("lemmatized") # after extracting product_name in search_term obs_corpus.append(dfAll["search_term_product_name"].values) query_suffix.append("product_name") if "search_term_auto_corrected" in dfAll.columns: # after auto correction obs_corpus.append(dfAll["search_term_auto_corrected"].values) query_suffix.append("corrected") # after stemming dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) obs_corpus.append(dfAll["search_term"].values) query_suffix.append("stemmed") y_train = dfAll["relevance"].values[:TRAIN_SIZE] for i in range(len(query_suffix)-1): for j in range(i+1, len(query_suffix)): ext = QueryQuality(obs_corpus[i], obs_corpus[j]) x = ext.transform() dim = 1 fname = "%s_%s_x_%s_%dD"%(ext._get_feat_name(), query_suffix[i], query_suffix[j], dim) pkl_utils._save(os.path.join(config.FEAT_DIR, fname+config.FEAT_FILE_SUFFIX), x) corr = np_utils._corr(x[:TRAIN_SIZE], y_train) logger.info("%s (%dD): corr = %.6f"%(fname, dim, corr)) # raw dfAll = pkl_utils._load(config.ALL_DATA_RAW) obs_fields = ["search_term"] param_list = [] sf = StandaloneFeatureWrapper(IsInGoogleDict, dfAll, obs_fields, param_list, config.FEAT_DIR, logger) sf.go()
def run_tfidf_ngram_cosinesim(): logname = "generate_feature_tfidf_ngram_cosinesim_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) dfAll.drop(["product_attribute_list"], inplace=True, axis=1) generators = [TFIDF_Word_Ngram_CosineSim, TFIDF_Char_Ngram_CosineSim] ngrams_list = [[1,2,3], [2,3,4,5]] ngrams_list = [[1,2,3], [4]] obs_fields_list = [] target_fields_list = [] obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"] ) target_fields_list.append( ["product_title", "product_description", "product_attribute"] ) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator,ngrams in zip(generators, ngrams_list): for ngram in ngrams: param_list = [ngram] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def run_lsa_ngram_cooc(): logname = "generate_feature_lsa_ngram_cooc_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) dfAll.drop(["product_attribute_list"], inplace=True, axis=1) generators = [LSA_Word_Ngram_Cooc] obs_ngrams = [1, 2] target_ngrams = [1, 2] obs_fields_list = [] target_fields_list = [] obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"] ) target_fields_list.append( ["product_title", "product_description"] ) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for obs_ngram in obs_ngrams: for target_ngram in target_ngrams: for generator in generators: param_list = [obs_ngram, target_ngram, config.SVD_DIM, config.SVD_N_ITER] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def __init__(self, feature_list, feature_name, feature_suffix=".csv", feature_level=2, meta_feature_dict={}, corr_threshold=0): self.feature_name = feature_name self.feature_list = feature_list self.feature_suffix = feature_suffix self.feature_level = feature_level # for meta features self.meta_feature_dict = meta_feature_dict self.corr_threshold = corr_threshold self.feature_names_basic = [] self.feature_names_cv = [] self.has_basic = 1 if self.meta_feature_dict else 0 logname = "feature_combiner_%s_%s.log"%(feature_name, time_utils._timestamp()) self.logger = logging_utils._get_logger(config.LOG_DIR, logname) if self.feature_level == 2: self.splitter = splitter_level2 elif self.feature_level == 3: self.splitter = splitter_level3 self.n_iter = n_iter self.splitter_prev = [0]*self.n_iter
def main(): logname = "generate_feature_group_distance_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) dfTrain = dfAll.iloc[:TRAIN_SIZE].copy() ## run python3 splitter.py first split = pkl_utils._load("%s/splits_level1.pkl"%config.SPLIT_DIR) n_iter = len(split) relevances_complete = [1, 1.25, 1.33, 1.5, 1.67, 1.75, 2, 2.25, 2.33, 2.5, 2.67, 2.75, 3] relevances = [1, 1.33, 1.67, 2, 2.33, 2.67, 3] ngrams = [1] obs_fields = ["search_term"] target_fields = ["product_title", "product_description"] aggregation_mode = ["mean", "std", "max", "min", "median"] ## for cv for i in range(n_iter): trainInd, validInd = split[i][0], split[i][1] dfTrain2 = dfTrain.iloc[trainInd].copy() sub_feature_dir = "%s/Run%d" % (config.FEAT_DIR, i+1) for target_field in target_fields: for relevance in relevances: for ngram in ngrams: param_list = [dfAll["id"], dfTrain2, target_field, relevance, ngram, aggregation_mode] pf = PairwiseFeatureWrapper(GroupRelevance_Ngram_Jaccard, dfAll, obs_fields, [target_field], param_list, sub_feature_dir, logger) pf.go() ## for all sub_feature_dir = "%s/All" % (config.FEAT_DIR) for target_field in target_fields: for relevance in relevances: for ngram in ngrams: param_list = [dfAll["id"], dfTrain, target_field, relevance, ngram, aggregation_mode] pf = PairwiseFeatureWrapper(GroupRelevance_Ngram_Jaccard, dfAll, obs_fields, [target_field], param_list, sub_feature_dir, logger) pf.go()
def main(): logname = "generate_feature_wordnet_similarity_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) #### NOTE: use data BEFORE STEMMING dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED) generators = [ WordNet_Path_Similarity, WordNet_Lch_Similarity, WordNet_Wup_Similarity, ] obs_fields_list = [] target_fields_list = [] obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"] ) target_fields_list.append( ["product_title", "product_description", "product_attribute"] ) # double aggregation aggregation_mode_prev = ["mean", "max", "min", "median"] aggregation_mode = ["mean", "std", "max", "min", "median"] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: param_list = [aggregation_mode_prev, aggregation_mode] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def main(): logname = "generate_feature_match_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) generators = [ MatchQueryCount, MatchQueryRatio, LongestMatchSize, LongestMatchRatio ] obs_fields_list = [] target_fields_list = [] obs_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"] ) target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] ) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: param_list = [] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go() # product_attribute_list generators = [ MatchAttrCount, MatchAttrRatio, IsIndoorOutdoorMatch, ] obs_fields_list = [] target_fields_list = [] obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"] ) target_fields_list.append( ["product_attribute_list"] ) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: param_list = [] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
#------------------- Process Attributes ------------------- def _split_attr_to_text(text): attrs = text.split(config.ATTR_SEPARATOR) return " ".join(attrs) def _split_attr_to_list(text): attrs = text.split(config.ATTR_SEPARATOR) if len(attrs) == 1: # missing return [[attrs[0], attrs[0]]] else: return [[n,v] for n,v in zip(attrs[::2], attrs[1::2])] #-------------------------- Main -------------------------- now = time_utils._timestamp() def main(): ########### ## Setup ## ########### logname = "data_processor_%s.log"%now logger = logging_utils._get_logger(config.LOG_DIR, logname) # put product_attribute_list, product_attribute and product_description first as they are # quite time consuming to process columns_to_proc = [ # # product_attribute_list is very time consuming to process # # so we just process product_attribute which is of the form # # attr_name1 | attr_value1 | attr_name2 | attr_value2 | ...