def main(): logname = "generate_feature_group_relevance_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) dfTrain = dfAll.iloc[:TRAIN_SIZE].copy() ## run python3 splitter.py first split = pkl_utils._load("%s/splits_level1.pkl"%config.SPLIT_DIR) n_iter = len(split) ## for cv for i in range(n_iter): trainInd, validInd = split[i][0], split[i][1] dfTrain2 = dfTrain.iloc[trainInd].copy() sub_feature_dir = "%s/Run%d" % (config.FEAT_DIR, i+1) obs_fields = ["search_term", "product_title"][1:] aggregation_mode = ["mean", "std", "max", "min", "median", "size"] param_list = [dfAll["id"], dfTrain2, aggregation_mode] sf = StandaloneFeatureWrapper(GroupRelevance, dfAll, obs_fields, param_list, sub_feature_dir, logger) sf.go() ## for all sub_feature_dir = "%s/All" % (config.FEAT_DIR) obs_fields = ["search_term", "product_title"][1:] aggregation_mode = ["mean", "std", "max", "min", "median", "size"] param_list = [dfAll["id"], dfTrain, aggregation_mode] sf = StandaloneFeatureWrapper(GroupRelevance, dfAll, obs_fields, param_list, sub_feature_dir, logger) sf.go()
def process(lang, pivot): print "[%s]: process for language %s" % (time_utils._timestamp(), lang) linkDict = pkl_utils._load(config.ILL_DICT["%s2%s" % (lang, pivot)]) templateDict = pkl_utils._load(config.TEMPLATE2ARTICLE[lang]) articleDict = pkl_utils._load(config.ARTICLE2TEMPLATE[pivot]) mapping = pd.read_csv(config.EXISTING_MAPPING_OUTPUT[pivot], index_col="template") template1 = []; template2 = [] article1 = []; article2 = []; ontology = [] for template in templateDict: articles = templateDict[template] for article in articles: if article in linkDict: tmp = linkDict[article] template1.append(template) article1.append(article) article2.append(tmp) if tmp in articleDict: templateList = articleDict[tmp] else: templateList = [] c = "" t = "" for Template in templateList: if Template in mapping.index: c = mapping.at[Template, "ontology"] t = Template template2.append(t) ontology.append(c) data = {"template1":template1, "article1":article1, "template2":template2, \ "article2":article2, "ontology":ontology} df = pd.DataFrame(data) df.to_csv(config.ENTITY_MATRIX["%s2%s" % (lang, pivot)], index=False) print "[%s]: processing complete" % time_utils._timestamp()
def main(options): lang = options.lang p = options.parse t = options.train ncomp = options.ncomp me = options.me fin = options.fin fout = options.fout if p: parse(lang) if t: cmd = "python run_hole.py --fin %s --fout %s --test-all 50 --nb 100 --me %d \ --margin 0.2 --lr 0.1 --ncomp %d" % (lang, config.HOLE_OUTPUT[lang], me, ncomp) os.system(cmd) hole = pkl_utils._load(config.HOLE_OUTPUT[lang]) data_dict = pkl_utils._load(config.DATA_DICT[lang]) model = hole["model"] entityDict = { y:x for x, y in enumerate(data_dict["entities"])} predicateDict = { y:x for x, y in enumerate(data_dict["relations"])} df = pd.read_csv(fin, names=["s", "p", "o"]) df["s"] = df["s"].map(entityDict) df["p"] = df["p"].map(predicateDict) df["o"] = df["o"].map(entityDict) scores = model._scores(list(df["s"]), list(df["p"]), list(df["o"])) pd.DataFrame(scores).to_csv(fout, index=False, header=False)
def main(): dfTrain = pd.read_csv(config.TRAIN_DATA, encoding="ISO-8859-1") dfTest = pd.read_csv(config.TEST_DATA, encoding="ISO-8859-1") # splits for level1 splitter = HomedepotSplitter(dfTrain=dfTrain, dfTest=dfTest, n_iter=config.N_RUNS, random_state=config.RANDOM_SEED, verbose=True, plot=True, # tune these params to get a close distribution split_param=[0.5, 0.25, 0.5], ) splitter.split() splitter.save("%s/splits_level1.pkl"%config.SPLIT_DIR) splits_level1 = splitter.splits ## splits for level2 splits_level1 = pkl_utils._load("%s/splits_level1.pkl"%config.SPLIT_DIR) splits_level2 = [0]*config.N_RUNS for run, (trainInd, validInd) in enumerate(splits_level1): dfValid = dfTrain.iloc[validInd].copy() splitter2 = HomedepotSplitter(dfTrain=dfValid, dfTest=dfTest, n_iter=1, random_state=run, verbose=True, # tune these params to get a close distribution split_param=[0.5, 0.15, 0.6]) splitter2.split() splits_level2[run] = splitter2.splits[0] pkl_utils._save("%s/splits_level2.pkl"%config.SPLIT_DIR, splits_level2) ## splits for level3 splits_level2 = pkl_utils._load("%s/splits_level2.pkl"%config.SPLIT_DIR) splits_level3 = [0]*config.N_RUNS for run, (trainInd, validInd) in enumerate(splits_level2): dfValid = dfTrain.iloc[validInd].copy() splitter3 = HomedepotSplitter(dfTrain=dfValid, dfTest=dfTest, n_iter=1, random_state=run, verbose=True, # tune these params to get a close distribution split_param=[0.5, 0.15, 0.7]) splitter3.split() splits_level3[run] = splitter3.splits[0] pkl_utils._save("%s/splits_level3.pkl"%config.SPLIT_DIR, splits_level3)
def run_tsne_lsa_ngram(): logname = "generate_feature_tsne_lsa_ngram_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) dfAll.drop(["product_attribute_list"], inplace=True, axis=1) generators = [TSNE_LSA_Word_Ngram, TSNE_LSA_Char_Ngram] ngrams_list = [[1,2,3], [2,3,4,5]] ngrams_list = [[1,2,3], [4]] obs_fields = ["search_term", "search_term_alt", "search_term_auto_corrected", "product_title", "product_description"] for generator,ngrams in zip(generators, ngrams_list): for ngram in ngrams: param_list = [ngram, config.SVD_DIM, config.SVD_N_ITER] sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger, force_corr=True) sf.go() generators = [TSNE_LSA_Word_Ngram_Pair] ngrams = [1, 2] obs_fields_list = [] target_fields_list = [] obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"] ) target_fields_list.append( ["product_title", "product_description"] ) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for ngram in ngrams: for generator in generators: param_list = [ngram, config.SVD_DIM, config.SVD_N_ITER] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger, force_corr=True) pf.go()
def main(): logname = "generate_feature_intersect_position_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) generators = [ IntersectPosition_Ngram, IntersectNormPosition_Ngram, ] obs_fields_list = [] target_fields_list = [] ## query in document obs_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"] ) target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"][1:2] ) ## document in query obs_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"][1:2] ) target_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"] ) ngrams = [1,2,3,12,123][:3] aggregation_mode = ["mean", "std", "max", "min", "median"] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: for ngram in ngrams: param_list = [ngram, aggregation_mode] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def run_count(): logname = "generate_feature_first_last_ngram_count_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) generators = [ FirstIntersectCount_Ngram, LastIntersectCount_Ngram, FirstIntersectRatio_Ngram, LastIntersectRatio_Ngram, ] obs_fields_list = [] target_fields_list = [] ## query in document obs_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"] ) target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] ) ## document in query obs_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] ) target_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"] ) ngrams = [1,2,3,12,123][:3] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: for ngram in ngrams: param_list = [ngram] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def main(): logname = "generate_feature_doc2vec_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) #### NOTE: use data BEFORE STEMMING dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED) doc2vec_model_dirs = [] model_prefixes = [] ## doc2vec model trained with Homedepot dataset: brand/color/obs/title/description doc2vec_model_dirs.append( config.DOC2VEC_MODEL_DIR + "/Homedepot-doc2vec-D%d-min_count%d.model"%(config.EMBEDDING_DIM, config.EMBEDDING_MIN_COUNT) ) model_prefixes.append( "Homedepot" ) for doc2vec_model_dir, model_prefix in zip(doc2vec_model_dirs, model_prefixes): ## load model try: if ".bin" in doc2vec_model_dir: doc2vec_model = gensim.models.Doc2Vec.load_word2vec_format(doc2vec_model_dir, binary=True) if ".txt" in doc2vec_model_dir: doc2vec_model = gensim.models.Doc2Vec.load_word2vec_format(doc2vec_model_dir, binary=False) else: doc2vec_model = gensim.models.Doc2Vec.load(doc2vec_model_dir) doc2vec_model_sent_label = pkl_utils._load(doc2vec_model_dir+".sent_label") except: continue # ## standalone (not used in model building) # obs_fields = ["search_term", "search_term_alt", "product_title", "product_description", "product_attribute"] # generator = Doc2Vec_Vector # param_list = [doc2vec_model, doc2vec_model_sent_label, model_prefix] # sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger) # sf.go() ## pairwise generators = [ Doc2Vec_CosineSim, Doc2Vec_RMSE, Doc2Vec_Vdiff, ] obs_fields_list = [] target_fields_list = [] obs_fields_list.append( ["search_term", "search_term_alt"] ) target_fields_list.append( ["product_title", "product_description", "product_attribute", "product_brand", "product_color"] ) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: param_list = [doc2vec_model, doc2vec_model_sent_label, model_prefix] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def main(): logname = "generate_feature_query_quality_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) obs_corpus = [] query_suffix = [] # raw dfAll = pkl_utils._load(config.ALL_DATA_RAW) obs_corpus.append(dfAll["search_term"].values) query_suffix.append("raw") # after processing dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED) obs_corpus.append(dfAll["search_term"].values) query_suffix.append("lemmatized") # after extracting product_name in search_term obs_corpus.append(dfAll["search_term_product_name"].values) query_suffix.append("product_name") if "search_term_auto_corrected" in dfAll.columns: # after auto correction obs_corpus.append(dfAll["search_term_auto_corrected"].values) query_suffix.append("corrected") # after stemming dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) obs_corpus.append(dfAll["search_term"].values) query_suffix.append("stemmed") y_train = dfAll["relevance"].values[:TRAIN_SIZE] for i in range(len(query_suffix)-1): for j in range(i+1, len(query_suffix)): ext = QueryQuality(obs_corpus[i], obs_corpus[j]) x = ext.transform() dim = 1 fname = "%s_%s_x_%s_%dD"%(ext._get_feat_name(), query_suffix[i], query_suffix[j], dim) pkl_utils._save(os.path.join(config.FEAT_DIR, fname+config.FEAT_FILE_SUFFIX), x) corr = np_utils._corr(x[:TRAIN_SIZE], y_train) logger.info("%s (%dD): corr = %.6f"%(fname, dim, corr)) # raw dfAll = pkl_utils._load(config.ALL_DATA_RAW) obs_fields = ["search_term"] param_list = [] sf = StandaloneFeatureWrapper(IsInGoogleDict, dfAll, obs_fields, param_list, config.FEAT_DIR, logger) sf.go()
def main(which): logname = "generate_feature_stat_cooc_tfidf_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) generators = [] for w in which.split(","): if w == "tf": generators.append( StatCoocTF_Ngram ) elif w == "norm_tf": generators.append( StatCoocNormTF_Ngram ) elif w == "tfidf": generators.append( StatCoocTFIDF_Ngram ) elif w == "norm_tfidf": generators.append( StatCoocNormTFIDF_Ngram ) elif w == "bm25": generators.append( StatCoocBM25_Ngram ) obs_fields_list = [] target_fields_list = [] ## query in document obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"] ) target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] ) ## document in query obs_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] ) target_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"] ) ngrams = [1,2,3,12,123][:3] aggregation_mode = ["mean", "std", "max", "min", "median"] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: for ngram in ngrams: param_list = [ngram, aggregation_mode] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go() obs_fields_list = [] target_fields_list = [] ## query in document obs_fields_list.append( ["search_term_product_name"] ) target_fields_list.append( ["product_title_product_name"] ) ngrams = [1,2] aggregation_mode = ["mean", "std", "max", "min", "median"] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: for ngram in ngrams: if ngram == 2: # since product_name is of length 2, it makes no difference # for various aggregation as there is only one item param_list = [ngram, "mean"] else: param_list = [ngram, aggregation_mode] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def factorize(lang="en"): X = pkl_utils._load(config.TENSOR[lang]) entityDict = pkl_utils._load(config.ENTITY[lang]) typeDict = pkl_utils._load(config.TYPE[lang]) entry = pkl_utils._load(config.TYPE_MATRIX[lang]) t2e = {typeDict[t]:entityDict[t] for t in typeDict} _log.info("Data has been loaded") N, M = X[0].shape[0], len(X) _log.info('Datasize: %d x %d x %d' % (N, N, M)) FOLDS = 5 IDX = list(range(N)) shuffle(IDX) fsz = int(N/FOLDS) offset = 0 tid = t2e[typeDict["http://dbpedia.org/ontology/Person"]] GROUND_TRUTH = X[-1][:, tid] AUC = np.zeros(FOLDS) for f in range(FOLDS): idx = set(IDX[offset:offset+fsz]) offset += fsz _log.info('Fold %d' % f) T = [x.copy() for x in X[:-1]] rows = [] cols = [] data = [] for x,y in zip(entry[0], entry[1]): if (x in idx) and (y == tid): continue rows.append(x) cols.append(y) data.append(1) T.append(spsp.csr_matrix((data, (rows, cols)), (N, N))) _log.info('Construction complete') P = predict_rescal_als(T, tid) precision, recall, _ = precision_recall_curve(GROUND_TRUTH, P) AUC[f] = auc(precision, recall) _log.info('AUC: %f' % AUC[f]) _log.info('AUC-PR Test Mean / Std: %f / %f' % (AUC.mean(), AUC.std()))
def run_compression_distance(): logname = "generate_feature_compression_distance_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) obs_fields_list = [] target_fields_list = [] obs_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"] ) target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] ) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): param_list = [] pf = PairwiseFeatureWrapper(CompressionDistance, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def main(): logname = "generate_feature_basic_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) ## basic generators = [DocId, DocLen, DocFreq, DocEntropy, DigitCount, DigitRatio] obs_fields = ["search_term", "product_title", "product_description", "product_attribute", "product_brand", "product_color"] for generator in generators: param_list = [] sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger) sf.go() ## for product_uid generators = [DocIdEcho, DocFreq, ProductUidDummy1, ProductUidDummy2, ProductUidDummy3] obs_fields = ["product_uid"] for generator in generators: param_list = [] sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger) sf.go() ## unique count generators = [UniqueCount_Ngram, UniqueRatio_Ngram] obs_fields = ["search_term", "product_title", "product_description", "product_attribute", "product_brand", "product_color"] ngrams = [1,2,3] for generator in generators: for ngram in ngrams: param_list = [ngram] sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger) sf.go() ## for product_attribute_list generators = [ AttrCount, AttrBulletCount, AttrBulletRatio, AttrNonBulletCount, AttrNonBulletRatio, AttrHasProductHeight, AttrHasProductWidth, AttrHasProductLength, AttrHasProductDepth, AttrHasIndoorOutdoor, ] obs_fields = ["product_attribute_list"] for generator in generators: param_list = [] sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger) sf.go()
def main(): logname = "generate_feature_group_distance_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) dfTrain = dfAll.iloc[:TRAIN_SIZE].copy() ## run python3 splitter.py first split = pkl_utils._load("%s/splits_level1.pkl"%config.SPLIT_DIR) n_iter = len(split) relevances_complete = [1, 1.25, 1.33, 1.5, 1.67, 1.75, 2, 2.25, 2.33, 2.5, 2.67, 2.75, 3] relevances = [1, 1.33, 1.67, 2, 2.33, 2.67, 3] ngrams = [1] obs_fields = ["search_term"] target_fields = ["product_title", "product_description"] aggregation_mode = ["mean", "std", "max", "min", "median"] ## for cv for i in range(n_iter): trainInd, validInd = split[i][0], split[i][1] dfTrain2 = dfTrain.iloc[trainInd].copy() sub_feature_dir = "%s/Run%d" % (config.FEAT_DIR, i+1) for target_field in target_fields: for relevance in relevances: for ngram in ngrams: param_list = [dfAll["id"], dfTrain2, target_field, relevance, ngram, aggregation_mode] pf = PairwiseFeatureWrapper(GroupRelevance_Ngram_Jaccard, dfAll, obs_fields, [target_field], param_list, sub_feature_dir, logger) pf.go() ## for all sub_feature_dir = "%s/All" % (config.FEAT_DIR) for target_field in target_fields: for relevance in relevances: for ngram in ngrams: param_list = [dfAll["id"], dfTrain, target_field, relevance, ngram, aggregation_mode] pf = PairwiseFeatureWrapper(GroupRelevance_Ngram_Jaccard, dfAll, obs_fields, [target_field], param_list, sub_feature_dir, logger) pf.go()
def main(): fnames = [ "LSA100_Word_Unigram_Pair_search_term_x_product_title_100D", "LSA100_Word_Bigram_Pair_search_term_x_product_title_100D", "LSA100_Word_Obs_Unigram_Target_Unigram_Cooc_search_term_x_product_title_100D", "LSA100_Word_Obs_Unigram_Target_Bigram_Cooc_search_term_x_product_title_100D", ] fnames = [os.path.join(config.FEAT_DIR, fname+".pkl") for fname in fnames] for fname in fnames: f = pkl_utils._load(fname) columns = ["LSA%d"%(i+1) for i in range(f.shape[1])] pd.DataFrame(f, columns=columns).to_csv(fname[:-4]+".csv", index=False)
def run_char_dist_sim(): logname = "generate_feature_char_dist_sim_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) dfAll.drop(["product_attribute_list"], inplace=True, axis=1) generators = [CharDistribution_Ratio, CharDistribution_CosineSim, CharDistribution_KL] obs_fields_list = [] target_fields_list = [] obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"] ) target_fields_list.append( ["product_title", "product_description", "product_attribute"] ) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: param_list = [] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def run_ngram_jaccard(): logname = "generate_feature_ngram_jaccard_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) generators = [JaccardCoef_Ngram, DiceDistance_Ngram] obs_fields_list = [] target_fields_list = [] obs_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"][:2] ) target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] ) ngrams = [1,2,3,12,123][:3] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: for ngram in ngrams: param_list = [ngram] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def run_tfidf_ngram_cosinesim(): logname = "generate_feature_tfidf_ngram_cosinesim_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) dfAll.drop(["product_attribute_list"], inplace=True, axis=1) generators = [TFIDF_Word_Ngram_CosineSim, TFIDF_Char_Ngram_CosineSim] ngrams_list = [[1,2,3], [2,3,4,5]] ngrams_list = [[1,2,3], [4]] obs_fields_list = [] target_fields_list = [] obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"] ) target_fields_list.append( ["product_title", "product_description", "product_attribute"] ) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator,ngrams in zip(generators, ngrams_list): for ngram in ngrams: param_list = [ngram] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def run_edit_distance(): logname = "generate_feature_edit_distance_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) obs_fields_list = [] target_fields_list = [] obs_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"][1:2] ) target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] ) ngrams = [1,2,3,12,123][:3] aggregation_mode_prev = ["mean", "max", "min", "median"] aggregation_mode = ["mean", "std", "max", "min", "median"] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): param_list = [] PairwiseFeatureWrapper(EditDistance, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) for ngram in ngrams: param_list = [ngram, aggregation_mode_prev, aggregation_mode] pf = PairwiseFeatureWrapper(EditDistance_Ngram, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def run_lsa_ngram_cooc(): logname = "generate_feature_lsa_ngram_cooc_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) dfAll.drop(["product_attribute_list"], inplace=True, axis=1) generators = [LSA_Word_Ngram_Cooc] obs_ngrams = [1, 2] target_ngrams = [1, 2] obs_fields_list = [] target_fields_list = [] obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"] ) target_fields_list.append( ["product_title", "product_description"] ) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for obs_ngram in obs_ngrams: for target_ngram in target_ngrams: for generator in generators: param_list = [obs_ngram, target_ngram, config.SVD_DIM, config.SVD_N_ITER] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def main(): logname = "generate_feature_wordnet_similarity_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) #### NOTE: use data BEFORE STEMMING dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED) generators = [ WordNet_Path_Similarity, WordNet_Lch_Similarity, WordNet_Wup_Similarity, ] obs_fields_list = [] target_fields_list = [] obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"] ) target_fields_list.append( ["product_title", "product_description", "product_attribute"] ) # double aggregation aggregation_mode_prev = ["mean", "max", "min", "median"] aggregation_mode = ["mean", "std", "max", "min", "median"] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: param_list = [aggregation_mode_prev, aggregation_mode] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def main(which): logname = "generate_feature_stat_cooc_tfidf_%s_%s.log" % ( which, time_utils._timestamp()) logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) generators = [] if which == "tf": generators.append(StatCoocTF_Ngram) elif which == "norm_tf": generators.append(StatCoocNormTF_Ngram) elif which == "tfidf": generators.append(StatCoocTFIDF_Ngram) elif which == "norm_tfidf": generators.append(StatCoocNormTFIDF_Ngram) elif which == "bm25": generators.append(StatCoocBM25_Ngram) obs_fields_list = [] target_fields_list = [] ## query in document obs_fields_list.append(["question1"]) target_fields_list.append(["question2"]) ## document in query obs_fields_list.append(["question2"]) target_fields_list.append(["question1"]) ngrams = [1, 2, 3, 12, 123][:3] aggregation_mode = ["mean", "std", "max", "min", "median"] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: for ngram in ngrams: param_list = [ngram, aggregation_mode] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def run_char_dist_sim(): logname = "generate_feature_char_dist_sim_%s.log" % time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) generators = [ CharDistribution_Ratio, CharDistribution_CosineSim, CharDistribution_KL ] obs_fields_list = [['question1'], ['question2']] target_fields_list = [['question2'], ['question1']] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: param_list = [] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger, force_corr=True) pf.go() del pf gc.collect()
def main(): logname = "generate_feature_basic_%s.log" % time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) ## basic generators = [DocId, DocLen, DocFreq, DocEntropy, DigitCount, DigitRatio] obs_fields = ["question1", "question2"] for generator in generators: param_list = [] sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger) sf.go() ## unique count generators = [UniqueCount_Ngram, UniqueRatio_Ngram] obs_fields = ["question1", "question2"] ngrams = [1, 2, 3] for generator in generators: for ngram in ngrams: param_list = [ngram] sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger) sf.go()
def main(): logname = "generate_feature_match_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) generators = [ MatchQueryCount, MatchQueryRatio, LongestMatchSize, LongestMatchRatio ] obs_fields_list = [] target_fields_list = [] obs_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"] ) target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] ) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: param_list = [] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go() # product_attribute_list generators = [ MatchAttrCount, MatchAttrRatio, IsIndoorOutdoorMatch, ] obs_fields_list = [] target_fields_list = [] obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"] ) target_fields_list.append( ["product_attribute_list"] ) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: param_list = [] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def __init__(self, model_name, data_name, cv_runs, params_dict, logger, portion=100, save_name=''): print("Loading data...") if portion <= 100: # all the data, portion% clean + all noisy self.portion = '-' + str(portion) if portion != 100 else '' else: portion /= 100 # only clean data, portion% clean self.portion = '-' + str(int(portion)) + '-clean' print('run task on: ', self.portion, ' dataset: ', data_name) if data_name == "ontonotes": words_train, mentions_train, positions_train, labels_train = data_utils.load( config.ONTONOTES_TRAIN_CLEAN + self.portion) words, mentions, positions, labels = data_utils.load( config.ONTONOTES_TEST_CLEAN) type2id, typeDict = pkl_utils._load(config.ONTONOTES_TYPE) num_types = len(type2id) type_info = config.ONTONOTES_TYPE elif data_name == "bbn": words_train, mentions_train, positions_train, labels_train = data_utils.load( config.BBN_TRAIN_CLEAN + self.portion) words, mentions, positions, labels = data_utils.load( config.BBN_TEST_CLEAN) type2id, typeDict = pkl_utils._load(config.BBN_TYPE) num_types = len(type2id) type_info = config.BBN_TYPE else: assert False, 'you have to specify the name of dataset with -d (ie. bbn/....)' self.model_name = model_name self.savename = save_name self.data_name = data_name self.cv_runs = cv_runs self.params_dict = params_dict self.hparams = AttrDict(params_dict) #self.hparams.alpha=alpha self.logger = logger self.id2type = {type2id[x]: x for x in type2id.keys()} def type2vec(types): # only terminal will be labeled tmp = np.zeros(num_types) for t in str(types).split(): if t in type2id.keys(): tmp[type2id[t]] = 1.0 return tmp labels_train = np.array([type2vec(t) for t in labels_train]) # one hot vec' labels = np.array([type2vec(t) for t in labels]) tempname = self.data_name + config.testemb tempname = os.path.join(config.PKL_DIR, tempname) if os.path.exists(tempname): self.embedding = pickle.load(open(tempname, 'rb')) print('embedding load over') else: self.embedding = embedding_utils.\ Embedding.fromCorpus(config.EMBEDDING_DATA,list(words_train) + list(words), config.MAX_DOCUMENT_LENGTH, config.MENTION_SIZE) pickle.dump(self.embedding, open(tempname, 'wb')) print('embedding dump over') self.embedding.max_document_length = config.MAX_DOCUMENT_LENGTH print("Preprocessing data...") if True: textlen_train = np.array([ self.embedding.len_transform1(x) for x in words_train ]) # with cut down len sequence words_train = np.array([ self.embedding.text_transform1(x) for x in words_train ]) # with cut down word id sequence and mask with zero <PAD> mentionlen_train = np.array([ self.embedding.len_transform2(x) for x in mentions_train ]) # mention len mentions_train = np.array([ self.embedding.text_transform2(x) for x in mentions_train ]) # mention text indexer positions_train = np.array([ self.embedding.position_transform(x) for x in positions_train ]) # start ,end position print('get train data') textlen = np.array( [self.embedding.len_transform1(x) for x in words]) words = np.array([ self.embedding.text_transform1(x) for x in words ]) # padding and cut down mentionlen = np.array( [self.embedding.len_transform2(x) for x in mentions]) mentions = np.array( [self.embedding.text_transform2(x) for x in mentions]) positions = np.array( [self.embedding.position_transform(x) for x in positions]) print('get test data') # pickle.dump([textlen_train, words_train, mentionlen_train, mentions_train, positions_train, # textlen, words, mentionlen, mentions, positions # ], open(os.path.join(self.data_name + config.prep+self.portion, 'wb')) # print('dump preprocessed data to pkl over...') # else: # textlen_train, words_train, mentionlen_train, mentions_train, \ # positions_train, textlen, words, mentionlen, mentions, positions = pickle.load( # open(self.data_name + config.prep+self.portion, 'rb')) # print('load preprocessed data from pkl over...') #if True: ss = ShuffleSplit(n_splits=1, test_size=0.1, random_state=config.RANDOM_SEED) for test_index, valid_index in ss.split(np.zeros(len(labels)), labels): # 用index做划分 textlen_test, textlen_valid = textlen[test_index], textlen[ valid_index] words_test, words_valid = words[test_index], words[valid_index] mentionlen_test, mentionlen_valid = mentionlen[ test_index], mentionlen[valid_index] mentions_test, mentions_valid = mentions[test_index], mentions[ valid_index] positions_test, positions_valid = positions[test_index], positions[ valid_index] labels_test, labels_valid = labels[test_index], labels[valid_index] self.train_set = list( zip( words_train, textlen_train, mentions_train, mentionlen_train, positions_train, labels_train, )) self.valid_set = list( zip( words_valid, textlen_valid, mentions_valid, mentionlen_valid, positions_valid, labels_valid, )) self.test_set = list( zip( words_test, textlen_test, mentions_test, mentionlen_test, positions_test, labels_test, )) self.full_test_set = list( zip( words, textlen, mentions, mentionlen, positions, labels, )) self.labels_test = labels_test self.labels = labels self.labels_valid = labels_valid self.num_types = num_types self.type_info = type_info self.logger.info("train set size:%d, test set size: %d" % (len(self.train_set), len(self.full_test_set))) self.model = self._get_model() self.saver = tf.train.Saver(tf.global_variables()) checkpoint_dir = os.path.abspath(config.CHECKPOINT_DIR) if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) self.checkpoint_prefix = os.path.join(checkpoint_dir, self.__str__())
"sample": 0.001, "window": config.EMBEDDING_WINDOW, "workers": config.EMBEDDING_WORKERS, } model_dir = config.DOC2VEC_MODEL_DIR model_name = "Quora-doc2vec-D%d-min_count%d.model" % ( model_param["size"], model_param["min_count"]) doc2vec = DataFrameDoc2Vec(df, columns, model_param) doc2vec.train() doc2vec.save(model_dir, model_name) #---------------------- Main ---------------------- if __name__ == "__main__": df = pkl_utils._load(config.ALL_DATA_LEMMATIZED) columns = ["question1", "question2"] columns = [col for col in columns if col in df.columns] if len(sys.argv) >= 2: for w in sys.argv[1].split(","): if w == "word2vec": train_word2vec_model(df, columns) elif w == "doc2vec": train_doc2vec_model(df, columns) else: print("Skip: %s" % w) continue else: train_doc2vec_model(df, columns) train_word2vec_model(df, columns)
def preprocess(data_name, if_clean=False, full_path=False): if data_name == "wiki": raw_all_file = config.WIKI_ALL raw_train_file = config.WIKI_TRAIN raw_valid_file = config.WIKI_VALID raw_test_file = config.WIKI_TEST clean_train_file = config.WIKI_TRAIN_CLEAN clean_test_file = config.WIKI_TEST_CLEAN type_file = config.WIKI_TYPE elif data_name == "wikim": raw_all_file = config.WIKIM_ALL raw_train_file = config.WIKIM_TRAIN raw_valid_file = config.WIKIM_VALID raw_test_file = config.WIKIM_TEST clean_train_file = config.WIKIM_TRAIN_CLEAN clean_test_file = config.WIKIM_TEST_CLEAN type_file = config.WIKIM_TYPE elif data_name == "ontonotes": raw_all_file = config.ONTONOTES_ALL raw_train_file = config.ONTONOTES_TRAIN raw_valid_file = config.ONTONOTES_VALID raw_test_file = config.ONTONOTES_TEST clean_train_file = config.ONTONOTES_TRAIN_CLEAN clean_test_file = config.ONTONOTES_TEST_CLEAN type_file = config.ONTONOTES_TYPE else: raise AttributeError("Invalid data name!") if not os.path.exists(type_file): create_type_dict(raw_all_file, type_file, full_path) type2id, typeDict = pkl_utils._load(type_file) df_train = pd.read_csv(raw_train_file, sep="\t", names=["p1", "p2", "text", "type", "f"]) df_valid = pd.read_csv(raw_valid_file, sep="\t", names=["p1", "p2", "text", "type", "f"]) df = pd.concat((df_train, df_valid), ignore_index=True) size = df.shape[0] outfile = open(clean_train_file, "w") for i in range(size): p1 = df["p1"][i] p2 = df["p2"][i] text = df["text"][i] types = df["type"][i].split() if (not path_count(types) == 1) and if_clean: #-> RAW continue text = clear_text(text) tokens = text.split() if p1 >= len(tokens): continue mention = " ".join(tokens[p1:p2]) if p1 == 0: mention = "<PAD> " + mention else: mention = tokens[p1 - 1] + " " + mention if p2 >= len(tokens): mention = mention + " <PAD>" else: mention = mention + " " + tokens[p2] offset = max(0, p1 - config.WINDOW_SIZE) text = " ".join(tokens[offset:min(len(tokens), p2 + config.WINDOW_SIZE - 1)]) p1 -= offset p2 -= offset out_type = [] for a in types: flag = True for b in types: if len(a) >= len(b): continue if (a == b[:len(a)]) and (b[len(a)] == "/"): flag = False if flag: out_type.append(a) if len(out_type) > 0: if full_path: outfile.write("%d\t%d\t%s\t%s\t%s\n" % (p1, p2, text, mention, " ".join(types))) else: outfile.write("%d\t%d\t%s\t%s\t%s\n" % (p1, p2, text, mention, " ".join(out_type))) outfile.close() #VALIDATION separate df = pd.read_csv(raw_valid_file, sep="\t", names=["p1", "p2", "text", "type", "f"]) outfile = open(clean_train_file.replace("train", "dev"), "w") size = df.shape[0] for i in range(size): p1 = df["p1"][i] p2 = df["p2"][i] text = df["text"][i] types = df["type"][i].split() text = clear_text(text) tokens = text.split() if p1 >= len(tokens): continue mention = " ".join(tokens[p1:p2]) if p1 == 0: mention = "<PAD> " + mention else: mention = tokens[p1 - 1] + " " + mention if p2 >= len(tokens): mention = mention + " <PAD>" else: mention = mention + " " + tokens[p2] offset = max(0, p1 - config.WINDOW_SIZE) text = " ".join(tokens[offset:min(len(tokens), p2 + config.WINDOW_SIZE - 1)]) p1 -= offset p2 -= offset out_type = [] for a in types: flag = True for b in types: if len(a) >= len(b): continue if (a == b[:len(a)]) and (b[len(a)] == "/"): flag = False if flag: out_type.append(a) if full_path: outfile.write("%d\t%d\t%s\t%s\t%s\n" % (p1, p2, text, mention, " ".join(types))) else: outfile.write("%d\t%d\t%s\t%s\t%s\n" % (p1, p2, text, mention, " ".join(out_type))) outfile.close() df = pd.read_csv(raw_test_file, sep="\t", names=["p1", "p2", "text", "type", "f"]) size = df.shape[0] outfile = open(clean_test_file, "w") for i in range(size): p1 = df["p1"][i] p2 = df["p2"][i] text = df["text"][i] types = df["type"][i].split() text = clear_text(text) tokens = text.split() if p1 >= len(tokens): continue mention = " ".join(tokens[p1:p2]) if p1 == 0: mention = "<PAD> " + mention else: mention = tokens[p1 - 1] + " " + mention if p2 >= len(tokens): mention = mention + " <PAD>" else: mention = mention + " " + tokens[p2] offset = max(0, p1 - config.WINDOW_SIZE) text = " ".join(tokens[offset:min(len(tokens), p2 + config.WINDOW_SIZE - 1)]) p1 -= offset p2 -= offset out_type = [] for a in types: flag = True for b in types: if len(a) >= len(b): continue if (a == b[:len(a)]) and (b[len(a)] == "/"): flag = False if flag: out_type.append(a) if full_path: outfile.write("%d\t%d\t%s\t%s\t%s\n" % (p1, p2, text, mention, " ".join(types))) else: outfile.write("%d\t%d\t%s\t%s\t%s\n" % (p1, p2, text, mention, " ".join(out_type))) outfile.close()
from sklearn.cross_validation import check_random_state from sklearn.cross_validation import BaseShuffleSplit, ShuffleSplit, StratifiedShuffleSplit # http://stackoverflow.com/questions/2801882/generating-a-png-with-matplotlib-when-display-is-undefined # The solution for me was to add the following code in a place # that gets read before any other pylab/matplotlib/pyplot import: import matplotlib # Force matplotlib to not use any Xwindows backend. matplotlib.use('Agg') import matplotlib.pyplot as plt import config from utils import dist_utils, np_utils from utils import logging_utils, os_utils, pkl_utils, time_utils from get_stacking_feature_conf import get_model_list splitter_level1 = pkl_utils._load("%s/splits_level1.pkl" % config.SPLIT_DIR) splitter_level2 = pkl_utils._load("%s/splits_level2.pkl" % config.SPLIT_DIR) splitter_level3 = pkl_utils._load("%s/splits_level3.pkl" % config.SPLIT_DIR) assert len(splitter_level1) == len(splitter_level2) assert len(splitter_level1) == len(splitter_level3) n_iter = len(splitter_level1) class StratifiedShuffleSplitReplacement(BaseShuffleSplit): def __init__(self, y, n_iter=10, test_size=0.1, train_size=None, random_state=None):
def main(): ########### ## Setup ## ########### logname = "data_processor_%s.log"%now logger = logging_utils._get_logger(config.LOG_DIR, logname) # put product_attribute_list, product_attribute and product_description first as they are # quite time consuming to process columns_to_proc = [ # # product_attribute_list is very time consuming to process # # so we just process product_attribute which is of the form # # attr_name1 | attr_value1 | attr_name2 | attr_value2 | ... # # and split it into a list afterwards # "product_attribute_list", "product_attribute_concat", "product_description", "product_brand", "product_color", "product_title", "search_term", ] if config.PLATFORM == "Linux": config.DATA_PROCESSOR_N_JOBS = len(columns_to_proc) # clean using a list of processors processors = [ LowerCaseConverter(), # See LowerUpperCaseSplitter and UnitConverter for why we put UnitConverter here UnitConverter(), LowerUpperCaseSplitter(), WordReplacer(replace_fname=config.WORD_REPLACER_DATA), LetterLetterSplitter(), DigitLetterSplitter(), DigitCommaDigitMerger(), NumberDigitMapper(), UnitConverter(), QuartetCleaner(), HtmlCleaner(parser="html.parser"), Lemmatizer(), ] stemmers = [ Stemmer(stemmer_type="snowball"), Stemmer(stemmer_type="porter") ][0:1] ## simple test text = "1/2 inch rubber lep tips Bullet07" print("Original:") print(text) list_processor = ListProcessor(processors) print("After:") print(list_processor.process([text])) ############# ## Process ## ############# ## load raw data dfAll = pkl_utils._load(config.ALL_DATA_RAW) columns_to_proc = [col for col in columns_to_proc if col in dfAll.columns] ## extract product name from search_term and product_title ext = ProductNameExtractor() dfAll["search_term_product_name"] = dfAll["search_term"].apply(ext.transform) dfAll["product_title_product_name"] = dfAll["product_title"].apply(ext.transform) if config.TASK == "sample": print(dfAll[["search_term", "search_term_product_name", "product_title_product_name"]]) ## clean using GoogleQuerySpellingChecker # MUST BE IN FRONT OF ALL THE PROCESSING logger.info("Run GoogleQuerySpellingChecker at search_term") checker = GoogleQuerySpellingChecker() dfAll["search_term"] = dfAll["search_term"].apply(checker.correct) ## clean uisng a list of processors df_processor = DataFrameParallelProcessor(processors, config.DATA_PROCESSOR_N_JOBS) df_processor.process(dfAll, columns_to_proc) # split product_attribute_concat into product_attribute and product_attribute_list dfAll["product_attribute"] = dfAll["product_attribute_concat"].apply(_split_attr_to_text) dfAll["product_attribute_list"] = dfAll["product_attribute_concat"].apply(_split_attr_to_list) if config.TASK == "sample": print(dfAll[["product_attribute", "product_attribute_list"]]) # query expansion list_processor = ListProcessor(processors) base_stopwords = set(list_processor.process(list(config.STOP_WORDS))) qe = QueryExpansion(dfAll, ngram=3, stopwords_threshold=0.9, base_stopwords=base_stopwords) dfAll["search_term_alt"] = qe.build() if config.TASK == "sample": print(dfAll[["search_term", "search_term_alt"]]) # save data logger.info("Save to %s"%config.ALL_DATA_LEMMATIZED) columns_to_save = [col for col in dfAll.columns if col != "product_attribute_concat"] pkl_utils._save(config.ALL_DATA_LEMMATIZED, dfAll[columns_to_save]) ## auto correcting query if config.AUTO_CORRECTING_QUERY: logger.info("Run AutoSpellingChecker at search_term") checker = AutoSpellingChecker(dfAll, exclude_stopwords=False, min_len=4) dfAll['search_term_auto_corrected'] = list(dfAll["search_term"].apply(checker.correct)) columns_to_proc += ['search_term_auto_corrected'] if config.TASK == "sample": print(dfAll[["search_term", "search_term_auto_corrected"]]) # save query_correction_map and spelling checker fname = "%s/auto_spelling_checker_query_correction_map_%s.log"%(config.LOG_DIR, now) checker.save_query_correction_map(fname) # save data logger.info("Save to %s"%config.ALL_DATA_LEMMATIZED) columns_to_save = [col for col in dfAll.columns if col != "product_attribute_concat"] pkl_utils._save(config.ALL_DATA_LEMMATIZED, dfAll[columns_to_save]) ## clean using stemmers df_processor = DataFrameParallelProcessor(stemmers, config.DATA_PROCESSOR_N_JOBS) df_processor.process(dfAll, columns_to_proc) # split product_attribute_concat into product_attribute and product_attribute_list dfAll["product_attribute"] = dfAll["product_attribute_concat"].apply(_split_attr_to_text) dfAll["product_attribute_list"] = dfAll["product_attribute_concat"].apply(_split_attr_to_list) # query expansion list_processor = ListProcessor(stemmers) base_stopwords = set(list_processor.process(list(config.STOP_WORDS))) qe = QueryExpansion(dfAll, ngram=3, stopwords_threshold=0.9, base_stopwords=base_stopwords) dfAll["search_term_alt"] = qe.build() if config.TASK == "sample": print(dfAll[["search_term", "search_term_alt"]]) # save data logger.info("Save to %s"%config.ALL_DATA_LEMMATIZED_STEMMED) columns_to_save = [col for col in dfAll.columns if col != "product_attribute_concat"] pkl_utils._save(config.ALL_DATA_LEMMATIZED_STEMMED, dfAll[columns_to_save])
def main(): ########### ## Setup ## ########### logname = "data_processor_%s.log"%now logger = logging_utils._get_logger(config.LOG_DIR, logname) # put product_attribute_list, product_attribute and product_description first as they are # quite time consuming to process columns_to_proc = [ # # product_attribute_list is very time consuming to process # # so we just process product_attribute which is of the form # # attr_name1 | attr_value1 | attr_name2 | attr_value2 | ... # # and split it into a list afterwards # "product_attribute_list", "product_attribute_concat", "product_description", "product_brand", "product_color", "product_title", "search_term", ] if config.PLATFORM == "Linux": config.DATA_PROCESSOR_N_JOBS = len(columns_to_proc) # clean using a list of processors processors = [ LowerCaseConverter(), # See LowerUpperCaseSplitter and UnitConverter for why we put UnitConverter here UnitConverter(), LowerUpperCaseSplitter(), WordReplacer(replace_fname=config.WORD_REPLACER_DATA), LetterLetterSplitter(), DigitLetterSplitter(), DigitCommaDigitMerger(), NumberDigitMapper(), UnitConverter(), QuartetCleaner(), HtmlCleaner(parser="html.parser"), Lemmatizer(), ] stemmers = [ Stemmer(stemmer_type="snowball"), Stemmer(stemmer_type="porter") ][0:1] ## simple test text = "1/2 inch rubber lep tips Bullet07" print("Original:") print(text) list_processor = ListProcessor(processors) print("After:") print(list_processor.process([text])) ############# ## Process ## ############# ## load raw data dfAll = pkl_utils._load(config.ALL_DATA_RAW) columns_to_proc = [col for col in columns_to_proc if col in dfAll.columns] ## extract product name from search_term and product_title ext = ProductNameExtractor() dfAll["search_term_product_name"] = dfAll["search_term"].apply(ext.transform) dfAll["product_title_product_name"] = dfAll["product_title"].apply(ext.transform) if config.TASK == "sample": print(dfAll[["search_term", "search_term_product_name", "product_title_product_name"]]) ## clean using GoogleQuerySpellingChecker # MUST BE IN FRONT OF ALL THE PROCESSING if config.GOOGLE_CORRECTING_QUERY: logger.info("Run GoogleQuerySpellingChecker at search_term") checker = GoogleQuerySpellingChecker() dfAll["search_term"] = dfAll["search_term"].apply(checker.correct) ## clean uisng a list of processors df_processor = DataFrameParallelProcessor(processors, config.DATA_PROCESSOR_N_JOBS) df_processor.process(dfAll, columns_to_proc) # split product_attribute_concat into product_attribute and product_attribute_list dfAll["product_attribute"] = dfAll["product_attribute_concat"].apply(_split_attr_to_text) dfAll["product_attribute_list"] = dfAll["product_attribute_concat"].apply(_split_attr_to_list) if config.TASK == "sample": print(dfAll[["product_attribute", "product_attribute_list"]]) # query expansion if config.QUERY_EXPANSION: list_processor = ListProcessor(processors) base_stopwords = set(list_processor.process(list(config.STOP_WORDS))) qe = QueryExpansion(dfAll, ngram=3, stopwords_threshold=0.9, base_stopwords=base_stopwords) dfAll["search_term_alt"] = qe.build() if config.TASK == "sample": print(dfAll[["search_term", "search_term_alt"]]) # save data logger.info("Save to %s"%config.ALL_DATA_LEMMATIZED) columns_to_save = [col for col in dfAll.columns if col != "product_attribute_concat"] pkl_utils._save(config.ALL_DATA_LEMMATIZED, dfAll[columns_to_save]) ## auto correcting query if config.AUTO_CORRECTING_QUERY: logger.info("Run AutoSpellingChecker at search_term") checker = AutoSpellingChecker(dfAll, exclude_stopwords=False, min_len=4) dfAll["search_term_auto_corrected"] = list(dfAll["search_term"].apply(checker.correct)) columns_to_proc += ["search_term_auto_corrected"] if config.TASK == "sample": print(dfAll[["search_term", "search_term_auto_corrected"]]) # save query_correction_map and spelling checker fname = "%s/auto_spelling_checker_query_correction_map_%s.log"%(config.LOG_DIR, now) checker.save_query_correction_map(fname) # save data logger.info("Save to %s"%config.ALL_DATA_LEMMATIZED) columns_to_save = [col for col in dfAll.columns if col != "product_attribute_concat"] pkl_utils._save(config.ALL_DATA_LEMMATIZED, dfAll[columns_to_save]) ## clean using stemmers df_processor = DataFrameParallelProcessor(stemmers, config.DATA_PROCESSOR_N_JOBS) df_processor.process(dfAll, columns_to_proc) # split product_attribute_concat into product_attribute and product_attribute_list dfAll["product_attribute"] = dfAll["product_attribute_concat"].apply(_split_attr_to_text) dfAll["product_attribute_list"] = dfAll["product_attribute_concat"].apply(_split_attr_to_list) # query expansion if config.QUERY_EXPANSION: list_processor = ListProcessor(stemmers) base_stopwords = set(list_processor.process(list(config.STOP_WORDS))) qe = QueryExpansion(dfAll, ngram=3, stopwords_threshold=0.9, base_stopwords=base_stopwords) dfAll["search_term_alt"] = qe.build() if config.TASK == "sample": print(dfAll[["search_term", "search_term_alt"]]) # save data logger.info("Save to %s"%config.ALL_DATA_LEMMATIZED_STEMMED) columns_to_save = [col for col in dfAll.columns if col != "product_attribute_concat"] pkl_utils._save(config.ALL_DATA_LEMMATIZED_STEMMED, dfAll[columns_to_save])
def combine(self): # combine meta features if self.meta_feature_dict: cb = Combiner(feature_dict=self.meta_feature_dict, feature_name=self.feature_name, feature_suffix=".pkl", corr_threshold=self.corr_threshold) cb.combine() self.X_train_basic = cb.X_train self.X_test_basic = cb.X_test self.feature_names_basic = cb.feature_names_basic self.feature_names.extend(cb.feature_names) else: self.X_train_basic = None self.X_test_basic = None # combine other features dfAll = pkl_utils._load(config.INFO_DATA) dfTrain = dfAll.iloc[:TRAIN_SIZE].copy() dfTest = dfAll.iloc[TRAIN_SIZE:].copy() self.id_test = dfTest["id"].values.astype(int) ## all first = True feat_cnt = 0 feature_dir = "%s/All" % (config.OUTPUT_DIR) for file_name in sorted(os.listdir(feature_dir)): if self.feature_suffix in file_name: ## NOTE the name is valid.pred.MODEL.csv and test.pred.MODEL.csv fname = file_name.split(".")[2] if fname not in self.feature_list: continue if first: self.logger.info("Run for all...") first = False # load prediction x = self.load_feature(feature_dir, "test.pred." + fname) x = np.nan_to_num(x) dim = np_utils._dim(x) dfTest[fname] = x feat_cnt += 1 self.feature_names_cv.append(fname) self.feature_names.append(fname) self.logger.info("Combine {:>3}/{:>3} feat: {} ({}D)".format( feat_cnt, len(self.feature_list), fname, dim)) # load probability if any try: x = self.load_feature(feature_dir, "test.proba." + fname, columns=None, columns_pattern="proba") x = np.nan_to_num(x) dim = np_utils._dim(x) for i in range(dim): dfTest["%s_proba%d" % (fname, i)] = x[:, i] self.logger.info( "Combine {:>3}/{:>3} proba feat: {} ({}D)".format( feat_cnt, len(self.feature_list), fname, dim)) self.feature_names.extend( ["%s_proba%d" % (fname, i) for i in range(dim)]) except: pass dfTest.fillna(config.MISSING_VALUE_NUMERIC, inplace=True) self.X_test = dfTest.drop(["id", "relevance"], axis=1).values.astype(float) if self.meta_feature_dict: self.X_test = np.hstack([self.X_test_basic, self.X_test]) ## for cv features first = True for run in range(1, self.n_iter + 1): feat_cnt = 0 idx1 = splitter_level1[run - 1][1] idx2 = splitter_level2[run - 1][1] if self.feature_level == 2: idx = idx1 elif self.feature_level == 3: idx = [idx1[i] for i in idx2] self.splitter_prev[run - 1] = idx dfTrain_cv = dfTrain.iloc[idx].copy() feature_dir = "%s/Run%d" % (config.OUTPUT_DIR, run) for file_name in sorted(os.listdir(feature_dir)): if self.feature_suffix in file_name: ## NOTE the name is valid.pred.MODEL.csv and test.pred.MODEL.csv fname = file_name.split(".")[2] if (fname not in self.feature_list) or ( fname not in self.feature_names_cv): continue if first: self.logger.info("Run for cv...") first = False if feat_cnt == 0: self.logger.info("Run %d" % run) # load prediction x = self.load_feature(feature_dir, "valid.pred." + fname) x = np.nan_to_num(x) dim = np_utils._dim(x) # also including level 1 models' preditions if x.shape[0] > len(idx): x = x[idx2] dfTrain_cv[fname] = x feat_cnt += 1 self.logger.info( "Combine {:>3}/{:>3} feat: {} ({}D)".format( feat_cnt, len(self.feature_list), fname, dim)) # load probability if any try: x = self.load_feature(feature_dir, "valid.proba." + fname, columns=None, columns_pattern="proba") x = np.nan_to_num(x) dim = np_utils._dim(x) # also including level 1 models' preditions if x.shape[0] > len(idx): x = x[idx2] for i in range(dim): dfTrain_cv["%s_proba%d" % (fname, i)] = x[:, i] self.logger.info( "Combine {:>3}/{:>3} proba feat: {} ({}D)".format( feat_cnt, len(self.feature_list), fname, dim)) except: pass dfTrain_cv.fillna(config.MISSING_VALUE_NUMERIC, inplace=True) if run == 1: self.y_train_cv = [0] * self.n_iter self.X_train_cv = [0] * self.n_iter self.y_train_cv[run - 1] = dfTrain_cv["relevance"].values.astype(float) self.X_train_cv[run - 1] = dfTrain_cv.drop( ["id", "relevance"], axis=1).values.astype(float) if self.has_basic: self.logger.info("Overall Shape: %d x %d" % (len(self.y_train_cv[self.n_iter - 1]), self.X_train_basic.shape[1] + self.X_train_cv[self.n_iter - 1].shape[1])) else: self.logger.info("Overall Shape: %d x %d" % (len(self.y_train_cv[self.n_iter - 1]), self.X_train_cv[self.n_iter - 1].shape[1])) self.logger.info("Done combinning.") return self
def _load_data_dict(self): fname = os.path.join(config.FEAT_DIR+"/Combine", self.feature_name+config.FEAT_FILE_SUFFIX) data_dict = pkl_utils._load(fname) return data_dict
def main(): logname = "generate_feature_group_distance_stat_%s.log" % time_utils._timestamp( ) logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) y_train = dfAll["relevance"].values[:TRAIN_SIZE] group_id_names = [ "DocId_search_term", "DocId_product_title", "DocIdEcho_product_uid" ] match_list = [ "MatchQueryCount", "MatchQueryRatio", "LongestMatchRatio", ] tfidf_list = [ "StatCoocTF_Unigram_Mean", "StatCoocTF_Unigram_Max", "StatCoocTF_Unigram_Min", # "StatCoocNormTF_Unigram_Mean", # "StatCoocNormTF_Unigram_Max", # "StatCoocNormTF_Unigram_Min", "StatCoocTFIDF_Unigram_Mean", "StatCoocTFIDF_Unigram_Max", "StatCoocTFIDF_Unigram_Min", "StatCoocBM25_Unigram_Mean", "StatCoocBM25_Unigram_Max", "StatCoocBM25_Unigram_Min", # "StatCoocTF_Bigram_Mean", # "StatCoocTF_Bigram_Max", # "StatCoocTF_Bigram_Min", # "StatCoocNormTF_Bigram_Mean", # "StatCoocNormTF_Bigram_Max", # "StatCoocNormTF_Bigram_Min", # "StatCoocTFIDF_Bigram_Mean", # "StatCoocTFIDF_Bigram_Max", # "StatCoocTFIDF_Bigram_Min", # "StatCoocBM25_Bigram_Mean", # "StatCoocBM25_Bigram_Max", # "StatCoocBM25_Bigram_Min", # "StatCoocTF_Trigram_Mean", # "StatCoocTF_Trigram_Max", # "StatCoocTF_Trigram_Min", # "StatCoocNormTF_Trigram_Mean", # "StatCoocNormTF_Trigram_Max", # "StatCoocNormTF_Trigram_Min", # "StatCoocTFIDF_Trigram_Mean", # "StatCoocTFIDF_Trigram_Max", # "StatCoocTFIDF_Trigram_Min", # "StatCoocBM25_Trigram_Mean", # "StatCoocBM25_Trigram_Max", # "StatCoocBM25_Trigram_Min", ] intersect_ngram_count_list = [ "IntersectCount_Unigram", "IntersectRatio_Unigram", # "IntersectCount_Bigram", # "IntersectRatio_Bigram", # "IntersectCount_Trigram", # "IntersectRatio_Trigram", ] first_last_ngram_list = [ "FirstIntersectCount_Unigram", "FirstIntersectRatio_Unigram", "LastIntersectCount_Unigram", "LastIntersectRatio_Unigram", # "FirstIntersectCount_Bigram", # "FirstIntersectRatio_Bigram", # "LastIntersectCount_Bigram", # "LastIntersectRatio_Bigram", # "FirstIntersectCount_Trigram", # "FirstIntersectRatio_Trigram", # "LastIntersectCount_Trigram", # "LastIntersectRatio_Trigram", ] cooccurrence_ngram_count_list = [ "CooccurrenceCount_Unigram", "CooccurrenceRatio_Unigram", # "CooccurrenceCount_Bigram", # "CooccurrenceRatio_Bigram", # "CooccurrenceCount_Trigram", # "CooccurrenceRatio_Trigram", ] ngram_jaccard_list = [ "JaccardCoef_Unigram", # "JaccardCoef_Bigram", # "JaccardCoef_Trigram", "DiceDistance_Unigram", # "DiceDistance_Bigram", # "DiceDistance_Trigram", ] char_dist_sim_list = [ "CharDistribution_CosineSim", "CharDistribution_KL", ] tfidf_word_ngram_cosinesim_list = [ "TFIDF_Word_Unigram_CosineSim", # "TFIDF_Word_Bigram_CosineSim", # "TFIDF_Word_Trigram_CosineSim", ] tfidf_char_ngram_cosinesim_list = [ # "TFIDF_Char_Bigram_CosineSim", # "TFIDF_Char_Trigram_CosineSim", "TFIDF_Char_Fourgram_CosineSim", # "TFIDF_Char_Fivegram_CosineSim", ] lsa_word_ngram_cosinesim_list = [ "LSA100_Word_Unigram_CosineSim", # "LSA100_Word_Bigram_CosineSim", # "LSA100_Word_Trigram_CosineSim", ] lsa_char_ngram_cosinesim_list = [ # "LSA100_Char_Bigram_CosineSim", # "LSA100_Char_Trigram_CosineSim", "LSA100_Char_Fourgram_CosineSim", # "LSA100_Char_Fivegram_CosineSim", ] doc2vec_list = [ "Doc2Vec_Homedepot_D100_CosineSim", ] word2vec_list = [ "Word2Vec_N_Similarity", "Word2Vec_Homedepot_D100_CosineSim_Mean_Mean", "Word2Vec_Homedepot_D100_CosineSim_Max_Mean", "Word2Vec_Homedepot_D100_CosineSim_Min_Mean", ] distance_generator_list = \ match_list + \ tfidf_list + \ intersect_ngram_count_list + \ first_last_ngram_list + \ cooccurrence_ngram_count_list + \ ngram_jaccard_list + \ tfidf_word_ngram_cosinesim_list + \ tfidf_char_ngram_cosinesim_list + \ lsa_word_ngram_cosinesim_list + \ lsa_char_ngram_cosinesim_list + \ char_dist_sim_list + \ word2vec_list + \ doc2vec_list obs_fields_list = [] target_fields_list = [] ## query in document obs_fields_list.append(["search_term"]) target_fields_list.append(["product_title", "product_title_product_name"]) aggregation_mode = ["mean", "max", "min"] for group_id_name in group_id_names: group_id_list = pkl_utils._load( os.path.join(config.FEAT_DIR, group_id_name + "_1D.pkl")) for distance_generator in distance_generator_list: for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for obs_field in obs_fields: for target_field in target_fields: dist_name = "%s_%s_x_%s" % (distance_generator, obs_field, target_field) try: dist_list = pkl_utils._load( os.path.join(config.FEAT_DIR, dist_name + "_1D.pkl")) ext = GroupDistanceStat(dist_list, group_id_list, dist_name, group_id_name, aggregation_mode) x = ext.transform() if isinstance(ext.__name__(), list): for i, feat_name in enumerate(ext.__name__()): dim = 1 fname = "%s_%dD" % (feat_name, dim) pkl_utils._save( os.path.join( config.FEAT_DIR, fname + config.FEAT_FILE_SUFFIX), x[:, i]) corr = np_utils._corr( x[:TRAIN_SIZE, i], y_train) logger.info("%s (%dD): corr = %.6f" % (fname, dim, corr)) except: logger.info("Skip %s" % dist_name) pass
from sklearn.cross_validation import BaseShuffleSplit, ShuffleSplit, StratifiedShuffleSplit # http://stackoverflow.com/questions/2801882/generating-a-png-with-matplotlib-when-display-is-undefined # The solution for me was to add the following code in a place # that gets read before any other pylab/matplotlib/pyplot import: import matplotlib # Force matplotlib to not use any Xwindows backend. matplotlib.use('Agg') import matplotlib.pyplot as plt import config from utils import dist_utils, np_utils from utils import logging_utils, os_utils, pkl_utils, time_utils from get_stacking_feature_conf import get_model_list splitter_level1 = pkl_utils._load("%s/splits_level1.pkl"%config.SPLIT_DIR) splitter_level2 = pkl_utils._load("%s/splits_level2.pkl"%config.SPLIT_DIR) splitter_level3 = pkl_utils._load("%s/splits_level3.pkl"%config.SPLIT_DIR) assert len(splitter_level1) == len(splitter_level2) assert len(splitter_level1) == len(splitter_level3) n_iter = len(splitter_level1) class StratifiedShuffleSplitReplacement(BaseShuffleSplit): def __init__(self, y, n_iter=10, test_size=0.1, train_size=None, random_state=None): super(StratifiedShuffleSplitReplacement, self).__init__( len(y), n_iter, test_size, train_size, random_state)
def main(which): logname = "generate_feature_word2vec_%s_%s.log"%(which, time_utils._timestamp()) logger = logging_utils._get_logger(config.LOG_DIR, logname) #### NOTE: use data BEFORE STEMMinG dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED) word2vec_model_dirs = [] model_prefixes = [] if which == "homedepot": ## word2vec model trained with Homedepot dataset: brand/color/query/title/description word2vec_model_dirs.append( config.WORD2VEC_MODEL_DIR + "/Homedepot-word2vec-D%d-min_count%d.model"%(config.EMBEDDING_DIM, config.EMBEDDING_MIN_COUNT) ) model_prefixes.append( "Homedepot" ) elif which == "wikipedia": ## word2vec model pretrained with Wikipedia+Gigaword 5 word2vec_model_dirs.append( config.GLOVE_WORD2VEC_MODEL_DIR + "/glove.6B.300d.txt" ) model_prefixes.append( "Wikipedia" ) elif which == "google": ## word2vec model pretrained with Google News word2vec_model_dirs.append( config.WORD2VEC_MODEL_DIR + "/GoogleNews-vectors-negative300.bin" ) model_prefixes.append( "GoogleNews" ) for word2vec_model_dir, model_prefix in zip(word2vec_model_dirs, model_prefixes): ## load model try: if ".bin" in word2vec_model_dir: word2vec_model = gensim.models.Word2Vec.load_word2vec_format(word2vec_model_dir, binary=True) elif ".txt" in word2vec_model_dir: word2vec_model = gensim.models.Word2Vec.load_word2vec_format(word2vec_model_dir, binary=False) else: word2vec_model = gensim.models.Word2Vec.load(word2vec_model_dir) except: continue # ## standalone (not used in model building) # obs_fields = ["search_term", "product_title", "product_description"] # generator = Word2Vec_Centroid_Vector # param_list = [word2vec_model, model_prefix] # sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger) # sf.go() ## pairwise generators = [ Word2Vec_Importance, Word2Vec_N_Similarity, Word2Vec_N_Similarity_Imp, Word2Vec_Centroid_RMSE, Word2Vec_Centroid_RMSE_IMP, # # not used in final submission # Word2Vec_Centroid_Vdiff, ] obs_fields_list = [] target_fields_list = [] obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"][:1] ) target_fields_list.append( ["product_title", "product_description", "product_attribute", "product_brand", "product_color"] ) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: param_list = [word2vec_model, model_prefix] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go() ## cosine sim generators = [ Word2Vec_CosineSim, ] # double aggregation aggregation_mode_prev = ["mean", "max", "min", "median"] aggregation_mode = ["mean", "std", "max", "min", "median"] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: param_list = [word2vec_model, model_prefix, aggregation_mode, aggregation_mode_prev] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def combine(self): # combine meta features if self.meta_feature_dict: cb = Combiner(feature_dict=self.meta_feature_dict, feature_name=self.feature_name, feature_suffix=".pkl", corr_threshold=self.corr_threshold) cb.combine() self.X_train_basic = cb.X_train self.X_test_basic = cb.X_test self.feature_names_basic = cb.feature_names_basic self.feature_names.extend(cb.feature_names) else: self.X_train_basic = None self.X_test_basic = None # combine other features dfAll = pkl_utils._load(config.INFO_DATA) dfTrain = dfAll.iloc[:TRAIN_SIZE].copy() dfTest = dfAll.iloc[TRAIN_SIZE:].copy() self.id_test = dfTest["id"].values.astype(int) ## all first = True feat_cnt = 0 feature_dir = "%s/All" % (config.OUTPUT_DIR) for file_name in sorted(os.listdir(feature_dir)): if self.feature_suffix in file_name: ## NOTE the name is valid.pred.MODEL.csv and test.pred.MODEL.csv fname = file_name.split(".")[2] if fname not in self.feature_list: continue if first: self.logger.info("Run for all...") first = False # load prediction x = self.load_feature(feature_dir, "test.pred."+fname) x = np.nan_to_num(x) dim = np_utils._dim(x) dfTest[fname] = x feat_cnt += 1 self.feature_names_cv.append(fname) self.feature_names.append(fname) self.logger.info("Combine {:>3}/{:>3} feat: {} ({}D)".format( feat_cnt, len(self.feature_list), fname, dim)) # load probability if any try: x = self.load_feature(feature_dir, "test.proba."+fname, columns=None, columns_pattern="proba") x = np.nan_to_num(x) dim = np_utils._dim(x) for i in range(dim): dfTest["%s_proba%d"%(fname, i)] = x[:,i] self.logger.info("Combine {:>3}/{:>3} proba feat: {} ({}D)".format( feat_cnt, len(self.feature_list), fname, dim)) self.feature_names.extend(["%s_proba%d"%(fname, i) for i in range(dim)]) except: pass dfTest.fillna(config.MISSING_VALUE_NUMERIC, inplace=True) self.X_test = dfTest.drop(["id","relevance"], axis=1).values.astype(float) if self.meta_feature_dict: self.X_test = np.hstack([self.X_test_basic, self.X_test]) ## for cv features first = True for run in range(1,self.n_iter+1): feat_cnt = 0 idx1 = splitter_level1[run-1][1] idx2 = splitter_level2[run-1][1] if self.feature_level == 2: idx = idx1 elif self.feature_level == 3: idx = [ idx1[i] for i in idx2 ] self.splitter_prev[run-1] = idx dfTrain_cv = dfTrain.iloc[idx].copy() feature_dir = "%s/Run%d" % (config.OUTPUT_DIR, run) for file_name in sorted(os.listdir(feature_dir)): if self.feature_suffix in file_name: ## NOTE the name is valid.pred.MODEL.csv and test.pred.MODEL.csv fname = file_name.split(".")[2] if (fname not in self.feature_list) or (fname not in self.feature_names_cv): continue if first: self.logger.info("Run for cv...") first = False if feat_cnt == 0: self.logger.info("Run %d"%run) # load prediction x = self.load_feature(feature_dir, "valid.pred."+fname) x = np.nan_to_num(x) dim = np_utils._dim(x) # also including level 1 models' preditions if x.shape[0] > len(idx): x = x[idx2] dfTrain_cv[fname] = x feat_cnt += 1 self.logger.info("Combine {:>3}/{:>3} feat: {} ({}D)".format( feat_cnt, len(self.feature_list), fname, dim)) # load probability if any try: x = self.load_feature(feature_dir, "valid.proba."+fname, columns=None, columns_pattern="proba") x = np.nan_to_num(x) dim = np_utils._dim(x) # also including level 1 models' preditions if x.shape[0] > len(idx): x = x[idx2] for i in range(dim): dfTrain_cv["%s_proba%d"%(fname, i)] = x[:,i] self.logger.info("Combine {:>3}/{:>3} proba feat: {} ({}D)".format( feat_cnt, len(self.feature_list), fname, dim)) except: pass dfTrain_cv.fillna(config.MISSING_VALUE_NUMERIC, inplace=True) if run == 1: self.y_train_cv = [0]*self.n_iter self.X_train_cv = [0]*self.n_iter self.y_train_cv[run-1] = dfTrain_cv["relevance"].values.astype(float) self.X_train_cv[run-1] = dfTrain_cv.drop(["id","relevance"], axis=1).values.astype(float) if self.has_basic: self.logger.info("Overall Shape: %d x %d"%( len(self.y_train_cv[self.n_iter-1]), self.X_train_basic.shape[1] + self.X_train_cv[self.n_iter-1].shape[1])) else: self.logger.info("Overall Shape: %d x %d"%( len(self.y_train_cv[self.n_iter-1]), self.X_train_cv[self.n_iter-1].shape[1])) self.logger.info("Done combinning.") return self
def load_feature(self, feature_dir, feature_name): fname = os.path.join(feature_dir, feature_name+self.feature_suffix) return pkl_utils._load(fname)
def combine(self): dfAll = pkl_utils._load(config.INFO_DATA) dfAll_raw = dfAll.copy() y_train = dfAll["relevance"].values[:TRAIN_SIZE] ## for basic features feat_cnt = 0 self.logger.info("Run for basic...") for file_name in sorted(os.listdir(config.FEAT_DIR)): if self.feature_suffix in file_name: fname = file_name.split(".")[0] if fname not in self.feature_dict: continue x = self.load_feature(config.FEAT_DIR, fname) x = np.nan_to_num(x) if np.isnan(x).any(): self.logger.info("%s nan"%fname) continue # apply feature transform mandatory = self.feature_dict[fname][0] transformer = self.feature_dict[fname][1] x = transformer.fit_transform(x) dim = np_utils._dim(x) if dim == 1: corr = np_utils._corr(x[:TRAIN_SIZE], y_train) if not mandatory and abs(corr) < self.corr_threshold: self.logger.info("Drop: {} ({}D) (abs corr = {}, < threshold = {})".format( fname, dim, abs(corr), self.corr_threshold)) continue dfAll[fname] = x self.feature_names.append(fname) else: columns = ["%s_%d"%(fname, x) for x in range(dim)] df = pd.DataFrame(x, columns=columns) dfAll = pd.concat([dfAll, df], axis=1) self.feature_names.extend(columns) feat_cnt += 1 self.feature_names_basic.append(fname) if dim == 1: self.logger.info("Combine {:>3}/{:>3} feat: {} ({}D) (corr = {})".format( feat_cnt, len(self.feature_dict.keys()), fname, dim, corr)) else: self.logger.info("Combine {:>3}/{:>3} feat: {} ({}D)".format( feat_cnt, len(self.feature_dict.keys()), fname, dim)) dfAll.fillna(config.MISSING_VALUE_NUMERIC, inplace=True) ## basic dfTrain = dfAll.iloc[:TRAIN_SIZE].copy() self.y_train = dfTrain["relevance"].values.astype(float) dfTrain.drop(["id","relevance"], axis=1, inplace=True) self.X_train = dfTrain.values.astype(float) dfTest = dfAll.iloc[TRAIN_SIZE:].copy() self.id_test = dfTest["id"].values.astype(int) dfTest.drop(["id","relevance"], axis=1, inplace=True) self.X_test = dfTest.values.astype(float) ## all first = True feat_cv_cnt = 0 dfAll_cv_all = dfAll_raw.copy() feature_dir = "%s/All" % (config.FEAT_DIR) for file_name in sorted(os.listdir(feature_dir)): if self.feature_suffix in file_name: fname = file_name.split(".")[0] if fname not in self.feature_dict: continue if first: self.logger.info("Run for all...") first = False x = self.load_feature(feature_dir, fname) x = np.nan_to_num(x) if np.isnan(x).any(): self.logger.info("%s nan"%fname) continue # apply feature transform mandatory = self.feature_dict[fname][0] transformer = self.feature_dict[fname][1] x = transformer.fit_transform(x) dim = np_utils._dim(x) if dim == 1: corr = np_utils._corr(x[:TRAIN_SIZE], y_train) if not mandatory and abs(corr) < self.corr_threshold: self.logger.info("Drop: {} ({}D) (abs corr = {}, < threshold = {})".format( fname, dim, abs(corr), self.corr_threshold)) continue dfAll_cv_all[fname] = x self.feature_names.append(fname) else: columns = ["%s_%d"%(fname, x) for x in range(dim)] df = pd.DataFrame(x, columns=columns) dfAll_cv_all = pd.concat([dfAll_cv_all, df], axis=1) self.feature_names.extend(columns) feat_cv_cnt += 1 self.feature_names_cv.append(fname) if dim == 1: self.logger.info("Combine {:>3}/{:>3} feat: {} ({}D) (corr = {})".format( feat_cnt+feat_cv_cnt, len(self.feature_dict.keys()), fname, dim, corr)) else: self.logger.info("Combine {:>3}/{:>3} feat: {} ({}D)".format( feat_cnt+feat_cv_cnt, len(self.feature_dict.keys()), fname, dim)) if feat_cv_cnt > 0: dfAll_cv_all.fillna(config.MISSING_VALUE_NUMERIC, inplace=True) X_tmp = dfAll_cv_all.drop(["id","relevance"], axis=1).values.astype(float) self.X_train_cv_all = X_tmp[:TRAIN_SIZE] self.X_test = np.hstack((self.X_test, X_tmp[TRAIN_SIZE:])) else: self.X_train_cv_all = None feat_cnt += feat_cv_cnt ## for cv features first = True for run in range(1,self.n_iter+1): feat_cv_cnt = 0 dfAll_cv = dfAll_raw.copy() feature_dir = "%s/Run%d" % (config.FEAT_DIR, run) for file_name in sorted(os.listdir(feature_dir)): if self.feature_suffix in file_name: fname = file_name.split(".")[0] if (fname not in self.feature_dict) or (fname not in self.feature_names_cv): continue if first: self.logger.info("Run for cv...") first = False if feat_cv_cnt == 0: self.logger.info("Run %d"%run) x = self.load_feature(feature_dir, fname) x = np.nan_to_num(x) if np.isnan(x).any(): self.logger.info("%s nan"%fname) continue # apply feature transform mandatory = self.feature_dict[fname][0] transformer = self.feature_dict[fname][1] x = transformer.fit_transform(x) dim = np_utils._dim(x) if dim == 1: dfAll_cv[fname] = x else: columns = ["%s_%d"%(fname, x) for x in range(dim)] df = pd.DataFrame(x, columns=columns) dfAll_cv = pd.concat([dfAll_cv, df], axis=1) feat_cv_cnt += 1 self.logger.info("Combine {:>3}/{:>3} feat: {} ({}D)".format( feat_cnt+feat_cv_cnt, len(self.feature_dict.keys()), fname, dim)) if feat_cv_cnt > 0: dfAll_cv.fillna(config.MISSING_VALUE_NUMERIC, inplace=True) dfTrain_cv = dfAll_cv.iloc[:TRAIN_SIZE].copy() X_tmp = dfTrain_cv.drop(["id","relevance"], axis=1).values.astype(float) if run == 1: self.X_train_cv = np.zeros((X_tmp.shape[0], X_tmp.shape[1], self.n_iter), dtype=float) self.X_train_cv[:,:,run-1] = X_tmp if feat_cv_cnt == 0: self.X_train_cv = None self.basic_only = 1 # report final results if self.basic_only: self.logger.info("Overall Shape: %d x %d"%(len(self.y_train), self.X_train.shape[1])) else: self.logger.info("Overall Shape: %d x %d"%( len(self.y_train), self.X_train.shape[1]+self.X_train_cv_all.shape[1])) self.logger.info("Done combinning.") return self
def main(): logname = "generate_feature_basic_%s.log" % time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) ## basic generators = [DocId, DocLen, DocFreq, DocEntropy, DigitCount, DigitRatio] obs_fields = [ "search_term", "product_title", "product_description", "product_attribute", "product_brand", "product_color" ] for generator in generators: param_list = [] sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger) sf.go() ## for product_uid generators = [ DocIdEcho, DocFreq, ProductUidDummy1, ProductUidDummy2, ProductUidDummy3 ] obs_fields = ["product_uid"] for generator in generators: param_list = [] sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger) sf.go() ## unique count generators = [UniqueCount_Ngram, UniqueRatio_Ngram] obs_fields = [ "search_term", "product_title", "product_description", "product_attribute", "product_brand", "product_color" ] ngrams = [1, 2, 3] for generator in generators: for ngram in ngrams: param_list = [ngram] sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger) sf.go() ## for product_attribute_list generators = [ AttrCount, AttrBulletCount, AttrBulletRatio, AttrNonBulletCount, AttrNonBulletRatio, AttrHasProductHeight, AttrHasProductWidth, AttrHasProductLength, AttrHasProductDepth, AttrHasIndoorOutdoor, ] obs_fields = ["product_attribute_list"] for generator in generators: param_list = [] sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger) sf.go()
def main(): logname = "generate_feature_group_distance_stat_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) y_train = dfAll["relevance"].values[:TRAIN_SIZE] group_id_names = ["DocId_search_term", "DocId_product_title", "DocIdEcho_product_uid"] match_list = [ "MatchQueryCount", "MatchQueryRatio", "LongestMatchRatio", ] tfidf_list = [ "StatCoocTF_Unigram_Mean", "StatCoocTF_Unigram_Max", "StatCoocTF_Unigram_Min", # "StatCoocNormTF_Unigram_Mean", # "StatCoocNormTF_Unigram_Max", # "StatCoocNormTF_Unigram_Min", "StatCoocTFIDF_Unigram_Mean", "StatCoocTFIDF_Unigram_Max", "StatCoocTFIDF_Unigram_Min", "StatCoocBM25_Unigram_Mean", "StatCoocBM25_Unigram_Max", "StatCoocBM25_Unigram_Min", # "StatCoocTF_Bigram_Mean", # "StatCoocTF_Bigram_Max", # "StatCoocTF_Bigram_Min", # "StatCoocNormTF_Bigram_Mean", # "StatCoocNormTF_Bigram_Max", # "StatCoocNormTF_Bigram_Min", # "StatCoocTFIDF_Bigram_Mean", # "StatCoocTFIDF_Bigram_Max", # "StatCoocTFIDF_Bigram_Min", # "StatCoocBM25_Bigram_Mean", # "StatCoocBM25_Bigram_Max", # "StatCoocBM25_Bigram_Min", # "StatCoocTF_Trigram_Mean", # "StatCoocTF_Trigram_Max", # "StatCoocTF_Trigram_Min", # "StatCoocNormTF_Trigram_Mean", # "StatCoocNormTF_Trigram_Max", # "StatCoocNormTF_Trigram_Min", # "StatCoocTFIDF_Trigram_Mean", # "StatCoocTFIDF_Trigram_Max", # "StatCoocTFIDF_Trigram_Min", # "StatCoocBM25_Trigram_Mean", # "StatCoocBM25_Trigram_Max", # "StatCoocBM25_Trigram_Min", ] intersect_ngram_count_list = [ "IntersectCount_Unigram", "IntersectRatio_Unigram", # "IntersectCount_Bigram", # "IntersectRatio_Bigram", # "IntersectCount_Trigram", # "IntersectRatio_Trigram", ] first_last_ngram_list = [ "FirstIntersectCount_Unigram", "FirstIntersectRatio_Unigram", "LastIntersectCount_Unigram", "LastIntersectRatio_Unigram", # "FirstIntersectCount_Bigram", # "FirstIntersectRatio_Bigram", # "LastIntersectCount_Bigram", # "LastIntersectRatio_Bigram", # "FirstIntersectCount_Trigram", # "FirstIntersectRatio_Trigram", # "LastIntersectCount_Trigram", # "LastIntersectRatio_Trigram", ] cooccurrence_ngram_count_list = [ "CooccurrenceCount_Unigram", "CooccurrenceRatio_Unigram", # "CooccurrenceCount_Bigram", # "CooccurrenceRatio_Bigram", # "CooccurrenceCount_Trigram", # "CooccurrenceRatio_Trigram", ] ngram_jaccard_list = [ "JaccardCoef_Unigram", # "JaccardCoef_Bigram", # "JaccardCoef_Trigram", "DiceDistance_Unigram", # "DiceDistance_Bigram", # "DiceDistance_Trigram", ] char_dist_sim_list = [ "CharDistribution_CosineSim", "CharDistribution_KL", ] tfidf_word_ngram_cosinesim_list = [ "TFIDF_Word_Unigram_CosineSim", # "TFIDF_Word_Bigram_CosineSim", # "TFIDF_Word_Trigram_CosineSim", ] tfidf_char_ngram_cosinesim_list = [ # "TFIDF_Char_Bigram_CosineSim", # "TFIDF_Char_Trigram_CosineSim", "TFIDF_Char_Fourgram_CosineSim", # "TFIDF_Char_Fivegram_CosineSim", ] lsa_word_ngram_cosinesim_list = [ "LSA100_Word_Unigram_CosineSim", # "LSA100_Word_Bigram_CosineSim", # "LSA100_Word_Trigram_CosineSim", ] lsa_char_ngram_cosinesim_list = [ # "LSA100_Char_Bigram_CosineSim", # "LSA100_Char_Trigram_CosineSim", "LSA100_Char_Fourgram_CosineSim", # "LSA100_Char_Fivegram_CosineSim", ] doc2vec_list = [ "Doc2Vec_Homedepot_D100_CosineSim", ] word2vec_list = [ "Word2Vec_N_Similarity", "Word2Vec_Homedepot_D100_CosineSim_Mean_Mean", "Word2Vec_Homedepot_D100_CosineSim_Max_Mean", "Word2Vec_Homedepot_D100_CosineSim_Min_Mean", ] distance_generator_list = \ match_list + \ tfidf_list + \ intersect_ngram_count_list + \ first_last_ngram_list + \ cooccurrence_ngram_count_list + \ ngram_jaccard_list + \ tfidf_word_ngram_cosinesim_list + \ tfidf_char_ngram_cosinesim_list + \ lsa_word_ngram_cosinesim_list + \ lsa_char_ngram_cosinesim_list + \ char_dist_sim_list + \ word2vec_list + \ doc2vec_list obs_fields_list = [] target_fields_list = [] ## query in document obs_fields_list.append( ["search_term"] ) target_fields_list.append( ["product_title", "product_title_product_name"] ) aggregation_mode = ["mean", "max", "min"] for group_id_name in group_id_names: group_id_list = pkl_utils._load(os.path.join(config.FEAT_DIR, group_id_name+"_1D.pkl")) for distance_generator in distance_generator_list: for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for obs_field in obs_fields: for target_field in target_fields: dist_name = "%s_%s_x_%s"%(distance_generator, obs_field, target_field) try: dist_list = pkl_utils._load(os.path.join(config.FEAT_DIR, dist_name+"_1D.pkl")) ext = GroupDistanceStat(dist_list, group_id_list, dist_name, group_id_name, aggregation_mode) x = ext.transform() if isinstance(ext.__name__(), list): for i,feat_name in enumerate(ext.__name__()): dim = 1 fname = "%s_%dD"%(feat_name, dim) pkl_utils._save(os.path.join(config.FEAT_DIR, fname+config.FEAT_FILE_SUFFIX), x[:,i]) corr = np_utils._corr(x[:TRAIN_SIZE,i], y_train) logger.info("%s (%dD): corr = %.6f"%(fname, dim, corr)) except: logger.info("Skip %s"%dist_name) pass
def main(): ########### ## Setup ## ########### logname = "data_processor_%s.log" % now logger = logging_utils._get_logger(config.LOG_DIR, logname) # put product_attribute_list, product_attribute and product_description first as they are # quite time consuming to process columns_to_proc = [ "question1", "question2", ] if config.PLATFORM == "Linux": config.DATA_PROCESSOR_N_JOBS = len(columns_to_proc) # clean using a list of processors processors = [ UnicodeConverter(), LowerCaseConverter(), # See LowerUpperCaseSplitter and UnitConverter for why we put UnitConverter here UnitConverter(), LowerUpperCaseSplitter(), # WordReplacer(replace_fname=config.WORD_REPLACER_DATA), LetterLetterSplitter(), DigitLetterSplitter(), DigitCommaDigitMerger(), NumberDigitMapper(), UnitConverter(), QuartetCleaner(), HtmlCleaner(parser="html.parser"), Lemmatizer(), ] stemmers = [ Stemmer(stemmer_type="snowball"), Stemmer(stemmer_type="porter") ][0:1] ## simple test text = "1/2 inch rubber lep tips Bullet07" print("Original:") print(text) list_processor = ListProcessor(processors) print("After:") print(list_processor.process([text])) ############# ## Process ## ############# ## load raw data dfAll = pkl_utils._load(config.ALL_DATA_RAW) columns_to_proc = [col for col in columns_to_proc if col in dfAll.columns] ## clean uisng a list of processors df_processor = DataFrameParallelProcessor(processors, config.DATA_PROCESSOR_N_JOBS) df_processor.process(dfAll, columns_to_proc) # save data logger.info("Save to %s" % config.ALL_DATA_LEMMATIZED) pkl_utils._save(config.ALL_DATA_LEMMATIZED, dfAll) ## clean using stemmers df_processor = DataFrameParallelProcessor(stemmers, config.DATA_PROCESSOR_N_JOBS) df_processor.process(dfAll, columns_to_proc) # save data logger.info("Save to %s" % config.ALL_DATA_LEMMATIZED_STEMMED) pkl_utils._save(config.ALL_DATA_LEMMATIZED_STEMMED, dfAll)
def __init__(self, model_name, runs, params_dict, logger): print("Loading data...") words, positions, heads, tails, labels = pkl_utils._load( config.GROUPED_TRAIN_DATA) words_test, positions_test, heads_test, tails_test, labels_test = pkl_utils._load( config.GROUPED_TEST_DATA) # noqa self.embedding = embedding_utils.Embedding( config.EMBEDDING_DATA, list([s for bags in words for s in bags]) + list([s for bags in words_test for s in bags]), config.MAX_DOCUMENT_LENGTH) print("Preprocessing data...") textlen = np.array([[self.embedding.len_transform(x) for x in y] for y in words]) words = np.array([[self.embedding.text_transform(x) for x in y] for y in words]) positions = np.array( [[self.embedding.position_transform(x) for x in y] for y in positions]) textlen_test = np.array([[self.embedding.len_transform(x) for x in y] for y in words_test]) words_test = np.array([[self.embedding.text_transform(x) for x in y] for y in words_test]) positions_test = np.array( [[self.embedding.position_transform(x) for x in y] for y in positions_test]) # noqa ss = ShuffleSplit(n_splits=1, test_size=0.1, random_state=config.RANDOM_SEED) for train_index, valid_index in ss.split(np.zeros(len(labels)), labels): words_train, words_valid = words[train_index], words[valid_index] textlen_train, textlen_valid = textlen[train_index], textlen[ valid_index] positions_train, positions_valid = positions[ train_index], positions[valid_index] heads_train, heads_valid = heads[train_index], heads[valid_index] tails_train, tails_valid = tails[train_index], tails[valid_index] labels_train, labels_valid = labels[train_index], labels[ valid_index] if "hrere" in model_name: self.full_set = list( zip(words, textlen, positions, heads, tails, labels)) self.train_set = list( zip(words_train, textlen_train, positions_train, heads_train, tails_train, labels_train)) # noqa self.valid_set = list( zip(words_valid, textlen_valid, positions_valid, heads_valid, tails_valid, labels_valid)) # noqa self.test_set = list( zip(words_test, textlen_test, positions_test, heads_test, tails_test, labels_test)) # noqa if "complex" in model_name: self.entity_embedding1 = np.load(config.ENTITY_EMBEDDING1) self.entity_embedding2 = np.load(config.ENTITY_EMBEDDING2) self.relation_embedding1 = np.load(config.RELATION_EMBEDDING1) self.relation_embedding2 = np.load(config.RELATION_EMBEDDING2) else: self.entity_embedding = np.load(config.ENTITY_EMBEDDING) self.relation_embedding = np.load(config.RELATION_EMBEDDING) else: self.full_set = list(zip(words, textlen, positions, labels)) self.train_set = list( zip(words_train, textlen_train, positions_train, labels_train)) # noqa self.valid_set = list( zip(words_valid, textlen_valid, positions_valid, labels_valid)) # noqa self.test_set = list( zip(words_test, textlen_test, positions_test, labels_test)) # noqa self.model_name = model_name self.runs = runs self.params_dict = params_dict self.hparams = AttrDict(params_dict) self.logger = logger self.model = self._get_model() self.saver = tf.train.Saver(tf.global_variables()) checkpoint_dir = os.path.abspath(config.CHECKPOINT_DIR) if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) self.checkpoint_prefix = os.path.join(checkpoint_dir, self.__str__())
def main(): ### 1. Record Time now = time_utils._timestamp() ########### ## Setup ## ########### logname = f'data_processor_{now}.log' logger = logging_utils._get_logger(config.LOG_DIR, logname) # Put product_attribute_list, product_attribute and product_description first as they are # quite time consuming to process. # Choose the columns by check data_preparer.ipynb. In the end, the notebook will show the clean data frame. columns_to_proc = [ # # product_attribute_list is very time consuming to process # # so we just process product_attribute which is of the form # # attr_name1 | attr_value1 | attr_name2 | attr_value2 | ... # # and split it into a list afterwards # 'product_attribute_list', 'product_attribute_concat', 'product_description', 'product_brand', 'product_color', 'product_title', 'search_term', ] if config.PLATFORM == 'Linux': config.DATA_PROCESSOR_N_JOBS = len(columns_to_proc) # clean using a list of processors processors = [ LowerCaseConverter(), # See LowerUpperCaseSplitter and UnitConverter for why we put UnitConverter here # 其實沒差,除非能處理掉數字加介係詞 in 的狀況不被替代成單位 in.(inch) UnitConverter(), LowerUpperCaseSplitter(), WordReplacer(replace_fname=config.WORD_REPLACER_DATA), LetterLetterSplitter(), DigitLetterSplitter(), DigitCommaDigitMerger(), NumberDigitMapper(), UnitConverter(), QuartetCleaner(), HtmlCleaner(parser='html.parser'), Lemmatizer(), ] stemmers = [ Stemmer(stemmer_type='snowball'), Stemmer(stemmer_type='porter') ][0:1] # means only use Stemmer(stemmer_type='snowball') ## simple test text = '1/2 inch rubber lep tips Bullet07' print('Original:') print(text) list_processor = ListProcessor(processors) print('After:') print(list_processor.process([text])) ############# ## Process ## ############# ## load raw data dfAll = pkl_utils._load(config.ALL_DATA_RAW) columns_to_proc = [col for col in columns_to_proc if col in dfAll.columns] if config.TASK == 'sample': dfAll = dfAll.iloc[0:config.SAMPLE_SIZE] print(f'data length: {len(dfAll)}') ## extract product name from search_term and product_title ext = ProductNameExtractor() dfAll['search_term_product_name'] = dfAll['search_term'].apply( ext.transform) dfAll['product_title_product_name'] = dfAll['product_title'].apply( ext.transform) if config.TASK == 'sample': print(dfAll[[ 'search_term', 'search_term_product_name', 'product_title_product_name' ]]) ## clean using GoogleQuerySpellingChecker(Chenglong team not used in final submission) # MUST BE IN FRONT OF ALL THE PROCESSING if config.GOOGLE_CORRECTING_QUERY: logger.info('Run GoogleQuerySpellingChecker at search_term') checker = GoogleQuerySpellingChecker() dfAll['search_term'] = dfAll['search_term'].apply(checker.correct) ## clean uisng a list of processors df_processor = DataFrameParallelProcessor(processors, config.DATA_PROCESSOR_N_JOBS) df_processor.process(dfAll, columns_to_proc) # split product_attribute_concat into product_attribute and product_attribute_list dfAll['product_attribute'] = dfAll['product_attribute_concat'].apply( _split_attr_to_text) dfAll['product_attribute_list'] = dfAll['product_attribute_concat'].apply( _split_attr_to_list) if config.TASK == 'sample': print(dfAll[['product_attribute', 'product_attribute_list']]) # query expansion (Chenglong team decided to remove the feature which might be a major cause of overfitting.) if config.QUERY_EXPANSION: list_processor = ListProcessor(processors) # stop words must to access data process. EX. NumberDigitMapper function will replace 'one' to '1'. # So, if stop word has 'one', it must replace to '1',too. base_stopwords = set(list_processor.process(list( config.STOP_WORDS))) # a set of stop word qe = QueryExpansion(dfAll, ngram=3, stopwords_threshold=0.9, base_stopwords=base_stopwords) dfAll['search_term_alt'] = qe.build() if config.TASK == 'sample': print(dfAll[['search_term', 'search_term_alt']]) # save data logger.info(f'Save to {config.ALL_DATA_LEMMATIZED}') columns_to_save = [ col for col in dfAll.columns if col != 'product_attribute_concat' ] pkl_utils._save(config.ALL_DATA_LEMMATIZED, dfAll[columns_to_save]) ## auto correcting query(Chenglong team not used in final submission) if config.AUTO_CORRECTING_QUERY: logger.info('Run AutoSpellingChecker at search_term') checker = AutoSpellingChecker(dfAll, exclude_stopwords=False, min_len=4) dfAll['search_term_auto_corrected'] = list(dfAll['search_term'].apply( checker.correct)) columns_to_proc += ['search_term_auto_corrected'] if config.TASK == 'sample': print(dfAll[['search_term', 'search_term_auto_corrected']]) # save query_correction_map and spelling checker fname = '%s/auto_spelling_checker_query_correction_map_%s.log' % ( config.LOG_DIR, now) checker.save_query_correction_map(fname) # save data logger.info('Save to %s' % config.ALL_DATA_LEMMATIZED) columns_to_save = [ col for col in dfAll.columns if col != 'product_attribute_concat' ] pkl_utils._save(config.ALL_DATA_LEMMATIZED, dfAll[columns_to_save]) ## clean using stemmers df_processor = DataFrameParallelProcessor(stemmers, config.DATA_PROCESSOR_N_JOBS) df_processor.process(dfAll, columns_to_proc) # split product_attribute_concat into product_attribute and product_attribute_list dfAll['product_attribute'] = dfAll['product_attribute_concat'].apply( _split_attr_to_text) dfAll['product_attribute_list'] = dfAll['product_attribute_concat'].apply( _split_attr_to_list) # query expansion if config.QUERY_EXPANSION: list_processor = ListProcessor(stemmers) base_stopwords = set(list_processor.process(list(config.STOP_WORDS))) qe = QueryExpansion(dfAll, ngram=3, stopwords_threshold=0.9, base_stopwords=base_stopwords) dfAll['search_term_alt'] = qe.build() if config.TASK == 'sample': print(dfAll[['search_term', 'search_term_alt']]) # save data logger.info('Save to %s' % config.ALL_DATA_LEMMATIZED_STEMMED) columns_to_save = [ col for col in dfAll.columns if col != 'product_attribute_concat' ] pkl_utils._save(config.ALL_DATA_LEMMATIZED_STEMMED, dfAll[columns_to_save])
from sklearn.cross_validation import train_test_split import config from utils import pkl_utils combine_flag = False suffix = 'v4' threshold = 0.05 if combine_flag: cmd = "python get_feature_conf_magic.py -l 5 -m 44 -o feature_conf_magic_%s.py"%suffix os.system(cmd) cmd = "python feature_combiner.py -l 1 -c feature_conf_magic_%s -n basic_magic_%s -t %.6f"%(suffix, suffix, threshold) os.system(cmd) feature_name = "basic_magic_{}".format(suffix) fname = os.path.join(config.FEAT_DIR+"/Combine", feature_name+config.FEAT_FILE_SUFFIX) data_dict = pkl_utils._load(fname) X_train = pd.DataFrame(data_dict["X_train_basic"], columns = data_dict["feature_names"]) X_test = pd.DataFrame(data_dict["X_test"], columns = data_dict["feature_names"]) y_train = data_dict["y_train"] X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.1, random_state=4242) #UPDownSampling pos_train = X_train[y_train == 1] neg_train = X_train[y_train == 0] X_train = pd.concat((neg_train, pos_train.iloc[:int(0.8*len(pos_train))], neg_train)) y_train = np.array([0] * neg_train.shape[0] + [1] * pos_train.iloc[:int(0.8*len(pos_train))].shape[0] + [0] * neg_train.shape[0]) print(np.mean(y_train)) del pos_train, neg_train pos_valid = X_valid[y_valid == 1]
def get_types(model_name, input_file, dev_file, output_file, options): checkpoint_file = os.path.join(config.CHECKPOINT_DIR, model_name) type2id, typeDict = pkl_utils._load(config.WIKI_TYPE) id2type = {type2id[x]: x for x in type2id.keys()} #different way? -> data is different! # words, mentions, positions, labels = data_utils.load(input_file) # n = len(words) embedding = embedding_utils.Embedding.restore(checkpoint_file) test_set, test_labels, test_tokenized = create_labelset_input( *data_utils.load(input_file), embedding) dev_set, dev_labels, dev_tokenized = create_labelset_input( *data_utils.load(dev_file), embedding) store = StructuredLogitsStore( model_name, idx2label=id2type, hierarchical=True if "hier" in model_name else False, nested=False) graph = tf.Graph() with graph.as_default(): sess = tf.Session() saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file)) saver.restore(sess, checkpoint_file) # DEFINE operations input_words = graph.get_operation_by_name("input_words").outputs[0] input_textlen = graph.get_operation_by_name("input_textlen").outputs[0] input_mentions = graph.get_operation_by_name( "input_mentions").outputs[0] input_mentionlen = graph.get_operation_by_name( "input_mentionlen").outputs[0] input_positions = graph.get_operation_by_name( "input_positions").outputs[0] phase = graph.get_operation_by_name("phase").outputs[0] dense_dropout = graph.get_operation_by_name("dense_dropout").outputs[0] rnn_dropout = graph.get_operation_by_name("rnn_dropout").outputs[0] pred_op = graph.get_operation_by_name("output/predictions").outputs[0] #proba_op = graph.get_operation_by_name("output/proba").outputs[0] #proba logit_op = graph.get_operation_by_name("output/scores").outputs[ 0] #proba tune_op = graph.get_operation_by_name("tune").outputs[0] # K x K # results_op = graph.get_operation_by_name("results").outputs[0] # require labels # DO THE SAME FOR DEV set! test_batches = data_utils.batch_iter(test_set, 512, 1, shuffle=False) all_predictions = [] all_logits = [] for batch in test_batches: words_batch, textlen_batch, mentions_batch, mentionlen_batch, positions_batch, labels_batch = zip( *batch) feed = { input_words: words_batch, input_textlen: textlen_batch, input_mentions: mentions_batch, input_mentionlen: mentionlen_batch, input_positions: positions_batch, phase: False, dense_dropout: 1.0, rnn_dropout: 1.0 } batch_predictions = sess.run(pred_op, feed_dict=feed) all_predictions = np.concatenate( [all_predictions, batch_predictions]) #probas = sess.run(logit_op, feed_dict=feed) logit_predictions = sess.run(logit_op, feed_dict=feed) if all_logits == []: all_logits = logit_predictions else: all_logits = np.concatenate([all_logits, logit_predictions]) store.create_labelset( StructuredLogits(f_x=all_logits, y_true=test_labels, tokenized=test_tokenized, y_hat=None, probas=None, c=None, document_masks=None, idx2label=id2type), "test") store.score_set("test") dev_batches = data_utils.batch_iter(dev_set, 512, 1, shuffle=False) all_predictions = [] all_logits = [] for batch in dev_batches: words_batch, textlen_batch, mentions_batch, mentionlen_batch, positions_batch, labels_batch = zip( *batch) feed = { input_words: words_batch, input_textlen: textlen_batch, input_mentions: mentions_batch, input_mentionlen: mentionlen_batch, input_positions: positions_batch, phase: False, dense_dropout: 1.0, rnn_dropout: 1.0 } batch_predictions = sess.run(pred_op, feed_dict=feed) all_predictions = np.concatenate( [all_predictions, batch_predictions]) #probas = sess.run(logit_op, feed_dict=feed) logit_predictions = sess.run(logit_op, feed_dict=feed) if all_logits == []: all_logits = logit_predictions else: all_logits = np.concatenate([all_logits, logit_predictions]) store.create_labelset( StructuredLogits(f_x=all_logits, y_true=dev_labels, tokenized=dev_tokenized, y_hat=None, probas=None, c=None, document_masks=None, idx2label=id2type), "dev") store.score_set("dev") #np.transpose(prior_utils.create_prior(type_info, hparams.alpha) # all_logits.append(logit_predictions) # save as pickle with open(os.path.join(os.path.dirname(checkpoint_file), "logits.pickle"), "wb") as f: pickle.dump(store, f) """
def get_types(model_name, input_file, output_file): checkpoint_file = os.path.join(config.CHECKPOINT_DIR, model_name) type2id, typeDict = pkl_utils._load(config.WIKI_TYPE) id2type = {type2id[x]:x for x in type2id.keys()} df = pd.read_csv(input_file, sep="\t", names=["r", "e1", "x1", "y1", "e2", "x2", "y2", "s"]) n = df.shape[0] words1 = np.array(df.s) mentions1 = np.array(df.e1) positions1 = np.array([[x, y] for x, y in zip(df.x1, df.y1+1)]) words2 = np.array(df.s) mentions2 = np.array(df.e2) positions2 = np.array([[x, y] for x, y in zip(df.x2, df.y2+1)]) words = np.concatenate([words1, words2]) mentions = np.concatenate([mentions1, mentions2]) positions = np.concatenate([positions1, positions2]) embedding = embedding_utils.Embedding.restore(checkpoint_file) textlen = np.array([embedding.len_transform1(x) for x in words]) words = np.array([embedding.text_transform1(x) for x in words]) mentionlen = np.array([embedding.len_transform2(x) for x in mentions]) mentions = np.array([embedding.text_transform2(x) for x in mentions]) positions = np.array([embedding.position_transform(x) for x in positions]) labels = np.zeros(2*n) test_set = list(zip(words, textlen, mentions, mentionlen, positions, labels)) graph = tf.Graph() with graph.as_default(): sess = tf.Session() saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file)) saver.restore(sess, checkpoint_file) input_words = graph.get_operation_by_name("input_words").outputs[0] input_textlen = graph.get_operation_by_name("input_textlen").outputs[0] input_mentions = graph.get_operation_by_name("input_mentions").outputs[0] input_mentionlen = graph.get_operation_by_name("input_mentionlen").outputs[0] input_positions = graph.get_operation_by_name("input_positions").outputs[0] phase = graph.get_operation_by_name("phase").outputs[0] dense_dropout = graph.get_operation_by_name("dense_dropout").outputs[0] rnn_dropout = graph.get_operation_by_name("rnn_dropout").outputs[0] pred_op = graph.get_operation_by_name("output/predictions").outputs[0] batches = data_utils.batch_iter(test_set, 512, 1, shuffle=False) all_predictions = [] for batch in batches: words_batch, textlen_batch, mentions_batch, mentionlen_batch, positions_batch, labels_batch = zip(*batch) feed = { input_words: words_batch, input_textlen: textlen_batch, input_mentions: mentions_batch, input_mentionlen: mentionlen_batch, input_positions: positions_batch, phase: False, dense_dropout: 1.0, rnn_dropout: 1.0 } batch_predictions = sess.run(pred_op, feed_dict=feed) all_predictions = np.concatenate([all_predictions, batch_predictions]) df["t1"] = all_predictions[:n] df["t2"] = all_predictions[n:] df["t1"] = df["t1"].map(id2type) df["t2"] = df["t2"].map(id2type) df.to_csv(output_file, sep="\t", header=False, index=False)
def _load_data_dict(self): fname = os.path.join(config.FEAT_DIR+"/Combine", self.feature_name + config.FEAT_FILE_SUFFIX) data_dict = pkl_utils._load(fname) return data_dict
def preprocess_new(data_name, if_clean=False, full_path=False): if data_name == "wiki": raw_all_file = config.WIKI_ALL raw_train_file = config.WIKI_TRAIN raw_test_file = config.WIKI_TEST clean_train_file = config.WIKI_TRAIN_CLEAN clean_test_file = config.WIKI_TEST_CLEAN type_file = config.WIKI_TYPE raw_valid_file = config.WIKI_VALID elif data_name == "ontonotes": raw_all_file = config.ONTONOTES_ALL raw_train_file = config.ONTONOTES_TRAIN raw_test_file = config.ONTONOTES_TEST clean_train_file = config.ONTONOTES_TRAIN_CLEAN clean_test_file = config.ONTONOTES_TEST_CLEAN type_file = config.ONTONOTES_TYPE raw_valid_file = config.ONTONOTES_VALID elif data_name == "bbn": raw_all_file = config.BBN_ALL raw_train_file = config.BBN_TRAIN raw_test_file = config.BBN_TEST raw_valid_file = config.BBN_VALID clean_train_file = config.BBN_TRAIN_CLEAN clean_test_file = config.BBN_TEST_CLEAN type_file = config.BBN_TYPE else: raise AttributeError("Invalid data name!") if not os.path.exists(type_file): create_type_dict_new(raw_all_file, type_file, full_path) type2id, typeDict = pkl_utils._load(type_file) data_train = json.load(open(raw_train_file)) data_valid = json.load(open(raw_valid_file)) data_test = json.load(open(raw_test_file)) data = data_train + data_valid size = len(data) outfile = open(clean_train_file, "w") for i in range(size): for j in range(len(data[i]["mentions"])): p1 = data[i]["mentions"][j]["start"] p2 = data[i]["mentions"][j]["end"] types = data[i]["mentions"][j]["labels"] if (not path_count(types) == 1) and if_clean: continue tokens = [clear_text(txt) for txt in data[i]["tokens"]] if p1 >= len(tokens): continue mention = " ".join(tokens[p1:p2]) if p1 == 0: mention = "<PAD> " + mention else: mention = tokens[p1 - 1] + " " + mention if p2 >= len(tokens): mention = mention + " <PAD>" else: mention = mention + " " + tokens[p2] offset = max(0, p1 - config.WINDOW_SIZE) text = " ".join(tokens[offset:min(len(tokens), p2 + config.WINDOW_SIZE - 1)]) p1 -= offset p2 -= offset out_type = [] for a in types: flag = True for b in types: if len(a) >= len(b): continue if (a == b[:len(a)]) and (b[len(a)] == "/"): flag = False if flag: out_type.append(a) if len(out_type) > 0: if full_path: try: outfile.write("%d\t%d\t%s\t%s\t%s\n" % (p1, p2, text, mention, " ".join(types))) except: continue else: try: outfile.write("%d\t%d\t%s\t%s\t%s\n" % (p1, p2, text, mention, " ".join(out_type))) except: continue outfile.close() outfile = open(clean_test_file, "w") size = len(data_test) for i in range(size): for j in range(len(data_test[i]["mentions"])): p1 = data_test[i]["mentions"][j]["start"] p2 = data_test[i]["mentions"][j]["end"] types = data_test[i]["mentions"][j]["labels"] tokens = [clear_text(txt) for txt in data_test[i]["tokens"]] if p1 >= len(tokens): continue mention = " ".join(tokens[p1:p2]) if p1 == 0: mention = "<PAD> " + mention else: mention = tokens[p1 - 1] + " " + mention if p2 >= len(tokens): mention = mention + " <PAD>" else: mention = mention + " " + tokens[p2] offset = max(0, p1 - config.WINDOW_SIZE) text = " ".join(tokens[offset:min(len(tokens), p2 + config.WINDOW_SIZE - 1)]) p1 -= offset p2 -= offset out_type = [] for a in types: flag = True for b in types: if len(a) >= len(b): continue if (a == b[:len(a)]) and (b[len(a)] == "/"): flag = False if flag: out_type.append(a) if full_path: try: outfile.write("%d\t%d\t%s\t%s\t%s\n" % (p1, p2, text, mention, " ".join(types))) except: continue else: try: outfile.write("%d\t%d\t%s\t%s\t%s\n" % (p1, p2, text, mention, " ".join(out_type))) except: continue outfile.close()
import config import pandas as pd from utils import time_utils, pkl_utils from optparse import OptionParser from collections import defaultdict import os.path import parse G = pkl_utils._load(config.ONTOLOGY_TREE) Root = "http://www.w3.org/2002/07/owl#Thing" def parse_args(parser): parser.add_option("-l", "--lang", default="zh", type="string", dest="lang", help="target language") parser.add_option("-p", "--pivot", default="en", type="string", dest="pivots", help="pivot lanuages") parser.add_option( "-L", default=0.5, type="float", dest="L", help="parameter to tune the tradeoff between precision and recall")
def combine(self): dfAll = pkl_utils._load(config.INFO_DATA) dfAll_raw = dfAll.copy() y_train = dfAll["relevance"].values[:TRAIN_SIZE] ## for basic features feat_cnt = 0 self.logger.info("Run for basic...") for file_name in sorted(os.listdir(config.FEAT_DIR)): if self.feature_suffix in file_name: fname = file_name.split(".")[0] if fname not in self.feature_dict: continue x = self.load_feature(config.FEAT_DIR, fname) x = np.nan_to_num(x) if np.isnan(x).any(): self.logger.info("%s nan" % fname) continue # apply feature transform mandatory = self.feature_dict[fname][0] transformer = self.feature_dict[fname][1] x = transformer.fit_transform(x) dim = np_utils._dim(x) if dim == 1: corr = np_utils._corr(x[:TRAIN_SIZE], y_train) if not mandatory and abs(corr) < self.corr_threshold: self.logger.info( "Drop: {} ({}D) (abs corr = {}, < threshold = {})". format(fname, dim, abs(corr), self.corr_threshold)) continue dfAll[fname] = x self.feature_names.append(fname) else: columns = ["%s_%d" % (fname, x) for x in range(dim)] df = pd.DataFrame(x, columns=columns) dfAll = pd.concat([dfAll, df], axis=1) self.feature_names.extend(columns) feat_cnt += 1 self.feature_names_basic.append(fname) if dim == 1: self.logger.info( "Combine {:>3}/{:>3} feat: {} ({}D) (corr = {})". format(feat_cnt, len(self.feature_dict.keys()), fname, dim, corr)) else: self.logger.info( "Combine {:>3}/{:>3} feat: {} ({}D)".format( feat_cnt, len(self.feature_dict.keys()), fname, dim)) dfAll.fillna(config.MISSING_VALUE_NUMERIC, inplace=True) ## basic dfTrain = dfAll.iloc[:TRAIN_SIZE].copy() self.y_train = dfTrain["relevance"].values.astype(float) dfTrain.drop(["id", "relevance"], axis=1, inplace=True) self.X_train = dfTrain.values.astype(float) dfTest = dfAll.iloc[TRAIN_SIZE:].copy() self.id_test = dfTest["id"].values.astype(int) dfTest.drop(["id", "relevance"], axis=1, inplace=True) self.X_test = dfTest.values.astype(float) ## all first = True feat_cv_cnt = 0 dfAll_cv_all = dfAll_raw.copy() feature_dir = "%s/All" % (config.FEAT_DIR) for file_name in sorted(os.listdir(feature_dir)): if self.feature_suffix in file_name: fname = file_name.split(".")[0] if fname not in self.feature_dict: continue if first: self.logger.info("Run for all...") first = False x = self.load_feature(feature_dir, fname) x = np.nan_to_num(x) if np.isnan(x).any(): self.logger.info("%s nan" % fname) continue # apply feature transform mandatory = self.feature_dict[fname][0] transformer = self.feature_dict[fname][1] x = transformer.fit_transform(x) dim = np_utils._dim(x) if dim == 1: corr = np_utils._corr(x[:TRAIN_SIZE], y_train) if not mandatory and abs(corr) < self.corr_threshold: self.logger.info( "Drop: {} ({}D) (abs corr = {}, < threshold = {})". format(fname, dim, abs(corr), self.corr_threshold)) continue dfAll_cv_all[fname] = x self.feature_names.append(fname) else: columns = ["%s_%d" % (fname, x) for x in range(dim)] df = pd.DataFrame(x, columns=columns) dfAll_cv_all = pd.concat([dfAll_cv_all, df], axis=1) self.feature_names.extend(columns) feat_cv_cnt += 1 self.feature_names_cv.append(fname) if dim == 1: self.logger.info( "Combine {:>3}/{:>3} feat: {} ({}D) (corr = {})". format(feat_cnt + feat_cv_cnt, len(self.feature_dict.keys()), fname, dim, corr)) else: self.logger.info( "Combine {:>3}/{:>3} feat: {} ({}D)".format( feat_cnt + feat_cv_cnt, len(self.feature_dict.keys()), fname, dim)) if feat_cv_cnt > 0: dfAll_cv_all.fillna(config.MISSING_VALUE_NUMERIC, inplace=True) X_tmp = dfAll_cv_all.drop(["id", "relevance"], axis=1).values.astype(float) self.X_train_cv_all = X_tmp[:TRAIN_SIZE] self.X_test = np.hstack((self.X_test, X_tmp[TRAIN_SIZE:])) else: self.X_train_cv_all = None feat_cnt += feat_cv_cnt ## for cv features first = True for run in range(1, self.n_iter + 1): feat_cv_cnt = 0 dfAll_cv = dfAll_raw.copy() feature_dir = "%s/Run%d" % (config.FEAT_DIR, run) for file_name in sorted(os.listdir(feature_dir)): if self.feature_suffix in file_name: fname = file_name.split(".")[0] if (fname not in self.feature_dict) or ( fname not in self.feature_names_cv): continue if first: self.logger.info("Run for cv...") first = False if feat_cv_cnt == 0: self.logger.info("Run %d" % run) x = self.load_feature(feature_dir, fname) x = np.nan_to_num(x) if np.isnan(x).any(): self.logger.info("%s nan" % fname) continue # apply feature transform mandatory = self.feature_dict[fname][0] transformer = self.feature_dict[fname][1] x = transformer.fit_transform(x) dim = np_utils._dim(x) if dim == 1: dfAll_cv[fname] = x else: columns = ["%s_%d" % (fname, x) for x in range(dim)] df = pd.DataFrame(x, columns=columns) dfAll_cv = pd.concat([dfAll_cv, df], axis=1) feat_cv_cnt += 1 self.logger.info( "Combine {:>3}/{:>3} feat: {} ({}D)".format( feat_cnt + feat_cv_cnt, len(self.feature_dict.keys()), fname, dim)) if feat_cv_cnt > 0: dfAll_cv.fillna(config.MISSING_VALUE_NUMERIC, inplace=True) dfTrain_cv = dfAll_cv.iloc[:TRAIN_SIZE].copy() X_tmp = dfTrain_cv.drop(["id", "relevance"], axis=1).values.astype(float) if run == 1: self.X_train_cv = np.zeros( (X_tmp.shape[0], X_tmp.shape[1], self.n_iter), dtype=float) self.X_train_cv[:, :, run - 1] = X_tmp if feat_cv_cnt == 0: self.X_train_cv = None self.basic_only = 1 # report final results if self.basic_only: self.logger.info("Overall Shape: %d x %d" % (len(self.y_train), self.X_train.shape[1])) else: self.logger.info("Overall Shape: %d x %d" % (len(self.y_train), self.X_train.shape[1] + self.X_train_cv_all.shape[1])) self.logger.info("Done combinning.") return self
def main(which): logname = "generate_feature_word2vec_%s_%s.log" % (which, time_utils._timestamp()) logger = logging_utils._get_logger(config.LOG_DIR, logname) #### NOTE: use data BEFORE STEMMinG dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED) word2vec_model_dirs = [] model_prefixes = [] if which == "homedepot": ## word2vec model trained with Homedepot dataset: brand/color/query/title/description word2vec_model_dirs.append( config.WORD2VEC_MODEL_DIR + "/Homedepot-word2vec-D%d-min_count%d.model" % (config.EMBEDDING_DIM, config.EMBEDDING_MIN_COUNT)) model_prefixes.append("Homedepot") elif which == "wikipedia": ## word2vec model pretrained with Wikipedia+Gigaword 5 word2vec_model_dirs.append(config.GLOVE_WORD2VEC_MODEL_DIR + "/glove.6B.300d.txt") model_prefixes.append("Wikipedia") elif which == "google": ## word2vec model pretrained with Google News word2vec_model_dirs.append(config.WORD2VEC_MODEL_DIR + "/GoogleNews-vectors-negative300.bin") model_prefixes.append("GoogleNews") elif which == "common_crawl": ## word2vec model pretrained with Common Crawl word2vec_model_dirs.append(config.GLOVE_WORD2VEC_MODEL_DIR + "/glove.840B.300d.txt") model_prefixes.append("CommonCrawl") for word2vec_model_dir, model_prefix in zip(word2vec_model_dirs, model_prefixes): ## load model try: if ".bin" in word2vec_model_dir: word2vec_model = gensim.models.Word2Vec.load_word2vec_format( word2vec_model_dir, binary=True) elif ".txt" in word2vec_model_dir: word2vec_model = gensim.models.Word2Vec.load_word2vec_format( word2vec_model_dir, binary=False) else: word2vec_model = gensim.models.Word2Vec.load( word2vec_model_dir) except: continue # ## standalone (not used in model building) # obs_fields = ["search_term", "product_title", "product_description"] # generator = Word2Vec_Centroid_Vector # param_list = [word2vec_model, model_prefix] # sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger) # sf.go() ## pairwise generators = [ Word2Vec_Importance, Word2Vec_N_Similarity, Word2Vec_N_Similarity_Imp, Word2Vec_Centroid_RMSE, Word2Vec_Centroid_RMSE_IMP, # # not used in final submission # Word2Vec_Centroid_Vdiff, ] obs_fields_list = [] target_fields_list = [] obs_fields_list.append(["question1"]) target_fields_list.append(["question2"]) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: param_list = [word2vec_model, model_prefix] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go() ## cosine sim generators = [ Word2Vec_CosineSim, ] # double aggregation aggregation_mode_prev = ["mean", "max", "min", "median"] aggregation_mode = ["mean", "std", "max", "min", "median"] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: param_list = [ word2vec_model, model_prefix, aggregation_mode, aggregation_mode_prev ] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
def load_feature(self, feature_dir, feature_name): fname = os.path.join(feature_dir, feature_name + self.feature_suffix) return pkl_utils._load(fname)
def __init__(self, model_name, data_name, cv_runs, params_dict, logger): print("Loading data...") if data_name == "wiki": words_train, mentions_train, positions_train, labels_train = data_utils.load(config.WIKI_TRAIN_CLEAN) words, mentions, positions, labels = data_utils.load(config.WIKI_TEST_CLEAN) type2id, typeDict = pkl_utils._load(config.WIKI_TYPE) num_types = len(type2id) type_info = config.WIKI_TYPE elif data_name == "ontonotes": words_train, mentions_train, positions_train, labels_train = data_utils.load(config.ONTONOTES_TRAIN_CLEAN) words, mentions, positions, labels = data_utils.load(config.ONTONOTES_TEST_CLEAN) type2id, typeDict = pkl_utils._load(config.ONTONOTES_TYPE) num_types = len(type2id) type_info = config.ONTONOTES_TYPE # "./data/corpus/OntoNotes/type.pkl" elif data_name == "wikim": words_train, mentions_train, positions_train, labels_train = data_utils.load(config.WIKIM_TRAIN_CLEAN) words, mentions, positions, labels = data_utils.load(config.WIKIM_TEST_CLEAN) type2id, typeDict = pkl_utils._load(config.WIKIM_TYPE) num_types = len(type2id) type_info = config.WIKIM_TYPE self.id2type = {type2id[x]:x for x in type2id.keys()} def type2vec(types): tmp = np.zeros(num_types) for t in types.split(): tmp[type2id[t]] = 1.0 return tmp labels_train = np.array([type2vec(t) for t in labels_train]) # one_hot coding labels = np.array([type2vec(t) for t in labels]) # labels_test [test_size,num_types] self.embedding = embedding_utils.Embedding.fromCorpus(config.EMBEDDING_DATA, list(words_train)+list(words), config.MAX_DOCUMENT_LENGTH, config.MENTION_SIZE) # MAX_DOCUMENT_LENGTH = 30 # MENTION_SIZE = 15 # WINDOW_SIZE = 10 print("Preprocessing data...") textlen_train = np.array([self.embedding.len_transform1(x) for x in words_train]) #1-D array [total] constraints max sentences len to 30 words_train = np.array([self.embedding.text_transform1(x) for x in words_train]) # 2-D array [[index,],] [total,30] mentionlen_train = np.array([self.embedding.len_transform2(x) for x in mentions_train]) # [total],constrains max mentions len to 15 mentions_train = np.array([self.embedding.text_transform2(x) for x in mentions_train]) # [total,15] positions_train = np.array([self.embedding.position_transform(x) for x in positions_train]) # [total,30] textlen = np.array([self.embedding.len_transform1(x) for x in words]) words = np.array([self.embedding.text_transform1(x) for x in words]) mentionlen = np.array([self.embedding.len_transform2(x) for x in mentions]) mentions = np.array([self.embedding.text_transform2(x) for x in mentions]) positions = np.array([self.embedding.position_transform(x) for x in positions]) ss = ShuffleSplit(n_splits=1, test_size=0.1, random_state=config.RANDOM_SEED) for test_index, valid_index in ss.split(np.zeros(len(labels)), labels): textlen_test, textlen_valid = textlen[test_index], textlen[valid_index] words_test, words_valid = words[test_index], words[valid_index] mentionlen_test, mentionlen_valid = mentionlen[test_index], mentionlen[valid_index] mentions_test, mentions_valid = mentions[test_index], mentions[valid_index] positions_test, positions_valid = positions[test_index], positions[valid_index] labels_test, labels_valid = labels[test_index], labels[valid_index] # [?,30] [?] [?,15] [?] [?,30] [?,num_types] # --> ? total tuples (30,1,15,1,30,num_types) --> (sentence, len, mention, len, positions, type) self.train_set = list(zip(words_train, textlen_train, mentions_train, mentionlen_train, positions_train, labels_train)) self.valid_set = list(zip(words_valid, textlen_valid, mentions_valid, mentionlen_valid, positions_valid, labels_valid)) self.test_set = list(zip(words_test, textlen_test, mentions_test, mentionlen_test, positions_test, labels_test)) self.full_test_set = list(zip(words, textlen, mentions, mentionlen, positions, labels)) self.labels_test = labels_test self.labels = labels self.model_name = model_name self.data_name = data_name self.cv_runs = cv_runs self.params_dict = params_dict self.hparams = AttrDict(params_dict) self.logger = logger self.num_types = num_types self.type_info = type_info self.model = self._get_model() self.saver = tf.train.Saver(tf.global_variables()) checkpoint_dir = os.path.abspath(config.CHECKPOINT_DIR) if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) self.checkpoint_prefix = os.path.join(checkpoint_dir, self.__str__())
def main(which): logname = "generate_feature_word2vec_%s_%s.log" % (which, time_utils._timestamp()) logger = logging_utils._get_logger(config.LOG_DIR, logname) #### NOTE: use data BEFORE STEMMinG dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED) word2vec_model_dirs = [] model_prefixes = [] if which == "wikipedia": ## word2vec model pretrained with Wikipedia+Gigaword 5 word2vec_model_dirs.append(config.GLOVE_WORD2VEC_MODEL_DIR + "/glove.6B.300d.txt") model_prefixes.append("Wikipedia") elif which == "google": ## word2vec model pretrained with Google News word2vec_model_dirs.append(config.WORD2VEC_MODEL_DIR + "/GoogleNews-vectors-negative300.bin") model_prefixes.append("GoogleNews") elif which == "quora": ## word2vec model trained with Quora dataset: question1/question2 word2vec_model_dirs.append( config.WORD2VEC_MODEL_DIR + "/Quora-word2vec-D%d-min_count%d.model" % (config.EMBEDDING_DIM, config.EMBEDDING_MIN_COUNT)) model_prefixes.append("Quora") print("word2vec mode: {}".format(which)) for word2vec_model_dir, model_prefix in zip(word2vec_model_dirs, model_prefixes): ## load model try: if ".bin" in word2vec_model_dir: word2vec_model = gensim.models.Word2Vec.load_word2vec_format( word2vec_model_dir, binary=True) elif ".txt" in word2vec_model_dir: #ipdb.set_trace() word2vec_model = gensim.models.Word2Vec.load_word2vec_format( word2vec_model_dir, binary=False) else: word2vec_model = gensim.models.Word2Vec.load( word2vec_model_dir) except: continue ## pairwise generators = [ Word2Vec_Importance, Word2Vec_N_Similarity, Word2Vec_N_Similarity_Imp, Word2Vec_Centroid_RMSE, Word2Vec_Centroid_RMSE_IMP, ] obs_fields_list = [["question1"], ["question2"]] target_fields_list = [["question2"], ["question1"]] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: param_list = [word2vec_model, model_prefix] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go() ## cosine sim generators = [ Word2Vec_CosineSim, ] # double aggregation aggregation_mode_prev = ["mean", "max", "min", "median"] aggregation_mode = ["mean", "std", "max", "min", "median"] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: param_list = [ word2vec_model, model_prefix, aggregation_mode, aggregation_mode_prev ] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()