예제 #1
0
파일: eval.py 프로젝트: U-Alberta/HRERE
def main(options):
    if options.eval:
        time_str = datetime.datetime.now().isoformat()
        logname = "Eval_[Model@%s]_%s.log" % (options.model_name, time_str)
        logger = logging_utils._get_logger(config.LOG_DIR, logname)
    else:
        time_str = datetime.datetime.now().isoformat()
        logname = "Final_[Model@%s]_%s.log" % (options.model_name, time_str)
        logger = logging_utils._get_logger(config.LOG_DIR, logname)
    params_dict = param_space_dict[options.model_name]
    task = Task(options.model_name, options.runs, params_dict, logger)
    if options.eval:
        task.refit(options.prefix)
    else:
        task.evaluate(options.prefix)
예제 #2
0
def run_tfidf_ngram_cosinesim():
    """Symmetric in obs and target"""
    logname = "generate_feature_tfidf_ngram_cosinesim_%s.log" % time_utils._timestamp(
    )
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    generators = [TFIDF_Word_Ngram_CosineSim, TFIDF_Char_Ngram_CosineSim]
    ngrams_list = [[2, 3], [4]]
    obs_fields_list = [['question1']]
    target_fields_list = [['question2']]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator, ngrams in zip(generators, ngrams_list):
            for ngram in ngrams:
                param_list = [ngram]
                pf = PairwiseFeatureWrapper(generator,
                                            dfAll,
                                            obs_fields,
                                            target_fields,
                                            param_list,
                                            config.FEAT_DIR,
                                            logger,
                                            force_corr=True)
                pf.go()
                del pf
                gc.collect()
예제 #3
0
def run_lsa_ngram_cooc():
    logname = "generate_feature_lsa_ngram_cooc_%s.log" % time_utils._timestamp(
    )
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
    dfAll.drop(["product_attribute_list"], inplace=True, axis=1)

    generators = [LSA_Word_Ngram_Cooc]
    obs_ngrams = [1, 2]
    target_ngrams = [1, 2]
    obs_fields_list = []
    target_fields_list = []
    obs_fields_list.append(
        ["search_term", "search_term_alt", "search_term_auto_corrected"][:1])
    target_fields_list.append(["product_title", "product_description"][:1])
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for obs_ngram in obs_ngrams:
            for target_ngram in target_ngrams:
                for generator in generators:
                    param_list = [
                        obs_ngram, target_ngram, config.SVD_DIM,
                        config.SVD_N_ITER
                    ]
                    pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields,
                                                target_fields, param_list,
                                                config.FEAT_DIR, logger)
                    pf.go()
예제 #4
0
def main(options):
    time_str = datetime.datetime.now().isoformat()
    time_str = re.sub(':','-',time_str)
    logname = "[Model@%s]_[Data@%s]_%s.log" % (options.model_name, options.data_name, time_str)
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    optimizer = TaskOptimizer(options.model_name, options.data_name, options.cv_runs, options.max_evals, logger)
    optimizer.run()
예제 #5
0
def experiments(args):
    runs = 5
    time_str = datetime.now().date().isoformat()
    logname = "[Data@%s]_[Encoder@%s]" % (args.data_name, args.sent_encoder)
    if args.bidirectional:
        logname += "_[Bi]"
    logname += "_%s.log" % time_str
    logger = _get_logger(config.LOG_PATH, logname)
    dis_accs = []
    ins_accs = []
    for i in range(runs):
        dis_acc, ins_acc = run_bigram_coherence(args)
        dis_accs.append(dis_acc[0])
        ins_accs.append(ins_acc[0])
        for _ in range(10):
            gc.collect()

    logger.info("=" * 50)
    for i in range(runs):
        logger.info("Run %d" % (i + 1))
        logger.info("Dis Acc: %.6f" % dis_accs[i])
        logger.info("Ins Acc: %.6f" % ins_accs[i])
    logger.info("=" * 50)
    logger.info("Average Dis Acc: %.6f (%.6f)" %
                (np.mean(dis_accs), np.std(dis_accs)))
    logger.info("Average Ins Acc: %.6f (%.6f)" %
                (np.mean(ins_accs), np.std(ins_accs)))
def main():
    FNAME = "model_predict_lgbm"
    logname = "%s_%s.log" % (FNAME, now)
    logger = logging_utils._get_logger(config.LOG_DIR, logname)

    # Load raw data
    # test_raw = dl.load_test_data()
    # gc.collect()
    # Load generated features
    test_features = load_combined_features(logger)

    #test_features = pd.concat([test_features, test_raw[config.NUMBER_FEATURES]], axis=1)
    logger.info('Final test data shape: %s' % str(test_features.shape))

    lightgbm_model = load_model(logger)

    t0 = time()
    pred = lightgbm_model.predict(test_features)

    submission = pd.read_csv(config.SAMPLE_SUBMISSION_DATA, nrows=config.RAW_DATA_ROWS)
    submission['deal_probability'] = pred
    submission['deal_probability'].clip(0.0, 1.0, inplace=True)
    submission_file = os.path.join(config.DATA_SUBMISSION_DIR, "submission_lightgbm.csv")
    submission.to_csv(submission_file, index=False)

    # Compress (zip) submission file.
    submission_zip_file = os.path.join(config.DATA_SUBMISSION_DIR, "submission_lightgbm.csv.zip")
    submission_zip = zipfile.ZipFile(submission_zip_file, 'w')
    submission_zip.write(submission_file, arcname="submission_lightgbm.csv", compress_type=zipfile.ZIP_DEFLATED)
    submission_zip.close()
    logger.info('LightGBM submission file generation took: %s minutes' % round((time() - t0) / 60, 1))
예제 #7
0
def main():
    logname = "generate_feature_match_%s.log" % time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    generators = [
        MatchQueryCount,
        MatchQueryRatio,
        LongestMatchSize,
        LongestMatchRatio,
    ]
    obs_fields_list = []
    target_fields_list = []
    ## question1 in question2
    obs_fields_list.append(['question1'])
    target_fields_list.append(['question2'])
    ## question2 in question1
    obs_fields_list.append(['question2'])
    target_fields_list.append(['question1'])
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            param_list = []
            pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields,
                                        target_fields, param_list,
                                        config.FEAT_DIR, logger)
            pf.go()
예제 #8
0
def main():
    logname = "generate_feature_wordnet_similarity_%s.log" % time_utils._timestamp(
    )
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    #### NOTE: use data BEFORE STEMMING
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED)

    # WordNet_Lch_Similarity and WordNet_Wup_Similarity are not used in final submission
    generators = [
        WordNet_Path_Similarity,
        WordNet_Lch_Similarity,
        WordNet_Wup_Similarity,
    ][:1]
    obs_fields_list = []
    target_fields_list = []
    # only search_term and product_title are used in final submission
    obs_fields_list.append(["question1"])
    target_fields_list.append(["question2"])
    # double aggregation
    aggregation_mode_prev = ["mean", "max", "min", "median"]
    aggregation_mode = ["mean", "std", "max", "min", "median"]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            param_list = [aggregation_mode_prev, aggregation_mode]
            pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields,
                                        target_fields, param_list,
                                        config.FEAT_DIR, logger)
            pf.go()
예제 #9
0
def main():
    FNAME = "feature_date"
    logname = "%s_%s.log" % (FNAME, now)
    logger = logging_utils._get_logger(config.LOG_DIR, logname)

    train, test = dl.load_data()

    logger.info("Generating activation date features ...")
    t0 = time()
    # Generating activation date features
    # Train data
    train['month'] = train['activation_date'].dt.month
    train['weekday'] = train['activation_date'].dt.weekday
    train['month_day'] = train['activation_date'].dt.day
    train['year_day'] = train['activation_date'].dt.dayofyear
    # Test data
    test['month'] = test['activation_date'].dt.month
    test['weekday'] = test['activation_date'].dt.weekday
    test['month_day'] = test['activation_date'].dt.day
    test['year_day'] = test['activation_date'].dt.dayofyear
    gc.collect()
    logger.info(FNAME + ' took: %s minutes' % round((time() - t0) / 60, 1))

    # save data
    train_fname = os.path.join(config.DATA_FEATURES_DIR,
                               "train_" + FNAME + config.FEAT_FILE_SUFFIX)
    test_fname = os.path.join(config.DATA_FEATURES_DIR,
                              "test_" + FNAME + config.FEAT_FILE_SUFFIX)
    logger.info("Save to %s" % train_fname)
    pkl_utils._save(train_fname, train[config.GENERATED_DATE_FEATURES])
    logger.info("Save to %s" % test_fname)
    pkl_utils._save(test_fname, test[config.GENERATED_DATE_FEATURES])
    gc.collect()
예제 #10
0
def main(options):
    logname = "[Feat@%s]_[Learner@%s]_hyperopt_%s.log"%(
        options.feature_name, options.learner_name, time_utils._timestamp())
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    optimizer = TaskOptimizer(options.task_mode, options.learner_name, 
        options.feature_name, logger, options.max_evals, verbose=True, refit_once=options.refit_once)
    optimizer.run()
예제 #11
0
def main():
    logname = "generate_feature_ident.log"
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = table_utils._read(config.ALL_DATA)

    # Copies of data from ES docs. Note that multi-valued fields are first
    # converted into their length
    obs_fields = ["incoming_links", "popularity_score", "text_bytes",
            "category", "template", "heading", "outgoing_link", "external_link",
            "redirect.title", "auxiliary_text"]
    transforms = [None, np.log, np.log10, np.sqrt]
    dedup = True
    for transform in transforms:
        param_list = [transform]
        sf = StandaloneFeatureWrapper(Ident, dfAll, obs_fields, param_list, config.FEAT_DIR, logger, dedup)
        sf.go()

    # Sub-fields from termvec data
    obs_fields = [x + '_termvec' for x in config.ES_TERM_FIELDS]
    obs_fields += ['query_' + x + '_termvec' for x in config.ES_TERM_FIELDS]
    obs_fields += ['norm_query_' + x + '_termvec' for x in config.ES_TERM_FIELDS]
    es_fields = ['score', 'term_freq', 'ttf', 'doc_freq']
    aggregation_mode = ["mean", "std", "max", "min", "median"]
    for es_field in es_fields:
        for transform in transforms:
            param_list = [es_field, transform, aggregation_mode]
            sf = StandaloneFeatureWrapper(SubFieldIdent, dfAll, obs_fields, param_list, config.FEAT_DIR, logger, dedup)
            sf.go()
예제 #12
0
def run_lsa_ngram_pair():
    """Symmetric in obs and target"""
    logname = "generate_feature_lsa_ngram_pair_%s.log" % time_utils._timestamp(
    )
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    generators = [LSA_Word_Ngram_Pair]
    ngrams = [1, 2, 3]
    obs_fields_list = []
    target_fields_list = []
    ## question1 in question2
    obs_fields_list.append(['question1'])
    target_fields_list.append(['question2'])
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for ngram in ngrams:
            for generator in generators:
                param_list = [ngram, config.SVD_DIM, config.SVD_N_ITER]
                pf = PairwiseFeatureWrapper(generator,
                                            dfAll,
                                            obs_fields,
                                            target_fields,
                                            param_list,
                                            config.FEAT_DIR,
                                            logger,
                                            force_corr=True)
                pf.go()
                del pf
                gc.collect()
def main():
    logname = "generate_feature_intersect_position_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    generators = [
        IntersectPosition_Ngram, 
        IntersectNormPosition_Ngram, 
    ]
    obs_fields_list = []
    target_fields_list = []
    ## query in document
    obs_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"] )
    target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"][1:2] )
    ## document in query
    obs_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"][1:2] )
    target_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"] )
    ngrams = [1,2,3,12,123][:3]
    aggregation_mode = ["mean", "std", "max", "min", "median"]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            for ngram in ngrams:
                param_list = [ngram, aggregation_mode]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
                pf.go()
예제 #14
0
def run_count():
    logname = "generate_feature_first_last_ngram_count_%s.log" % time_utils._timestamp(
    )
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    generators = [
        FirstIntersectCount_Ngram,
        LastIntersectCount_Ngram,
        FirstIntersectRatio_Ngram,
        LastIntersectRatio_Ngram,
    ]

    obs_fields_list = []
    target_fields_list = []
    ## query in document
    obs_fields_list.append(["question1"])
    target_fields_list.append(["question2"])
    ## document in query
    obs_fields_list.append(["question2"])
    target_fields_list.append(["question1"])
    ngrams = [1, 2, 3, 12, 123][:3]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            for ngram in ngrams:
                param_list = [ngram]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields,
                                            target_fields, param_list,
                                            config.FEAT_DIR, logger)
                pf.go()
def run_count():
    logname = "generate_feature_first_last_ngram_count_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    generators = [
        FirstIntersectCount_Ngram, 
        LastIntersectCount_Ngram, 
        FirstIntersectRatio_Ngram, 
        LastIntersectRatio_Ngram, 
    ]

    obs_fields_list = []
    target_fields_list = []
    ## query in document
    obs_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"] )
    target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] )
    ## document in query
    obs_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] )
    target_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"] )
    ngrams = [1,2,3,12,123][:3]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            for ngram in ngrams:
                param_list = [ngram]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
                pf.go()
예제 #16
0
def main(options):
    time_str = datetime.datetime.now().isoformat()
    if len(options.save_name) == 0:
        logname = "Eval_[Model@%s]_[Data@%s]_%s.log" % (
            options.model_name, options.data_name, time_str)
    else:
        logname = "Eval_[Model@%s]_[Data@%s]_%s.log" % (
            options.save_name, options.data_name, time_str)
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    # else:
    #     time_str = datetime.datetime.now().isoformat()
    #     logname = "Final_[Model@%s]_[Data@%s]_%s.log" % (options.model_name,
    #             options.data_name, time_str)
    #     logger = logging_utils._get_logger(config.LOG_DIR, logname)
    #
    params_dict = param_space_dict[options.model_name]
    params_dict['alpha'] = options.alpha
    task = Task(model_name=options.model_name,
                data_name=options.data_name,
                cv_runs=options.runs,
                params_dict=params_dict,
                logger=logger,
                portion=options.portion,
                save_name=options.save_name)

    print('-' * 50 + 'refit' + '-' * 50)
    task.refit()
예제 #17
0
def run_position():
    logname = "generate_feature_first_last_ngram_position_%s.log" % time_utils._timestamp(
    )
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    generators = [
        FirstIntersectPosition_Ngram,
        LastIntersectPosition_Ngram,
        FirstIntersectNormPosition_Ngram,
        LastIntersectNormPosition_Ngram,
    ]

    obs_fields_list = [["question1"], ["question2"]]
    target_fields_list = [["question2"], ["question1"]]
    ngrams = [1, 2, 3, 12, 123]
    aggregation_mode = ["mean", "std", "max", "min", "median"]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            for ngram in ngrams:
                param_list = [ngram, aggregation_mode]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields,
                                            target_fields, param_list,
                                            config.FEAT_DIR, logger)
                pf.go()
def run_compression_distance():
    logname = "generate_feature_compression_distance_%s.log" % time_utils._timestamp(
    )
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    obs_fields_list = []
    target_fields_list = []
    obs_fields_list.append([
        "search_term", "search_term_product_name", "search_term_alt",
        "search_term_auto_corrected"
    ][:2])
    target_fields_list.append([
        "product_title", "product_title_product_name", "product_description",
        "product_attribute", "product_brand", "product_color"
    ])
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        param_list = []
        pf = PairwiseFeatureWrapper(CompressionDistance, dfAll, obs_fields,
                                    target_fields, param_list, config.FEAT_DIR,
                                    logger)
        pf.go()
        for ngram in ngrams:
            param_list = [ngram, aggregation_mode_prev, aggregation_mode]
            pf = PairwiseFeatureWrapper(CompressionDistance_Ngram, dfAll,
                                        obs_fields, target_fields, param_list,
                                        config.FEAT_DIR, logger)
            pf.go()
def run_ngram_jaccard():
    logname = "generate_feature_ngram_jaccard_%s.log" % time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    generators = [JaccardCoef_Ngram, DiceDistance_Ngram]
    obs_fields_list = []
    target_fields_list = []
    obs_fields_list.append([
        "search_term", "search_term_product_name", "search_term_alt",
        "search_term_auto_corrected"
    ][:2])
    target_fields_list.append([
        "product_title", "product_title_product_name", "product_description",
        "product_attribute", "product_brand", "product_color"
    ])
    ngrams = [1, 2, 3, 12, 123][:3]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            for ngram in ngrams:
                param_list = [ngram]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields,
                                            target_fields, param_list,
                                            config.FEAT_DIR, logger)
                pf.go()
예제 #20
0
def main():
    logname = "generate_feature_doc2vec_%s.log" % time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    #### NOTE: use data BEFORE STEMMING
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED)

    doc2vec_model_dirs = []
    model_prefixes = []
    ## doc2vec model trained with Homedepot dataset: brand/color/obs/title/description
    doc2vec_model_dirs.append(
        config.DOC2VEC_MODEL_DIR + "/Homedepot-doc2vec-D%d-min_count%d.model" %
        (config.EMBEDDING_DIM, config.EMBEDDING_MIN_COUNT))
    model_prefixes.append("Homedepot")
    for doc2vec_model_dir, model_prefix in zip(doc2vec_model_dirs,
                                               model_prefixes):
        ## load model
        try:
            if ".bin" in doc2vec_model_dir:
                doc2vec_model = gensim.models.Doc2Vec.load_word2vec_format(
                    doc2vec_model_dir, binary=True)
            if ".txt" in doc2vec_model_dir:
                doc2vec_model = gensim.models.Doc2Vec.load_word2vec_format(
                    doc2vec_model_dir, binary=False)
            else:
                doc2vec_model = gensim.models.Doc2Vec.load(doc2vec_model_dir)
                doc2vec_model_sent_label = pkl_utils._load(doc2vec_model_dir +
                                                           ".sent_label")
        except:
            continue

        # ## standalone (not used in model building)
        # obs_fields = ["search_term", "search_term_alt", "product_title", "product_description", "product_attribute"]
        # generator = Doc2Vec_Vector
        # param_list = [doc2vec_model, doc2vec_model_sent_label, model_prefix]
        # sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger)
        # sf.go()

        ## pairwise
        generators = [
            Doc2Vec_CosineSim,
            Doc2Vec_RMSE,
            # Doc2Vec_Vdiff,
        ]
        obs_fields_list = []
        target_fields_list = []
        obs_fields_list.append(["search_term", "search_term_alt"][:1])
        target_fields_list.append([
            "product_title", "product_description", "product_attribute",
            "product_brand", "product_color"
        ])
        for obs_fields, target_fields in zip(obs_fields_list,
                                             target_fields_list):
            for generator in generators:
                param_list = [
                    doc2vec_model, doc2vec_model_sent_label, model_prefix
                ]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields,
                                            target_fields, param_list,
                                            config.FEAT_DIR, logger)
                pf.go()
def main():
    FNAME = "feature_aggregates"
    logname = "%s_%s.log" % (FNAME, now)
    logger = logging_utils._get_logger(config.LOG_DIR, logname)

    train, test = dl.load_data()

    train['weekday'] = train['activation_date'].dt.weekday
    train['month_day'] = train['activation_date'].dt.day
    test['weekday'] = test['activation_date'].dt.weekday
    test['month_day'] = test['activation_date'].dt.day

    logger.info("Train shape: %s & Test shape: %s" % (train.shape, test.shape))
    logger.info("Generating aggregate features ...")
    t0 = time()
    # Generating aggregate features
    agg_deal_probability_features(train, test, config.AGGREGATE_COLUMNS)
    agg_price_features(train, test, config.AGGREGATE_COLUMNS)
    logger.info(FNAME + ' took: %s minutes' % round((time() - t0) / 60, 1))
    logger.info("Train shape: %s & Test shape: %s" % (train.shape, test.shape))
    gc.collect()

    # save data
    train_fname = os.path.join(config.DATA_FEATURES_DIR, "train_" + FNAME + config.FEAT_FILE_SUFFIX)
    test_fname = os.path.join(config.DATA_FEATURES_DIR, "test_" + FNAME + config.FEAT_FILE_SUFFIX)
    logger.info("Save to %s" % train_fname)
    pkl_utils._save(train_fname, train[config.AGGREGATE_DEAL_FEATURES + config.AGGREGATE_PRICE_FEATURES])
    logger.info("Save to %s" % test_fname)
    pkl_utils._save(test_fname, test[config.AGGREGATE_DEAL_FEATURES + config.AGGREGATE_PRICE_FEATURES])
    gc.collect()
예제 #22
0
def main():
    logname = "generate_feature_intersect_position_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    generators = [
        IntersectPosition_Ngram, 
        IntersectNormPosition_Ngram, 
    ]
    obs_fields_list = []
    target_fields_list = []
    ## query in document
    obs_fields_list.append( ["question1"] )
    target_fields_list.append( ["question2"] )
    ## document in query
    obs_fields_list.append( ["question2"] )
    target_fields_list.append( ["question1"] )
    ngrams = [1,2,3,12,123][:3]
    aggregation_mode = ["mean", "std", "max", "min", "median"]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            for ngram in ngrams:
                param_list = [ngram, aggregation_mode]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
                pf.go()
def main():
    logname = "generate_feature_group_relevance_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
    dfTrain = dfAll.iloc[:TRAIN_SIZE].copy()

    ## run python3 splitter.py first
    split = pkl_utils._load("%s/splits_level1.pkl"%config.SPLIT_DIR)
    n_iter = len(split)

    ## for cv
    for i in range(n_iter):
        trainInd, validInd = split[i][0], split[i][1]
        dfTrain2 = dfTrain.iloc[trainInd].copy()
        sub_feature_dir = "%s/Run%d" % (config.FEAT_DIR, i+1)

        obs_fields = ["search_term", "product_title"][1:]
        aggregation_mode = ["mean", "std", "max", "min", "median", "size"]
        param_list = [dfAll["id"], dfTrain2, aggregation_mode]
        sf = StandaloneFeatureWrapper(GroupRelevance, dfAll, obs_fields, param_list, sub_feature_dir, logger)
        sf.go()

    ## for all
    sub_feature_dir = "%s/All" % (config.FEAT_DIR)
    obs_fields = ["search_term", "product_title"][1:]
    aggregation_mode = ["mean", "std", "max", "min", "median", "size"]
    param_list = [dfAll["id"], dfTrain, aggregation_mode]
    sf = StandaloneFeatureWrapper(GroupRelevance, dfAll, obs_fields, param_list, sub_feature_dir, logger)
    sf.go()
예제 #24
0
def run_tsne_lsa_ngram():
    logname = "generate_feature_tsne_lsa_ngram_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
    dfAll.drop(["product_attribute_list"], inplace=True, axis=1)

    generators = [TSNE_LSA_Word_Ngram, TSNE_LSA_Char_Ngram]
    ngrams_list = [[1,2,3], [2,3,4,5]]
    ngrams_list = [[1,2,3], [4]]
    obs_fields = ["search_term", "search_term_alt", "search_term_auto_corrected", "product_title", "product_description"]
    for generator,ngrams in zip(generators, ngrams_list):
        for ngram in ngrams:
            param_list = [ngram, config.SVD_DIM, config.SVD_N_ITER]
            sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger, force_corr=True)
            sf.go()

    generators = [TSNE_LSA_Word_Ngram_Pair]
    ngrams = [1, 2]
    obs_fields_list = []
    target_fields_list = []
    obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"] )
    target_fields_list.append( ["product_title", "product_description"] )
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for ngram in ngrams:
            for generator in generators:
                param_list = [ngram, config.SVD_DIM, config.SVD_N_ITER]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger, force_corr=True)
                pf.go()
 def __init__(self,
              feature_list,
              feature_name,
              feature_suffix=".csv",
              feature_level=2,
              meta_feature_dict={},
              corr_threshold=0):
     self.feature_name = feature_name
     self.feature_list = feature_list
     self.feature_suffix = feature_suffix
     self.feature_level = feature_level
     # for meta features
     self.meta_feature_dict = meta_feature_dict
     self.corr_threshold = corr_threshold
     self.feature_names_basic = []
     self.feature_names_cv = []
     self.feature_names = []
     self.has_basic = 1 if self.meta_feature_dict else 0
     logname = "feature_combiner_%s_%s.log" % (feature_name,
                                               time_utils._timestamp())
     self.logger = logging_utils._get_logger(config.LOG_DIR, logname)
     if self.feature_level == 2:
         self.splitter = splitter_level2
     elif self.feature_level == 3:
         self.splitter = splitter_level3
     self.n_iter = n_iter
     self.splitter_prev = [0] * self.n_iter
def main():
    logname = "generate_feature_group_relevance_%s.log" % time_utils._timestamp(
    )
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
    dfTrain = dfAll.iloc[:TRAIN_SIZE].copy()

    ## run python3 splitter.py first
    split = pkl_utils._load("%s/splits_level1.pkl" % config.SPLIT_DIR)
    n_iter = len(split)

    ## for cv
    for i in range(n_iter):
        trainInd, validInd = split[i][0], split[i][1]
        dfTrain2 = dfTrain.iloc[trainInd].copy()
        sub_feature_dir = "%s/Run%d" % (config.FEAT_DIR, i + 1)

        obs_fields = ["search_term", "product_title"][1:]
        aggregation_mode = ["mean", "std", "max", "min", "median", "size"]
        param_list = [dfAll["id"], dfTrain2, aggregation_mode]
        sf = StandaloneFeatureWrapper(GroupRelevance, dfAll, obs_fields,
                                      param_list, sub_feature_dir, logger)
        sf.go()

    ## for all
    sub_feature_dir = "%s/All" % (config.FEAT_DIR)
    obs_fields = ["search_term", "product_title"][1:]
    aggregation_mode = ["mean", "std", "max", "min", "median", "size"]
    param_list = [dfAll["id"], dfTrain, aggregation_mode]
    sf = StandaloneFeatureWrapper(GroupRelevance, dfAll, obs_fields,
                                  param_list, sub_feature_dir, logger)
    sf.go()
예제 #27
0
def run_ngram_jaccard():
    logname = "generate_feature_ngram_jaccard.log"
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = table_utils._read(config.ALL_DATA)

    generators = [JaccardCoef_Ngram, DiceDistance_Ngram]
    # single valued fields
    obs_fields_list = [["query", "norm_query"]]
    target_fields_list = [["hit_title", "opening_text" ]]
    ngrams = [1,2,3,12,123][:3]
    dedup = True
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            for ngram in ngrams:
                param_list = [ngram]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger, dedup)
                pf.go()

    # multi-valued fields
    target_fields_list = [["category", "template", "heading",
            "outgoing_link", "external_link", "redirect.title",
            "auxiliary_text"]]
    aggregation_mode = ["mean", "std", "max", "min", "median"]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            multi_gen = MultiTargetEstimatorWrapper(generator)
            for ngram in ngrams:
                param_list = [ngram, aggregation_mode]
                pf = PairwiseFeatureWrapper(multi_gen, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger, dedup)
                pf.go()
    def __init__(self, model_folder, model_list, subm_prefix, 
                weight_opt_max_evals=10, w_min=-1., w_max=1., 
                inst_subsample=0.5, inst_subsample_replacement=False, 
                inst_splitter=None,
                model_subsample=1.0, model_subsample_replacement=True,
                bagging_size=10, init_top_k=5, epsilon=0.00001, 
                multiprocessing=False, multiprocessing_num_cores=1,
                enable_extreme=True, random_seed=0):

        self.model_folder = model_folder
        self.model_list = model_list
        self.subm_prefix = subm_prefix
        self.weight_opt_max_evals = weight_opt_max_evals
        self.w_min = w_min
        self.w_max = w_max
        assert inst_subsample > 0 and inst_subsample <= 1.
        self.inst_subsample = inst_subsample
        self.inst_subsample_replacement = inst_subsample_replacement
        self.inst_splitter = inst_splitter
        assert model_subsample > 0
        assert (type(model_subsample) == int) or (model_subsample <= 1.)
        self.model_subsample = model_subsample
        self.model_subsample_replacement = model_subsample_replacement
        self.bagging_size = bagging_size
        self.init_top_k = init_top_k
        self.epsilon = epsilon
        self.multiprocessing = multiprocessing
        self.multiprocessing_num_cores = multiprocessing_num_cores
        self.enable_extreme = enable_extreme
        self.random_seed = random_seed
        logname = "ensemble_selection_%s.log"%time_utils._timestamp()
        self.logger = logging_utils._get_logger(config.LOG_DIR, logname)
        self.n_models = len(self.model_list)
예제 #29
0
def run_tsne_lsa_ngram():
    logname = "generate_feature_tsne_lsa_ngram_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    generators = [TSNE_LSA_Word_Ngram, TSNE_LSA_Char_Ngram]
    ngrams_list = [[1,2,3], [2,3,4,5]]
    ngrams_list = [[3], [4]]
    obs_fields = ["question1", "question2"]
    for generator,ngrams in zip(generators, ngrams_list):
        for ngram in ngrams:
            param_list = [ngram, config.SVD_DIM, config.SVD_N_ITER]
            sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger, force_corr=True)
            sf.go()

    generators = [TSNE_LSA_Word_Ngram_Pair]
    ngrams = [1, 2]
    obs_fields_list = []
    target_fields_list = []
    obs_fields_list.append( ["question1"] )
    target_fields_list.append( ["question2"] )
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for ngram in ngrams:
            for generator in generators:
                param_list = [ngram, config.SVD_DIM, config.SVD_N_ITER]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger, force_corr=True)
                pf.go()
예제 #30
0
	def initi_task(self, model_name, data_name, epoch_num) :
		time_str = datetime.datetime.now().isoformat()
		logname = "Final_[Model@%s]_[Data@%s]_%s.log" % (model_name, data_name, time_str)
		logger = logging_utils._get_logger(config.LOG_DIR, logname)
		params_dict = param_space_dict[model_name]
		task = Task(model_name, data_name, epoch_num, params_dict, logger) # default:cv_run=5
		return task
예제 #31
0
def main():
    logname = "generate_feature_basic.log"
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = table_utils._read(config.ALL_DATA)

    # basic
    generators = [DocLen, DocFreq, DocEntropy, DigitCount, DigitRatio]
    obs_fields = ["query", "norm_query", "hit_title", 'opening_text']
    for generator in generators:
        param_list = []
        dedup = False if generator == DocFreq else True
        sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list,
                                      config.FEAT_DIR, logger, dedup)
        sf.go()

    # basic against multi-value fields
    obs_fields = [
        'category', 'template', 'heading', 'outgoing_link', 'external_link',
        'redirect.title', 'auxiliary_text'
    ]
    aggregations = ['mean', 'std', 'max', 'min', 'median']
    param_list = [aggregations]
    for generator in generators:
        multi_gen = MultiObjEstimatorWrapper(generator)
        dedup = False if generator == DocFreq else True
        sf = StandaloneFeatureWrapper(multi_gen, dfAll, obs_fields, param_list,
                                      config.FEAT_DIR, logger, dedup)
        sf.go()

    # unique count
    generators = [UniqueCount_Ngram, UniqueRatio_Ngram]
    obs_fields = ["query", "norm_query", "hit_title", 'opening_text']
    ngrams = [1, 2, 3]
    for generator in generators:
        for ngram in ngrams:
            param_list = [ngram]
            dedup = True
            sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields,
                                          param_list, config.FEAT_DIR, logger,
                                          dedup)
            sf.go()

    # unique count against multi-value fields
    generators = [UniqueCount_Ngram, UniqueRatio_Ngram]
    obs_fields = [
        'category', 'template', 'heading', 'outgoing_link', 'external_link',
        'redirect.title', 'auxiliary_text'
    ]
    aggregations = ['mean', 'std', 'max', 'min', 'median']
    ngrams = [1, 2, 3]
    for generator in generators:
        for ngram in ngrams:
            multi_gen = MultiObjEstimatorWrapper(generator)
            param_list = [ngram, aggregations]
            dedup = True
            sf = StandaloneFeatureWrapper(multi_gen, dfAll, obs_fields,
                                          param_list, config.FEAT_DIR, logger,
                                          dedup)
            sf.go()
예제 #32
0
파일: task_o.py 프로젝트: gsangeryee/TGS
def main(options):
    logname = "[Feat@%s]_[Learner@%s]_hyperopt_%s.log"%(
        options.feature_name, options.learner_name, time_utils._timestamp())
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    optimizer = TaskOptimizer(options.task_mode, options.learner_name,
                              options.feature_name, logger, options.max_evals, verbose=True,
                              refit_once=options.refit_once, plot_importance=options.plot_importance)
    optimizer.run()
def main():
    FNAME = "feature_general"
    logname = "%s_%s.log" % (FNAME, now)
    logger = logging_utils._get_logger(config.LOG_DIR, logname)

    logger.info("Generating time period feature ...")
    train, test = dl.load_data()
    periods_train, periods_test = dl.load_periods_data()

    t0 = time()
    # Generating general features
    train_general = pd.DataFrame()
    test_general = pd.DataFrame()

    logger.info("Generating general features ...")

    train = train.merge(periods_train[['item_id', 'date_to', 'date_from']], how='left', on=['item_id'])
    test = test.merge(periods_test[['item_id', 'date_to', 'date_from']], how='left', on=['item_id'])
    logger.info('Train  shape: %s & Test shape: %s' % (train.shape, test.shape))

    # https: // stackoverflow.com / questions / 37840812 / pandas - subtracting - two - date - columns - and -the - result - being - an - integer
    train_general['total_period'] = train['date_to'].sub(train['date_from'], axis=0)
    train_general['total_period'] = train_general['total_period'] / np.timedelta64(1, 'D')
    train_general['total_period'].fillna(0, inplace=True)
    test_general['total_period'] = test['date_to'].sub(test['date_from'], axis=0)
    test_general['total_period'] = test_general['total_period'] / np.timedelta64(1, 'D')
    test_general['total_period'].fillna(0, inplace=True)

    generate_count_features('title', train_general, train)
    generate_count_features('title', test_general, test)

    generate_count_features('description', train_general, train)
    generate_count_features('description', test_general, test)

    train_general['log_price'] = np.log(train["price"] + 0.001)
    train_general['log_price'].fillna(-999, inplace=True)

    test_general['log_price'] = np.log(test["price"] + 0.001)
    test_general['log_price'].fillna(-999, inplace=True)

    train['has_image'] = train['image'].isnull().astype(int)
    test['has_image'] = test['image'].isnull().astype(int)

    logger.info(FNAME + ' took: %s minutes' % round((time() - t0) / 60, 1))
    del train
    del test
    gc.collect()

    logger.info('Train general shape: %s & Test general shape: %s' % (train_general.shape, test_general.shape))

    # save data
    train_fname = os.path.join(config.DATA_FEATURES_DIR, "train_" + FNAME + config.FEAT_FILE_SUFFIX)
    test_fname = os.path.join(config.DATA_FEATURES_DIR, "test_" + FNAME + config.FEAT_FILE_SUFFIX)
    logger.info("Save to %s" % train_fname)
    pkl_utils._save(train_fname, train_general)
    logger.info("Save to %s" % test_fname)
    pkl_utils._save(test_fname, test_general)
    gc.collect()
예제 #34
0
    def __init__(self, feature_dict, feature_name, corr_threshold=0):
        self.feature_dict = feature_dict
        self.feature_name = feature_name
        self.corr_threshold = corr_threshold

        self.feature_names_basic = []
        self.feature_names = []
        logname = "feature_combiner_%s.log" % (feature_name)
        self.logger = logging_utils._get_logger(config.LOG_DIR, logname)
예제 #35
0
def main():
    logname = "generate_feature_intersect_count_%s.log" % time_utils._timestamp(
    )
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    # Ngram
    generators = [
        IntersectCount_Ngram,
        IntersectRatio_Ngram,
    ]
    obs_fields_list = [['question1'], ['question2']]
    target_fields_list = [['question2'], ['question1']]
    ngrams = [1, 2, 3, 4, 5, 12,
              123]  # only 1,2,3,4,5,12,123 available, see ngram_utils.py
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            for ngram in ngrams:
                param_list = [ngram]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields,
                                            target_fields, param_list,
                                            config.FEAT_DIR, logger)
                pf.go()

    # Ngram symmetric
    generators = [
        CooccurrenceCount_Ngram,
        CooccurrenceRatio_Ngram,
        #CooccurrenceCount_Nterm,    # not used in Quora project, takes long to run
        #CooccurrenceRatio_Nterm,
    ]
    obs_fields_list = [['question1']]
    target_fields_list = [['question2']]
    ngrams = [1, 2, 3, 4, 5, 12,
              123]  # only 1,2,3,4,5,12,123 available, see ngram_utils.py
    nterms = [
        2, 3, 4
    ]  # only 1,2,3,4 available,(uniterms is the same as unigrams) see ngram_utils.py
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            if generator.__name__[-5:] == 'Ngram':
                for ngram in ngrams:
                    param_list = [ngram]
                    pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields,
                                                target_fields, param_list,
                                                config.FEAT_DIR, logger)
                    pf.go()
            elif generator.__name__[-5:] == 'Nterm':
                for nterm in nterms:
                    param_list = [nterm]
                    pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields,
                                                target_fields, param_list,
                                                config.FEAT_DIR, logger)
                    pf.go()
            else:
                print("Wrong Generator")
                pass
예제 #36
0
def main():
    FNAME = "model_train_lgbm"
    logname = "%s_%s.log" % (FNAME, now)
    logger = logging_utils._get_logger(config.LOG_DIR, logname)

    # Load raw data
    train_raw = dl.load_train_data()
    # Load generated features
    train_features = load_combined_features(logger)

    train_column_names = list(train_features.columns.values)
    logger.info("Training set column names: " + str(train_column_names))

    # train_features = pd.concat([train_features, train_raw[config.NUMBER_FEATURES]], axis=1)
    logger.info('Final training data shape: %s' % str(train_features.shape))

    x_train, x_valid, y_train, y_valid = train_test_split(
        train_features,
        train_raw[config.TARGET_FEATURE],
        test_size=0.20,
        random_state=42)
    del train_raw
    del train_features
    gc.collect()
    lgtrain = lgb.Dataset(x_train,
                          label=y_train,
                          feature_name=train_column_names,
                          categorical_feature=config.ENCODED_CATEGORY_FEATURES)
    lgvalid = lgb.Dataset(x_valid,
                          label=y_valid,
                          feature_name=train_column_names,
                          categorical_feature=config.ENCODED_CATEGORY_FEATURES)

    t0 = time()
    lightgbm_model = lgb.train(
        config.LGBM_PARAMS,
        lgtrain,
        config.LGBM_NUM_ROUNDS,
        valid_sets=lgvalid,
        verbose_eval=50,
        early_stopping_rounds=config.LGBM_EARLY_STOPPING_ROUNDS)
    logger.info('Training LightGBM model took: %s minutes' % round(
        (time() - t0) / 60, 1))

    # Save model
    t0 = time()
    MODEL_FILE_NAME = "lightgbm_model"
    model_file = os.path.join(config.DATA_MODELS_DIR,
                              MODEL_FILE_NAME + config.FEAT_FILE_SUFFIX)
    logger.info("Save to %s" % model_file)
    lightgbm_model.save_model(model_file,
                              num_iteration=lightgbm_model.best_iteration)
    logger.info('Saving %s lightgbm model took: %s minutes' %
                (MODEL_FILE_NAME, round((time() - t0) / 60, 1)))

    generate_figure_importance(lightgbm_model, logger)
def main(which):
    logname = "generate_feature_stat_cooc_tfidf_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    generators = []
    for w in which.split(","):
        if w == "tf":
            generators.append( StatCoocTF_Ngram )
        elif w == "norm_tf":
            generators.append( StatCoocNormTF_Ngram )
        elif w == "tfidf":
            generators.append( StatCoocTFIDF_Ngram )
        elif w == "norm_tfidf":
            generators.append( StatCoocNormTFIDF_Ngram )
        elif w == "bm25":
            generators.append( StatCoocBM25_Ngram )


    obs_fields_list = []
    target_fields_list = []
    ## query in document
    obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"] )
    target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] )
    ## document in query
    obs_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] )
    target_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"] )
    ngrams = [1,2,3,12,123][:3]
    aggregation_mode = ["mean", "std", "max", "min", "median"]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            for ngram in ngrams:
                param_list = [ngram, aggregation_mode]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
                pf.go()


    obs_fields_list = []
    target_fields_list = []
    ## query in document
    obs_fields_list.append( ["search_term_product_name"] )
    target_fields_list.append( ["product_title_product_name"] )
    ngrams = [1,2]
    aggregation_mode = ["mean", "std", "max", "min", "median"]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            for ngram in ngrams:
                if ngram == 2:
                    # since product_name is of length 2, it makes no difference 
                    # for various aggregation as there is only one item
                    param_list = [ngram, "mean"]
                else:
                    param_list = [ngram, aggregation_mode]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
                pf.go()
예제 #38
0
 def __init__(self, feature_dict, feature_name, feature_suffix=".pkl", corr_threshold=0):
     self.feature_name = feature_name
     self.feature_dict = feature_dict
     self.feature_suffix = feature_suffix
     self.corr_threshold = corr_threshold
     self.feature_names_basic = []
     self.feature_names_cv = []
     self.basic_only = 0
     logname = "feature_combiner_%s_%s.log"%(feature_name, time_utils._timestamp())
     self.logger = logging_utils._get_logger(config.LOG_DIR, logname)
     self.splitter = splitter_level1
     self.n_iter = n_iter
예제 #39
0
def run_compression_distance():
    logname = "generate_feature_compression_distance_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    obs_fields_list = []
    target_fields_list = []
    obs_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"] )
    target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] )
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        param_list = []
        pf = PairwiseFeatureWrapper(CompressionDistance, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
        pf.go()
예제 #40
0
def main():
    logname = "generate_feature_basic_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    ## basic
    generators = [DocId, DocLen, DocFreq, DocEntropy, DigitCount, DigitRatio]
    obs_fields = ["search_term", "product_title", "product_description", 
                "product_attribute", "product_brand", "product_color"]
    for generator in generators:
        param_list = []
        sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger)
        sf.go()

    ## for product_uid
    generators = [DocIdEcho, DocFreq, ProductUidDummy1, ProductUidDummy2, ProductUidDummy3]
    obs_fields = ["product_uid"]
    for generator in generators:
        param_list = []
        sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger)
        sf.go()

    ## unique count
    generators = [UniqueCount_Ngram, UniqueRatio_Ngram]
    obs_fields = ["search_term", "product_title", "product_description", 
    "product_attribute", "product_brand", "product_color"]
    ngrams = [1,2,3]
    for generator in generators:
        for ngram in ngrams:
            param_list = [ngram]
            sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger)
            sf.go()

    ## for product_attribute_list
    generators = [
        AttrCount, 
        AttrBulletCount, 
        AttrBulletRatio, 
        AttrNonBulletCount, 
        AttrNonBulletRatio,
        AttrHasProductHeight,
        AttrHasProductWidth,
        AttrHasProductLength,
        AttrHasProductDepth,
        AttrHasIndoorOutdoor,
    ]
    obs_fields = ["product_attribute_list"]
    for generator in generators:
        param_list = []
        sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger)
        sf.go()
예제 #41
0
def run_char_dist_sim():
    logname = "generate_feature_char_dist_sim_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
    dfAll.drop(["product_attribute_list"], inplace=True, axis=1)
    
    generators = [CharDistribution_Ratio, CharDistribution_CosineSim, CharDistribution_KL]
    obs_fields_list = []
    target_fields_list = []
    obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"] )
    target_fields_list.append( ["product_title", "product_description", "product_attribute"] )
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            param_list = []
            pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
            pf.go()
예제 #42
0
def run_ngram_jaccard():
    logname = "generate_feature_ngram_jaccard_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    generators = [JaccardCoef_Ngram, DiceDistance_Ngram]
    obs_fields_list = []
    target_fields_list = []
    obs_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"][:2] )
    target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] )
    ngrams = [1,2,3,12,123][:3]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            for ngram in ngrams:
                param_list = [ngram]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
                pf.go()
예제 #43
0
def main():
    logname = "generate_feature_doc2vec_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    #### NOTE: use data BEFORE STEMMING
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED)

    doc2vec_model_dirs = []
    model_prefixes = []
    ## doc2vec model trained with Homedepot dataset: brand/color/obs/title/description
    doc2vec_model_dirs.append( config.DOC2VEC_MODEL_DIR + "/Homedepot-doc2vec-D%d-min_count%d.model"%(config.EMBEDDING_DIM, config.EMBEDDING_MIN_COUNT) )
    model_prefixes.append( "Homedepot" )
    for doc2vec_model_dir, model_prefix in zip(doc2vec_model_dirs, model_prefixes):
        ## load model
        try:
            if ".bin" in doc2vec_model_dir:
                doc2vec_model = gensim.models.Doc2Vec.load_word2vec_format(doc2vec_model_dir, binary=True)
            if ".txt" in doc2vec_model_dir:
                doc2vec_model = gensim.models.Doc2Vec.load_word2vec_format(doc2vec_model_dir, binary=False)
            else:
                doc2vec_model = gensim.models.Doc2Vec.load(doc2vec_model_dir)
                doc2vec_model_sent_label = pkl_utils._load(doc2vec_model_dir+".sent_label")
        except:
            continue

        # ## standalone (not used in model building)
        # obs_fields = ["search_term", "search_term_alt", "product_title", "product_description", "product_attribute"]
        # generator = Doc2Vec_Vector
        # param_list = [doc2vec_model, doc2vec_model_sent_label, model_prefix]
        # sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger)
        # sf.go()

        ## pairwise
        generators = [
            Doc2Vec_CosineSim, 
            Doc2Vec_RMSE, 
            Doc2Vec_Vdiff,
        ]
        obs_fields_list = []
        target_fields_list = []
        obs_fields_list.append( ["search_term", "search_term_alt"] )
        target_fields_list.append( ["product_title", "product_description", "product_attribute", "product_brand", "product_color"] )
        for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
            for generator in generators:
                param_list = [doc2vec_model, doc2vec_model_sent_label, model_prefix]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
                pf.go()
예제 #44
0
def run_edit_distance():
    logname = "generate_feature_edit_distance_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    obs_fields_list = []
    target_fields_list = []
    obs_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"][1:2] )
    target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] )
    ngrams = [1,2,3,12,123][:3]
    aggregation_mode_prev = ["mean", "max", "min", "median"]
    aggregation_mode = ["mean", "std", "max", "min", "median"]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        param_list = []
        PairwiseFeatureWrapper(EditDistance, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
        for ngram in ngrams:
            param_list = [ngram, aggregation_mode_prev, aggregation_mode]
            pf = PairwiseFeatureWrapper(EditDistance_Ngram, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
            pf.go()
def main():
    logname = "generate_feature_query_quality_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)

    obs_corpus = []
    query_suffix = []
    # raw
    dfAll = pkl_utils._load(config.ALL_DATA_RAW)
    obs_corpus.append(dfAll["search_term"].values)
    query_suffix.append("raw")
    # after processing    
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED)
    obs_corpus.append(dfAll["search_term"].values)
    query_suffix.append("lemmatized")
    # after extracting product_name in search_term
    obs_corpus.append(dfAll["search_term_product_name"].values)
    query_suffix.append("product_name")
    if "search_term_auto_corrected" in dfAll.columns:
        # after auto correction
        obs_corpus.append(dfAll["search_term_auto_corrected"].values)
        query_suffix.append("corrected")  
    # after stemming
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
    obs_corpus.append(dfAll["search_term"].values)
    query_suffix.append("stemmed")

    y_train = dfAll["relevance"].values[:TRAIN_SIZE]
    for i in range(len(query_suffix)-1):
        for j in range(i+1, len(query_suffix)):
            ext = QueryQuality(obs_corpus[i], obs_corpus[j])
            x = ext.transform()
            dim = 1
            fname = "%s_%s_x_%s_%dD"%(ext._get_feat_name(), query_suffix[i], query_suffix[j], dim)
            pkl_utils._save(os.path.join(config.FEAT_DIR, fname+config.FEAT_FILE_SUFFIX), x)
            corr = np_utils._corr(x[:TRAIN_SIZE], y_train)
            logger.info("%s (%dD): corr = %.6f"%(fname, dim, corr))

    # raw
    dfAll = pkl_utils._load(config.ALL_DATA_RAW)
    obs_fields = ["search_term"]
    param_list = []
    sf = StandaloneFeatureWrapper(IsInGoogleDict, dfAll, obs_fields, param_list, config.FEAT_DIR, logger)
    sf.go()
예제 #46
0
def run_tfidf_ngram_cosinesim():
    logname = "generate_feature_tfidf_ngram_cosinesim_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
    dfAll.drop(["product_attribute_list"], inplace=True, axis=1)

    generators = [TFIDF_Word_Ngram_CosineSim, TFIDF_Char_Ngram_CosineSim]
    ngrams_list = [[1,2,3], [2,3,4,5]]
    ngrams_list = [[1,2,3], [4]]
    obs_fields_list = []
    target_fields_list = []
    obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"] )
    target_fields_list.append( ["product_title", "product_description", "product_attribute"] )
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator,ngrams in zip(generators, ngrams_list):
            for ngram in ngrams:
                param_list = [ngram]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
                pf.go()
예제 #47
0
 def __init__(self, feature_list, feature_name, feature_suffix=".csv",
             feature_level=2, meta_feature_dict={}, corr_threshold=0):
     self.feature_name = feature_name
     self.feature_list = feature_list
     self.feature_suffix = feature_suffix
     self.feature_level = feature_level
     # for meta features
     self.meta_feature_dict = meta_feature_dict
     self.corr_threshold = corr_threshold
     self.feature_names_basic = []
     self.feature_names_cv = []
     self.has_basic = 1 if self.meta_feature_dict else 0
     logname = "feature_combiner_%s_%s.log"%(feature_name, time_utils._timestamp())
     self.logger = logging_utils._get_logger(config.LOG_DIR, logname)
     if self.feature_level == 2:
         self.splitter = splitter_level2
     elif self.feature_level == 3:
         self.splitter = splitter_level3
     self.n_iter = n_iter
     self.splitter_prev = [0]*self.n_iter
예제 #48
0
def run_lsa_ngram_cooc():
    logname = "generate_feature_lsa_ngram_cooc_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
    dfAll.drop(["product_attribute_list"], inplace=True, axis=1)

    generators = [LSA_Word_Ngram_Cooc]
    obs_ngrams = [1, 2]
    target_ngrams = [1, 2]
    obs_fields_list = []
    target_fields_list = []
    obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"] )
    target_fields_list.append( ["product_title", "product_description"] )
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for obs_ngram in obs_ngrams:
            for target_ngram in target_ngrams:
                for generator in generators:
                    param_list = [obs_ngram, target_ngram, config.SVD_DIM, config.SVD_N_ITER]
                    pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
                    pf.go()
def main():
    logname = "generate_feature_group_distance_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
    dfTrain = dfAll.iloc[:TRAIN_SIZE].copy()

    ## run python3 splitter.py first
    split = pkl_utils._load("%s/splits_level1.pkl"%config.SPLIT_DIR)
    n_iter = len(split)

    relevances_complete = [1, 1.25, 1.33, 1.5, 1.67, 1.75, 2, 2.25, 2.33, 2.5, 2.67, 2.75, 3]
    relevances = [1, 1.33, 1.67, 2, 2.33, 2.67, 3]
    ngrams = [1]
    obs_fields = ["search_term"]
    target_fields = ["product_title", "product_description"]
    aggregation_mode = ["mean", "std", "max", "min", "median"]

    ## for cv
    for i in range(n_iter):
        trainInd, validInd = split[i][0], split[i][1]
        dfTrain2 = dfTrain.iloc[trainInd].copy()
        sub_feature_dir = "%s/Run%d" % (config.FEAT_DIR, i+1)

        for target_field in target_fields:
            for relevance in relevances:
                for ngram in ngrams:
                    param_list = [dfAll["id"], dfTrain2, target_field, relevance, ngram, aggregation_mode]
                    pf = PairwiseFeatureWrapper(GroupRelevance_Ngram_Jaccard, dfAll, obs_fields, [target_field], param_list, sub_feature_dir, logger)
                    pf.go()

    ## for all
    sub_feature_dir = "%s/All" % (config.FEAT_DIR)
    for target_field in target_fields:
        for relevance in relevances:
            for ngram in ngrams:
                param_list = [dfAll["id"], dfTrain, target_field, relevance, ngram, aggregation_mode]
                pf = PairwiseFeatureWrapper(GroupRelevance_Ngram_Jaccard, dfAll, obs_fields, [target_field], param_list, sub_feature_dir, logger)
                pf.go()
def main():
    logname = "generate_feature_wordnet_similarity_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    #### NOTE: use data BEFORE STEMMING
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED)

    generators = [
        WordNet_Path_Similarity,
        WordNet_Lch_Similarity,
        WordNet_Wup_Similarity,
    ]
    obs_fields_list = []
    target_fields_list = []
    obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"] )
    target_fields_list.append( ["product_title", "product_description", "product_attribute"] )
    # double aggregation
    aggregation_mode_prev = ["mean", "max", "min", "median"]
    aggregation_mode = ["mean", "std", "max", "min", "median"]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            param_list = [aggregation_mode_prev, aggregation_mode]
            pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
            pf.go()
예제 #51
0
def main():
    logname = "generate_feature_match_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
    
    generators = [
        MatchQueryCount, 
        MatchQueryRatio, 
        LongestMatchSize,
        LongestMatchRatio
    ]
    obs_fields_list = []
    target_fields_list = []
    obs_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"] )
    target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] )
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            param_list = []
            pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
            pf.go()

    # product_attribute_list
    generators = [
        MatchAttrCount, 
        MatchAttrRatio, 
        IsIndoorOutdoorMatch, 
    ]
    obs_fields_list = []
    target_fields_list = []
    obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"] )
    target_fields_list.append( ["product_attribute_list"] )
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            param_list = []
            pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
            pf.go()
예제 #52
0
def main():

    ###########
    ## Setup ##
    ###########
    logname = "data_processor_%s.log"%now
    logger = logging_utils._get_logger(config.LOG_DIR, logname)

    # put product_attribute_list, product_attribute and product_description first as they are
    # quite time consuming to process
    columns_to_proc = [
        # # product_attribute_list is very time consuming to process
        # # so we just process product_attribute which is of the form 
        # # attr_name1 | attr_value1 | attr_name2 | attr_value2 | ...
        # # and split it into a list afterwards
        # "product_attribute_list",
        "product_attribute_concat",
        "product_description",
        "product_brand", 
        "product_color",
        "product_title",
        "search_term", 
    ]
    if config.PLATFORM == "Linux":
        config.DATA_PROCESSOR_N_JOBS = len(columns_to_proc)

    # clean using a list of processors
    processors = [
        LowerCaseConverter(), 
        # See LowerUpperCaseSplitter and UnitConverter for why we put UnitConverter here
        UnitConverter(),
        LowerUpperCaseSplitter(), 
        WordReplacer(replace_fname=config.WORD_REPLACER_DATA), 
        LetterLetterSplitter(),
        DigitLetterSplitter(), 
        DigitCommaDigitMerger(), 
        NumberDigitMapper(),
        UnitConverter(), 
        QuartetCleaner(), 
        HtmlCleaner(parser="html.parser"), 
        Lemmatizer(),
    ]
    stemmers = [
        Stemmer(stemmer_type="snowball"), 
        Stemmer(stemmer_type="porter")
    ][0:1]

    ## simple test
    text = "1/2 inch rubber lep tips Bullet07"
    print("Original:")
    print(text)
    list_processor = ListProcessor(processors)
    print("After:")
    print(list_processor.process([text]))

    #############
    ## Process ##
    #############
    ## load raw data
    dfAll = pkl_utils._load(config.ALL_DATA_RAW)
    columns_to_proc = [col for col in columns_to_proc if col in dfAll.columns]


    ## extract product name from search_term and product_title
    ext = ProductNameExtractor()
    dfAll["search_term_product_name"] = dfAll["search_term"].apply(ext.transform)
    dfAll["product_title_product_name"] = dfAll["product_title"].apply(ext.transform)
    if config.TASK == "sample":
        print(dfAll[["search_term", "search_term_product_name", "product_title_product_name"]])


    ## clean using GoogleQuerySpellingChecker
    # MUST BE IN FRONT OF ALL THE PROCESSING
    logger.info("Run GoogleQuerySpellingChecker at search_term")
    checker = GoogleQuerySpellingChecker()
    dfAll["search_term"] = dfAll["search_term"].apply(checker.correct)


    ## clean uisng a list of processors
    df_processor = DataFrameParallelProcessor(processors, config.DATA_PROCESSOR_N_JOBS)
    df_processor.process(dfAll, columns_to_proc)
    # split product_attribute_concat into product_attribute and product_attribute_list
    dfAll["product_attribute"] = dfAll["product_attribute_concat"].apply(_split_attr_to_text)
    dfAll["product_attribute_list"] = dfAll["product_attribute_concat"].apply(_split_attr_to_list)
    if config.TASK == "sample":
        print(dfAll[["product_attribute", "product_attribute_list"]])
    # query expansion
    list_processor = ListProcessor(processors)
    base_stopwords = set(list_processor.process(list(config.STOP_WORDS)))
    qe = QueryExpansion(dfAll, ngram=3, stopwords_threshold=0.9, base_stopwords=base_stopwords)
    dfAll["search_term_alt"] = qe.build()
    if config.TASK == "sample":
        print(dfAll[["search_term", "search_term_alt"]])
    # save data
    logger.info("Save to %s"%config.ALL_DATA_LEMMATIZED)
    columns_to_save = [col for col in dfAll.columns if col != "product_attribute_concat"]
    pkl_utils._save(config.ALL_DATA_LEMMATIZED, dfAll[columns_to_save])


    ## auto correcting query
    if config.AUTO_CORRECTING_QUERY:
        logger.info("Run AutoSpellingChecker at search_term")
        checker = AutoSpellingChecker(dfAll, exclude_stopwords=False, min_len=4)
        dfAll['search_term_auto_corrected'] = list(dfAll["search_term"].apply(checker.correct))
        columns_to_proc += ['search_term_auto_corrected']
        if config.TASK == "sample":
            print(dfAll[["search_term", "search_term_auto_corrected"]])
        # save query_correction_map and spelling checker
        fname = "%s/auto_spelling_checker_query_correction_map_%s.log"%(config.LOG_DIR, now)
        checker.save_query_correction_map(fname)
        # save data
        logger.info("Save to %s"%config.ALL_DATA_LEMMATIZED)
        columns_to_save = [col for col in dfAll.columns if col != "product_attribute_concat"]
        pkl_utils._save(config.ALL_DATA_LEMMATIZED, dfAll[columns_to_save])


    ## clean using stemmers
    df_processor = DataFrameParallelProcessor(stemmers, config.DATA_PROCESSOR_N_JOBS)
    df_processor.process(dfAll, columns_to_proc)
    # split product_attribute_concat into product_attribute and product_attribute_list
    dfAll["product_attribute"] = dfAll["product_attribute_concat"].apply(_split_attr_to_text)
    dfAll["product_attribute_list"] = dfAll["product_attribute_concat"].apply(_split_attr_to_list)
    # query expansion
    list_processor = ListProcessor(stemmers)
    base_stopwords = set(list_processor.process(list(config.STOP_WORDS)))
    qe = QueryExpansion(dfAll, ngram=3, stopwords_threshold=0.9, base_stopwords=base_stopwords)
    dfAll["search_term_alt"] = qe.build()
    if config.TASK == "sample":
        print(dfAll[["search_term", "search_term_alt"]])
    # save data
    logger.info("Save to %s"%config.ALL_DATA_LEMMATIZED_STEMMED)
    columns_to_save = [col for col in dfAll.columns if col != "product_attribute_concat"]
    pkl_utils._save(config.ALL_DATA_LEMMATIZED_STEMMED, dfAll[columns_to_save])
def main():
    logname = "generate_feature_group_distance_stat_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
    y_train = dfAll["relevance"].values[:TRAIN_SIZE]

    group_id_names = ["DocId_search_term", "DocId_product_title", "DocIdEcho_product_uid"]

    match_list = [
    "MatchQueryCount",
    "MatchQueryRatio",
    "LongestMatchRatio",
    ]

    tfidf_list = [
    "StatCoocTF_Unigram_Mean", 
    "StatCoocTF_Unigram_Max",
    "StatCoocTF_Unigram_Min",
    # "StatCoocNormTF_Unigram_Mean", 
    # "StatCoocNormTF_Unigram_Max",
    # "StatCoocNormTF_Unigram_Min", 
    "StatCoocTFIDF_Unigram_Mean",
    "StatCoocTFIDF_Unigram_Max",
    "StatCoocTFIDF_Unigram_Min",
    "StatCoocBM25_Unigram_Mean",
    "StatCoocBM25_Unigram_Max",
    "StatCoocBM25_Unigram_Min",
    # "StatCoocTF_Bigram_Mean", 
    # "StatCoocTF_Bigram_Max",
    # "StatCoocTF_Bigram_Min",
    # "StatCoocNormTF_Bigram_Mean", 
    # "StatCoocNormTF_Bigram_Max",
    # "StatCoocNormTF_Bigram_Min",
    # "StatCoocTFIDF_Bigram_Mean",
    # "StatCoocTFIDF_Bigram_Max",
    # "StatCoocTFIDF_Bigram_Min",
    # "StatCoocBM25_Bigram_Mean",
    # "StatCoocBM25_Bigram_Max",
    # "StatCoocBM25_Bigram_Min",
    # "StatCoocTF_Trigram_Mean", 
    # "StatCoocTF_Trigram_Max",
    # "StatCoocTF_Trigram_Min",
    # "StatCoocNormTF_Trigram_Mean", 
    # "StatCoocNormTF_Trigram_Max",
    # "StatCoocNormTF_Trigram_Min", 
    # "StatCoocTFIDF_Trigram_Mean",
    # "StatCoocTFIDF_Trigram_Max",
    # "StatCoocTFIDF_Trigram_Min",
    # "StatCoocBM25_Trigram_Mean",
    # "StatCoocBM25_Trigram_Max",
    # "StatCoocBM25_Trigram_Min",
    ]
    intersect_ngram_count_list = [    
    "IntersectCount_Unigram", 
    "IntersectRatio_Unigram", 
    # "IntersectCount_Bigram", 
    # "IntersectRatio_Bigram", 
    # "IntersectCount_Trigram", 
    # "IntersectRatio_Trigram", 
    ]
    first_last_ngram_list = [
    "FirstIntersectCount_Unigram", 
    "FirstIntersectRatio_Unigram", 
    "LastIntersectCount_Unigram", 
    "LastIntersectRatio_Unigram",
    # "FirstIntersectCount_Bigram", 
    # "FirstIntersectRatio_Bigram", 
    # "LastIntersectCount_Bigram", 
    # "LastIntersectRatio_Bigram",
    # "FirstIntersectCount_Trigram", 
    # "FirstIntersectRatio_Trigram", 
    # "LastIntersectCount_Trigram", 
    # "LastIntersectRatio_Trigram",
    ]

    cooccurrence_ngram_count_list = [
    "CooccurrenceCount_Unigram", 
    "CooccurrenceRatio_Unigram", 
    # "CooccurrenceCount_Bigram", 
    # "CooccurrenceRatio_Bigram",
    # "CooccurrenceCount_Trigram", 
    # "CooccurrenceRatio_Trigram",
    ]

    ngram_jaccard_list = [
    "JaccardCoef_Unigram", 
    # "JaccardCoef_Bigram", 
    # "JaccardCoef_Trigram", 
    "DiceDistance_Unigram", 
    # "DiceDistance_Bigram", 
    # "DiceDistance_Trigram", 
    ]

    char_dist_sim_list = [
    "CharDistribution_CosineSim",
    "CharDistribution_KL",
    ]

    tfidf_word_ngram_cosinesim_list = [
    "TFIDF_Word_Unigram_CosineSim",
    # "TFIDF_Word_Bigram_CosineSim",
    # "TFIDF_Word_Trigram_CosineSim",
    ]
    tfidf_char_ngram_cosinesim_list = [
    # "TFIDF_Char_Bigram_CosineSim",
    # "TFIDF_Char_Trigram_CosineSim",
    "TFIDF_Char_Fourgram_CosineSim",
    # "TFIDF_Char_Fivegram_CosineSim",
    ]

    lsa_word_ngram_cosinesim_list = [
    "LSA100_Word_Unigram_CosineSim",
    # "LSA100_Word_Bigram_CosineSim",
    # "LSA100_Word_Trigram_CosineSim",
    ]
    lsa_char_ngram_cosinesim_list = [
    # "LSA100_Char_Bigram_CosineSim",
    # "LSA100_Char_Trigram_CosineSim",
    "LSA100_Char_Fourgram_CosineSim",
    # "LSA100_Char_Fivegram_CosineSim",
    ]

    doc2vec_list = [
    "Doc2Vec_Homedepot_D100_CosineSim",
    ]

    word2vec_list = [
    "Word2Vec_N_Similarity",
    "Word2Vec_Homedepot_D100_CosineSim_Mean_Mean",
    "Word2Vec_Homedepot_D100_CosineSim_Max_Mean",
    "Word2Vec_Homedepot_D100_CosineSim_Min_Mean",
    ]

    distance_generator_list = \
    match_list + \
    tfidf_list + \
    intersect_ngram_count_list + \
    first_last_ngram_list + \
    cooccurrence_ngram_count_list + \
    ngram_jaccard_list + \
    tfidf_word_ngram_cosinesim_list + \
    tfidf_char_ngram_cosinesim_list + \
    lsa_word_ngram_cosinesim_list + \
    lsa_char_ngram_cosinesim_list + \
    char_dist_sim_list + \
    word2vec_list + \
    doc2vec_list

    obs_fields_list = []
    target_fields_list = []
    ## query in document
    obs_fields_list.append( ["search_term"] )
    target_fields_list.append( ["product_title", "product_title_product_name"] )
    aggregation_mode = ["mean", "max", "min"]
    for group_id_name in group_id_names:
        group_id_list = pkl_utils._load(os.path.join(config.FEAT_DIR, group_id_name+"_1D.pkl"))
        for distance_generator in distance_generator_list:
            for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
                for obs_field in obs_fields:
                    for target_field in target_fields:
                        dist_name = "%s_%s_x_%s"%(distance_generator, obs_field, target_field)
                        try:
                            dist_list = pkl_utils._load(os.path.join(config.FEAT_DIR, dist_name+"_1D.pkl"))
                            ext = GroupDistanceStat(dist_list, group_id_list, dist_name, group_id_name, aggregation_mode)
                            x = ext.transform()
                            if isinstance(ext.__name__(), list):
                                for i,feat_name in enumerate(ext.__name__()):
                                    dim = 1
                                    fname = "%s_%dD"%(feat_name, dim)
                                    pkl_utils._save(os.path.join(config.FEAT_DIR, fname+config.FEAT_FILE_SUFFIX), x[:,i])
                                    corr = np_utils._corr(x[:TRAIN_SIZE,i], y_train)
                                    logger.info("%s (%dD): corr = %.6f"%(fname, dim, corr))
                        except:
                            logger.info("Skip %s"%dist_name)
                            pass
예제 #54
0
def main(which):
    logname = "generate_feature_word2vec_%s_%s.log"%(which, time_utils._timestamp())
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    #### NOTE: use data BEFORE STEMMinG
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED)

    word2vec_model_dirs = []
    model_prefixes = []
    if which == "homedepot":
        ## word2vec model trained with Homedepot dataset: brand/color/query/title/description
        word2vec_model_dirs.append( config.WORD2VEC_MODEL_DIR + "/Homedepot-word2vec-D%d-min_count%d.model"%(config.EMBEDDING_DIM, config.EMBEDDING_MIN_COUNT) )
        model_prefixes.append( "Homedepot" )
    elif which == "wikipedia":
        ## word2vec model pretrained with Wikipedia+Gigaword 5
        word2vec_model_dirs.append( config.GLOVE_WORD2VEC_MODEL_DIR + "/glove.6B.300d.txt" )
        model_prefixes.append( "Wikipedia" )
    elif which == "google":
        ## word2vec model pretrained with Google News
        word2vec_model_dirs.append( config.WORD2VEC_MODEL_DIR + "/GoogleNews-vectors-negative300.bin" )
        model_prefixes.append( "GoogleNews" )

    for word2vec_model_dir, model_prefix in zip(word2vec_model_dirs, model_prefixes):
        ## load model
        try:
            if ".bin" in word2vec_model_dir:
                word2vec_model = gensim.models.Word2Vec.load_word2vec_format(word2vec_model_dir, binary=True)
            elif ".txt" in word2vec_model_dir:
                word2vec_model = gensim.models.Word2Vec.load_word2vec_format(word2vec_model_dir, binary=False)
            else:
                word2vec_model = gensim.models.Word2Vec.load(word2vec_model_dir)
        except:
            continue

        # ## standalone (not used in model building)
        # obs_fields = ["search_term", "product_title", "product_description"]
        # generator = Word2Vec_Centroid_Vector
        # param_list = [word2vec_model, model_prefix]
        # sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger)
        # sf.go()

        ## pairwise
        generators = [
            Word2Vec_Importance,
            Word2Vec_N_Similarity, 
            Word2Vec_N_Similarity_Imp, 
            Word2Vec_Centroid_RMSE, 
            Word2Vec_Centroid_RMSE_IMP,
            # # not used in final submission
            # Word2Vec_Centroid_Vdiff, 
        ]
        obs_fields_list = []
        target_fields_list = []
        obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"][:1] )
        target_fields_list.append( ["product_title", "product_description", "product_attribute", "product_brand", "product_color"] )
        for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
            for generator in generators:
                param_list = [word2vec_model, model_prefix]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
                pf.go()

        ## cosine sim
        generators = [
            Word2Vec_CosineSim,
        ]
        # double aggregation
        aggregation_mode_prev = ["mean", "max", "min", "median"]
        aggregation_mode = ["mean", "std", "max", "min", "median"]
        for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
            for generator in generators:
                param_list = [word2vec_model, model_prefix, aggregation_mode, aggregation_mode_prev]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
                pf.go()