list(df_test['search_term_stem'].values) +
        list(df_train['description_stem'].values) +
        list(df_test['description_stem'].values)
    )

    #######################
    ## Generate Features ##
    #######################
    print("==================================================")
    print("Generate distance features...")

    extract_cosine_feat(df_train)
    extract_cosine_feat(df_test)

    feat_names = ['id', 'cosine_st_orig', 'cosine_st_t_stem', 'cosine_st_d_stem', 'set_st_t_stem']

    if not os.path.exists(config.path_cosine_features):
        os.makedirs(config.path_cosine_features)

    with open(config.file_cosine_feat_train, "wb") as f:
        pickle.dump(df_train[feat_names], f, -1)
    with open(config.file_cosine_feat_test, "wb") as f:
        pickle.dump(df_test[feat_names], f, -1)

    # save feat names
    print("Feature names are stored in %s" % config.file_cosine_feat_name)
    # dump feat name
    feature_utils.dump_feat_name(feat_names, config.file_cosine_feat_name)

    print("All Done.")
    df_train = df_train.fillna('')
    df_test = df_test.fillna('')
    generate_intersect_word_count(df_train)
    generate_intersect_word_count(df_test)

    feat_names = list()
    feat_names.append("id")
    for name in df_train.columns:
        if "count_of_" in name or "ratio_of_" in name:
            feat_names.append(name)

    X_train = df_train[feat_names]
    print(X_train.shape)
    X_test = df_test[feat_names]
    print(X_test.shape)
    if not os.path.exists(config.path_brand_counting_features):
        os.makedirs(config.path_brand_counting_features)

    with open(config.file_brand_count_feat_train, "wb") as f:
        pickle.dump(X_train, f, -1)
    with open(config.file_brand_count_feat_test, "wb") as f:
        pickle.dump(X_test, f, -1)

    # save feat names
    print("Feature names are stored in %s" % config.file_brand_feat_name)
    # dump feat name
    feature_utils.dump_feat_name(feat_names, config.file_brand_feat_name)

    print("All Done.")
    print("==================================================")
    print("Generate distance features...")

    extract_basic_distance_feat(df_train)
    extract_basic_distance_feat(df_test)

    feat_names = list()
    feat_names.append("id")
    for name in df_train.columns:
        if "jaccard_coef" in name or "dice_dist" in name:
            feat_names.append(name)

    X_train = df_train[feat_names]
    print(X_train.shape)
    X_test = df_test[feat_names]
    print(X_test.shape)
    if not os.path.exists(config.path_distance_features):
        os.makedirs(config.path_distance_features)

    with open(config.file_distance_feat_train, "wb") as f:
        pickle.dump(X_train, f, -1)
    with open(config.file_distance_feat_test, "wb") as f:
        pickle.dump(X_test, f, -1)

    # save feat names
    print("Feature names are stored in %s" % config.file_distance_feat_name)
    # dump feat name
    feature_utils.dump_feat_name(feat_names, config.file_distance_feat_name)

    print("All Done.")
        X_svd_train = svd.fit_transform(X_tfidf_train)
        print(type(X_svd_train))
        print("X_svd_train shape: {0}".format(X_svd_train.shape))
        if i == 0:
            nd_train = X_svd_train
        else:
            nd_train = np.hstack([nd_train, X_svd_train])
        print("nd_train shape: {0}".format(nd_train.shape))

        X_svd_test = svd.transform(X_tfidf_test)
        print("X_svd_test shape: {0}".format(X_svd_test.shape))
        if i == 0:
            nd_test = X_svd_test
        else:
            nd_test = np.hstack([nd_test, X_svd_test])
        print("df_train shape: {0}".format(nd_test.shape))

    print("Done.")

    with open("%s/train.tfidf.feat.pkl" % (config.path_tfidf_features), "wb") as f:
        pickle.dump(nd_train, f, -1)
    with open("%s/test.tfidf.feat.pkl" % (config.path_tfidf_features), "wb") as f:
        pickle.dump(nd_test, f, -1)

    # save feat names
    print("Feature names are stored in %s" % config.file_tfidf_feat_name)
    svd_feat_names = ["%s_tfidf_individual_svd%d" % (name, svd_n_components) for name in column_names]
    feature_utils.dump_feat_name(svd_feat_names, config.file_tfidf_feat_name)

    print("All Done.")