def generate_word_counting_features(df):
    ################################
    ## word count and digit count ##
    ################################
    print("generate word counting features")
    feat_names = ["search_term", "title", "description"]
    grams = ["unigram", "bigram", "trigram"]
    count_digit = lambda x: sum([1. for w in x if w.isdigit()])
    for feat_name in feat_names:
        for gram in grams:
            # word count
            print("Generating count_of_{0}_{1} feature...".format(feat_name, gram))
            df["count_of_%s_%s" % (feat_name, gram)] = df.apply(lambda x: len(x[feat_name + "_" + gram]), axis=1)
            print("Generating count_of_unique_{0}_{1} feature...".format(feat_name, gram))
            df["count_of_unique_%s_%s" % (feat_name, gram)] = df.apply(lambda x: len(set(x[feat_name + "_" + gram])), axis=1)
            print("Generating ratio_of_unique_{0}_{1} feature...".format(feat_name, gram))
            df["ratio_of_unique_%s_%s" % (feat_name, gram)] = df.apply(
                lambda x: feature_utils.try_divide(x["count_of_unique_%s_%s" % (feat_name, gram)],
                                                   x["count_of_%s_%s" % (feat_name, gram)]), axis=1)

        # digit count
        print("Generating count_of_digit_in_{0} feature...".format(feat_name))
        df["count_of_digit_in_%s" % feat_name] = df.apply(lambda x: count_digit(x[feat_name + "_unigram"]), axis=1)
        print("Generating ratio_of_digit_in_{0} feature...".format(feat_name))
        df["ratio_of_digit_in_%s" % feat_name] = df.apply(lambda x: feature_utils.try_divide(x["count_of_digit_in_%s" % feat_name],
                                                                                             x["count_of_%s_unigram" % (feat_name)]),
                                                          axis=1)
    # description missing indicator
    print("Generating description_missing feature...")
    df["description_missing"] = df.apply(lambda x: int(x["description_unigram"] == ""), axis=1)
def generate_intersect_word_position_features(df):
    ######################################
    ## intersect word position feat ##
    ######################################
    print("generate intersect word position features")
    feat_names = ["search_term", "title", "description"]
    grams = ["unigram"]
    for gram in grams:
        for target_name in feat_names:
            for obs_name in feat_names:
                if target_name != obs_name:
                    pos = df.apply(lambda x: get_position_list(x[target_name + "_" + gram], obs=x[obs_name + "_" + gram]), axis=1)
                    # stats feat on pos
                    print("Generating pos_of_%s_%s_in_%s_min" % (obs_name, gram, target_name))
                    df["pos_of_%s_%s_in_%s_min" % (obs_name, gram, target_name)] = pos.apply(lambda x: np.min(x))  # np.min(pos)
                    # print(df['pos_of_title_unigram_in_search_term_min'])
                    print("Generating pos_of_%s_%s_in_%s_mean" % (obs_name, gram, target_name))
                    df["pos_of_%s_%s_in_%s_mean" % (obs_name, gram, target_name)] = pos.apply(lambda x: np.mean(x))
                    print("Generating pos_of_%s_%s_in_%s_median" % (obs_name, gram, target_name))
                    df["pos_of_%s_%s_in_%s_median" % (obs_name, gram, target_name)] = pos.apply(lambda x: np.median(x))
                    print("Generating pos_of_%s_%s_in_%s_max" % (obs_name, gram, target_name))
                    df["pos_of_%s_%s_in_%s_max" % (obs_name, gram, target_name)] = pos.apply(lambda x: np.max(x))
                    print("Generating pos_of_%s_%s_in_%s_std" % (obs_name, gram, target_name))
                    df["pos_of_%s_%s_in_%s_std" % (obs_name, gram, target_name)] = pos.apply(lambda x: np.std(x))
                    # stats feat on normalized_pos
                    print("Generating normalized_pos_of_%s_%s_in_%s_min" % (obs_name, gram, target_name))
                    df["normalized_pos_of_%s_%s_in_%s_min" % (obs_name, gram, target_name)] = df.apply(
                        lambda x: feature_utils.try_divide(x["pos_of_%s_%s_in_%s_min" % (obs_name, gram, target_name)],
                                                           x["count_of_%s_%s" % (obs_name, gram)]), axis=1)
                    print("Generating normalized_pos_of_%s_%s_in_%s_mean" % (obs_name, gram, target_name))
                    df["normalized_pos_of_%s_%s_in_%s_mean" % (obs_name, gram, target_name)] = df.apply(
                        lambda x: feature_utils.try_divide(x["pos_of_%s_%s_in_%s_mean" % (obs_name, gram, target_name)],
                                                           x["count_of_%s_%s" % (obs_name, gram)]), axis=1)
                    print("Generating normalized_pos_of_%s_%s_in_%s_median" % (obs_name, gram, target_name))
                    df["normalized_pos_of_%s_%s_in_%s_median" % (obs_name, gram, target_name)] = df.apply(
                        lambda x: feature_utils.try_divide(x["pos_of_%s_%s_in_%s_median" % (obs_name, gram, target_name)],
                                                           x["count_of_%s_%s" % (obs_name, gram)]), axis=1)
                    print("Generating normalized_pos_of_%s_%s_in_%s_max" % (obs_name, gram, target_name))
                    df["normalized_pos_of_%s_%s_in_%s_max" % (obs_name, gram, target_name)] = df.apply(
                        lambda x: feature_utils.try_divide(x["pos_of_%s_%s_in_%s_max" % (obs_name, gram, target_name)],
                                                           x["count_of_%s_%s" % (obs_name, gram)]), axis=1)
                    print("Generating normalized_pos_of_%s_%s_in_%s_std" % (obs_name, gram, target_name))
                    df["normalized_pos_of_%s_%s_in_%s_std" % (obs_name, gram, target_name)] = df.apply(
                        lambda x: feature_utils.try_divide(x["pos_of_%s_%s_in_%s_std" % (obs_name, gram, target_name)],
                                                           x["count_of_%s_%s" % (obs_name, gram)]), axis=1)
def generate_intersect_word_count(df):
    ##############################
    ## intersect word count     ##
    ##############################
    print("generate intersect word counting features")
    feat_names = ["search_term", "title", "description"]
    grams = ["unigram", "bigram", "trigram"]

    for gram in grams:
        for obs_name in feat_names:
            for target_name in feat_names:
                if target_name != obs_name:
                    ## query
                    print("Generating count_of_{0}_{1}_in_{2} feature...".format(obs_name, gram, target_name))
                    df["count_of_%s_%s_in_%s" % (obs_name, gram, target_name)] = list(
                        df.apply(lambda x: sum([1. for w in x[obs_name + "_" + gram] if w in set(x[target_name + "_" + gram])]), axis=1))
                    print("Generating ratio_of_{0}_{1}_in_{2} feature...".format(obs_name, gram, target_name))
                    df["ratio_of_%s_%s_in_%s" % (obs_name, gram, target_name)] = df.apply(
                        lambda x: feature_utils.try_divide(x["count_of_%s_%s_in_%s" % (obs_name, gram, target_name)],
                                                           x["count_of_%s_%s" % (obs_name, gram)]), axis=1)
        ## some other feat
        print("Generating title_{0}_in_search_term_div_search_term_{1} feature...".format(gram, gram))
        df["title_%s_in_search_term_div_search_term_%s" % (gram, gram)] = df.apply(
            lambda x: feature_utils.try_divide(x["count_of_title_%s_in_search_term" % gram], x["count_of_search_term_%s" % gram]), axis=1)
        print("Generating title_{0}_in_search_term_div_search_term_{1}_in_title feature...".format(gram, gram))
        df["title_%s_in_search_term_div_search_term_%s_in_title" % (gram, gram)] = df.apply(
            lambda x: feature_utils.try_divide(x["count_of_title_%s_in_search_term" % gram], x["count_of_search_term_%s_in_title" % gram]),
            axis=1)
        print("Generating description_{0}_in_search_term_div_search_term_{1} feature...".format(gram, gram))
        df["description_%s_in_search_term_div_search_term_%s" % (gram, gram)] = df.apply(
            lambda x: feature_utils.try_divide(x["count_of_description_%s_in_search_term" % gram], x["count_of_search_term_%s" % gram]),
            axis=1)
        print("Generating description_{0}_in_search_term_div_search_term_{1}_in_description feature...".format(gram, gram))
        df["description_%s_in_search_term_div_search_term_%s_in_description" % (gram, gram)] = df.apply(
            lambda x: feature_utils.try_divide(x["count_of_description_%s_in_search_term" % gram],
                                               x["count_of_search_term_%s_in_description" % gram]),
            axis=1)
def generate_intersect_word_count(df):
    ##############################
    ## intersect word count     ##
    ##############################
    print("generate intersect word counting features")
    grams = ["unigram", "bigram", "trigram"]

    for gram in grams:
        # word count
        print("Generating count_of_brand_{0} feature...".format(gram))
        df["count_of_brand_%s" % (gram)] = df.apply(lambda x: len(x["brand_" + gram]), axis=1)
        # search term
        print("Generating count_of_search_term_{0}_in_brand_{1} feature...".format(gram, gram))
        df["count_of_search_term_%s_in_brand_%s" % (gram, gram)] = list(
            df.apply(lambda x: sum([1. for w in x["search_term_" + gram] if w in set(x["title_" + gram])]), axis=1))
        print("Generating ratio_of_search_term_{0}_in_title_{1} feature...".format(gram, gram))
        df["ratio_of_search_term_%s_in_brand_%s" % (gram, gram)] = df.apply(
            lambda x: feature_utils.try_divide(x["count_of_search_term_%s_in_brand_%s" % (gram, gram)],
                                               x["count_of_brand_%s" % (gram)]), axis=1)