def prepare_bigram(path, out):
    c = 0
    start = datetime.now()
    with open(out, 'w') as outfile:
        outfile.write('word1_bigram,word2_bigram,char1_bigram,char2_bigram\n')
        for t, row in enumerate(DictReader(open(path), delimiter=',')):
            if c % 100000 == 0:
                print('finished', c)
            q1 = remove_punctuation(str(row['words_x']).lower()).split(' ')
            q2 = remove_punctuation(str(
                row['words_y']).lower()).lower().split(' ')
            q3 = remove_punctuation(str(row['chars_x']).lower()).split(' ')
            q4 = remove_punctuation(str(
                row['chars_y']).lower()).lower().split(' ')
            q1_bigram = getBigram(q1)
            q2_bigram = getBigram(q2)
            q3_bigram = getBigram(q3)
            q4_bigram = getBigram(q4)
            q1_bigram = ' '.join(q1_bigram)
            q2_bigram = ' '.join(q2_bigram)
            q3_bigram = ' '.join(q3_bigram)
            q4_bigram = ' '.join(q4_bigram)
            outfile.write('%s,%s,%s,%s\n' %
                          (q1_bigram, q2_bigram, q3_bigram, q4_bigram))

            c += 1
        end = datetime.now()
        print('times:', end - start)
Пример #2
0
def extract_distance_features(df):	
    join_str="_"
    df["query_unigram"] = list(df.apply(lambda x: preprocess_data(x["query"], "stem"), axis=1))
    df["title_unigram"] = list(df.apply(lambda x: preprocess_data(x["title"]), axis=1))
    df["description_unigram"] = list(df.apply(lambda x: preprocess_data(x["description"]), axis=1))
    df["query_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["query_unigram"], join_str), axis=1))
    df["title_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["title_unigram"], join_str), axis=1))
    df["description_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["description_unigram"], join_str), axis=1))
    df["query_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["query_unigram"], join_str), axis=1))
    df["title_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["title_unigram"], join_str), axis=1))
    df["description_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["description_unigram"], join_str), axis=1))
    df["attribute_values_unigram"] = list(df.apply(lambda x: preprocess_data(x["values"], "stem"), axis=1))
    df["attribute_values_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["attribute_values_unigram"]), axis=1))
    df["attribute_values_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["attribute_values_unigram"]), axis=1))
    #calculate distance

    print "generate jaccard coef and dice dist for n-gram"
    dists = ["jaccard_coef", "dice_dist"]
    grams = ["unigram", "bigram", "trigram"]
    feat_names = ["query", "title", "description", "attribute_values"]
    for dist in dists:
        print "Generating ",dist
        for gram in grams:
            for i in range(len(feat_names)-1):
                for j in range(i+1,len(feat_names)):
                    target_name = feat_names[i]
                    obs_name = feat_names[j]
                    df["%s_of_%s_between_%s_%s"%(dist,gram,target_name,obs_name)] = \
                            list(df.apply(lambda x: compute_dist(x[target_name+"_"+gram], x[obs_name+"_"+gram], dist), axis=1))

    print "Dropping columns"
    df=df.drop(['query_unigram', 'title_unigram', 'description_unigram', 'query_bigram','title_bigram','description_bigram', 'query_trigram', 'title_trigram', 'description_trigram', 'attribute_values_unigram', 'attribute_values_bigram', 'attribute_values_trigram'], axis=1)                      
    print "Creating csv"
    df.to_csv("../../data/feat/test_distFeat.csv", header=True, index=False)
    return df
def extract_feat(df):
    ## unigram
    print "generate unigram"
    df["query_unigram"] = list(df.apply(lambda x: preprocess_data(x["query"]), axis=1))
    df["title_unigram"] = list(df.apply(lambda x: preprocess_data(x["product_title"]), axis=1))
    df["description_unigram"] = list(df.apply(lambda x: preprocess_data(x["product_description"]), axis=1))
    ## bigram
    print "generate bigram"
    join_str = "_"
    df["query_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["query_unigram"], join_str), axis=1))
    df["title_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["title_unigram"], join_str), axis=1))
    df["description_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["description_unigram"], join_str), axis=1))
    # ## trigram
    # join_str = "_"
    # df["query_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["query_unigram"], join_str), axis=1))
    # df["title_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["title_unigram"], join_str), axis=1))
    # df["description_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["description_unigram"], join_str), axis=1))

    ## cooccurrence terms
    join_str = "X"
    # query unigram
    df["query_unigram_title_unigram"] = list(df.apply(lambda x: cooccurrence_terms(x["query_unigram"], x["title_unigram"], join_str), axis=1))
    df["query_unigram_title_bigram"] = list(df.apply(lambda x: cooccurrence_terms(x["query_unigram"], x["title_bigram"], join_str), axis=1))
    df["query_unigram_description_unigram"] = list(df.apply(lambda x: cooccurrence_terms(x["query_unigram"], x["description_unigram"], join_str), axis=1))
    df["query_unigram_description_bigram"] = list(df.apply(lambda x: cooccurrence_terms(x["query_unigram"], x["description_bigram"], join_str), axis=1))
    # query bigram
    df["query_bigram_title_unigram"] = list(df.apply(lambda x: cooccurrence_terms(x["query_bigram"], x["title_unigram"], join_str), axis=1))
    df["query_bigram_title_bigram"] = list(df.apply(lambda x: cooccurrence_terms(x["query_bigram"], x["title_bigram"], join_str), axis=1))
    df["query_bigram_description_unigram"] = list(df.apply(lambda x: cooccurrence_terms(x["query_bigram"], x["description_unigram"], join_str), axis=1))
    df["query_bigram_description_bigram"] = list(df.apply(lambda x: cooccurrence_terms(x["query_bigram"], x["description_bigram"], join_str), axis=1))
    # query id
    df["query_id_title_unigram"] = list(df.apply(lambda x: cooccurrence_terms(["qid"+str(x["qid"])], x["title_unigram"], join_str), axis=1))
    df["query_id_title_bigram"] = list(df.apply(lambda x: cooccurrence_terms(["qid"+str(x["qid"])], x["title_bigram"], join_str), axis=1))
    df["query_id_description_unigram"] = list(df.apply(lambda x: cooccurrence_terms(["qid"+str(x["qid"])], x["description_unigram"], join_str), axis=1))
    df["query_id_description_bigram"] = list(df.apply(lambda x: cooccurrence_terms(["qid"+str(x["qid"])], x["description_bigram"], join_str), axis=1))
def extract_basic_distance_feat(df):
    ## unigram
    print("generate unigram")
    df["query_unigram"] = list(df.apply(lambda x: preprocess_data(x["query"]), axis=1))
    df["title_unigram"] = list(df.apply(lambda x: preprocess_data(x["product_title"]), axis=1))
    df["description_unigram"] = list(df.apply(lambda x: preprocess_data(x["product_description"]), axis=1))
    ## bigram
    print( "generate bigram")
    join_str = "_"
    df["query_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["query_unigram"], join_str), axis=1))
    df["title_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["title_unigram"], join_str), axis=1))
    df["description_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["description_unigram"], join_str), axis=1))
    ## trigram
    print( "generate trigram")
    join_str = "_"
    df["query_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["query_unigram"], join_str), axis=1))
    df["title_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["title_unigram"], join_str), axis=1))
    df["description_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["description_unigram"], join_str), axis=1))

    ## jaccard coef/dice dist of n-gram
    print( "generate jaccard coef and dice dist for n-gram")
    dists = ["jaccard_coef", "dice_dist"]
    grams = ["unigram", "bigram", "trigram"]
    feat_names = ["query", "title", "description"]
    for dist in dists:
        for gram in grams:
            for i in range(len(feat_names)-1):
                for j in range(i+1,len(feat_names)):
                    target_name = feat_names[i]
                    obs_name = feat_names[j]
                    df["%s_of_%s_between_%s_%s"%(dist,gram,target_name,obs_name)] = \
                            list(df.apply(lambda x: compute_dist(x[target_name+"_"+gram], x[obs_name+"_"+gram], dist), axis=1))
def extract_feat(df):
    ## unigram
    print "generate unigram"
    df["query_unigram"] = list(df.apply(lambda x: preprocess_data(x["query"]), axis=1))
    df["title_unigram"] = list(df.apply(lambda x: preprocess_data(x["product_title"]), axis=1))
    df["description_unigram"] = list(df.apply(lambda x: preprocess_data(x["product_description"]), axis=1))
    ## bigram
    print "generate bigram"
    join_str = "_"
    df["query_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["query_unigram"], join_str), axis=1))
    df["title_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["title_unigram"], join_str), axis=1))
    df["description_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["description_unigram"], join_str), axis=1))
    # ## trigram
    # join_str = "_"
    # df["query_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["query_unigram"], join_str), axis=1))
    # df["title_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["title_unigram"], join_str), axis=1))
    # df["description_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["description_unigram"], join_str), axis=1))

    ## cooccurrence terms
    join_str = "X"
    # query unigram
    df["query_unigram_title_unigram"] = list(df.apply(lambda x: cooccurrence_terms(x["query_unigram"], x["title_unigram"], join_str), axis=1))
    df["query_unigram_title_bigram"] = list(df.apply(lambda x: cooccurrence_terms(x["query_unigram"], x["title_bigram"], join_str), axis=1))
    df["query_unigram_description_unigram"] = list(df.apply(lambda x: cooccurrence_terms(x["query_unigram"], x["description_unigram"], join_str), axis=1))
    df["query_unigram_description_bigram"] = list(df.apply(lambda x: cooccurrence_terms(x["query_unigram"], x["description_bigram"], join_str), axis=1))
    # query bigram
    df["query_bigram_title_unigram"] = list(df.apply(lambda x: cooccurrence_terms(x["query_bigram"], x["title_unigram"], join_str), axis=1))
    df["query_bigram_title_bigram"] = list(df.apply(lambda x: cooccurrence_terms(x["query_bigram"], x["title_bigram"], join_str), axis=1))
    df["query_bigram_description_unigram"] = list(df.apply(lambda x: cooccurrence_terms(x["query_bigram"], x["description_unigram"], join_str), axis=1))
    df["query_bigram_description_bigram"] = list(df.apply(lambda x: cooccurrence_terms(x["query_bigram"], x["description_bigram"], join_str), axis=1))
    # query id
    df["query_id_title_unigram"] = list(df.apply(lambda x: cooccurrence_terms(["qid"+str(x["qid"])], x["title_unigram"], join_str), axis=1))
    df["query_id_title_bigram"] = list(df.apply(lambda x: cooccurrence_terms(["qid"+str(x["qid"])], x["title_bigram"], join_str), axis=1))
    df["query_id_description_unigram"] = list(df.apply(lambda x: cooccurrence_terms(["qid"+str(x["qid"])], x["description_unigram"], join_str), axis=1))
    df["query_id_description_bigram"] = list(df.apply(lambda x: cooccurrence_terms(["qid"+str(x["qid"])], x["description_bigram"], join_str), axis=1))
Пример #6
0
def extract_basic_distance_feat(df):
    ## unigram
    print "generate unigram"
    df["query_unigram"] = list(df.apply(lambda x: preprocess_data(x["query"]), axis=1))
    df["title_unigram"] = list(df.apply(lambda x: preprocess_data(x["product_title"]), axis=1))
    df["description_unigram"] = list(df.apply(lambda x: preprocess_data(x["product_description"]), axis=1))
    ## bigram
    print "generate bigram"
    join_str = "_"
    df["query_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["query_unigram"], join_str), axis=1))
    df["title_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["title_unigram"], join_str), axis=1))
    df["description_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["description_unigram"], join_str), axis=1))
    ## trigram
    print "generate trigram"
    join_str = "_"
    df["query_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["query_unigram"], join_str), axis=1))
    df["title_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["title_unigram"], join_str), axis=1))
    df["description_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["description_unigram"], join_str), axis=1))

    ## jaccard coef/dice dist of n-gram
    print "generate jaccard coef and dice dist for n-gram"
    dists = ["jaccard_coef", "dice_dist"]
    grams = ["unigram", "bigram", "trigram"]
    feat_names = ["query", "title", "description"]
    for dist in dists:
        for gram in grams:
            for i in range(len(feat_names)-1):
                for j in range(i+1,len(feat_names)):
                    target_name = feat_names[i]
                    obs_name = feat_names[j]
                    df["%s_of_%s_between_%s_%s"%(dist,gram,target_name,obs_name)] = \
                            list(df.apply(lambda x: compute_dist(x[target_name+"_"+gram], x[obs_name+"_"+gram], dist), axis=1))
Пример #7
0
def prepare_unigram(path,out):
    data_input = pd.read_csv(path)
    data_ouput = DataFrame(columns=['sen1_bigram','sen2_bigram'])
    for index,row in data_input.iterrows():
        s1 = str(row['sen1']).split()
        s2 = str(row['sen2']).split()
        s1 = ' '.join(getBigram(s1))
        s2 = ' '.join(getBigram(s2))
        data_ouput.loc[index] = [s1,s2]
    data_ouput.to_csv(out,index=False)
def generate_ngrams(df):
    # unigram
    print("generate unigram")
    df["title_unigram"] = list(df.apply(lambda x: preprocess_data(x["product_title"]), axis=1))
    df["search_term_unigram"] = list(df.apply(lambda x: preprocess_data(x["search_term"]), axis=1))

    # bigram
    print("generate bigram")
    join_str = "_"
    df["title_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["title_unigram"], join_str), axis=1))
    df["search_term_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["search_term_unigram"], join_str), axis=1))
    # trigram
    print("generate trigram")
    join_str = "_"
    df["search_term_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["search_term_unigram"], join_str), axis=1))
    df["title_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["title_unigram"], join_str), axis=1))
Пример #9
0
def gen_ngram_data(df):
    ## unigram
    print("generate unigram")
    df["q1_unigram"] = list(df.apply(lambda x: preprocess_data(x["question1"]), axis=1))
    df["q2_unigram"] = list(df.apply(lambda x: preprocess_data(x["question2"]), axis=1))
    ## bigram
    print("generate bigram")
    join_str = "_"
    df["q1_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["q1_unigram"], join_str), axis=1))
    df["q2_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["q2_unigram"], join_str), axis=1))
    ## trigram
    print("generate trigram")
    join_str = "_"
    df["q1_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["q1_bigram"], join_str), axis=1))
    df["q2_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["q2_bigram"], join_str), axis=1))
    return df
def generate_product_ngrams(df):
    print("Generate unigram")
    df["description_unigram"] = list(df.apply(lambda x: preprocess_data(x["product_description"]), axis=1))
    print("Generate bigram")
    join_str = "_"
    df["description_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["description_unigram"], join_str), axis=1))
    print("Generate trigram")
    join_str = "_"
    df["description_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["description_unigram"], join_str), axis=1))
def generate_brand_ngrams(df):
    print("Generate brand unigram")
    df["brand_unigram"] = list(df.apply(lambda x: preprocess_data(x["brand"]), axis=1))
    print("Generate brand bigram")
    join_str = "_"
    df["brand_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["brand_unigram"], join_str), axis=1))
    print("Generate brand trigram")
    join_str = "_"
    df["brand_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["brand_unigram"], join_str), axis=1))
Пример #12
0
def generateNGram(df):
    # unigram
    df['query_unigram'] = df['query'].apply(lambda x: ngram.getUnigram(x))
    df['title_unigram'] = df['product_title'].apply(
        lambda x: ngram.getUnigram(x))
    df['description_unigram'] = df['product_description'].apply(
        lambda x: ngram.getUnigram(x))
    # bigram
    df['query_bigram'] = df['query'].apply(lambda x: ngram.getBigram(x, '_'))
    df['title_bigram'] = df['product_title'].apply(
        lambda x: ngram.getBigram(x, '_'))
    df['description_bigram'] = df['product_description'].apply(
        lambda x: ngram.getBigram(x, '_'))
    # trigram
    df['query_trigram'] = df['query'].apply(lambda x: ngram.getTrigram(x, '_'))
    df['title_trigram'] = df['product_title'].apply(
        lambda x: ngram.getTrigram(x, '_'))
    df['description_trigram'] = df['product_description'].apply(
        lambda x: ngram.getTrigram(x, '_'))
    return df
Пример #13
0
def extract_feat(df):
    ## unigram
    print("generate unigram")
    df["question1_unigram"] = list(
        df.apply(lambda x: preprocess_data(x["question1"]), axis=1))
    df["question2_unigram"] = list(
        df.apply(lambda x: preprocess_data(x["question2"]), axis=1))
    ## bigram
    print("generate bigram")
    join_str = "_"
    df["question1_bigram"] = list(
        df.apply(lambda x: ngram.getBigram(x["question1_unigram"], join_str),
                 axis=1))
    df["question2_bigram"] = list(
        df.apply(lambda x: ngram.getBigram(x["question2_unigram"], join_str),
                 axis=1))
    # ## trigram
    # join_str = "_"
    # df["question1_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["question1_unigram"], join_str), axis=1))
    # df["question2_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["question2_unigram"], join_str), axis=1))

    ## cooccurrence terms
    join_str = "X"
    # question1 unigram
    df["question1_unigram_question2_unigram"] = list(
        df.apply(lambda x: cooccurrence_terms(x["question1_unigram"], x[
            "question2_unigram"], join_str),
                 axis=1))
    df["question1_unigram_question2_bigram"] = list(
        df.apply(lambda x: cooccurrence_terms(x["question1_unigram"], x[
            "question2_bigram"], join_str),
                 axis=1))
    # question1 bigram
    df["question1_bigram_question2_unigram"] = list(
        df.apply(lambda x: cooccurrence_terms(x["question1_bigram"], x[
            "question2_unigram"], join_str),
                 axis=1))
    df["question1_bigram_question2_bigram"] = list(
        df.apply(lambda x: cooccurrence_terms(x["question1_bigram"], x[
            "question2_bigram"], join_str),
                 axis=1))
Пример #14
0
def extract_basic_distance_feat(df):
    ## unigram

    ## unigram
    print "generate ngrams"
    join_str = "_"

    print "generate ngrams for question1"
    df.loc[:, "question1_unigram"] = list(map(preprocess_data,
                                              df["question1"]))
    df.loc[:, "question1_bigram"] = [
        ngram.getBigram(x, join_str) for x in df["question1_unigram"]
    ]
    df.loc[:, "question1_trigram"] = [
        ngram.getTrigram(x, join_str) for x in df["question1_unigram"]
    ]

    print "generate ngrams for question2"

    df.loc[:, "question2_unigram"] = list(map(preprocess_data,
                                              df["question2"]))
    df.loc[:, "question2_bigram"] = [
        ngram.getBigram(x, join_str) for x in df["question2_unigram"]
    ]
    df.loc[:, "question2_trigram"] = [
        ngram.getTrigram(x, join_str) for x in df["question2_unigram"]
    ]

    ## jaccard coef/dice dist of n-gram
    print "generate jaccard coef and dice dist for n-gram"
    dists = ["jaccard_coef", "dice_dist"]
    grams = ["unigram", "bigram", "trigram"]
    feat_names = ["question1", "question2"]
    for dist in dists:
        for gram in grams:
            for i in range(len(feat_names) - 1):
                for j in range(i + 1, len(feat_names)):
                    target_name = feat_names[i]
                    obs_name = feat_names[j]
                    df["%s_of_%s_between_%s_%s"%(dist,gram,target_name,obs_name)] = \
                            map(partial(compute_dist, dist=dist), df[target_name+"_"+gram], df[obs_name+"_"+gram])
Пример #15
0
def extract_feat(df):

    print "generate ngrams"
    join_str = "_"

    print "generate ngrams for question1"
    df.loc[:, "question1_unigram"] = list(map(preprocess_data,
                                              df["question1"]))
    df.loc[:, "question1_bigram"] = [
        ngram.getBigram(x, join_str) for x in df["question1_unigram"]
    ]
    #    df.loc[:,"question1_trigram"] = [ngram.getTrigram(x, join_str) for x in df["question1_unigram"]]

    print "generate ngrams for question2"

    df.loc[:, "question2_unigram"] = list(map(preprocess_data,
                                              df["question2"]))
    df.loc[:, "question2_bigram"] = [
        ngram.getBigram(x, join_str) for x in df["question2_unigram"]
    ]
    #    df.loc[:,"question2_trigram"] = [ngram.getTrigram(x, join_str) for x in df["question2_unigram"]]

    ## cooccurrence terms
    #    join_str = "X"
    print "generate coocurance terms"
    df["question1_unigram_question2_unigram"] = map(cooccurrence_terms,
                                                    df["question1_unigram"],
                                                    df["question2_unigram"])
    df["question1_unigram_question2_bigram"] = map(cooccurrence_terms,
                                                   df["question1_unigram"],
                                                   df["question2_bigram"])

    # query bigram
    df["question1_bigram_question2_unigram"] = map(cooccurrence_terms,
                                                   df["question1_bigram"],
                                                   df["question2_unigram"])
    df["question1_bigram_question2_bigram"] = map(cooccurrence_terms,
                                                  df["question1_bigram"],
                                                  df["question2_bigram"])
Пример #16
0
 def __iter__(self):
     for item in DictReader(open(self.tsvFile, "rb"), delimiter='\t', quotechar='"'):
         self.counter += 1
         item = {featureName:featureValue.decode('utf-8') \
                 for featureName,featureValue in item.items() \
                 if featureValue is not None}
         description = [ str(self.wordIndex[w]) for w in getWords(item["description"]) ]
         if self.ngram == 1:
             yield getUnigram(description)
         elif self.ngram == 2:
             yield getBigram(description, "_")
         if self.counter%100000 == 0:
             print(( "     Process %s" % self.counter ))
 def __iter__(self):
     for item in DictReader(open(self.tsvFile, "rb"), delimiter='\t', quotechar='"'):
         self.counter += 1
         item = {featureName:featureValue.decode('utf-8') \
                 for featureName,featureValue in item.iteritems() \
                 if featureValue is not None}
         description = [ str(self.wordIndex[w]) for w in getWords(item["description"]) ]
         if self.ngram == 1:
             yield getUnigram(description)
         elif self.ngram == 2:
             yield getBigram(description, "_")
         if self.counter%100000 == 0:
             print( "     Process %s" % self.counter )
def prepare_bigram(path, out):
    print path
    c = 0
    start = datetime.now()
    with open(out, 'w') as outfile:
        outfile.write('question1_bigram,question2_bigram\n')
        for t, row in enumerate(DictReader(open(path), delimiter=',')):
            if c % 100000 == 0:
                print 'finished', c
            q1 = remove_punctuation(str(
                row['question1_porter']).lower()).split(' ')
            q2 = remove_punctuation(str(
                row['question2_porter']).lower()).lower().split(' ')
            q1_bigram = getBigram(q1)
            q2_bigram = getBigram(q2)
            q1_bigram = ' '.join(q1_bigram)
            q2_bigram = ' '.join(q2_bigram)
            outfile.write('%s,%s\n' % (q1_bigram, q2_bigram))

            c += 1
        end = datetime.now()

    print 'times:', end - start
Пример #19
0
def str_common_word_ngram(str1, str2, n):
    # what happens if length of word is less than size of gram? should return 0
    # use switcher
    if n == 1:
        return sum(int(str2.find(str(str1))>=0) for word in str1.split())
    elif n == 2:
        return sum(int(str2.find(word_ngram)>=0) for word_ngram in ngram.getBigram(str1.split()," "))
    elif n == 3:
        return sum(int(str2.find(word_ngram)>=0) for word_ngram in ngram.getTrigram(str1.split()," "))
    elif n == 4:
        return sum(int(str2.find(word_ngram)>=0) for word_ngram in ngram.getFourgram(str1.split()," "))
    else:
        print("Incorrect n value entered:",n)
        return 0
Пример #20
0
 def _get_ngram(self, sr):
     """
     Compute ngram of the text of a pd.Series. The unigram operation is combining stemming \
     words and excluding stopwords. The bigram and trigram operations are based on the results \
     of the unigram operation.
     
     Args:
         sr(pd.Series):
         
     Returns:
         sr_unigram(pd.Series), sr_bigram(pd.Series), sr_trigram(pd.Series)
     """
     # Unigram.
     unigram_func = lambda s: list(self._stem_excl_words(s))
     sr_unigram = sr.map(unigram_func)     
     # Bigram.
     bigram_func = lambda s: ngram.getBigram(s, '_')
     sr_bigram = sr_unigram.map(bigram_func) 
     # Trigram.
     trigram_func = lambda s: ngram.getTrigram(s, '_')
     sr_trigram = sr_unigram.map(trigram_func) 
     return sr_unigram, sr_bigram, sr_trigram
def test():

    ###############
    ## Load Data ##
    ###############
    ## load data
    dataPath = "./ModelSystem/ProcessedData"
    columnNames = ["query", "title", "description"]
    catagories = ["train", "test"]

    for cata in catagories:
        for col in columnNames:
            path = "%s/%s_%s.pickle" % (dataPath, col, cata)
            with open(path, "rb") as f:
                input = pickle.load(f)

            sz = len(input)
            #开始1,2,3元文法
            output_unigram = []
            output_bigram = []
            output_trigram = []
            for i in range(2):
                text = input[i]
                #去除标点
                text = re.sub("[^0-9a-zA-Z.]", " ", text)
                wordList = text.split()

                unigram = wordList
                bigram = ngram.getBigram(wordList, "_")
                trigram = ngram.getTrigram(wordList, "_")

                print(unigram)
                print(bigram)
                print(trigram)

# ret = ngram.getBigram(x["query_unigram"], join_str)

    print("ngram All Done.")
Пример #22
0
def extract_feat(df):
    ## unigram
    print "generate ngrams"
    join_str = "_"

    print "generate ngrams for question1"
    df.loc[:, "question1_unigram"] = list(map(preprocess_data,
                                              df["question1"]))
    df.loc[:, "question1_bigram"] = [
        ngram.getBigram(x, join_str) for x in df["question1_unigram"]
    ]
    df.loc[:, "question1_trigram"] = [
        ngram.getTrigram(x, join_str) for x in df["question1_unigram"]
    ]

    print "generate ngrams for question2"

    df.loc[:, "question2_unigram"] = list(map(preprocess_data,
                                              df["question2"]))
    df.loc[:, "question2_bigram"] = [
        ngram.getBigram(x, join_str) for x in df["question2_unigram"]
    ]
    df.loc[:, "question2_trigram"] = [
        ngram.getTrigram(x, join_str) for x in df["question2_unigram"]
    ]

    ################################
    ## word count and digit count ##
    ################################
    print "generate word counting features"
    feat_names = ["question1", "question2"]
    grams = ["unigram", "bigram", "trigram"]
    count_digit = lambda x: sum([1. for w in x if w.isdigit()])
    for feat_name in feat_names:
        for gram in grams:
            ## word count
            df["count_of_%s_%s" % (feat_name, gram)] = [
                len(x) for x in df[feat_name + "_" + gram]
            ]
            df["count_of_unique_%s_%s" % (feat_name, gram)] = [
                len(set(x)) for x in df[feat_name + "_" + gram]
            ]
            df["ratio_of_unique_%s_%s" % (feat_name, gram)] = map(
                try_divide, df["count_of_unique_%s_%s" % (feat_name, gram)],
                df["count_of_%s_%s" % (feat_name, gram)])

        ## digit count
        df["count_of_digit_in_%s" % feat_name] = list(
            map(count_digit, df[feat_name + "_unigram"]))
        df["ratio_of_digit_in_%s" % feat_name] = map(
            try_divide, df["count_of_digit_in_%s" % feat_name],
            df["count_of_%s_unigram" % (feat_name)])

    ##############################
    ## intersect word count ##
    ##############################
    print "generate intersect word counting features"

    def word_count_intersect_questions(obs, target):
        word_count_intersect = 0
        if len(obs) != 0:
            word_count_intersect = len([w for w in obs if w in target])
        return word_count_intersect

    #### unigram
    for gram in grams:
        for obs_name in feat_names:
            for target_name in feat_names:
                if target_name != obs_name:
                    ## query
                    df["count_of_%s_%s_in_%s" %
                       (obs_name, gram, target_name)] = list(
                           map(word_count_intersect_questions,
                               df[obs_name + "_" + gram],
                               df[target_name + "_" + gram]))
                    df["ratio_of_%s_%s_in_%s" %
                       (obs_name, gram, target_name)] = map(
                           try_divide, df["count_of_%s_%s_in_%s" %
                                          (obs_name, gram, target_name)],
                           df["count_of_%s_%s" % (obs_name, gram)])

        ## some other feat
        df["question2_%s_in_question1_div_question1_%s" % (gram, gram)] = map(
            try_divide, df["count_of_question2_%s_in_question1" % gram],
            df["count_of_question1_%s" % gram])
        df["question2_%s_in_question1_div_question1_%s_in_question2" %
           (gram, gram)] = map(try_divide,
                               df["count_of_question2_%s_in_question1" % gram],
                               df["count_of_question1_%s_in_question2" % gram])

    ######################################
    ## intersect word position feat ##
    ######################################
    print "generate intersect word position features"
    for gram in grams:
        for target_name in feat_names:
            for obs_name in feat_names:
                if target_name != obs_name:
                    pos = list(
                        map(get_position_list, df[obs_name + "_" + gram],
                            df[target_name + "_" + gram]))
                    ## stats feat on pos
                    df["pos_of_%s_%s_in_%s_min" %
                       (obs_name, gram, target_name)] = map(np.min, pos)
                    df["pos_of_%s_%s_in_%s_mean" %
                       (obs_name, gram, target_name)] = map(np.mean, pos)
                    df["pos_of_%s_%s_in_%s_median" %
                       (obs_name, gram, target_name)] = map(np.median, pos)
                    df["pos_of_%s_%s_in_%s_max" %
                       (obs_name, gram, target_name)] = map(np.max, pos)
                    df["pos_of_%s_%s_in_%s_std" %
                       (obs_name, gram, target_name)] = map(np.std, pos)
                    ## stats feat on normalized_pos
                    df["normalized_pos_of_%s_%s_in_%s_min" %
                       (obs_name, gram, target_name)] = map(
                           try_divide, df["pos_of_%s_%s_in_%s_min" %
                                          (obs_name, gram, target_name)],
                           df["count_of_%s_%s" % (obs_name, gram)])
                    df["normalized_pos_of_%s_%s_in_%s_mean" %
                       (obs_name, gram, target_name)] = map(
                           try_divide, df["pos_of_%s_%s_in_%s_mean" %
                                          (obs_name, gram, target_name)],
                           df["count_of_%s_%s" % (obs_name, gram)])
                    df["normalized_pos_of_%s_%s_in_%s_median" %
                       (obs_name, gram, target_name)] = map(
                           try_divide, df["pos_of_%s_%s_in_%s_median" %
                                          (obs_name, gram, target_name)],
                           df["count_of_%s_%s" % (obs_name, gram)])
                    df["normalized_pos_of_%s_%s_in_%s_max" %
                       (obs_name, gram, target_name)] = map(
                           try_divide, df["pos_of_%s_%s_in_%s_max" %
                                          (obs_name, gram, target_name)],
                           df["count_of_%s_%s" % (obs_name, gram)])
                    df["normalized_pos_of_%s_%s_in_%s_std" %
                       (obs_name, gram, target_name)] = map(
                           try_divide, df["pos_of_%s_%s_in_%s_std" %
                                          (obs_name, gram, target_name)],
                           df["count_of_%s_%s" % (obs_name, gram)])
def process():

    read = False
    if not read:
        '''
        body_train = pd.read_csv("train_bodies_processed.csv", encoding='utf-8')
        stances_train = pd.read_csv("train_stances_processed.csv", encoding='utf-8')
        # training set
        train = pd.merge(stances_train, body_train, how='left', on='Body ID')
        
        train.head()
        targets = ['agree', 'disagree', 'discuss', 'unrelated']
        targets_dict = dict(zip(targets, range(len(targets))))
        train['target'] = map(lambda x: targets_dict[x], train['Stance'])
        print ('train.shape:')
        print (train.shape)
        n_train = train.shape[0]
        '''
        #sample starts

        sample_head = "Italy culls birds after five H5N8 avian flu outbreaks in October"
        sample_body = "ROME (Reuters) - Italy has had five outbreaks of highly pathogenic H5N8 avian flu in farms the central and northern parts of the country since the start of the month and about 880,000 chickens, ducks and turkeys will be culled, officials said on Wednesday.\
            The biggest outbreak of the H5N8 virus, which led to the death or killing of millions of birds in an outbreak in western Europe last winter, was at a large egg producing farm in the province of Ferrara.\
            The latest outbreak was confirmed on Oct. 6 and about 853,000 hens are due to be culled by Oct. 17, the IZSV zoological institute said.\
            Another involved 14,000 turkeys in the province of Brescia, which are due to be culled by Oct. 13.\
            A third involved 12,400 broiler chickens at a smaller farm in the province of Vicenza and two others were among a small number of hens, ducks, broilers and turkeys on family farms.\
            In those three cases, all the birds have been culled."

        sample_head_pd = pd.DataFrame([sample_head])
        sample_body_pd = pd.DataFrame([sample_body])
        sample_data_pd = pd.concat((sample_head_pd, sample_body_pd), axis=1)
        sample_data_pd.columns = ['Headline', 'articleBody']
        sample_data_pd['URLs'] = np.nan
        sample_data_pd['Stance'] = np.nan

        #sample ends

        dataset = pd.read_csv('data.csv')

        dataset.isnull().sum()

        dataset = dataset[pd.notnull(dataset['Body'])]

        dataset.columns = ['URLs', 'Headline', 'articleBody', 'Stance']

        X_data = dataset.iloc[:, 1:3]
        Y_data = dataset.iloc[:, 3]

        from sklearn.cross_validation import train_test_split

        X_train, X_test, Y_train, Y_test = train_test_split(X_data,
                                                            Y_data,
                                                            test_size=0.25,
                                                            random_state=0)

        train = pd.concat([X_train, Y_train], axis=1)

        train.to_csv('gdbt_training_input.csv', index=False)

        X_test.to_csv('gdbt_testing_input.csv', index=False)
        Y_test = pd.DataFrame(Y_test)
        Y_test.to_csv('gdbt_testing_ouput.csv', index=False)

        targets = ['Fake', 'Real']
        targets_dict = dict(zip(targets, range(len(targets))))
        train['target'] = map(lambda x: targets_dict[x], train['Stance'])

        data = train

        # read test set, no 'Stance' column in test set -> target = NULL
        # concatenate training and test set
        test_flag = True

        if test_flag:
            '''
            body_test = pd.read_csv("test_bodies_processed.csv", encoding='utf-8')
            headline_test = pd.read_csv("test_stances_unlabeled.csv", encoding='utf-8')
            test = pd.merge(headline_test, body_test, how="left", on="Body ID")
            '''
            data = pd.concat((train, X_test))  # target = NaN for test set
            #print (data)
            print('data.shape:')
            print(data.shape)

            train = data[~data['target'].isnull()]
            print(train)
            print('train.shape:')
            print(train.shape)

            test = data[data['target'].isnull()]
            print(test)
            print('test.shape:')
            print(test.shape)

        #data = data.iloc[:100, :]

        #return 1

        print("generate unigram")
        data["Headline_unigram"] = data["Headline"].map(
            lambda x: preprocess_data(x))
        print(data.head())
        data["articleBody_unigram"] = data["articleBody"].map(
            lambda x: preprocess_data(x))

        print("generate bigram")
        join_str = "_"
        data["Headline_bigram"] = data["Headline_unigram"].map(
            lambda x: ngram.getBigram(x, join_str))
        data["articleBody_bigram"] = data["articleBody_unigram"].map(
            lambda x: ngram.getBigram(x, join_str))

        print("generate trigram")
        join_str = "_"
        data["Headline_trigram"] = data["Headline_unigram"].map(
            lambda x: ngram.getTrigram(x, join_str))
        data["articleBody_trigram"] = data["articleBody_unigram"].map(
            lambda x: ngram.getTrigram(x, join_str))

        with open('data.pkl', 'wb') as outfile:
            pickle.dump(data, outfile, -1)
            print('dataframe saved in data.pkl')

    else:
        with open('data.pkl', 'rb') as infile:
            data = pickle.load(infile)
            print('data loaded')
            print('data.shape:')
            print(data.shape)
    #return 1

    # define feature generators
    countFG = CountFeatureGenerator()
    tfidfFG = TfidfFeatureGenerator()
    svdFG = SvdFeatureGenerator()
    word2vecFG = Word2VecFeatureGenerator()
    sentiFG = SentimentFeatureGenerator()
    #walignFG   = AlignmentFeatureGenerator()
    generators = [countFG, tfidfFG, svdFG, word2vecFG, sentiFG]
    #generators = [svdFG, word2vecFG, sentiFG]
    #generators = [tfidfFG]
    #generators = [countFG]
    #generators = [walignFG]

    #countFG.process(data)
    #countFG.read()

    #word2vecFG.process(data)

    #sentiFG.process(data)

    for g in generators:
        g.process(data)

    for g in generators:
        g.read('train')

    for g in generators:
        g.read('test')

    print('done')
def extract_feat(df):
    ## unigram
    print "generate unigram"
    df["query_unigram"] = list(df.apply(lambda x: preprocess_data(x["query"]), axis=1))
    df["title_unigram"] = list(df.apply(lambda x: preprocess_data(x["product_title"]), axis=1))
    df["description_unigram"] = list(df.apply(lambda x: preprocess_data(x["product_description"]), axis=1))
    ## bigram
    print "generate bigram"
    join_str = "_"
    df["query_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["query_unigram"], join_str), axis=1))
    df["title_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["title_unigram"], join_str), axis=1))
    df["description_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["description_unigram"], join_str), axis=1))
    ## trigram
    print "generate trigram"
    join_str = "_"
    df["query_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["query_unigram"], join_str), axis=1))
    df["title_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["title_unigram"], join_str), axis=1))
    df["description_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["description_unigram"], join_str), axis=1))


    ################################
    ## word count and digit count ##
    ################################
    print "generate word counting features"
    feat_names = ["query", "title", "description"]
    grams = ["unigram", "bigram", "trigram"]
    count_digit = lambda x: sum([1. for w in x if w.isdigit()])
    for feat_name in feat_names:
        for gram in grams:
            ## word count
            df["count_of_%s_%s"%(feat_name,gram)] = list(df.apply(lambda x: len(x[feat_name+"_"+gram]), axis=1))
            df["count_of_unique_%s_%s"%(feat_name,gram)] = list(df.apply(lambda x: len(set(x[feat_name+"_"+gram])), axis=1))
            df["ratio_of_unique_%s_%s"%(feat_name,gram)] = map(try_divide, df["count_of_unique_%s_%s"%(feat_name,gram)], df["count_of_%s_%s"%(feat_name,gram)])

        ## digit count
        df["count_of_digit_in_%s"%feat_name] = list(df.apply(lambda x: count_digit(x[feat_name+"_unigram"]), axis=1))
        df["ratio_of_digit_in_%s"%feat_name] = map(try_divide, df["count_of_digit_in_%s"%feat_name], df["count_of_%s_unigram"%(feat_name)])

    ## description missing indicator
    df["description_missing"] = list(df.apply(lambda x: int(x["description_unigram"] == ""), axis=1))


    ##############################
    ## intersect word count ##
    ##############################
    print "generate intersect word counting features"
    #### unigram
    for gram in grams:
        for obs_name in feat_names:
            for target_name in feat_names:
                if target_name != obs_name:
                    ## query
                    df["count_of_%s_%s_in_%s"%(obs_name,gram,target_name)] = list(df.apply(lambda x: sum([1. for w in x[obs_name+"_"+gram] if w in set(x[target_name+"_"+gram])]), axis=1))
                    df["ratio_of_%s_%s_in_%s"%(obs_name,gram,target_name)] = map(try_divide, df["count_of_%s_%s_in_%s"%(obs_name,gram,target_name)], df["count_of_%s_%s"%(obs_name,gram)])

        ## some other feat
        df["title_%s_in_query_div_query_%s"%(gram,gram)] = map(try_divide, df["count_of_title_%s_in_query"%gram], df["count_of_query_%s"%gram])
        df["title_%s_in_query_div_query_%s_in_title"%(gram,gram)] = map(try_divide, df["count_of_title_%s_in_query"%gram], df["count_of_query_%s_in_title"%gram])
        df["description_%s_in_query_div_query_%s"%(gram,gram)] = map(try_divide, df["count_of_description_%s_in_query"%gram], df["count_of_query_%s"%gram])
        df["description_%s_in_query_div_query_%s_in_description"%(gram,gram)] = map(try_divide, df["count_of_description_%s_in_query"%gram], df["count_of_query_%s_in_description"%gram])


    ######################################
    ## intersect word position feat ##
    ######################################
    print "generate intersect word position features"
    for gram in grams:
        for target_name in feat_names:
            for obs_name in feat_names:
                if target_name != obs_name:
                    pos = list(df.apply(lambda x: get_position_list(x[target_name+"_"+gram], obs=x[obs_name+"_"+gram]), axis=1))
                    ## stats feat on pos
                    df["pos_of_%s_%s_in_%s_min" % (obs_name, gram, target_name)] = map(np.min, pos)
                    df["pos_of_%s_%s_in_%s_mean" % (obs_name, gram, target_name)] = map(np.mean, pos)
                    df["pos_of_%s_%s_in_%s_median" % (obs_name, gram, target_name)] = map(np.median, pos)
                    df["pos_of_%s_%s_in_%s_max" % (obs_name, gram, target_name)] = map(np.max, pos)
                    df["pos_of_%s_%s_in_%s_std" % (obs_name, gram, target_name)] = map(np.std, pos)
                    ## stats feat on normalized_pos
                    df["normalized_pos_of_%s_%s_in_%s_min" % (obs_name, gram, target_name)] = map(try_divide, df["pos_of_%s_%s_in_%s_min" % (obs_name, gram, target_name)], df["count_of_%s_%s" % (obs_name, gram)])
                    df["normalized_pos_of_%s_%s_in_%s_mean" % (obs_name, gram, target_name)] = map(try_divide, df["pos_of_%s_%s_in_%s_mean" % (obs_name, gram, target_name)], df["count_of_%s_%s" % (obs_name, gram)])
                    df["normalized_pos_of_%s_%s_in_%s_median" % (obs_name, gram, target_name)] = map(try_divide, df["pos_of_%s_%s_in_%s_median" % (obs_name, gram, target_name)], df["count_of_%s_%s" % (obs_name, gram)])
                    df["normalized_pos_of_%s_%s_in_%s_max" % (obs_name, gram, target_name)] = map(try_divide, df["pos_of_%s_%s_in_%s_max" % (obs_name, gram, target_name)], df["count_of_%s_%s" % (obs_name, gram)])
                    df["normalized_pos_of_%s_%s_in_%s_std" % (obs_name, gram, target_name)] = map(try_divide, df["pos_of_%s_%s_in_%s_std" % (obs_name, gram, target_name)] , df["count_of_%s_%s" % (obs_name, gram)])
Пример #25
0
def getVWFile(tsvFile, vwFile, dataIndex, dictionary, tfidf_model, train=True):
    """ Generate the overall features in VW format."""
    start = datetime.now()
    
    # extract all the index
    catIndex = dataIndex["catIndex"]
    subCatIndex = dataIndex["subCatIndex"]
    attrsKeyIndex = dataIndex["attrsKeyIndex"]
    attrsValIndex = dataIndex["attrsValIndex"]
    wordIndex = dataIndex["wordIndex"]
    
    with open(vwFile, "wb") as vwWriter:
        with open(tsvFile, "rb") as tsvReader:
            itemReader = DictReader(tsvReader, delimiter='\t', quotechar='"')
            for i, item in enumerate(itemReader):
                item = {featureName:featureValue.decode('utf-8') \
                        for featureName,featureValue in item.items() \
                        if featureValue is not None}

                # get header
                itemid = int(item["itemid"])
                label = int(item["is_blocked"]) if train else 1
                header = "%s '%s " % (int(2*label - 1), itemid)
                
                # category
                categoryFeat = "|C %s " % catIndex[ item["category"] ]
                
                # subcategory
                subcategoryFeat = "|SC %s " % subCatIndex[ item["subcategory"] ]
                
                # title
                title = [ str(wordIndex[w]) for w in getWords(item["title"]) ]
                # first-gram
                title_start = title[0] if len(title)>0 else "0"
                # end-gram
                title_end = title[-1] if len(title)>0 else "0"
                # naming is a pain for me
                titleFeat = "|T %s |bT %s |cT %s " % (" ".join(title), title_start, title_end)
                titleStatsFeat = "|t %s " % getTextStatsFeat(item["title"])
                
                # description
                description = [ str(wordIndex[w]) for w in getWords(item["description"]) ]
                # first-gram
                description_start = description[0] if len(description)>0 else "0"
                # end-gram
                description_end = description[-1] if len(description)>0 else "0"
                descriptionFeat = "|D %s |fD %s |gD %s " % (" ".join(description), description_start, description_end)
                descriptionStatsFeat = "|d %s " % getTextStatsFeat(item["description"])
                tfidf_feat1 = getTfidfFeat(getUnigram(description), dictionary[1], tfidf_model[1])
                # 2gram tfidf seem to harm the performance, you are save to drop it here
                tfidf_feat2 = getTfidfFeat(getBigram(description, "_"), dictionary[2], tfidf_model[2])
                descriptionFeat += "|iD %s |jD %s " % (tfidf_feat1, tfidf_feat2)
                
                # attrs
                attrsFeat = ""
                countAttrs = 0
                if "attrs" in item:
                    attrsDict = getAttrsDict(item["attrs"])
                    #print attrs
                    for k,v in list(attrsDict.items()):
                        countAttrs += 1
                        attrsFeat += "|A%s %s " % (attrsKeyIndex[k], attrsValIndex[v])
                    attrsFeat += "|a "
                    for k,v in list(attrsDict.items()):
                        attrsFeat += "%s " % (attrsKeyIndex[k])                        
                if len(attrsFeat) == 0:
                    attrsFeat = "|NA 1 "
                attrsFeat += "|hAC %s " % countAttrs
                
                # price
                priceFeat = "|P %s " % item["price"]                
                # phones_cnt
                phonesCntFeat = "|p %s " % item["phones_cnt"]
                # emails_cnt
                emailsCntFeat = "|e %s " % item["emails_cnt"]
                # urls_cnt
                urlsCntFeat = "|u %s " % item["urls_cnt"]
                
                # output
                vwLine = header \
                       + categoryFeat \
                       + subcategoryFeat \
                       + titleFeat \
                       + titleStatsFeat \
                       + descriptionFeat \
                       + descriptionStatsFeat \
                       + attrsFeat \
                       + priceFeat \
                       + phonesCntFeat \
                       + emailsCntFeat \
                       + urlsCntFeat[:-1] + "\n"
                vwWriter.write( vwLine )
                
                # report progress
                if (i+1)%10000 == 0:
                    print(( "\n%s\t%s"%((i+1),str(datetime.now() - start)) ))
                    print(( "Sample output:\n%s" % vwLine ))
Пример #26
0
def process():

    read = False
    if not read:

        body_train = pd.read_csv("train_bodies_processed.csv",
                                 encoding='utf-8')
        stances_train = pd.read_csv("train_stances_processed.csv",
                                    encoding='utf-8')
        # training set
        train = pd.merge(stances_train, body_train, how='left', on='Body ID')
        targets = ['agree', 'disagree', 'discuss', 'unrelated']
        targets_dict = dict(zip(targets, range(len(targets))))
        train['target'] = map(lambda x: targets_dict[x], train['Stance'])
        print 'train.shape:'
        print train.shape
        n_train = train.shape[0]

        data = train
        # read test set, no 'Stance' column in test set -> target = NULL
        # concatenate training and test set
        test_flag = True
        if test_flag:
            body_test = pd.read_csv("test_bodies_processed.csv",
                                    encoding='utf-8')
            headline_test = pd.read_csv("test_stances_unlabeled.csv",
                                        encoding='utf-8')
            test = pd.merge(headline_test, body_test, how="left", on="Body ID")

            data = pd.concat((train, test))  # target = NaN for test set
            print data
            print 'data.shape:'
            print data.shape

            train = data[~data['target'].isnull()]
            print train
            print 'train.shape:'
            print train.shape

            test = data[data['target'].isnull()]
            print test
            print 'test.shape:'
            print test.shape

        #data = data.iloc[:100, :]

        #return 1

        print "generate unigram"
        data["Headline_unigram"] = data["Headline"].map(
            lambda x: preprocess_data(x))
        data["articleBody_unigram"] = data["articleBody"].map(
            lambda x: preprocess_data(x))

        print "generate bigram"
        join_str = "_"
        data["Headline_bigram"] = data["Headline_unigram"].map(
            lambda x: ngram.getBigram(x, join_str))
        data["articleBody_bigram"] = data["articleBody_unigram"].map(
            lambda x: ngram.getBigram(x, join_str))

        print "generate trigram"
        join_str = "_"
        data["Headline_trigram"] = data["Headline_unigram"].map(
            lambda x: ngram.getTrigram(x, join_str))
        data["articleBody_trigram"] = data["articleBody_unigram"].map(
            lambda x: ngram.getTrigram(x, join_str))

        with open('data.pkl', 'wb') as outfile:
            cPickle.dump(data, outfile, -1)
            print 'dataframe saved in data.pkl'

    else:
        with open('data.pkl', 'rb') as infile:
            data = cPickle.load(infile)
            print 'data loaded'
            print 'data.shape:'
            print data.shape
    #return 1

    # define feature generators
    countFG = CountFeatureGenerator()
    tfidfFG = TfidfFeatureGenerator()
    svdFG = SvdFeatureGenerator()
    word2vecFG = Word2VecFeatureGenerator()
    sentiFG = SentimentFeatureGenerator()
    #walignFG   = AlignmentFeatureGenerator()
    generators = [countFG, tfidfFG, svdFG, word2vecFG, sentiFG]
    #generators = [svdFG, word2vecFG, sentiFG]
    #generators = [tfidfFG]
    #generators = [countFG]
    #generators = [walignFG]

    for g in generators:
        g.process(data)

    for g in generators:
        g.read('train')

    #for g in generators:
    #    g.read('test')

    print 'done'
def getVWFile(tsvFile, vwFile, dataIndex, dictionary, tfidf_model, train=True):
    """ Generate the overall features in VW format."""
    start = datetime.now()
    
    # extract all the index
    catIndex = dataIndex["catIndex"]
    subCatIndex = dataIndex["subCatIndex"]
    attrsKeyIndex = dataIndex["attrsKeyIndex"]
    attrsValIndex = dataIndex["attrsValIndex"]
    wordIndex = dataIndex["wordIndex"]
    
    with open(vwFile, "wb") as vwWriter:
        with open(tsvFile, "rb") as tsvReader:
            itemReader = DictReader(tsvReader, delimiter='\t', quotechar='"')
            for i, item in enumerate(itemReader):
                item = {featureName:featureValue.decode('utf-8') \
                        for featureName,featureValue in item.iteritems() \
                        if featureValue is not None}

                # get header
                itemid = int(item["itemid"])
                label = int(item["is_blocked"]) if train else 1
                header = "%s '%s " % (int(2*label - 1), itemid)
                
                # category
                categoryFeat = "|C %s " % catIndex[ item["category"] ]
                
                # subcategory
                subcategoryFeat = "|SC %s " % subCatIndex[ item["subcategory"] ]
                
                # title
                title = [ str(wordIndex[w]) for w in getWords(item["title"]) ]
                # first-gram
                title_start = title[0] if len(title)>0 else "0"
                # end-gram
                title_end = title[-1] if len(title)>0 else "0"
                # naming is a pain for me
                titleFeat = "|T %s |bT %s |cT %s " % (" ".join(title), title_start, title_end)
                titleStatsFeat = "|t %s " % getTextStatsFeat(item["title"])
                
                # description
                description = [ str(wordIndex[w]) for w in getWords(item["description"]) ]
                # first-gram
                description_start = description[0] if len(description)>0 else "0"
                # end-gram
                description_end = description[-1] if len(description)>0 else "0"
                descriptionFeat = "|D %s |fD %s |gD %s " % (" ".join(description), description_start, description_end)
                descriptionStatsFeat = "|d %s " % getTextStatsFeat(item["description"])
                tfidf_feat1 = getTfidfFeat(getUnigram(description), dictionary[1], tfidf_model[1])
                # 2gram tfidf seem to harm the performance, you are save to drop it here
                tfidf_feat2 = getTfidfFeat(getBigram(description, "_"), dictionary[2], tfidf_model[2])
                descriptionFeat += "|iD %s |jD %s " % (tfidf_feat1, tfidf_feat2)
                
                # attrs
                attrsFeat = ""
                countAttrs = 0
                if item.has_key("attrs"):
                    attrsDict = getAttrsDict(item["attrs"])
                    #print attrs
                    for k,v in attrsDict.items():
                        countAttrs += 1
                        attrsFeat += "|A%s %s " % (attrsKeyIndex[k], attrsValIndex[v])
                    attrsFeat += "|a "
                    for k,v in attrsDict.items():
                        attrsFeat += "%s " % (attrsKeyIndex[k])                        
                if len(attrsFeat) == 0:
                    attrsFeat = "|NA 1 "
                attrsFeat += "|hAC %s " % countAttrs
                
                # price
                priceFeat = "|P %s " % item["price"]                
                # phones_cnt
                phonesCntFeat = "|p %s " % item["phones_cnt"]
                # emails_cnt
                emailsCntFeat = "|e %s " % item["emails_cnt"]
                # urls_cnt
                urlsCntFeat = "|u %s " % item["urls_cnt"]
                
                # output
                vwLine = header \
                       + categoryFeat \
                       + subcategoryFeat \
                       + titleFeat \
                       + titleStatsFeat \
                       + descriptionFeat \
                       + descriptionStatsFeat \
                       + attrsFeat \
                       + priceFeat \
                       + phonesCntFeat \
                       + emailsCntFeat \
                       + urlsCntFeat[:-1] + "\n"
                vwWriter.write( vwLine )
                
                # report progress
                if (i+1)%10000 == 0:
                    print( "\n%s\t%s"%((i+1),str(datetime.now() - start)) )
                    print( "Sample output:\n%s" % vwLine )
def main():

    ###############
    ## Load Data ##
    ###############
    ## load data
    dataPath = "./ModelSystem/ProcessedData"
    columnNames = ["query", "title", "description"]
    catagories = ["train", "test"]

    for cata in catagories:
        for col in columnNames:
            path = "%s/%s_%s.pickle" % (dataPath, col, cata)
            with open(path, "rb") as f:
                input = pickle.load(f)

            sz = len(input)
            #开始1,2,3元文法
            output_unigram = []
            output_bigram = []
            output_trigram = []
            for i in range(sz):
                text = str(input[i])
                #去除标点
                wordList = text.split()

                unigram = wordList
                bigram = ngram.getBigram(wordList, "_")
                trigram = ngram.getTrigram(wordList, "_")

                for i in range(len(unigram)):
                    if (unigram[i] == "nan"): unigram[i] = ""

                for i in range(len(bigram)):
                    if (bigram[i] == "nan"): bigram[i] = ""

                for i in range(len(trigram)):
                    if (trigram[i] == "nan"): trigram[i] = ""

                output_unigram.append(unigram)
                output_bigram.append(bigram)
                output_trigram.append(trigram)

                #print(unigram)
                #print(bigram)
                #print(trigram)
                #raise Exception("sdf")

            path = "./ModelSystem/Features/ngram/%s_unigram_%s.pickle" % (col,
                                                                          cata)
            with open(path, "wb") as f:
                pickle.dump(output_unigram, f)

            path = "./ModelSystem/Features/ngram/%s_bigram_%s.pickle" % (col,
                                                                         cata)
            with open(path, "wb") as f:
                pickle.dump(output_bigram, f)

            path = "./ModelSystem/Features/ngram/%s_trigram_%s.pickle" % (col,
                                                                          cata)
            with open(path, "wb") as f:
                pickle.dump(output_trigram, f)

            print("%s_ngram_%s Completed" % (col, cata))

# ret = ngram.getBigram(x["query_unigram"], join_str)

    print("ngram All Done.")
Пример #29
0
def extract_feat(df):
	join_str="_"
	df["query_unigram"] = list(df.apply(lambda x: preprocess_data(x["query"], stem=True), axis=1))
	df["title_unigram"] = list(df.apply(lambda x: preprocess_data(x["title"]), axis=1))
	df["description_unigram"] = list(df.apply(lambda x: preprocess_data(x["description"]), axis=1))
	df["attribute_values_unigram"] = list(df.apply(lambda x: preprocess_data(x["values"]), axis=1))
	df["query_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["query_unigram"], join_str), axis=1))
	df["title_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["title_unigram"], join_str), axis=1))
	df["description_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["description_unigram"], join_str), axis=1))
	df["attribute_values_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["attribute_values_unigram"], join_str), axis=1))
	df["query_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["query_unigram"], join_str), axis=1))
	df["title_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["title_unigram"], join_str), axis=1))
	df["description_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["description_unigram"], join_str), axis=1))
	df["attribute_values_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["attribute_values_unigram"], join_str), axis=1))

	################################
	## word count and digit count ##
	################################
	print "generate word counting features"
	feat_names = ["query", "title", "description", "attribute_values"]
	grams = ["unigram", "bigram", "trigram"]
	count_digit = lambda x: sum([1. for w in x if w.isdigit()])
	for feat_name in feat_names:
		for gram in grams:
			## word count
			df["count_of_%s_%s"%(feat_name,gram)] = list(df.apply(lambda x: len(x[feat_name+"_"+gram]), axis=1))
			df["count_of_unique_%s_%s"%(feat_name,gram)] = list(df.apply(lambda x: len(set(x[feat_name+"_"+gram])), axis=1))
			df["ratio_of_unique_%s_%s"%(feat_name,gram)] = map(try_divide, df["count_of_unique_%s_%s"%(feat_name,gram)], df["count_of_%s_%s"%(feat_name,gram)])

		## digit count
		df["count_of_digit_in_%s"%feat_name] = list(df.apply(lambda x: count_digit(x[feat_name+"_unigram"]), axis=1))
		df["ratio_of_digit_in_%s"%feat_name] = map(try_divide, df["count_of_digit_in_%s"%feat_name], df["count_of_%s_unigram"%(feat_name)])

	## description missing indicator
	df["description_missing"] = list(df.apply(lambda x: int(x["description_unigram"] == ""), axis=1))
	#print "dropping unigrams bigrams trigrams"
	#df=df.drop(['query','description','title','values'], axis=1)                      	


 #    ##############################
 #    ## intersect word count ##
 #    ##############################

	# print "generate intersect word counting features"
	# #### unigram
	# for gram in grams:
	# 	for obs_name in feat_names:
	# 		for target_name in feat_names:
	# 			if target_name != obs_name:
 #                    ## query
	# 				df["count_of_%s_%s_in_%s"%(obs_name,gram,target_name)] = list(df.apply(lambda x: sum([1. for w in x[obs_name+"_"+gram] if w in set(x[target_name+"_"+gram])]), axis=1))
	# 				df["ratio_of_%s_%s_in_%s"%(obs_name,gram,target_name)] = map(try_divide, df["count_of_%s_%s_in_%s"%(obs_name,gram,target_name)], df["count_of_%s_%s"%(obs_name,gram)])

	# 	## some other feat
	# 	df["title_%s_in_query_div_query_%s"%(gram,gram)] = map(try_divide, df["count_of_title_%s_in_query"%gram], df["count_of_query_%s"%gram])
	# 	df["title_%s_in_query_div_query_%s_in_title"%(gram,gram)] = map(try_divide, df["count_of_title_%s_in_query"%gram], df["count_of_query_%s_in_title"%gram])
	# 	df["description_%s_in_query_div_query_%s"%(gram,gram)] = map(try_divide, df["count_of_description_%s_in_query"%gram], df["count_of_query_%s"%gram])
	# 	df["description_%s_in_query_div_query_%s_in_description"%(gram,gram)] = map(try_divide, df["count_of_description_%s_in_query"%gram], df["count_of_query_%s_in_description"%gram])




	######################################
	## intersect word position feat ##
	######################################


	print "dropping unigrams bigrams trigrams"
	df=df.drop(['query','description','title','values'], axis=1)                      	


	print "generate intersect word position features"
	for gram in grams:
		for target_name in feat_names:
			for obs_name in feat_names:
				if target_name != obs_name:
					pos = list(df.apply(lambda x: get_position_list(x[target_name+"_"+gram], obs=x[obs_name+"_"+gram]), axis=1))
					## stats feat on pos
					df["pos_of_%s_%s_in_%s_min" % (obs_name, gram, target_name)] = map(np.min, pos)
					df["pos_of_%s_%s_in_%s_mean" % (obs_name, gram, target_name)] = map(np.mean, pos)
					df["pos_of_%s_%s_in_%s_median" % (obs_name, gram, target_name)] = map(np.median, pos)
					df["pos_of_%s_%s_in_%s_max" % (obs_name, gram, target_name)] = map(np.max, pos)
					df["pos_of_%s_%s_in_%s_std" % (obs_name, gram, target_name)] = map(np.std, pos)
					## stats feat on normalized_pos
					df["normalized_pos_of_%s_%s_in_%s_min" % (obs_name, gram, target_name)] = map(try_divide, df["pos_of_%s_%s_in_%s_min" % (obs_name, gram, target_name)], df["count_of_%s_%s" % (obs_name, gram)])
					df["normalized_pos_of_%s_%s_in_%s_mean" % (obs_name, gram, target_name)] = map(try_divide, df["pos_of_%s_%s_in_%s_mean" % (obs_name, gram, target_name)], df["count_of_%s_%s" % (obs_name, gram)])
					df["normalized_pos_of_%s_%s_in_%s_median" % (obs_name, gram, target_name)] = map(try_divide, df["pos_of_%s_%s_in_%s_median" % (obs_name, gram, target_name)], df["count_of_%s_%s" % (obs_name, gram)])
					df["normalized_pos_of_%s_%s_in_%s_max" % (obs_name, gram, target_name)] = map(try_divide, df["pos_of_%s_%s_in_%s_max" % (obs_name, gram, target_name)], df["count_of_%s_%s" % (obs_name, gram)])
					df["normalized_pos_of_%s_%s_in_%s_std" % (obs_name, gram, target_name)] = map(try_divide, df["pos_of_%s_%s_in_%s_std" % (obs_name, gram, target_name)] , df["count_of_%s_%s" % (obs_name, gram)])


	#print "dropping unigrams bigrams trigrams"
	df=df.drop(['query_unigram', 'title_unigram', 'description_unigram', 'query_bigram','title_bigram','description_bigram', 'query_trigram', 'title_trigram', 'description_trigram', 'attribute_values_unigram', 'attribute_values_bigram', 'attribute_values_trigram'], axis=1)                      	
	print "creating csv"
	df.to_csv("../../data/feat/test_countingfeat_part3.csv", header=True, index=False)
Пример #30
0
def extract_feat(df_all):
    df_all['len_of_query'] = df_all['search_term'].map(lambda x:len(x.split())).astype(np.int64)
    df_all['len_of_title'] = df_all['product_title'].map(lambda x:len(x.split())).astype(np.int64)
    df_all['len_of_description'] = df_all['product_description'].map(lambda x:len(x.split())).astype(np.int64)
    df_all['len_of_brand'] = df_all['brand'].map(lambda x:len(x.split())).astype(np.int64)

    df_all['product_info'] = df_all['search_term']+"\t"+df_all['product_title']+"\t"+df_all['product_description']+"\t"+df_all['product_attributes']+"\t"+df_all['brand']+"\t"+df_all['color']+"\t"+df_all['appl']

    df_all['word_in_title'] = df_all['product_info'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[1]))
    df_all['word_in_description'] = df_all['product_info'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[2]))
    df_all['word_in_attributes'] = df_all['product_info'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[3]))
    df_all['word_in_brand'] = df_all['product_info'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[4]))
    df_all['word_in_color'] = df_all['product_info'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[5]))
    df_all['word_in_appl'] = df_all['product_info'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[6]))

    
    df_all['ratio_title'] = df_all['word_in_title']/df_all['len_of_query']
    df_all['ratio_description'] = df_all['word_in_description']/df_all['len_of_query']
    df_all['ratio_attributes'] = df_all['word_in_attributes']/df_all['len_of_query']
    df_all['ratio_brand'] = df_all['word_in_brand']/df_all['len_of_query']
    df_all['ratio_color'] = df_all['word_in_color']/df_all['len_of_query']
    df_all['ratio_appl'] = df_all['word_in_appl']/df_all['len_of_query']


    df_all['cs_1'] = df_all['product_info'].map(lambda x:cs(x.split('\t')[0],x.split('\t')[1]))
    df_all['cs_2'] = df_all['product_info'].map(lambda x:cs(x.split('\t')[0],x.split('\t')[2]))
    df_all['cs_3'] = df_all['product_info'].map(lambda x:cs(x.split('\t')[1],x.split('\t')[2]))
    print "generate unigram"
    df_all["query_unigram"] = list(df_all.apply(lambda x: x["search_term"].lower().split(), axis=1))
    df_all["title_unigram"] = list(df_all.apply(lambda x: x["product_title"].lower().split(), axis=1))
    df_all["description_unigram"] = list(df_all.apply(lambda x: x["product_description"].lower().split(), axis=1))
 
    print "generate bigram"
    join_str = "_"
    df_all["query_bigram"] = list(df_all.apply(lambda x: ngram.getBigram(x["search_term"].split(), join_str), axis=1))
    df_all["title_bigram"] = list(df_all.apply(lambda x: ngram.getBigram(x["product_title"].split(), join_str), axis=1))
    df_all["description_bigram"] = list(df_all.apply(lambda x: ngram.getBigram(x["product_description"].split(), join_str), axis=1))
    ## trigram
    print "generate trigram"
    join_str = "_"
    df_all["query_trigram"] = list(df_all.apply(lambda x: ngram.getTrigram(x["search_term"].split(), join_str), axis=1))
    df_all["title_trigram"] = list(df_all.apply(lambda x: ngram.getTrigram(x["product_title"].split(), join_str), axis=1))
    df_all["description_trigram"] = list(df_all.apply(lambda x: ngram.getTrigram(x["product_description"].split(), join_str), axis=1))
    
    join_str = "X"
    # query unigram
    df_all["query_unigram_title_unigram"] = list(df_all.apply(lambda x: cooccurrence_terms(x["query_unigram"], x["title_unigram"], join_str), axis=1))
    df_all["query_unigram_title_bigram"] = list(df_all.apply(lambda x: cooccurrence_terms(x["query_unigram"], x["title_bigram"], join_str), axis=1))
    df_all["query_unigram_description_unigram"] = list(df_all.apply(lambda x: cooccurrence_terms(x["query_unigram"], x["description_unigram"], join_str), axis=1))
    df_all["query_unigram_description_bigram"] = list(df_all.apply(lambda x: cooccurrence_terms(x["query_unigram"], x["description_bigram"], join_str), axis=1))
    # query bigram
    df_all["query_bigram_title_unigram"] = list(df_all.apply(lambda x: cooccurrence_terms(x["query_bigram"], x["title_unigram"], join_str), axis=1))
    df_all["query_bigram_title_bigram"] = list(df_all.apply(lambda x: cooccurrence_terms(x["query_bigram"], x["title_bigram"], join_str), axis=1))
    df_all["query_bigram_description_unigram"] = list(df_all.apply(lambda x: cooccurrence_terms(x["query_bigram"], x["description_unigram"], join_str), axis=1))
    df_all["query_bigram_description_bigram"] = list(df_all.apply(lambda x: cooccurrence_terms(x["query_bigram"], x["description_bigram"], join_str), axis=1))
    
    
    
    print "generate word counting features"
    feat_names = ["query", "title","description"]
    grams = ["unigram","bigram", "trigram"]
    count_digit = lambda x: sum([1. for w in x if w.isdigit()])
    for feat_name in feat_names:
        for gram in grams:
            ## word count
                df_all["count_of_%s_%s"%(feat_name,gram)] = list(df_all.apply(lambda x: len(x[feat_name+"_"+gram]), axis=1))
                df_all["count_of_unique_%s_%s"%(feat_name,gram)] = list(df_all.apply(lambda x: len(set(x[feat_name+"_"+gram])), axis=1))
                df_all["ratio_of_unique_%s_%s"%(feat_name,gram)] = map(try_divide, df_all["count_of_unique_%s_%s"%(feat_name,gram)], df_all["count_of_%s_%s"%(feat_name,gram)])
    print "generate intersect word counting features"
    #### unigram
    for gram in grams:
        for obs_name in feat_names:
            for target_name in feat_names:
                 if target_name != obs_name:
                     ## query
                        df_all["count_of_%s_%s_in_%s"%(obs_name,gram,target_name)] = list(df_all.apply(lambda x: sum([1. for w in x[obs_name+"_"+gram] if w in set(x[target_name+"_"+gram])]), axis=1))
                        df_all["ratio_of_%s_%s_in_%s"%(obs_name,gram,target_name)] = map(try_divide, df_all["count_of_%s_%s_in_%s"%(obs_name,gram,target_name)], df_all["count_of_%s_%s"%(obs_name,gram)])

                        
            ## some other feat
        df_all["title_%s_in_query_div_query_%s"%(gram,gram)] = map(try_divide, df_all["count_of_title_%s_in_query"%gram], df_all["count_of_query_%s"%gram])
        df_all["title_%s_in_query_div_query_%s_in_title"%(gram,gram)] = map(try_divide, df_all["count_of_title_%s_in_query"%gram], df_all["count_of_query_%s_in_title"%gram])
        #df_all["description_%s_in_query_div_query_%s"%(gram,gram)] = map(try_divide, df_all["count_of_description_%s_in_query"%gram], df_all["count_of_query_%s"%gram])
        #df_all["description_%s_in_query_div_query_%s_in_description"%(gram,gram)] = map(try_divide, df_all["count_of_description_%s_in_query"%gram], df_all["count_of_query_%s_in_description"%gram])

    print "generate intersect word position features"
    for gram in grams:
        for target_name in feat_names:
            for obs_name in feat_names:
                 if target_name != obs_name:
                     pos = list(df_all.apply(lambda x: get_position_list(x[target_name+"_"+gram], obs=x[obs_name+"_"+gram]), axis=1))
                     ## stats feat on pos
                     df_all["pos_of_%s_%s_in_%s_min" % (obs_name, gram, target_name)] = map(np.min, pos)
                     df_all["pos_of_%s_%s_in_%s_mean" % (obs_name, gram, target_name)] = map(np.mean, pos)
                     df_all["pos_of_%s_%s_in_%s_median" % (obs_name, gram, target_name)] = map(np.median, pos)
                     df_all["pos_of_%s_%s_in_%s_max" % (obs_name, gram, target_name)] = map(np.max, pos)
                     df_all["pos_of_%s_%s_in_%s_std" % (obs_name, gram, target_name)] = map(np.std, pos)
                     ## stats feat on normalized_pos
                     df_all["normalized_pos_of_%s_%s_in_%s_min" % (obs_name, gram, target_name)] = map(try_divide, df_all["pos_of_%s_%s_in_%s_min" % (obs_name, gram, target_name)], df_all["count_of_%s_%s" % (obs_name, gram)])
                     df_all["normalized_pos_of_%s_%s_in_%s_mean" % (obs_name, gram, target_name)] = map(try_divide, df_all["pos_of_%s_%s_in_%s_mean" % (obs_name, gram, target_name)], df_all["count_of_%s_%s" % (obs_name, gram)])
                     df_all["normalized_pos_of_%s_%s_in_%s_median" % (obs_name, gram, target_name)] = map(try_divide, df_all["pos_of_%s_%s_in_%s_median" % (obs_name, gram, target_name)], df_all["count_of_%s_%s" % (obs_name, gram)])
                     df_all["normalized_pos_of_%s_%s_in_%s_max" % (obs_name, gram, target_name)] = map(try_divide, df_all["pos_of_%s_%s_in_%s_max" % (obs_name, gram, target_name)], df_all["count_of_%s_%s" % (obs_name, gram)])
                     df_all["normalized_pos_of_%s_%s_in_%s_std" % (obs_name, gram, target_name)] = map(try_divide, df_all["pos_of_%s_%s_in_%s_std" % (obs_name, gram, target_name)] , df_all["count_of_%s_%s" % (obs_name, gram)])
    
    print "generate jaccard coef and dice dist for n-gram"
    dists = ["jaccard_coef", "dice_dist"]
    grams = ["bigram", "trigram"]
    feat_names = ["query", "title","description"]
    for dist in dists:
        for gram in grams:
            for i in range(len(feat_names)-1):
                for j in range(i+1,len(feat_names)):
                     target_name = feat_names[i]
                     obs_name = feat_names[j]
                     df_all["%s_of_%s_between_%s_%s"%(dist,gram,target_name,obs_name)] = \
                     list(df_all.apply(lambda x: compute_dist(x[target_name+"_"+gram], x[obs_name+"_"+gram], dist), axis=1))

    return df_all
Пример #31
0
def extract_feat(df):
    ## unigram
    print "generate unigram"
    df["query_unigram"] = list(df.apply(lambda x: preprocess_data(x["query"]), axis=1))
    df["title_unigram"] = list(df.apply(lambda x: preprocess_data(x["product_title"]), axis=1))
    df["description_unigram"] = list(df.apply(lambda x: preprocess_data(x["product_description"]), axis=1))
    ## bigram
    print "generate bigram"
    join_str = "_"
    df["query_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["query_unigram"], join_str), axis=1))
    df["title_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["title_unigram"], join_str), axis=1))
    df["description_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["description_unigram"], join_str), axis=1))
    ## trigram
    print "generate trigram"
    join_str = "_"
    df["query_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["query_unigram"], join_str), axis=1))
    df["title_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["title_unigram"], join_str), axis=1))
    df["description_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["description_unigram"], join_str), axis=1))


    ################################
    ## word count and digit count ##
    ################################
    print "generate word counting features"
    feat_names = ["query", "title", "description"]
    grams = ["unigram", "bigram", "trigram"]
    count_digit = lambda x: sum([1. for w in x if w.isdigit()])
    for feat_name in feat_names:
        for gram in grams:
            ## word count
            df["count_of_%s_%s"%(feat_name,gram)] = list(df.apply(lambda x: len(x[feat_name+"_"+gram]), axis=1))
            df["count_of_unique_%s_%s"%(feat_name,gram)] = list(df.apply(lambda x: len(set(x[feat_name+"_"+gram])), axis=1))
            df["ratio_of_unique_%s_%s"%(feat_name,gram)] = map(try_divide, df["count_of_unique_%s_%s"%(feat_name,gram)], df["count_of_%s_%s"%(feat_name,gram)])

        ## digit count
        df["count_of_digit_in_%s"%feat_name] = list(df.apply(lambda x: count_digit(x[feat_name+"_unigram"]), axis=1))
        df["ratio_of_digit_in_%s"%feat_name] = map(try_divide, df["count_of_digit_in_%s"%feat_name], df["count_of_%s_unigram"%(feat_name)])

    ## description missing indicator
    df["description_missing"] = list(df.apply(lambda x: int(x["description_unigram"] == ""), axis=1))


    ##############################
    ## intersect word count ##
    ##############################
    print "generate intersect word counting features"
    #### unigram
    for gram in grams:
        for obs_name in feat_names:
            for target_name in feat_names:
                if target_name != obs_name:
                    ## query
                    df["count_of_%s_%s_in_%s"%(obs_name,gram,target_name)] = list(df.apply(lambda x: sum([1. for w in x[obs_name+"_"+gram] if w in set(x[target_name+"_"+gram])]), axis=1))
                    df["ratio_of_%s_%s_in_%s"%(obs_name,gram,target_name)] = map(try_divide, df["count_of_%s_%s_in_%s"%(obs_name,gram,target_name)], df["count_of_%s_%s"%(obs_name,gram)])

        ## some other feat
        df["title_%s_in_query_div_query_%s"%(gram,gram)] = map(try_divide, df["count_of_title_%s_in_query"%gram], df["count_of_query_%s"%gram])
        df["title_%s_in_query_div_query_%s_in_title"%(gram,gram)] = map(try_divide, df["count_of_title_%s_in_query"%gram], df["count_of_query_%s_in_title"%gram])
        df["description_%s_in_query_div_query_%s"%(gram,gram)] = map(try_divide, df["count_of_description_%s_in_query"%gram], df["count_of_query_%s"%gram])
        df["description_%s_in_query_div_query_%s_in_description"%(gram,gram)] = map(try_divide, df["count_of_description_%s_in_query"%gram], df["count_of_query_%s_in_description"%gram])


    ######################################
    ## intersect word position feat ##
    ######################################
    print "generate intersect word position features"
    for gram in grams:
        for target_name in feat_names:
            for obs_name in feat_names:
                if target_name != obs_name:
                    pos = list(df.apply(lambda x: get_position_list(x[target_name+"_"+gram], obs=x[obs_name+"_"+gram]), axis=1))
                    ## stats feat on pos
                    df["pos_of_%s_%s_in_%s_min" % (obs_name, gram, target_name)] = map(np.min, pos)
                    df["pos_of_%s_%s_in_%s_mean" % (obs_name, gram, target_name)] = map(np.mean, pos)
                    df["pos_of_%s_%s_in_%s_median" % (obs_name, gram, target_name)] = map(np.median, pos)
                    df["pos_of_%s_%s_in_%s_max" % (obs_name, gram, target_name)] = map(np.max, pos)
                    df["pos_of_%s_%s_in_%s_std" % (obs_name, gram, target_name)] = map(np.std, pos)
                    ## stats feat on normalized_pos
                    df["normalized_pos_of_%s_%s_in_%s_min" % (obs_name, gram, target_name)] = map(try_divide, df["pos_of_%s_%s_in_%s_min" % (obs_name, gram, target_name)], df["count_of_%s_%s" % (obs_name, gram)])
                    df["normalized_pos_of_%s_%s_in_%s_mean" % (obs_name, gram, target_name)] = map(try_divide, df["pos_of_%s_%s_in_%s_mean" % (obs_name, gram, target_name)], df["count_of_%s_%s" % (obs_name, gram)])
                    df["normalized_pos_of_%s_%s_in_%s_median" % (obs_name, gram, target_name)] = map(try_divide, df["pos_of_%s_%s_in_%s_median" % (obs_name, gram, target_name)], df["count_of_%s_%s" % (obs_name, gram)])
                    df["normalized_pos_of_%s_%s_in_%s_max" % (obs_name, gram, target_name)] = map(try_divide, df["pos_of_%s_%s_in_%s_max" % (obs_name, gram, target_name)], df["count_of_%s_%s" % (obs_name, gram)])
                    df["normalized_pos_of_%s_%s_in_%s_std" % (obs_name, gram, target_name)] = map(try_divide, df["pos_of_%s_%s_in_%s_std" % (obs_name, gram, target_name)] , df["count_of_%s_%s" % (obs_name, gram)])
Пример #32
0
def extract_feat(df):
    ## unigram
    print("generate unigram")
    df["question1_unigram"] = list(df.apply(lambda x: preprocess_data(x["question1"]), axis=1))
    df["question2_unigram"] = list(df.apply(lambda x: preprocess_data(x["question2"]), axis=1))
    ## bigram
    print("generate bigram")
    join_str = "_"
    df["question1_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["question1_unigram"], join_str), axis=1))
    df["question2_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["question2_unigram"], join_str), axis=1))
    ## trigram
    print("generate trigram")
    join_str = "_"
    df["question1_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["question1_unigram"], join_str), axis=1))
    df["question2_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["question2_unigram"], join_str), axis=1))

    ################################
    ## word count and digit count ##
    ################################
    print("generate word counting features")
    feat_names = ["question1", "question2"]
    grams = ["unigram", "bigram", "trigram"]
    count_digit = lambda x: sum([1. for w in x if w.isdigit()])
    for feat_name in feat_names:
        for gram in grams:
            ## word count 单词数量
            df["count_of_%s_%s"%(feat_name,gram)] = list(df.apply(lambda x: len(x[feat_name+"_"+gram]), axis=1))    # 单词数量
            df["count_of_unique_%s_%s"%(feat_name, gram)] = list(df.apply(lambda x: len(set(x[feat_name+"_"+gram])), axis=1))   # 不重复单词数量
            df["ratio_of_unique_%s_%s"%(feat_name, gram)] = list(map(try_divide, df["count_of_unique_%s_%s"%(feat_name,gram)], df["count_of_%s_%s"%(feat_name,gram)]))  # 不重复单词占比

        ## digit count 数字数量
        df["count_of_digit_in_%s"%feat_name] = list(df.apply(lambda x: count_digit(x[feat_name+"_unigram"]), axis=1))   # 数字数量
        df["ratio_of_digit_in_%s"%feat_name] = list(map(try_divide, df["count_of_digit_in_%s"%feat_name], df["count_of_%s_unigram"%(feat_name)]))   # 数字占比

        # ## letter count 字母数量
        # df["count_of_letter_in_%s" % feat_name] = list( df.apply(lambda x: len(x[feat_name]), axis=1))

    ####################################
    ## subtract word and letter count ##
    ####################################
    print("generate subtract word counting features")
    #### unigram
    for obs_name in feat_names:
        for target_name in feat_names:
            if target_name != obs_name:
                ## word count 单词数量差
                df["count_of_%s_%s_subtract_%s" % (obs_name,  "unigram", target_name)] = list(df.apply(
                    lambda x: 1 if (len(x[obs_name + "_unigram"]) + len(x[target_name + "_unigram"])) == 0 else 1.0 * abs(len(x[obs_name + "_unigram"]) - len(x[target_name + "_unigram"])) / (len(x[obs_name + "_unigram"]) + len(x[target_name + "_unigram"])), axis=1))
                ## digit count 数字数量差
                df["count_of_%s_%s_subtract_%s" % (obs_name, "digit", target_name)] = list(df.apply(
                    lambda x: 1 if (count_digit(x[obs_name+"_unigram"]) + count_digit(x[target_name+"_unigram"])) == 0 else 1.0 * abs(count_digit(x[obs_name+"_unigram"]) - count_digit(x[target_name+"_unigram"])) / (count_digit(x[obs_name+"_unigram"]) + count_digit(x[target_name+"_unigram"])), axis=1))
                # ## word count 字母数量差
                # f["count_of_%s_%s_subtract_%s" % (obs_name, "letter", target_name)] = list(df.apply(
                #     lambda x: 1.0 * abs(len(x[obs_name]) - len(x[target_name])) / (len(x[obs_name]) + len(x[target_name])), axis=1))


    ##############################
    ## intersect word count ######
    ##############################
    print("generate intersect word counting features")
    #### unigram
    for gram in grams:
        for obs_name in feat_names:
            for target_name in feat_names:
                if target_name != obs_name:
                    ## query
                    df["count_of_%s_%s_in_%s"%(obs_name, gram, target_name)] = list(df.apply(lambda x: sum([1. for w in x[obs_name+"_"+gram] if w in set(x[target_name+"_"+gram])]), axis=1))   # 两特征单词相交的数量
                    df["ratio_of_%s_%s_in_%s"%(obs_name, gram, target_name)] = list(map(try_divide, df["count_of_%s_%s_in_%s"%(obs_name,gram,target_name)], df["count_of_%s_%s"%(obs_name,gram)]))   # 两特征单词相交的数量占比
Пример #33
0
def process():

    full_data = pd.read_csv('./data/merged_data_tain.csv', encoding='utf-8')
    used_column = [
        'claimHeadline', 'articleHeadline', 'claimTruthiness', 'articleStance'
    ]
    full_data = full_data[used_column]
    full_data = full_data.dropna()
    train, test = train_test_split(full_data,
                                   test_size=0.33,
                                   random_state=1234)

    read = False
    if not read:

        targets = ['observing', 'for', 'against', 'ignoring']
        targets_dict = dict(zip(targets, range(len(targets))))
        train['target'] = map(lambda x: targets_dict[x],
                              train['articleStance'])
        print 'train.shape:'
        print train.shape
        n_train = train.shape[0]

        data = train
        # read test set, no 'Stance' column in test set -> target = NULL
        # concatenate training and test set
        test_flag = True
        if test_flag:

            data = train
            print data
            print 'data.shape:'
            print data.shape

            train = data[~data['target'].isnull()]
            print train
            print 'train.shape:'
            print train.shape

            test = data[data['target'].isnull()]
            print test
            print 'test.shape:'
            print test.shape

        #data = data.iloc[:100, :]

        #return 1

        print "generate unigram"
        data["claimHeadline_unigram"] = data["claimHeadline"].map(
            lambda x: preprocess_data(x))
        data["articleHeadline_unigram"] = data["articleHeadline"].map(
            lambda x: preprocess_data(x))

        print "generate bigram"
        join_str = "_"
        data["claimHeadline_bigram"] = data["claimHeadline_unigram"].map(
            lambda x: ngram.getBigram(x, join_str))
        data["articleHeadline_bigram"] = data["articleHeadline_unigram"].map(
            lambda x: ngram.getBigram(x, join_str))

        print "generate trigram"
        join_str = "_"
        data["claimHeadline_trigram"] = data["claimHeadline_unigram"].map(
            lambda x: ngram.getTrigram(x, join_str))
        data["articleHeadline_trigram"] = data["articleHeadline_bigram"].map(
            lambda x: ngram.getTrigram(x, join_str))

        with open('data.pkl', 'wb') as outfile:
            cPickle.dump(data, outfile, -1)
            print 'dataframe saved in data.pkl'

    else:
        with open('data.pkl', 'rb') as infile:
            data = cPickle.load(infile)
            print 'data loaded'
            print 'data.shape:'
            print data.shape
    #return 1

    # define feature generators
    countFG = CountFeatureGenerator()
    tfidfFG = TfidfFeatureGenerator()
    svdFG = SvdFeatureGenerator()
    word2vecFG = Word2VecFeatureGenerator()
    sentiFG = SentimentFeatureGenerator()
    #walignFG   = AlignmentFeatureGenerator()
    generators = [countFG, tfidfFG, svdFG, word2vecFG, sentiFG]
    #generators = [svdFG, word2vecFG, sentiFG]
    #generators = [tfidfFG]
    #generators = [countFG]
    #generators = [walignFG]

    for g in generators:
        g.process(data)

    for g in generators:
        g.read('train')

    #for g in generators:
    #    g.read('test')

    print 'done'