def prepare_bigram(path, out): c = 0 start = datetime.now() with open(out, 'w') as outfile: outfile.write('word1_bigram,word2_bigram,char1_bigram,char2_bigram\n') for t, row in enumerate(DictReader(open(path), delimiter=',')): if c % 100000 == 0: print('finished', c) q1 = remove_punctuation(str(row['words_x']).lower()).split(' ') q2 = remove_punctuation(str( row['words_y']).lower()).lower().split(' ') q3 = remove_punctuation(str(row['chars_x']).lower()).split(' ') q4 = remove_punctuation(str( row['chars_y']).lower()).lower().split(' ') q1_bigram = getBigram(q1) q2_bigram = getBigram(q2) q3_bigram = getBigram(q3) q4_bigram = getBigram(q4) q1_bigram = ' '.join(q1_bigram) q2_bigram = ' '.join(q2_bigram) q3_bigram = ' '.join(q3_bigram) q4_bigram = ' '.join(q4_bigram) outfile.write('%s,%s,%s,%s\n' % (q1_bigram, q2_bigram, q3_bigram, q4_bigram)) c += 1 end = datetime.now() print('times:', end - start)
def extract_distance_features(df): join_str="_" df["query_unigram"] = list(df.apply(lambda x: preprocess_data(x["query"], "stem"), axis=1)) df["title_unigram"] = list(df.apply(lambda x: preprocess_data(x["title"]), axis=1)) df["description_unigram"] = list(df.apply(lambda x: preprocess_data(x["description"]), axis=1)) df["query_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["query_unigram"], join_str), axis=1)) df["title_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["title_unigram"], join_str), axis=1)) df["description_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["description_unigram"], join_str), axis=1)) df["query_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["query_unigram"], join_str), axis=1)) df["title_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["title_unigram"], join_str), axis=1)) df["description_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["description_unigram"], join_str), axis=1)) df["attribute_values_unigram"] = list(df.apply(lambda x: preprocess_data(x["values"], "stem"), axis=1)) df["attribute_values_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["attribute_values_unigram"]), axis=1)) df["attribute_values_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["attribute_values_unigram"]), axis=1)) #calculate distance print "generate jaccard coef and dice dist for n-gram" dists = ["jaccard_coef", "dice_dist"] grams = ["unigram", "bigram", "trigram"] feat_names = ["query", "title", "description", "attribute_values"] for dist in dists: print "Generating ",dist for gram in grams: for i in range(len(feat_names)-1): for j in range(i+1,len(feat_names)): target_name = feat_names[i] obs_name = feat_names[j] df["%s_of_%s_between_%s_%s"%(dist,gram,target_name,obs_name)] = \ list(df.apply(lambda x: compute_dist(x[target_name+"_"+gram], x[obs_name+"_"+gram], dist), axis=1)) print "Dropping columns" df=df.drop(['query_unigram', 'title_unigram', 'description_unigram', 'query_bigram','title_bigram','description_bigram', 'query_trigram', 'title_trigram', 'description_trigram', 'attribute_values_unigram', 'attribute_values_bigram', 'attribute_values_trigram'], axis=1) print "Creating csv" df.to_csv("../../data/feat/test_distFeat.csv", header=True, index=False) return df
def extract_feat(df): ## unigram print "generate unigram" df["query_unigram"] = list(df.apply(lambda x: preprocess_data(x["query"]), axis=1)) df["title_unigram"] = list(df.apply(lambda x: preprocess_data(x["product_title"]), axis=1)) df["description_unigram"] = list(df.apply(lambda x: preprocess_data(x["product_description"]), axis=1)) ## bigram print "generate bigram" join_str = "_" df["query_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["query_unigram"], join_str), axis=1)) df["title_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["title_unigram"], join_str), axis=1)) df["description_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["description_unigram"], join_str), axis=1)) # ## trigram # join_str = "_" # df["query_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["query_unigram"], join_str), axis=1)) # df["title_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["title_unigram"], join_str), axis=1)) # df["description_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["description_unigram"], join_str), axis=1)) ## cooccurrence terms join_str = "X" # query unigram df["query_unigram_title_unigram"] = list(df.apply(lambda x: cooccurrence_terms(x["query_unigram"], x["title_unigram"], join_str), axis=1)) df["query_unigram_title_bigram"] = list(df.apply(lambda x: cooccurrence_terms(x["query_unigram"], x["title_bigram"], join_str), axis=1)) df["query_unigram_description_unigram"] = list(df.apply(lambda x: cooccurrence_terms(x["query_unigram"], x["description_unigram"], join_str), axis=1)) df["query_unigram_description_bigram"] = list(df.apply(lambda x: cooccurrence_terms(x["query_unigram"], x["description_bigram"], join_str), axis=1)) # query bigram df["query_bigram_title_unigram"] = list(df.apply(lambda x: cooccurrence_terms(x["query_bigram"], x["title_unigram"], join_str), axis=1)) df["query_bigram_title_bigram"] = list(df.apply(lambda x: cooccurrence_terms(x["query_bigram"], x["title_bigram"], join_str), axis=1)) df["query_bigram_description_unigram"] = list(df.apply(lambda x: cooccurrence_terms(x["query_bigram"], x["description_unigram"], join_str), axis=1)) df["query_bigram_description_bigram"] = list(df.apply(lambda x: cooccurrence_terms(x["query_bigram"], x["description_bigram"], join_str), axis=1)) # query id df["query_id_title_unigram"] = list(df.apply(lambda x: cooccurrence_terms(["qid"+str(x["qid"])], x["title_unigram"], join_str), axis=1)) df["query_id_title_bigram"] = list(df.apply(lambda x: cooccurrence_terms(["qid"+str(x["qid"])], x["title_bigram"], join_str), axis=1)) df["query_id_description_unigram"] = list(df.apply(lambda x: cooccurrence_terms(["qid"+str(x["qid"])], x["description_unigram"], join_str), axis=1)) df["query_id_description_bigram"] = list(df.apply(lambda x: cooccurrence_terms(["qid"+str(x["qid"])], x["description_bigram"], join_str), axis=1))
def extract_basic_distance_feat(df): ## unigram print("generate unigram") df["query_unigram"] = list(df.apply(lambda x: preprocess_data(x["query"]), axis=1)) df["title_unigram"] = list(df.apply(lambda x: preprocess_data(x["product_title"]), axis=1)) df["description_unigram"] = list(df.apply(lambda x: preprocess_data(x["product_description"]), axis=1)) ## bigram print( "generate bigram") join_str = "_" df["query_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["query_unigram"], join_str), axis=1)) df["title_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["title_unigram"], join_str), axis=1)) df["description_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["description_unigram"], join_str), axis=1)) ## trigram print( "generate trigram") join_str = "_" df["query_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["query_unigram"], join_str), axis=1)) df["title_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["title_unigram"], join_str), axis=1)) df["description_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["description_unigram"], join_str), axis=1)) ## jaccard coef/dice dist of n-gram print( "generate jaccard coef and dice dist for n-gram") dists = ["jaccard_coef", "dice_dist"] grams = ["unigram", "bigram", "trigram"] feat_names = ["query", "title", "description"] for dist in dists: for gram in grams: for i in range(len(feat_names)-1): for j in range(i+1,len(feat_names)): target_name = feat_names[i] obs_name = feat_names[j] df["%s_of_%s_between_%s_%s"%(dist,gram,target_name,obs_name)] = \ list(df.apply(lambda x: compute_dist(x[target_name+"_"+gram], x[obs_name+"_"+gram], dist), axis=1))
def extract_basic_distance_feat(df): ## unigram print "generate unigram" df["query_unigram"] = list(df.apply(lambda x: preprocess_data(x["query"]), axis=1)) df["title_unigram"] = list(df.apply(lambda x: preprocess_data(x["product_title"]), axis=1)) df["description_unigram"] = list(df.apply(lambda x: preprocess_data(x["product_description"]), axis=1)) ## bigram print "generate bigram" join_str = "_" df["query_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["query_unigram"], join_str), axis=1)) df["title_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["title_unigram"], join_str), axis=1)) df["description_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["description_unigram"], join_str), axis=1)) ## trigram print "generate trigram" join_str = "_" df["query_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["query_unigram"], join_str), axis=1)) df["title_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["title_unigram"], join_str), axis=1)) df["description_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["description_unigram"], join_str), axis=1)) ## jaccard coef/dice dist of n-gram print "generate jaccard coef and dice dist for n-gram" dists = ["jaccard_coef", "dice_dist"] grams = ["unigram", "bigram", "trigram"] feat_names = ["query", "title", "description"] for dist in dists: for gram in grams: for i in range(len(feat_names)-1): for j in range(i+1,len(feat_names)): target_name = feat_names[i] obs_name = feat_names[j] df["%s_of_%s_between_%s_%s"%(dist,gram,target_name,obs_name)] = \ list(df.apply(lambda x: compute_dist(x[target_name+"_"+gram], x[obs_name+"_"+gram], dist), axis=1))
def prepare_unigram(path,out): data_input = pd.read_csv(path) data_ouput = DataFrame(columns=['sen1_bigram','sen2_bigram']) for index,row in data_input.iterrows(): s1 = str(row['sen1']).split() s2 = str(row['sen2']).split() s1 = ' '.join(getBigram(s1)) s2 = ' '.join(getBigram(s2)) data_ouput.loc[index] = [s1,s2] data_ouput.to_csv(out,index=False)
def generate_ngrams(df): # unigram print("generate unigram") df["title_unigram"] = list(df.apply(lambda x: preprocess_data(x["product_title"]), axis=1)) df["search_term_unigram"] = list(df.apply(lambda x: preprocess_data(x["search_term"]), axis=1)) # bigram print("generate bigram") join_str = "_" df["title_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["title_unigram"], join_str), axis=1)) df["search_term_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["search_term_unigram"], join_str), axis=1)) # trigram print("generate trigram") join_str = "_" df["search_term_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["search_term_unigram"], join_str), axis=1)) df["title_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["title_unigram"], join_str), axis=1))
def gen_ngram_data(df): ## unigram print("generate unigram") df["q1_unigram"] = list(df.apply(lambda x: preprocess_data(x["question1"]), axis=1)) df["q2_unigram"] = list(df.apply(lambda x: preprocess_data(x["question2"]), axis=1)) ## bigram print("generate bigram") join_str = "_" df["q1_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["q1_unigram"], join_str), axis=1)) df["q2_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["q2_unigram"], join_str), axis=1)) ## trigram print("generate trigram") join_str = "_" df["q1_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["q1_bigram"], join_str), axis=1)) df["q2_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["q2_bigram"], join_str), axis=1)) return df
def generate_product_ngrams(df): print("Generate unigram") df["description_unigram"] = list(df.apply(lambda x: preprocess_data(x["product_description"]), axis=1)) print("Generate bigram") join_str = "_" df["description_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["description_unigram"], join_str), axis=1)) print("Generate trigram") join_str = "_" df["description_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["description_unigram"], join_str), axis=1))
def generate_brand_ngrams(df): print("Generate brand unigram") df["brand_unigram"] = list(df.apply(lambda x: preprocess_data(x["brand"]), axis=1)) print("Generate brand bigram") join_str = "_" df["brand_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["brand_unigram"], join_str), axis=1)) print("Generate brand trigram") join_str = "_" df["brand_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["brand_unigram"], join_str), axis=1))
def generateNGram(df): # unigram df['query_unigram'] = df['query'].apply(lambda x: ngram.getUnigram(x)) df['title_unigram'] = df['product_title'].apply( lambda x: ngram.getUnigram(x)) df['description_unigram'] = df['product_description'].apply( lambda x: ngram.getUnigram(x)) # bigram df['query_bigram'] = df['query'].apply(lambda x: ngram.getBigram(x, '_')) df['title_bigram'] = df['product_title'].apply( lambda x: ngram.getBigram(x, '_')) df['description_bigram'] = df['product_description'].apply( lambda x: ngram.getBigram(x, '_')) # trigram df['query_trigram'] = df['query'].apply(lambda x: ngram.getTrigram(x, '_')) df['title_trigram'] = df['product_title'].apply( lambda x: ngram.getTrigram(x, '_')) df['description_trigram'] = df['product_description'].apply( lambda x: ngram.getTrigram(x, '_')) return df
def extract_feat(df): ## unigram print("generate unigram") df["question1_unigram"] = list( df.apply(lambda x: preprocess_data(x["question1"]), axis=1)) df["question2_unigram"] = list( df.apply(lambda x: preprocess_data(x["question2"]), axis=1)) ## bigram print("generate bigram") join_str = "_" df["question1_bigram"] = list( df.apply(lambda x: ngram.getBigram(x["question1_unigram"], join_str), axis=1)) df["question2_bigram"] = list( df.apply(lambda x: ngram.getBigram(x["question2_unigram"], join_str), axis=1)) # ## trigram # join_str = "_" # df["question1_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["question1_unigram"], join_str), axis=1)) # df["question2_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["question2_unigram"], join_str), axis=1)) ## cooccurrence terms join_str = "X" # question1 unigram df["question1_unigram_question2_unigram"] = list( df.apply(lambda x: cooccurrence_terms(x["question1_unigram"], x[ "question2_unigram"], join_str), axis=1)) df["question1_unigram_question2_bigram"] = list( df.apply(lambda x: cooccurrence_terms(x["question1_unigram"], x[ "question2_bigram"], join_str), axis=1)) # question1 bigram df["question1_bigram_question2_unigram"] = list( df.apply(lambda x: cooccurrence_terms(x["question1_bigram"], x[ "question2_unigram"], join_str), axis=1)) df["question1_bigram_question2_bigram"] = list( df.apply(lambda x: cooccurrence_terms(x["question1_bigram"], x[ "question2_bigram"], join_str), axis=1))
def extract_basic_distance_feat(df): ## unigram ## unigram print "generate ngrams" join_str = "_" print "generate ngrams for question1" df.loc[:, "question1_unigram"] = list(map(preprocess_data, df["question1"])) df.loc[:, "question1_bigram"] = [ ngram.getBigram(x, join_str) for x in df["question1_unigram"] ] df.loc[:, "question1_trigram"] = [ ngram.getTrigram(x, join_str) for x in df["question1_unigram"] ] print "generate ngrams for question2" df.loc[:, "question2_unigram"] = list(map(preprocess_data, df["question2"])) df.loc[:, "question2_bigram"] = [ ngram.getBigram(x, join_str) for x in df["question2_unigram"] ] df.loc[:, "question2_trigram"] = [ ngram.getTrigram(x, join_str) for x in df["question2_unigram"] ] ## jaccard coef/dice dist of n-gram print "generate jaccard coef and dice dist for n-gram" dists = ["jaccard_coef", "dice_dist"] grams = ["unigram", "bigram", "trigram"] feat_names = ["question1", "question2"] for dist in dists: for gram in grams: for i in range(len(feat_names) - 1): for j in range(i + 1, len(feat_names)): target_name = feat_names[i] obs_name = feat_names[j] df["%s_of_%s_between_%s_%s"%(dist,gram,target_name,obs_name)] = \ map(partial(compute_dist, dist=dist), df[target_name+"_"+gram], df[obs_name+"_"+gram])
def extract_feat(df): print "generate ngrams" join_str = "_" print "generate ngrams for question1" df.loc[:, "question1_unigram"] = list(map(preprocess_data, df["question1"])) df.loc[:, "question1_bigram"] = [ ngram.getBigram(x, join_str) for x in df["question1_unigram"] ] # df.loc[:,"question1_trigram"] = [ngram.getTrigram(x, join_str) for x in df["question1_unigram"]] print "generate ngrams for question2" df.loc[:, "question2_unigram"] = list(map(preprocess_data, df["question2"])) df.loc[:, "question2_bigram"] = [ ngram.getBigram(x, join_str) for x in df["question2_unigram"] ] # df.loc[:,"question2_trigram"] = [ngram.getTrigram(x, join_str) for x in df["question2_unigram"]] ## cooccurrence terms # join_str = "X" print "generate coocurance terms" df["question1_unigram_question2_unigram"] = map(cooccurrence_terms, df["question1_unigram"], df["question2_unigram"]) df["question1_unigram_question2_bigram"] = map(cooccurrence_terms, df["question1_unigram"], df["question2_bigram"]) # query bigram df["question1_bigram_question2_unigram"] = map(cooccurrence_terms, df["question1_bigram"], df["question2_unigram"]) df["question1_bigram_question2_bigram"] = map(cooccurrence_terms, df["question1_bigram"], df["question2_bigram"])
def __iter__(self): for item in DictReader(open(self.tsvFile, "rb"), delimiter='\t', quotechar='"'): self.counter += 1 item = {featureName:featureValue.decode('utf-8') \ for featureName,featureValue in item.items() \ if featureValue is not None} description = [ str(self.wordIndex[w]) for w in getWords(item["description"]) ] if self.ngram == 1: yield getUnigram(description) elif self.ngram == 2: yield getBigram(description, "_") if self.counter%100000 == 0: print(( " Process %s" % self.counter ))
def __iter__(self): for item in DictReader(open(self.tsvFile, "rb"), delimiter='\t', quotechar='"'): self.counter += 1 item = {featureName:featureValue.decode('utf-8') \ for featureName,featureValue in item.iteritems() \ if featureValue is not None} description = [ str(self.wordIndex[w]) for w in getWords(item["description"]) ] if self.ngram == 1: yield getUnigram(description) elif self.ngram == 2: yield getBigram(description, "_") if self.counter%100000 == 0: print( " Process %s" % self.counter )
def prepare_bigram(path, out): print path c = 0 start = datetime.now() with open(out, 'w') as outfile: outfile.write('question1_bigram,question2_bigram\n') for t, row in enumerate(DictReader(open(path), delimiter=',')): if c % 100000 == 0: print 'finished', c q1 = remove_punctuation(str( row['question1_porter']).lower()).split(' ') q2 = remove_punctuation(str( row['question2_porter']).lower()).lower().split(' ') q1_bigram = getBigram(q1) q2_bigram = getBigram(q2) q1_bigram = ' '.join(q1_bigram) q2_bigram = ' '.join(q2_bigram) outfile.write('%s,%s\n' % (q1_bigram, q2_bigram)) c += 1 end = datetime.now() print 'times:', end - start
def str_common_word_ngram(str1, str2, n): # what happens if length of word is less than size of gram? should return 0 # use switcher if n == 1: return sum(int(str2.find(str(str1))>=0) for word in str1.split()) elif n == 2: return sum(int(str2.find(word_ngram)>=0) for word_ngram in ngram.getBigram(str1.split()," ")) elif n == 3: return sum(int(str2.find(word_ngram)>=0) for word_ngram in ngram.getTrigram(str1.split()," ")) elif n == 4: return sum(int(str2.find(word_ngram)>=0) for word_ngram in ngram.getFourgram(str1.split()," ")) else: print("Incorrect n value entered:",n) return 0
def _get_ngram(self, sr): """ Compute ngram of the text of a pd.Series. The unigram operation is combining stemming \ words and excluding stopwords. The bigram and trigram operations are based on the results \ of the unigram operation. Args: sr(pd.Series): Returns: sr_unigram(pd.Series), sr_bigram(pd.Series), sr_trigram(pd.Series) """ # Unigram. unigram_func = lambda s: list(self._stem_excl_words(s)) sr_unigram = sr.map(unigram_func) # Bigram. bigram_func = lambda s: ngram.getBigram(s, '_') sr_bigram = sr_unigram.map(bigram_func) # Trigram. trigram_func = lambda s: ngram.getTrigram(s, '_') sr_trigram = sr_unigram.map(trigram_func) return sr_unigram, sr_bigram, sr_trigram
def test(): ############### ## Load Data ## ############### ## load data dataPath = "./ModelSystem/ProcessedData" columnNames = ["query", "title", "description"] catagories = ["train", "test"] for cata in catagories: for col in columnNames: path = "%s/%s_%s.pickle" % (dataPath, col, cata) with open(path, "rb") as f: input = pickle.load(f) sz = len(input) #开始1,2,3元文法 output_unigram = [] output_bigram = [] output_trigram = [] for i in range(2): text = input[i] #去除标点 text = re.sub("[^0-9a-zA-Z.]", " ", text) wordList = text.split() unigram = wordList bigram = ngram.getBigram(wordList, "_") trigram = ngram.getTrigram(wordList, "_") print(unigram) print(bigram) print(trigram) # ret = ngram.getBigram(x["query_unigram"], join_str) print("ngram All Done.")
def extract_feat(df): ## unigram print "generate ngrams" join_str = "_" print "generate ngrams for question1" df.loc[:, "question1_unigram"] = list(map(preprocess_data, df["question1"])) df.loc[:, "question1_bigram"] = [ ngram.getBigram(x, join_str) for x in df["question1_unigram"] ] df.loc[:, "question1_trigram"] = [ ngram.getTrigram(x, join_str) for x in df["question1_unigram"] ] print "generate ngrams for question2" df.loc[:, "question2_unigram"] = list(map(preprocess_data, df["question2"])) df.loc[:, "question2_bigram"] = [ ngram.getBigram(x, join_str) for x in df["question2_unigram"] ] df.loc[:, "question2_trigram"] = [ ngram.getTrigram(x, join_str) for x in df["question2_unigram"] ] ################################ ## word count and digit count ## ################################ print "generate word counting features" feat_names = ["question1", "question2"] grams = ["unigram", "bigram", "trigram"] count_digit = lambda x: sum([1. for w in x if w.isdigit()]) for feat_name in feat_names: for gram in grams: ## word count df["count_of_%s_%s" % (feat_name, gram)] = [ len(x) for x in df[feat_name + "_" + gram] ] df["count_of_unique_%s_%s" % (feat_name, gram)] = [ len(set(x)) for x in df[feat_name + "_" + gram] ] df["ratio_of_unique_%s_%s" % (feat_name, gram)] = map( try_divide, df["count_of_unique_%s_%s" % (feat_name, gram)], df["count_of_%s_%s" % (feat_name, gram)]) ## digit count df["count_of_digit_in_%s" % feat_name] = list( map(count_digit, df[feat_name + "_unigram"])) df["ratio_of_digit_in_%s" % feat_name] = map( try_divide, df["count_of_digit_in_%s" % feat_name], df["count_of_%s_unigram" % (feat_name)]) ############################## ## intersect word count ## ############################## print "generate intersect word counting features" def word_count_intersect_questions(obs, target): word_count_intersect = 0 if len(obs) != 0: word_count_intersect = len([w for w in obs if w in target]) return word_count_intersect #### unigram for gram in grams: for obs_name in feat_names: for target_name in feat_names: if target_name != obs_name: ## query df["count_of_%s_%s_in_%s" % (obs_name, gram, target_name)] = list( map(word_count_intersect_questions, df[obs_name + "_" + gram], df[target_name + "_" + gram])) df["ratio_of_%s_%s_in_%s" % (obs_name, gram, target_name)] = map( try_divide, df["count_of_%s_%s_in_%s" % (obs_name, gram, target_name)], df["count_of_%s_%s" % (obs_name, gram)]) ## some other feat df["question2_%s_in_question1_div_question1_%s" % (gram, gram)] = map( try_divide, df["count_of_question2_%s_in_question1" % gram], df["count_of_question1_%s" % gram]) df["question2_%s_in_question1_div_question1_%s_in_question2" % (gram, gram)] = map(try_divide, df["count_of_question2_%s_in_question1" % gram], df["count_of_question1_%s_in_question2" % gram]) ###################################### ## intersect word position feat ## ###################################### print "generate intersect word position features" for gram in grams: for target_name in feat_names: for obs_name in feat_names: if target_name != obs_name: pos = list( map(get_position_list, df[obs_name + "_" + gram], df[target_name + "_" + gram])) ## stats feat on pos df["pos_of_%s_%s_in_%s_min" % (obs_name, gram, target_name)] = map(np.min, pos) df["pos_of_%s_%s_in_%s_mean" % (obs_name, gram, target_name)] = map(np.mean, pos) df["pos_of_%s_%s_in_%s_median" % (obs_name, gram, target_name)] = map(np.median, pos) df["pos_of_%s_%s_in_%s_max" % (obs_name, gram, target_name)] = map(np.max, pos) df["pos_of_%s_%s_in_%s_std" % (obs_name, gram, target_name)] = map(np.std, pos) ## stats feat on normalized_pos df["normalized_pos_of_%s_%s_in_%s_min" % (obs_name, gram, target_name)] = map( try_divide, df["pos_of_%s_%s_in_%s_min" % (obs_name, gram, target_name)], df["count_of_%s_%s" % (obs_name, gram)]) df["normalized_pos_of_%s_%s_in_%s_mean" % (obs_name, gram, target_name)] = map( try_divide, df["pos_of_%s_%s_in_%s_mean" % (obs_name, gram, target_name)], df["count_of_%s_%s" % (obs_name, gram)]) df["normalized_pos_of_%s_%s_in_%s_median" % (obs_name, gram, target_name)] = map( try_divide, df["pos_of_%s_%s_in_%s_median" % (obs_name, gram, target_name)], df["count_of_%s_%s" % (obs_name, gram)]) df["normalized_pos_of_%s_%s_in_%s_max" % (obs_name, gram, target_name)] = map( try_divide, df["pos_of_%s_%s_in_%s_max" % (obs_name, gram, target_name)], df["count_of_%s_%s" % (obs_name, gram)]) df["normalized_pos_of_%s_%s_in_%s_std" % (obs_name, gram, target_name)] = map( try_divide, df["pos_of_%s_%s_in_%s_std" % (obs_name, gram, target_name)], df["count_of_%s_%s" % (obs_name, gram)])
def process(): read = False if not read: ''' body_train = pd.read_csv("train_bodies_processed.csv", encoding='utf-8') stances_train = pd.read_csv("train_stances_processed.csv", encoding='utf-8') # training set train = pd.merge(stances_train, body_train, how='left', on='Body ID') train.head() targets = ['agree', 'disagree', 'discuss', 'unrelated'] targets_dict = dict(zip(targets, range(len(targets)))) train['target'] = map(lambda x: targets_dict[x], train['Stance']) print ('train.shape:') print (train.shape) n_train = train.shape[0] ''' #sample starts sample_head = "Italy culls birds after five H5N8 avian flu outbreaks in October" sample_body = "ROME (Reuters) - Italy has had five outbreaks of highly pathogenic H5N8 avian flu in farms the central and northern parts of the country since the start of the month and about 880,000 chickens, ducks and turkeys will be culled, officials said on Wednesday.\ The biggest outbreak of the H5N8 virus, which led to the death or killing of millions of birds in an outbreak in western Europe last winter, was at a large egg producing farm in the province of Ferrara.\ The latest outbreak was confirmed on Oct. 6 and about 853,000 hens are due to be culled by Oct. 17, the IZSV zoological institute said.\ Another involved 14,000 turkeys in the province of Brescia, which are due to be culled by Oct. 13.\ A third involved 12,400 broiler chickens at a smaller farm in the province of Vicenza and two others were among a small number of hens, ducks, broilers and turkeys on family farms.\ In those three cases, all the birds have been culled." sample_head_pd = pd.DataFrame([sample_head]) sample_body_pd = pd.DataFrame([sample_body]) sample_data_pd = pd.concat((sample_head_pd, sample_body_pd), axis=1) sample_data_pd.columns = ['Headline', 'articleBody'] sample_data_pd['URLs'] = np.nan sample_data_pd['Stance'] = np.nan #sample ends dataset = pd.read_csv('data.csv') dataset.isnull().sum() dataset = dataset[pd.notnull(dataset['Body'])] dataset.columns = ['URLs', 'Headline', 'articleBody', 'Stance'] X_data = dataset.iloc[:, 1:3] Y_data = dataset.iloc[:, 3] from sklearn.cross_validation import train_test_split X_train, X_test, Y_train, Y_test = train_test_split(X_data, Y_data, test_size=0.25, random_state=0) train = pd.concat([X_train, Y_train], axis=1) train.to_csv('gdbt_training_input.csv', index=False) X_test.to_csv('gdbt_testing_input.csv', index=False) Y_test = pd.DataFrame(Y_test) Y_test.to_csv('gdbt_testing_ouput.csv', index=False) targets = ['Fake', 'Real'] targets_dict = dict(zip(targets, range(len(targets)))) train['target'] = map(lambda x: targets_dict[x], train['Stance']) data = train # read test set, no 'Stance' column in test set -> target = NULL # concatenate training and test set test_flag = True if test_flag: ''' body_test = pd.read_csv("test_bodies_processed.csv", encoding='utf-8') headline_test = pd.read_csv("test_stances_unlabeled.csv", encoding='utf-8') test = pd.merge(headline_test, body_test, how="left", on="Body ID") ''' data = pd.concat((train, X_test)) # target = NaN for test set #print (data) print('data.shape:') print(data.shape) train = data[~data['target'].isnull()] print(train) print('train.shape:') print(train.shape) test = data[data['target'].isnull()] print(test) print('test.shape:') print(test.shape) #data = data.iloc[:100, :] #return 1 print("generate unigram") data["Headline_unigram"] = data["Headline"].map( lambda x: preprocess_data(x)) print(data.head()) data["articleBody_unigram"] = data["articleBody"].map( lambda x: preprocess_data(x)) print("generate bigram") join_str = "_" data["Headline_bigram"] = data["Headline_unigram"].map( lambda x: ngram.getBigram(x, join_str)) data["articleBody_bigram"] = data["articleBody_unigram"].map( lambda x: ngram.getBigram(x, join_str)) print("generate trigram") join_str = "_" data["Headline_trigram"] = data["Headline_unigram"].map( lambda x: ngram.getTrigram(x, join_str)) data["articleBody_trigram"] = data["articleBody_unigram"].map( lambda x: ngram.getTrigram(x, join_str)) with open('data.pkl', 'wb') as outfile: pickle.dump(data, outfile, -1) print('dataframe saved in data.pkl') else: with open('data.pkl', 'rb') as infile: data = pickle.load(infile) print('data loaded') print('data.shape:') print(data.shape) #return 1 # define feature generators countFG = CountFeatureGenerator() tfidfFG = TfidfFeatureGenerator() svdFG = SvdFeatureGenerator() word2vecFG = Word2VecFeatureGenerator() sentiFG = SentimentFeatureGenerator() #walignFG = AlignmentFeatureGenerator() generators = [countFG, tfidfFG, svdFG, word2vecFG, sentiFG] #generators = [svdFG, word2vecFG, sentiFG] #generators = [tfidfFG] #generators = [countFG] #generators = [walignFG] #countFG.process(data) #countFG.read() #word2vecFG.process(data) #sentiFG.process(data) for g in generators: g.process(data) for g in generators: g.read('train') for g in generators: g.read('test') print('done')
def extract_feat(df): ## unigram print "generate unigram" df["query_unigram"] = list(df.apply(lambda x: preprocess_data(x["query"]), axis=1)) df["title_unigram"] = list(df.apply(lambda x: preprocess_data(x["product_title"]), axis=1)) df["description_unigram"] = list(df.apply(lambda x: preprocess_data(x["product_description"]), axis=1)) ## bigram print "generate bigram" join_str = "_" df["query_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["query_unigram"], join_str), axis=1)) df["title_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["title_unigram"], join_str), axis=1)) df["description_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["description_unigram"], join_str), axis=1)) ## trigram print "generate trigram" join_str = "_" df["query_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["query_unigram"], join_str), axis=1)) df["title_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["title_unigram"], join_str), axis=1)) df["description_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["description_unigram"], join_str), axis=1)) ################################ ## word count and digit count ## ################################ print "generate word counting features" feat_names = ["query", "title", "description"] grams = ["unigram", "bigram", "trigram"] count_digit = lambda x: sum([1. for w in x if w.isdigit()]) for feat_name in feat_names: for gram in grams: ## word count df["count_of_%s_%s"%(feat_name,gram)] = list(df.apply(lambda x: len(x[feat_name+"_"+gram]), axis=1)) df["count_of_unique_%s_%s"%(feat_name,gram)] = list(df.apply(lambda x: len(set(x[feat_name+"_"+gram])), axis=1)) df["ratio_of_unique_%s_%s"%(feat_name,gram)] = map(try_divide, df["count_of_unique_%s_%s"%(feat_name,gram)], df["count_of_%s_%s"%(feat_name,gram)]) ## digit count df["count_of_digit_in_%s"%feat_name] = list(df.apply(lambda x: count_digit(x[feat_name+"_unigram"]), axis=1)) df["ratio_of_digit_in_%s"%feat_name] = map(try_divide, df["count_of_digit_in_%s"%feat_name], df["count_of_%s_unigram"%(feat_name)]) ## description missing indicator df["description_missing"] = list(df.apply(lambda x: int(x["description_unigram"] == ""), axis=1)) ############################## ## intersect word count ## ############################## print "generate intersect word counting features" #### unigram for gram in grams: for obs_name in feat_names: for target_name in feat_names: if target_name != obs_name: ## query df["count_of_%s_%s_in_%s"%(obs_name,gram,target_name)] = list(df.apply(lambda x: sum([1. for w in x[obs_name+"_"+gram] if w in set(x[target_name+"_"+gram])]), axis=1)) df["ratio_of_%s_%s_in_%s"%(obs_name,gram,target_name)] = map(try_divide, df["count_of_%s_%s_in_%s"%(obs_name,gram,target_name)], df["count_of_%s_%s"%(obs_name,gram)]) ## some other feat df["title_%s_in_query_div_query_%s"%(gram,gram)] = map(try_divide, df["count_of_title_%s_in_query"%gram], df["count_of_query_%s"%gram]) df["title_%s_in_query_div_query_%s_in_title"%(gram,gram)] = map(try_divide, df["count_of_title_%s_in_query"%gram], df["count_of_query_%s_in_title"%gram]) df["description_%s_in_query_div_query_%s"%(gram,gram)] = map(try_divide, df["count_of_description_%s_in_query"%gram], df["count_of_query_%s"%gram]) df["description_%s_in_query_div_query_%s_in_description"%(gram,gram)] = map(try_divide, df["count_of_description_%s_in_query"%gram], df["count_of_query_%s_in_description"%gram]) ###################################### ## intersect word position feat ## ###################################### print "generate intersect word position features" for gram in grams: for target_name in feat_names: for obs_name in feat_names: if target_name != obs_name: pos = list(df.apply(lambda x: get_position_list(x[target_name+"_"+gram], obs=x[obs_name+"_"+gram]), axis=1)) ## stats feat on pos df["pos_of_%s_%s_in_%s_min" % (obs_name, gram, target_name)] = map(np.min, pos) df["pos_of_%s_%s_in_%s_mean" % (obs_name, gram, target_name)] = map(np.mean, pos) df["pos_of_%s_%s_in_%s_median" % (obs_name, gram, target_name)] = map(np.median, pos) df["pos_of_%s_%s_in_%s_max" % (obs_name, gram, target_name)] = map(np.max, pos) df["pos_of_%s_%s_in_%s_std" % (obs_name, gram, target_name)] = map(np.std, pos) ## stats feat on normalized_pos df["normalized_pos_of_%s_%s_in_%s_min" % (obs_name, gram, target_name)] = map(try_divide, df["pos_of_%s_%s_in_%s_min" % (obs_name, gram, target_name)], df["count_of_%s_%s" % (obs_name, gram)]) df["normalized_pos_of_%s_%s_in_%s_mean" % (obs_name, gram, target_name)] = map(try_divide, df["pos_of_%s_%s_in_%s_mean" % (obs_name, gram, target_name)], df["count_of_%s_%s" % (obs_name, gram)]) df["normalized_pos_of_%s_%s_in_%s_median" % (obs_name, gram, target_name)] = map(try_divide, df["pos_of_%s_%s_in_%s_median" % (obs_name, gram, target_name)], df["count_of_%s_%s" % (obs_name, gram)]) df["normalized_pos_of_%s_%s_in_%s_max" % (obs_name, gram, target_name)] = map(try_divide, df["pos_of_%s_%s_in_%s_max" % (obs_name, gram, target_name)], df["count_of_%s_%s" % (obs_name, gram)]) df["normalized_pos_of_%s_%s_in_%s_std" % (obs_name, gram, target_name)] = map(try_divide, df["pos_of_%s_%s_in_%s_std" % (obs_name, gram, target_name)] , df["count_of_%s_%s" % (obs_name, gram)])
def getVWFile(tsvFile, vwFile, dataIndex, dictionary, tfidf_model, train=True): """ Generate the overall features in VW format.""" start = datetime.now() # extract all the index catIndex = dataIndex["catIndex"] subCatIndex = dataIndex["subCatIndex"] attrsKeyIndex = dataIndex["attrsKeyIndex"] attrsValIndex = dataIndex["attrsValIndex"] wordIndex = dataIndex["wordIndex"] with open(vwFile, "wb") as vwWriter: with open(tsvFile, "rb") as tsvReader: itemReader = DictReader(tsvReader, delimiter='\t', quotechar='"') for i, item in enumerate(itemReader): item = {featureName:featureValue.decode('utf-8') \ for featureName,featureValue in item.items() \ if featureValue is not None} # get header itemid = int(item["itemid"]) label = int(item["is_blocked"]) if train else 1 header = "%s '%s " % (int(2*label - 1), itemid) # category categoryFeat = "|C %s " % catIndex[ item["category"] ] # subcategory subcategoryFeat = "|SC %s " % subCatIndex[ item["subcategory"] ] # title title = [ str(wordIndex[w]) for w in getWords(item["title"]) ] # first-gram title_start = title[0] if len(title)>0 else "0" # end-gram title_end = title[-1] if len(title)>0 else "0" # naming is a pain for me titleFeat = "|T %s |bT %s |cT %s " % (" ".join(title), title_start, title_end) titleStatsFeat = "|t %s " % getTextStatsFeat(item["title"]) # description description = [ str(wordIndex[w]) for w in getWords(item["description"]) ] # first-gram description_start = description[0] if len(description)>0 else "0" # end-gram description_end = description[-1] if len(description)>0 else "0" descriptionFeat = "|D %s |fD %s |gD %s " % (" ".join(description), description_start, description_end) descriptionStatsFeat = "|d %s " % getTextStatsFeat(item["description"]) tfidf_feat1 = getTfidfFeat(getUnigram(description), dictionary[1], tfidf_model[1]) # 2gram tfidf seem to harm the performance, you are save to drop it here tfidf_feat2 = getTfidfFeat(getBigram(description, "_"), dictionary[2], tfidf_model[2]) descriptionFeat += "|iD %s |jD %s " % (tfidf_feat1, tfidf_feat2) # attrs attrsFeat = "" countAttrs = 0 if "attrs" in item: attrsDict = getAttrsDict(item["attrs"]) #print attrs for k,v in list(attrsDict.items()): countAttrs += 1 attrsFeat += "|A%s %s " % (attrsKeyIndex[k], attrsValIndex[v]) attrsFeat += "|a " for k,v in list(attrsDict.items()): attrsFeat += "%s " % (attrsKeyIndex[k]) if len(attrsFeat) == 0: attrsFeat = "|NA 1 " attrsFeat += "|hAC %s " % countAttrs # price priceFeat = "|P %s " % item["price"] # phones_cnt phonesCntFeat = "|p %s " % item["phones_cnt"] # emails_cnt emailsCntFeat = "|e %s " % item["emails_cnt"] # urls_cnt urlsCntFeat = "|u %s " % item["urls_cnt"] # output vwLine = header \ + categoryFeat \ + subcategoryFeat \ + titleFeat \ + titleStatsFeat \ + descriptionFeat \ + descriptionStatsFeat \ + attrsFeat \ + priceFeat \ + phonesCntFeat \ + emailsCntFeat \ + urlsCntFeat[:-1] + "\n" vwWriter.write( vwLine ) # report progress if (i+1)%10000 == 0: print(( "\n%s\t%s"%((i+1),str(datetime.now() - start)) )) print(( "Sample output:\n%s" % vwLine ))
def process(): read = False if not read: body_train = pd.read_csv("train_bodies_processed.csv", encoding='utf-8') stances_train = pd.read_csv("train_stances_processed.csv", encoding='utf-8') # training set train = pd.merge(stances_train, body_train, how='left', on='Body ID') targets = ['agree', 'disagree', 'discuss', 'unrelated'] targets_dict = dict(zip(targets, range(len(targets)))) train['target'] = map(lambda x: targets_dict[x], train['Stance']) print 'train.shape:' print train.shape n_train = train.shape[0] data = train # read test set, no 'Stance' column in test set -> target = NULL # concatenate training and test set test_flag = True if test_flag: body_test = pd.read_csv("test_bodies_processed.csv", encoding='utf-8') headline_test = pd.read_csv("test_stances_unlabeled.csv", encoding='utf-8') test = pd.merge(headline_test, body_test, how="left", on="Body ID") data = pd.concat((train, test)) # target = NaN for test set print data print 'data.shape:' print data.shape train = data[~data['target'].isnull()] print train print 'train.shape:' print train.shape test = data[data['target'].isnull()] print test print 'test.shape:' print test.shape #data = data.iloc[:100, :] #return 1 print "generate unigram" data["Headline_unigram"] = data["Headline"].map( lambda x: preprocess_data(x)) data["articleBody_unigram"] = data["articleBody"].map( lambda x: preprocess_data(x)) print "generate bigram" join_str = "_" data["Headline_bigram"] = data["Headline_unigram"].map( lambda x: ngram.getBigram(x, join_str)) data["articleBody_bigram"] = data["articleBody_unigram"].map( lambda x: ngram.getBigram(x, join_str)) print "generate trigram" join_str = "_" data["Headline_trigram"] = data["Headline_unigram"].map( lambda x: ngram.getTrigram(x, join_str)) data["articleBody_trigram"] = data["articleBody_unigram"].map( lambda x: ngram.getTrigram(x, join_str)) with open('data.pkl', 'wb') as outfile: cPickle.dump(data, outfile, -1) print 'dataframe saved in data.pkl' else: with open('data.pkl', 'rb') as infile: data = cPickle.load(infile) print 'data loaded' print 'data.shape:' print data.shape #return 1 # define feature generators countFG = CountFeatureGenerator() tfidfFG = TfidfFeatureGenerator() svdFG = SvdFeatureGenerator() word2vecFG = Word2VecFeatureGenerator() sentiFG = SentimentFeatureGenerator() #walignFG = AlignmentFeatureGenerator() generators = [countFG, tfidfFG, svdFG, word2vecFG, sentiFG] #generators = [svdFG, word2vecFG, sentiFG] #generators = [tfidfFG] #generators = [countFG] #generators = [walignFG] for g in generators: g.process(data) for g in generators: g.read('train') #for g in generators: # g.read('test') print 'done'
def getVWFile(tsvFile, vwFile, dataIndex, dictionary, tfidf_model, train=True): """ Generate the overall features in VW format.""" start = datetime.now() # extract all the index catIndex = dataIndex["catIndex"] subCatIndex = dataIndex["subCatIndex"] attrsKeyIndex = dataIndex["attrsKeyIndex"] attrsValIndex = dataIndex["attrsValIndex"] wordIndex = dataIndex["wordIndex"] with open(vwFile, "wb") as vwWriter: with open(tsvFile, "rb") as tsvReader: itemReader = DictReader(tsvReader, delimiter='\t', quotechar='"') for i, item in enumerate(itemReader): item = {featureName:featureValue.decode('utf-8') \ for featureName,featureValue in item.iteritems() \ if featureValue is not None} # get header itemid = int(item["itemid"]) label = int(item["is_blocked"]) if train else 1 header = "%s '%s " % (int(2*label - 1), itemid) # category categoryFeat = "|C %s " % catIndex[ item["category"] ] # subcategory subcategoryFeat = "|SC %s " % subCatIndex[ item["subcategory"] ] # title title = [ str(wordIndex[w]) for w in getWords(item["title"]) ] # first-gram title_start = title[0] if len(title)>0 else "0" # end-gram title_end = title[-1] if len(title)>0 else "0" # naming is a pain for me titleFeat = "|T %s |bT %s |cT %s " % (" ".join(title), title_start, title_end) titleStatsFeat = "|t %s " % getTextStatsFeat(item["title"]) # description description = [ str(wordIndex[w]) for w in getWords(item["description"]) ] # first-gram description_start = description[0] if len(description)>0 else "0" # end-gram description_end = description[-1] if len(description)>0 else "0" descriptionFeat = "|D %s |fD %s |gD %s " % (" ".join(description), description_start, description_end) descriptionStatsFeat = "|d %s " % getTextStatsFeat(item["description"]) tfidf_feat1 = getTfidfFeat(getUnigram(description), dictionary[1], tfidf_model[1]) # 2gram tfidf seem to harm the performance, you are save to drop it here tfidf_feat2 = getTfidfFeat(getBigram(description, "_"), dictionary[2], tfidf_model[2]) descriptionFeat += "|iD %s |jD %s " % (tfidf_feat1, tfidf_feat2) # attrs attrsFeat = "" countAttrs = 0 if item.has_key("attrs"): attrsDict = getAttrsDict(item["attrs"]) #print attrs for k,v in attrsDict.items(): countAttrs += 1 attrsFeat += "|A%s %s " % (attrsKeyIndex[k], attrsValIndex[v]) attrsFeat += "|a " for k,v in attrsDict.items(): attrsFeat += "%s " % (attrsKeyIndex[k]) if len(attrsFeat) == 0: attrsFeat = "|NA 1 " attrsFeat += "|hAC %s " % countAttrs # price priceFeat = "|P %s " % item["price"] # phones_cnt phonesCntFeat = "|p %s " % item["phones_cnt"] # emails_cnt emailsCntFeat = "|e %s " % item["emails_cnt"] # urls_cnt urlsCntFeat = "|u %s " % item["urls_cnt"] # output vwLine = header \ + categoryFeat \ + subcategoryFeat \ + titleFeat \ + titleStatsFeat \ + descriptionFeat \ + descriptionStatsFeat \ + attrsFeat \ + priceFeat \ + phonesCntFeat \ + emailsCntFeat \ + urlsCntFeat[:-1] + "\n" vwWriter.write( vwLine ) # report progress if (i+1)%10000 == 0: print( "\n%s\t%s"%((i+1),str(datetime.now() - start)) ) print( "Sample output:\n%s" % vwLine )
def main(): ############### ## Load Data ## ############### ## load data dataPath = "./ModelSystem/ProcessedData" columnNames = ["query", "title", "description"] catagories = ["train", "test"] for cata in catagories: for col in columnNames: path = "%s/%s_%s.pickle" % (dataPath, col, cata) with open(path, "rb") as f: input = pickle.load(f) sz = len(input) #开始1,2,3元文法 output_unigram = [] output_bigram = [] output_trigram = [] for i in range(sz): text = str(input[i]) #去除标点 wordList = text.split() unigram = wordList bigram = ngram.getBigram(wordList, "_") trigram = ngram.getTrigram(wordList, "_") for i in range(len(unigram)): if (unigram[i] == "nan"): unigram[i] = "" for i in range(len(bigram)): if (bigram[i] == "nan"): bigram[i] = "" for i in range(len(trigram)): if (trigram[i] == "nan"): trigram[i] = "" output_unigram.append(unigram) output_bigram.append(bigram) output_trigram.append(trigram) #print(unigram) #print(bigram) #print(trigram) #raise Exception("sdf") path = "./ModelSystem/Features/ngram/%s_unigram_%s.pickle" % (col, cata) with open(path, "wb") as f: pickle.dump(output_unigram, f) path = "./ModelSystem/Features/ngram/%s_bigram_%s.pickle" % (col, cata) with open(path, "wb") as f: pickle.dump(output_bigram, f) path = "./ModelSystem/Features/ngram/%s_trigram_%s.pickle" % (col, cata) with open(path, "wb") as f: pickle.dump(output_trigram, f) print("%s_ngram_%s Completed" % (col, cata)) # ret = ngram.getBigram(x["query_unigram"], join_str) print("ngram All Done.")
def extract_feat(df): join_str="_" df["query_unigram"] = list(df.apply(lambda x: preprocess_data(x["query"], stem=True), axis=1)) df["title_unigram"] = list(df.apply(lambda x: preprocess_data(x["title"]), axis=1)) df["description_unigram"] = list(df.apply(lambda x: preprocess_data(x["description"]), axis=1)) df["attribute_values_unigram"] = list(df.apply(lambda x: preprocess_data(x["values"]), axis=1)) df["query_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["query_unigram"], join_str), axis=1)) df["title_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["title_unigram"], join_str), axis=1)) df["description_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["description_unigram"], join_str), axis=1)) df["attribute_values_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["attribute_values_unigram"], join_str), axis=1)) df["query_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["query_unigram"], join_str), axis=1)) df["title_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["title_unigram"], join_str), axis=1)) df["description_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["description_unigram"], join_str), axis=1)) df["attribute_values_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["attribute_values_unigram"], join_str), axis=1)) ################################ ## word count and digit count ## ################################ print "generate word counting features" feat_names = ["query", "title", "description", "attribute_values"] grams = ["unigram", "bigram", "trigram"] count_digit = lambda x: sum([1. for w in x if w.isdigit()]) for feat_name in feat_names: for gram in grams: ## word count df["count_of_%s_%s"%(feat_name,gram)] = list(df.apply(lambda x: len(x[feat_name+"_"+gram]), axis=1)) df["count_of_unique_%s_%s"%(feat_name,gram)] = list(df.apply(lambda x: len(set(x[feat_name+"_"+gram])), axis=1)) df["ratio_of_unique_%s_%s"%(feat_name,gram)] = map(try_divide, df["count_of_unique_%s_%s"%(feat_name,gram)], df["count_of_%s_%s"%(feat_name,gram)]) ## digit count df["count_of_digit_in_%s"%feat_name] = list(df.apply(lambda x: count_digit(x[feat_name+"_unigram"]), axis=1)) df["ratio_of_digit_in_%s"%feat_name] = map(try_divide, df["count_of_digit_in_%s"%feat_name], df["count_of_%s_unigram"%(feat_name)]) ## description missing indicator df["description_missing"] = list(df.apply(lambda x: int(x["description_unigram"] == ""), axis=1)) #print "dropping unigrams bigrams trigrams" #df=df.drop(['query','description','title','values'], axis=1) # ############################## # ## intersect word count ## # ############################## # print "generate intersect word counting features" # #### unigram # for gram in grams: # for obs_name in feat_names: # for target_name in feat_names: # if target_name != obs_name: # ## query # df["count_of_%s_%s_in_%s"%(obs_name,gram,target_name)] = list(df.apply(lambda x: sum([1. for w in x[obs_name+"_"+gram] if w in set(x[target_name+"_"+gram])]), axis=1)) # df["ratio_of_%s_%s_in_%s"%(obs_name,gram,target_name)] = map(try_divide, df["count_of_%s_%s_in_%s"%(obs_name,gram,target_name)], df["count_of_%s_%s"%(obs_name,gram)]) # ## some other feat # df["title_%s_in_query_div_query_%s"%(gram,gram)] = map(try_divide, df["count_of_title_%s_in_query"%gram], df["count_of_query_%s"%gram]) # df["title_%s_in_query_div_query_%s_in_title"%(gram,gram)] = map(try_divide, df["count_of_title_%s_in_query"%gram], df["count_of_query_%s_in_title"%gram]) # df["description_%s_in_query_div_query_%s"%(gram,gram)] = map(try_divide, df["count_of_description_%s_in_query"%gram], df["count_of_query_%s"%gram]) # df["description_%s_in_query_div_query_%s_in_description"%(gram,gram)] = map(try_divide, df["count_of_description_%s_in_query"%gram], df["count_of_query_%s_in_description"%gram]) ###################################### ## intersect word position feat ## ###################################### print "dropping unigrams bigrams trigrams" df=df.drop(['query','description','title','values'], axis=1) print "generate intersect word position features" for gram in grams: for target_name in feat_names: for obs_name in feat_names: if target_name != obs_name: pos = list(df.apply(lambda x: get_position_list(x[target_name+"_"+gram], obs=x[obs_name+"_"+gram]), axis=1)) ## stats feat on pos df["pos_of_%s_%s_in_%s_min" % (obs_name, gram, target_name)] = map(np.min, pos) df["pos_of_%s_%s_in_%s_mean" % (obs_name, gram, target_name)] = map(np.mean, pos) df["pos_of_%s_%s_in_%s_median" % (obs_name, gram, target_name)] = map(np.median, pos) df["pos_of_%s_%s_in_%s_max" % (obs_name, gram, target_name)] = map(np.max, pos) df["pos_of_%s_%s_in_%s_std" % (obs_name, gram, target_name)] = map(np.std, pos) ## stats feat on normalized_pos df["normalized_pos_of_%s_%s_in_%s_min" % (obs_name, gram, target_name)] = map(try_divide, df["pos_of_%s_%s_in_%s_min" % (obs_name, gram, target_name)], df["count_of_%s_%s" % (obs_name, gram)]) df["normalized_pos_of_%s_%s_in_%s_mean" % (obs_name, gram, target_name)] = map(try_divide, df["pos_of_%s_%s_in_%s_mean" % (obs_name, gram, target_name)], df["count_of_%s_%s" % (obs_name, gram)]) df["normalized_pos_of_%s_%s_in_%s_median" % (obs_name, gram, target_name)] = map(try_divide, df["pos_of_%s_%s_in_%s_median" % (obs_name, gram, target_name)], df["count_of_%s_%s" % (obs_name, gram)]) df["normalized_pos_of_%s_%s_in_%s_max" % (obs_name, gram, target_name)] = map(try_divide, df["pos_of_%s_%s_in_%s_max" % (obs_name, gram, target_name)], df["count_of_%s_%s" % (obs_name, gram)]) df["normalized_pos_of_%s_%s_in_%s_std" % (obs_name, gram, target_name)] = map(try_divide, df["pos_of_%s_%s_in_%s_std" % (obs_name, gram, target_name)] , df["count_of_%s_%s" % (obs_name, gram)]) #print "dropping unigrams bigrams trigrams" df=df.drop(['query_unigram', 'title_unigram', 'description_unigram', 'query_bigram','title_bigram','description_bigram', 'query_trigram', 'title_trigram', 'description_trigram', 'attribute_values_unigram', 'attribute_values_bigram', 'attribute_values_trigram'], axis=1) print "creating csv" df.to_csv("../../data/feat/test_countingfeat_part3.csv", header=True, index=False)
def extract_feat(df_all): df_all['len_of_query'] = df_all['search_term'].map(lambda x:len(x.split())).astype(np.int64) df_all['len_of_title'] = df_all['product_title'].map(lambda x:len(x.split())).astype(np.int64) df_all['len_of_description'] = df_all['product_description'].map(lambda x:len(x.split())).astype(np.int64) df_all['len_of_brand'] = df_all['brand'].map(lambda x:len(x.split())).astype(np.int64) df_all['product_info'] = df_all['search_term']+"\t"+df_all['product_title']+"\t"+df_all['product_description']+"\t"+df_all['product_attributes']+"\t"+df_all['brand']+"\t"+df_all['color']+"\t"+df_all['appl'] df_all['word_in_title'] = df_all['product_info'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[1])) df_all['word_in_description'] = df_all['product_info'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[2])) df_all['word_in_attributes'] = df_all['product_info'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[3])) df_all['word_in_brand'] = df_all['product_info'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[4])) df_all['word_in_color'] = df_all['product_info'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[5])) df_all['word_in_appl'] = df_all['product_info'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[6])) df_all['ratio_title'] = df_all['word_in_title']/df_all['len_of_query'] df_all['ratio_description'] = df_all['word_in_description']/df_all['len_of_query'] df_all['ratio_attributes'] = df_all['word_in_attributes']/df_all['len_of_query'] df_all['ratio_brand'] = df_all['word_in_brand']/df_all['len_of_query'] df_all['ratio_color'] = df_all['word_in_color']/df_all['len_of_query'] df_all['ratio_appl'] = df_all['word_in_appl']/df_all['len_of_query'] df_all['cs_1'] = df_all['product_info'].map(lambda x:cs(x.split('\t')[0],x.split('\t')[1])) df_all['cs_2'] = df_all['product_info'].map(lambda x:cs(x.split('\t')[0],x.split('\t')[2])) df_all['cs_3'] = df_all['product_info'].map(lambda x:cs(x.split('\t')[1],x.split('\t')[2])) print "generate unigram" df_all["query_unigram"] = list(df_all.apply(lambda x: x["search_term"].lower().split(), axis=1)) df_all["title_unigram"] = list(df_all.apply(lambda x: x["product_title"].lower().split(), axis=1)) df_all["description_unigram"] = list(df_all.apply(lambda x: x["product_description"].lower().split(), axis=1)) print "generate bigram" join_str = "_" df_all["query_bigram"] = list(df_all.apply(lambda x: ngram.getBigram(x["search_term"].split(), join_str), axis=1)) df_all["title_bigram"] = list(df_all.apply(lambda x: ngram.getBigram(x["product_title"].split(), join_str), axis=1)) df_all["description_bigram"] = list(df_all.apply(lambda x: ngram.getBigram(x["product_description"].split(), join_str), axis=1)) ## trigram print "generate trigram" join_str = "_" df_all["query_trigram"] = list(df_all.apply(lambda x: ngram.getTrigram(x["search_term"].split(), join_str), axis=1)) df_all["title_trigram"] = list(df_all.apply(lambda x: ngram.getTrigram(x["product_title"].split(), join_str), axis=1)) df_all["description_trigram"] = list(df_all.apply(lambda x: ngram.getTrigram(x["product_description"].split(), join_str), axis=1)) join_str = "X" # query unigram df_all["query_unigram_title_unigram"] = list(df_all.apply(lambda x: cooccurrence_terms(x["query_unigram"], x["title_unigram"], join_str), axis=1)) df_all["query_unigram_title_bigram"] = list(df_all.apply(lambda x: cooccurrence_terms(x["query_unigram"], x["title_bigram"], join_str), axis=1)) df_all["query_unigram_description_unigram"] = list(df_all.apply(lambda x: cooccurrence_terms(x["query_unigram"], x["description_unigram"], join_str), axis=1)) df_all["query_unigram_description_bigram"] = list(df_all.apply(lambda x: cooccurrence_terms(x["query_unigram"], x["description_bigram"], join_str), axis=1)) # query bigram df_all["query_bigram_title_unigram"] = list(df_all.apply(lambda x: cooccurrence_terms(x["query_bigram"], x["title_unigram"], join_str), axis=1)) df_all["query_bigram_title_bigram"] = list(df_all.apply(lambda x: cooccurrence_terms(x["query_bigram"], x["title_bigram"], join_str), axis=1)) df_all["query_bigram_description_unigram"] = list(df_all.apply(lambda x: cooccurrence_terms(x["query_bigram"], x["description_unigram"], join_str), axis=1)) df_all["query_bigram_description_bigram"] = list(df_all.apply(lambda x: cooccurrence_terms(x["query_bigram"], x["description_bigram"], join_str), axis=1)) print "generate word counting features" feat_names = ["query", "title","description"] grams = ["unigram","bigram", "trigram"] count_digit = lambda x: sum([1. for w in x if w.isdigit()]) for feat_name in feat_names: for gram in grams: ## word count df_all["count_of_%s_%s"%(feat_name,gram)] = list(df_all.apply(lambda x: len(x[feat_name+"_"+gram]), axis=1)) df_all["count_of_unique_%s_%s"%(feat_name,gram)] = list(df_all.apply(lambda x: len(set(x[feat_name+"_"+gram])), axis=1)) df_all["ratio_of_unique_%s_%s"%(feat_name,gram)] = map(try_divide, df_all["count_of_unique_%s_%s"%(feat_name,gram)], df_all["count_of_%s_%s"%(feat_name,gram)]) print "generate intersect word counting features" #### unigram for gram in grams: for obs_name in feat_names: for target_name in feat_names: if target_name != obs_name: ## query df_all["count_of_%s_%s_in_%s"%(obs_name,gram,target_name)] = list(df_all.apply(lambda x: sum([1. for w in x[obs_name+"_"+gram] if w in set(x[target_name+"_"+gram])]), axis=1)) df_all["ratio_of_%s_%s_in_%s"%(obs_name,gram,target_name)] = map(try_divide, df_all["count_of_%s_%s_in_%s"%(obs_name,gram,target_name)], df_all["count_of_%s_%s"%(obs_name,gram)]) ## some other feat df_all["title_%s_in_query_div_query_%s"%(gram,gram)] = map(try_divide, df_all["count_of_title_%s_in_query"%gram], df_all["count_of_query_%s"%gram]) df_all["title_%s_in_query_div_query_%s_in_title"%(gram,gram)] = map(try_divide, df_all["count_of_title_%s_in_query"%gram], df_all["count_of_query_%s_in_title"%gram]) #df_all["description_%s_in_query_div_query_%s"%(gram,gram)] = map(try_divide, df_all["count_of_description_%s_in_query"%gram], df_all["count_of_query_%s"%gram]) #df_all["description_%s_in_query_div_query_%s_in_description"%(gram,gram)] = map(try_divide, df_all["count_of_description_%s_in_query"%gram], df_all["count_of_query_%s_in_description"%gram]) print "generate intersect word position features" for gram in grams: for target_name in feat_names: for obs_name in feat_names: if target_name != obs_name: pos = list(df_all.apply(lambda x: get_position_list(x[target_name+"_"+gram], obs=x[obs_name+"_"+gram]), axis=1)) ## stats feat on pos df_all["pos_of_%s_%s_in_%s_min" % (obs_name, gram, target_name)] = map(np.min, pos) df_all["pos_of_%s_%s_in_%s_mean" % (obs_name, gram, target_name)] = map(np.mean, pos) df_all["pos_of_%s_%s_in_%s_median" % (obs_name, gram, target_name)] = map(np.median, pos) df_all["pos_of_%s_%s_in_%s_max" % (obs_name, gram, target_name)] = map(np.max, pos) df_all["pos_of_%s_%s_in_%s_std" % (obs_name, gram, target_name)] = map(np.std, pos) ## stats feat on normalized_pos df_all["normalized_pos_of_%s_%s_in_%s_min" % (obs_name, gram, target_name)] = map(try_divide, df_all["pos_of_%s_%s_in_%s_min" % (obs_name, gram, target_name)], df_all["count_of_%s_%s" % (obs_name, gram)]) df_all["normalized_pos_of_%s_%s_in_%s_mean" % (obs_name, gram, target_name)] = map(try_divide, df_all["pos_of_%s_%s_in_%s_mean" % (obs_name, gram, target_name)], df_all["count_of_%s_%s" % (obs_name, gram)]) df_all["normalized_pos_of_%s_%s_in_%s_median" % (obs_name, gram, target_name)] = map(try_divide, df_all["pos_of_%s_%s_in_%s_median" % (obs_name, gram, target_name)], df_all["count_of_%s_%s" % (obs_name, gram)]) df_all["normalized_pos_of_%s_%s_in_%s_max" % (obs_name, gram, target_name)] = map(try_divide, df_all["pos_of_%s_%s_in_%s_max" % (obs_name, gram, target_name)], df_all["count_of_%s_%s" % (obs_name, gram)]) df_all["normalized_pos_of_%s_%s_in_%s_std" % (obs_name, gram, target_name)] = map(try_divide, df_all["pos_of_%s_%s_in_%s_std" % (obs_name, gram, target_name)] , df_all["count_of_%s_%s" % (obs_name, gram)]) print "generate jaccard coef and dice dist for n-gram" dists = ["jaccard_coef", "dice_dist"] grams = ["bigram", "trigram"] feat_names = ["query", "title","description"] for dist in dists: for gram in grams: for i in range(len(feat_names)-1): for j in range(i+1,len(feat_names)): target_name = feat_names[i] obs_name = feat_names[j] df_all["%s_of_%s_between_%s_%s"%(dist,gram,target_name,obs_name)] = \ list(df_all.apply(lambda x: compute_dist(x[target_name+"_"+gram], x[obs_name+"_"+gram], dist), axis=1)) return df_all
def extract_feat(df): ## unigram print("generate unigram") df["question1_unigram"] = list(df.apply(lambda x: preprocess_data(x["question1"]), axis=1)) df["question2_unigram"] = list(df.apply(lambda x: preprocess_data(x["question2"]), axis=1)) ## bigram print("generate bigram") join_str = "_" df["question1_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["question1_unigram"], join_str), axis=1)) df["question2_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["question2_unigram"], join_str), axis=1)) ## trigram print("generate trigram") join_str = "_" df["question1_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["question1_unigram"], join_str), axis=1)) df["question2_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["question2_unigram"], join_str), axis=1)) ################################ ## word count and digit count ## ################################ print("generate word counting features") feat_names = ["question1", "question2"] grams = ["unigram", "bigram", "trigram"] count_digit = lambda x: sum([1. for w in x if w.isdigit()]) for feat_name in feat_names: for gram in grams: ## word count 单词数量 df["count_of_%s_%s"%(feat_name,gram)] = list(df.apply(lambda x: len(x[feat_name+"_"+gram]), axis=1)) # 单词数量 df["count_of_unique_%s_%s"%(feat_name, gram)] = list(df.apply(lambda x: len(set(x[feat_name+"_"+gram])), axis=1)) # 不重复单词数量 df["ratio_of_unique_%s_%s"%(feat_name, gram)] = list(map(try_divide, df["count_of_unique_%s_%s"%(feat_name,gram)], df["count_of_%s_%s"%(feat_name,gram)])) # 不重复单词占比 ## digit count 数字数量 df["count_of_digit_in_%s"%feat_name] = list(df.apply(lambda x: count_digit(x[feat_name+"_unigram"]), axis=1)) # 数字数量 df["ratio_of_digit_in_%s"%feat_name] = list(map(try_divide, df["count_of_digit_in_%s"%feat_name], df["count_of_%s_unigram"%(feat_name)])) # 数字占比 # ## letter count 字母数量 # df["count_of_letter_in_%s" % feat_name] = list( df.apply(lambda x: len(x[feat_name]), axis=1)) #################################### ## subtract word and letter count ## #################################### print("generate subtract word counting features") #### unigram for obs_name in feat_names: for target_name in feat_names: if target_name != obs_name: ## word count 单词数量差 df["count_of_%s_%s_subtract_%s" % (obs_name, "unigram", target_name)] = list(df.apply( lambda x: 1 if (len(x[obs_name + "_unigram"]) + len(x[target_name + "_unigram"])) == 0 else 1.0 * abs(len(x[obs_name + "_unigram"]) - len(x[target_name + "_unigram"])) / (len(x[obs_name + "_unigram"]) + len(x[target_name + "_unigram"])), axis=1)) ## digit count 数字数量差 df["count_of_%s_%s_subtract_%s" % (obs_name, "digit", target_name)] = list(df.apply( lambda x: 1 if (count_digit(x[obs_name+"_unigram"]) + count_digit(x[target_name+"_unigram"])) == 0 else 1.0 * abs(count_digit(x[obs_name+"_unigram"]) - count_digit(x[target_name+"_unigram"])) / (count_digit(x[obs_name+"_unigram"]) + count_digit(x[target_name+"_unigram"])), axis=1)) # ## word count 字母数量差 # f["count_of_%s_%s_subtract_%s" % (obs_name, "letter", target_name)] = list(df.apply( # lambda x: 1.0 * abs(len(x[obs_name]) - len(x[target_name])) / (len(x[obs_name]) + len(x[target_name])), axis=1)) ############################## ## intersect word count ###### ############################## print("generate intersect word counting features") #### unigram for gram in grams: for obs_name in feat_names: for target_name in feat_names: if target_name != obs_name: ## query df["count_of_%s_%s_in_%s"%(obs_name, gram, target_name)] = list(df.apply(lambda x: sum([1. for w in x[obs_name+"_"+gram] if w in set(x[target_name+"_"+gram])]), axis=1)) # 两特征单词相交的数量 df["ratio_of_%s_%s_in_%s"%(obs_name, gram, target_name)] = list(map(try_divide, df["count_of_%s_%s_in_%s"%(obs_name,gram,target_name)], df["count_of_%s_%s"%(obs_name,gram)])) # 两特征单词相交的数量占比
def process(): full_data = pd.read_csv('./data/merged_data_tain.csv', encoding='utf-8') used_column = [ 'claimHeadline', 'articleHeadline', 'claimTruthiness', 'articleStance' ] full_data = full_data[used_column] full_data = full_data.dropna() train, test = train_test_split(full_data, test_size=0.33, random_state=1234) read = False if not read: targets = ['observing', 'for', 'against', 'ignoring'] targets_dict = dict(zip(targets, range(len(targets)))) train['target'] = map(lambda x: targets_dict[x], train['articleStance']) print 'train.shape:' print train.shape n_train = train.shape[0] data = train # read test set, no 'Stance' column in test set -> target = NULL # concatenate training and test set test_flag = True if test_flag: data = train print data print 'data.shape:' print data.shape train = data[~data['target'].isnull()] print train print 'train.shape:' print train.shape test = data[data['target'].isnull()] print test print 'test.shape:' print test.shape #data = data.iloc[:100, :] #return 1 print "generate unigram" data["claimHeadline_unigram"] = data["claimHeadline"].map( lambda x: preprocess_data(x)) data["articleHeadline_unigram"] = data["articleHeadline"].map( lambda x: preprocess_data(x)) print "generate bigram" join_str = "_" data["claimHeadline_bigram"] = data["claimHeadline_unigram"].map( lambda x: ngram.getBigram(x, join_str)) data["articleHeadline_bigram"] = data["articleHeadline_unigram"].map( lambda x: ngram.getBigram(x, join_str)) print "generate trigram" join_str = "_" data["claimHeadline_trigram"] = data["claimHeadline_unigram"].map( lambda x: ngram.getTrigram(x, join_str)) data["articleHeadline_trigram"] = data["articleHeadline_bigram"].map( lambda x: ngram.getTrigram(x, join_str)) with open('data.pkl', 'wb') as outfile: cPickle.dump(data, outfile, -1) print 'dataframe saved in data.pkl' else: with open('data.pkl', 'rb') as infile: data = cPickle.load(infile) print 'data loaded' print 'data.shape:' print data.shape #return 1 # define feature generators countFG = CountFeatureGenerator() tfidfFG = TfidfFeatureGenerator() svdFG = SvdFeatureGenerator() word2vecFG = Word2VecFeatureGenerator() sentiFG = SentimentFeatureGenerator() #walignFG = AlignmentFeatureGenerator() generators = [countFG, tfidfFG, svdFG, word2vecFG, sentiFG] #generators = [svdFG, word2vecFG, sentiFG] #generators = [tfidfFG] #generators = [countFG] #generators = [walignFG] for g in generators: g.process(data) for g in generators: g.read('train') #for g in generators: # g.read('test') print 'done'