def prepare_unigram(path,out): data_input = pd.read_csv(path) data_ouput = DataFrame(columns=['sen1_unigram','sen2_unigram']) for index,row in data_input.iterrows(): s1 = str(row['sen1']).split() s2 = str(row['sen2']).split() s1 = ' '.join(getUnigram(s1)) s2 = ' '.join(getUnigram(s2)) data_ouput.loc[index] = [s1,s2] data_ouput.to_csv(out,index=False)
def generateNGram(df): # unigram df['query_unigram'] = df['query'].apply(lambda x: ngram.getUnigram(x)) df['title_unigram'] = df['product_title'].apply( lambda x: ngram.getUnigram(x)) df['description_unigram'] = df['product_description'].apply( lambda x: ngram.getUnigram(x)) # bigram df['query_bigram'] = df['query'].apply(lambda x: ngram.getBigram(x, '_')) df['title_bigram'] = df['product_title'].apply( lambda x: ngram.getBigram(x, '_')) df['description_bigram'] = df['product_description'].apply( lambda x: ngram.getBigram(x, '_')) # trigram df['query_trigram'] = df['query'].apply(lambda x: ngram.getTrigram(x, '_')) df['title_trigram'] = df['product_title'].apply( lambda x: ngram.getTrigram(x, '_')) df['description_trigram'] = df['product_description'].apply( lambda x: ngram.getTrigram(x, '_')) return df
def __iter__(self): for item in DictReader(open(self.tsvFile, "rb"), delimiter='\t', quotechar='"'): self.counter += 1 item = {featureName:featureValue.decode('utf-8') \ for featureName,featureValue in item.items() \ if featureValue is not None} description = [ str(self.wordIndex[w]) for w in getWords(item["description"]) ] if self.ngram == 1: yield getUnigram(description) elif self.ngram == 2: yield getBigram(description, "_") if self.counter%100000 == 0: print(( " Process %s" % self.counter ))
def __iter__(self): for item in DictReader(open(self.tsvFile, "rb"), delimiter='\t', quotechar='"'): self.counter += 1 item = {featureName:featureValue.decode('utf-8') \ for featureName,featureValue in item.iteritems() \ if featureValue is not None} description = [ str(self.wordIndex[w]) for w in getWords(item["description"]) ] if self.ngram == 1: yield getUnigram(description) elif self.ngram == 2: yield getBigram(description, "_") if self.counter%100000 == 0: print( " Process %s" % self.counter )
def prepare_unigram(path, out): print path c = 0 start = datetime.now() with open(out, 'w') as outfile: outfile.write('question1_unigram,question2_unigram\n') for t, row in enumerate(DictReader(open(path), delimiter=',')): if c % 100000 == 0: print 'finished', c q1 = remove_punctuation(str( row['question1_porter']).lower()).split(' ') q2 = remove_punctuation(str( row['question2_porter']).lower()).lower().split(' ') q1_bigram = getUnigram(q1) q2_bigram = getUnigram(q2) q1_bigram = ' '.join(q1_bigram) q2_bigram = ' '.join(q2_bigram) outfile.write('%s,%s\n' % (q1_bigram, q2_bigram)) c += 1 end - start = datetime.now() print 'times:', end - start
def getVWFile(tsvFile, vwFile, dataIndex, dictionary, tfidf_model, train=True): """ Generate the overall features in VW format.""" start = datetime.now() # extract all the index catIndex = dataIndex["catIndex"] subCatIndex = dataIndex["subCatIndex"] attrsKeyIndex = dataIndex["attrsKeyIndex"] attrsValIndex = dataIndex["attrsValIndex"] wordIndex = dataIndex["wordIndex"] with open(vwFile, "wb") as vwWriter: with open(tsvFile, "rb") as tsvReader: itemReader = DictReader(tsvReader, delimiter='\t', quotechar='"') for i, item in enumerate(itemReader): item = {featureName:featureValue.decode('utf-8') \ for featureName,featureValue in item.items() \ if featureValue is not None} # get header itemid = int(item["itemid"]) label = int(item["is_blocked"]) if train else 1 header = "%s '%s " % (int(2*label - 1), itemid) # category categoryFeat = "|C %s " % catIndex[ item["category"] ] # subcategory subcategoryFeat = "|SC %s " % subCatIndex[ item["subcategory"] ] # title title = [ str(wordIndex[w]) for w in getWords(item["title"]) ] # first-gram title_start = title[0] if len(title)>0 else "0" # end-gram title_end = title[-1] if len(title)>0 else "0" # naming is a pain for me titleFeat = "|T %s |bT %s |cT %s " % (" ".join(title), title_start, title_end) titleStatsFeat = "|t %s " % getTextStatsFeat(item["title"]) # description description = [ str(wordIndex[w]) for w in getWords(item["description"]) ] # first-gram description_start = description[0] if len(description)>0 else "0" # end-gram description_end = description[-1] if len(description)>0 else "0" descriptionFeat = "|D %s |fD %s |gD %s " % (" ".join(description), description_start, description_end) descriptionStatsFeat = "|d %s " % getTextStatsFeat(item["description"]) tfidf_feat1 = getTfidfFeat(getUnigram(description), dictionary[1], tfidf_model[1]) # 2gram tfidf seem to harm the performance, you are save to drop it here tfidf_feat2 = getTfidfFeat(getBigram(description, "_"), dictionary[2], tfidf_model[2]) descriptionFeat += "|iD %s |jD %s " % (tfidf_feat1, tfidf_feat2) # attrs attrsFeat = "" countAttrs = 0 if "attrs" in item: attrsDict = getAttrsDict(item["attrs"]) #print attrs for k,v in list(attrsDict.items()): countAttrs += 1 attrsFeat += "|A%s %s " % (attrsKeyIndex[k], attrsValIndex[v]) attrsFeat += "|a " for k,v in list(attrsDict.items()): attrsFeat += "%s " % (attrsKeyIndex[k]) if len(attrsFeat) == 0: attrsFeat = "|NA 1 " attrsFeat += "|hAC %s " % countAttrs # price priceFeat = "|P %s " % item["price"] # phones_cnt phonesCntFeat = "|p %s " % item["phones_cnt"] # emails_cnt emailsCntFeat = "|e %s " % item["emails_cnt"] # urls_cnt urlsCntFeat = "|u %s " % item["urls_cnt"] # output vwLine = header \ + categoryFeat \ + subcategoryFeat \ + titleFeat \ + titleStatsFeat \ + descriptionFeat \ + descriptionStatsFeat \ + attrsFeat \ + priceFeat \ + phonesCntFeat \ + emailsCntFeat \ + urlsCntFeat[:-1] + "\n" vwWriter.write( vwLine ) # report progress if (i+1)%10000 == 0: print(( "\n%s\t%s"%((i+1),str(datetime.now() - start)) )) print(( "Sample output:\n%s" % vwLine ))
def getVWFile(tsvFile, vwFile, dataIndex, dictionary, tfidf_model, train=True): """ Generate the overall features in VW format.""" start = datetime.now() # extract all the index catIndex = dataIndex["catIndex"] subCatIndex = dataIndex["subCatIndex"] attrsKeyIndex = dataIndex["attrsKeyIndex"] attrsValIndex = dataIndex["attrsValIndex"] wordIndex = dataIndex["wordIndex"] with open(vwFile, "wb") as vwWriter: with open(tsvFile, "rb") as tsvReader: itemReader = DictReader(tsvReader, delimiter='\t', quotechar='"') for i, item in enumerate(itemReader): item = {featureName:featureValue.decode('utf-8') \ for featureName,featureValue in item.iteritems() \ if featureValue is not None} # get header itemid = int(item["itemid"]) label = int(item["is_blocked"]) if train else 1 header = "%s '%s " % (int(2*label - 1), itemid) # category categoryFeat = "|C %s " % catIndex[ item["category"] ] # subcategory subcategoryFeat = "|SC %s " % subCatIndex[ item["subcategory"] ] # title title = [ str(wordIndex[w]) for w in getWords(item["title"]) ] # first-gram title_start = title[0] if len(title)>0 else "0" # end-gram title_end = title[-1] if len(title)>0 else "0" # naming is a pain for me titleFeat = "|T %s |bT %s |cT %s " % (" ".join(title), title_start, title_end) titleStatsFeat = "|t %s " % getTextStatsFeat(item["title"]) # description description = [ str(wordIndex[w]) for w in getWords(item["description"]) ] # first-gram description_start = description[0] if len(description)>0 else "0" # end-gram description_end = description[-1] if len(description)>0 else "0" descriptionFeat = "|D %s |fD %s |gD %s " % (" ".join(description), description_start, description_end) descriptionStatsFeat = "|d %s " % getTextStatsFeat(item["description"]) tfidf_feat1 = getTfidfFeat(getUnigram(description), dictionary[1], tfidf_model[1]) # 2gram tfidf seem to harm the performance, you are save to drop it here tfidf_feat2 = getTfidfFeat(getBigram(description, "_"), dictionary[2], tfidf_model[2]) descriptionFeat += "|iD %s |jD %s " % (tfidf_feat1, tfidf_feat2) # attrs attrsFeat = "" countAttrs = 0 if item.has_key("attrs"): attrsDict = getAttrsDict(item["attrs"]) #print attrs for k,v in attrsDict.items(): countAttrs += 1 attrsFeat += "|A%s %s " % (attrsKeyIndex[k], attrsValIndex[v]) attrsFeat += "|a " for k,v in attrsDict.items(): attrsFeat += "%s " % (attrsKeyIndex[k]) if len(attrsFeat) == 0: attrsFeat = "|NA 1 " attrsFeat += "|hAC %s " % countAttrs # price priceFeat = "|P %s " % item["price"] # phones_cnt phonesCntFeat = "|p %s " % item["phones_cnt"] # emails_cnt emailsCntFeat = "|e %s " % item["emails_cnt"] # urls_cnt urlsCntFeat = "|u %s " % item["urls_cnt"] # output vwLine = header \ + categoryFeat \ + subcategoryFeat \ + titleFeat \ + titleStatsFeat \ + descriptionFeat \ + descriptionStatsFeat \ + attrsFeat \ + priceFeat \ + phonesCntFeat \ + emailsCntFeat \ + urlsCntFeat[:-1] + "\n" vwWriter.write( vwLine ) # report progress if (i+1)%10000 == 0: print( "\n%s\t%s"%((i+1),str(datetime.now() - start)) ) print( "Sample output:\n%s" % vwLine )