예제 #1
0
def prepare_unigram(path,out):
    data_input = pd.read_csv(path)
    data_ouput = DataFrame(columns=['sen1_unigram','sen2_unigram'])
    for index,row in data_input.iterrows():
        s1 = str(row['sen1']).split()
        s2 = str(row['sen2']).split()
        s1 = ' '.join(getUnigram(s1))
        s2 = ' '.join(getUnigram(s2))
        data_ouput.loc[index] = [s1,s2]
    data_ouput.to_csv(out,index=False)
예제 #2
0
def generateNGram(df):
    # unigram
    df['query_unigram'] = df['query'].apply(lambda x: ngram.getUnigram(x))
    df['title_unigram'] = df['product_title'].apply(
        lambda x: ngram.getUnigram(x))
    df['description_unigram'] = df['product_description'].apply(
        lambda x: ngram.getUnigram(x))
    # bigram
    df['query_bigram'] = df['query'].apply(lambda x: ngram.getBigram(x, '_'))
    df['title_bigram'] = df['product_title'].apply(
        lambda x: ngram.getBigram(x, '_'))
    df['description_bigram'] = df['product_description'].apply(
        lambda x: ngram.getBigram(x, '_'))
    # trigram
    df['query_trigram'] = df['query'].apply(lambda x: ngram.getTrigram(x, '_'))
    df['title_trigram'] = df['product_title'].apply(
        lambda x: ngram.getTrigram(x, '_'))
    df['description_trigram'] = df['product_description'].apply(
        lambda x: ngram.getTrigram(x, '_'))
    return df
예제 #3
0
 def __iter__(self):
     for item in DictReader(open(self.tsvFile, "rb"), delimiter='\t', quotechar='"'):
         self.counter += 1
         item = {featureName:featureValue.decode('utf-8') \
                 for featureName,featureValue in item.items() \
                 if featureValue is not None}
         description = [ str(self.wordIndex[w]) for w in getWords(item["description"]) ]
         if self.ngram == 1:
             yield getUnigram(description)
         elif self.ngram == 2:
             yield getBigram(description, "_")
         if self.counter%100000 == 0:
             print(( "     Process %s" % self.counter ))
 def __iter__(self):
     for item in DictReader(open(self.tsvFile, "rb"), delimiter='\t', quotechar='"'):
         self.counter += 1
         item = {featureName:featureValue.decode('utf-8') \
                 for featureName,featureValue in item.iteritems() \
                 if featureValue is not None}
         description = [ str(self.wordIndex[w]) for w in getWords(item["description"]) ]
         if self.ngram == 1:
             yield getUnigram(description)
         elif self.ngram == 2:
             yield getBigram(description, "_")
         if self.counter%100000 == 0:
             print( "     Process %s" % self.counter )
def prepare_unigram(path, out):
    print path
    c = 0
    start = datetime.now()
    with open(out, 'w') as outfile:
        outfile.write('question1_unigram,question2_unigram\n')
        for t, row in enumerate(DictReader(open(path), delimiter=',')):
            if c % 100000 == 0:
                print 'finished', c
            q1 = remove_punctuation(str(
                row['question1_porter']).lower()).split(' ')
            q2 = remove_punctuation(str(
                row['question2_porter']).lower()).lower().split(' ')
            q1_bigram = getUnigram(q1)
            q2_bigram = getUnigram(q2)
            q1_bigram = ' '.join(q1_bigram)
            q2_bigram = ' '.join(q2_bigram)
            outfile.write('%s,%s\n' % (q1_bigram, q2_bigram))

            c += 1
        end - start = datetime.now()

    print 'times:', end - start
예제 #6
0
def getVWFile(tsvFile, vwFile, dataIndex, dictionary, tfidf_model, train=True):
    """ Generate the overall features in VW format."""
    start = datetime.now()
    
    # extract all the index
    catIndex = dataIndex["catIndex"]
    subCatIndex = dataIndex["subCatIndex"]
    attrsKeyIndex = dataIndex["attrsKeyIndex"]
    attrsValIndex = dataIndex["attrsValIndex"]
    wordIndex = dataIndex["wordIndex"]
    
    with open(vwFile, "wb") as vwWriter:
        with open(tsvFile, "rb") as tsvReader:
            itemReader = DictReader(tsvReader, delimiter='\t', quotechar='"')
            for i, item in enumerate(itemReader):
                item = {featureName:featureValue.decode('utf-8') \
                        for featureName,featureValue in item.items() \
                        if featureValue is not None}

                # get header
                itemid = int(item["itemid"])
                label = int(item["is_blocked"]) if train else 1
                header = "%s '%s " % (int(2*label - 1), itemid)
                
                # category
                categoryFeat = "|C %s " % catIndex[ item["category"] ]
                
                # subcategory
                subcategoryFeat = "|SC %s " % subCatIndex[ item["subcategory"] ]
                
                # title
                title = [ str(wordIndex[w]) for w in getWords(item["title"]) ]
                # first-gram
                title_start = title[0] if len(title)>0 else "0"
                # end-gram
                title_end = title[-1] if len(title)>0 else "0"
                # naming is a pain for me
                titleFeat = "|T %s |bT %s |cT %s " % (" ".join(title), title_start, title_end)
                titleStatsFeat = "|t %s " % getTextStatsFeat(item["title"])
                
                # description
                description = [ str(wordIndex[w]) for w in getWords(item["description"]) ]
                # first-gram
                description_start = description[0] if len(description)>0 else "0"
                # end-gram
                description_end = description[-1] if len(description)>0 else "0"
                descriptionFeat = "|D %s |fD %s |gD %s " % (" ".join(description), description_start, description_end)
                descriptionStatsFeat = "|d %s " % getTextStatsFeat(item["description"])
                tfidf_feat1 = getTfidfFeat(getUnigram(description), dictionary[1], tfidf_model[1])
                # 2gram tfidf seem to harm the performance, you are save to drop it here
                tfidf_feat2 = getTfidfFeat(getBigram(description, "_"), dictionary[2], tfidf_model[2])
                descriptionFeat += "|iD %s |jD %s " % (tfidf_feat1, tfidf_feat2)
                
                # attrs
                attrsFeat = ""
                countAttrs = 0
                if "attrs" in item:
                    attrsDict = getAttrsDict(item["attrs"])
                    #print attrs
                    for k,v in list(attrsDict.items()):
                        countAttrs += 1
                        attrsFeat += "|A%s %s " % (attrsKeyIndex[k], attrsValIndex[v])
                    attrsFeat += "|a "
                    for k,v in list(attrsDict.items()):
                        attrsFeat += "%s " % (attrsKeyIndex[k])                        
                if len(attrsFeat) == 0:
                    attrsFeat = "|NA 1 "
                attrsFeat += "|hAC %s " % countAttrs
                
                # price
                priceFeat = "|P %s " % item["price"]                
                # phones_cnt
                phonesCntFeat = "|p %s " % item["phones_cnt"]
                # emails_cnt
                emailsCntFeat = "|e %s " % item["emails_cnt"]
                # urls_cnt
                urlsCntFeat = "|u %s " % item["urls_cnt"]
                
                # output
                vwLine = header \
                       + categoryFeat \
                       + subcategoryFeat \
                       + titleFeat \
                       + titleStatsFeat \
                       + descriptionFeat \
                       + descriptionStatsFeat \
                       + attrsFeat \
                       + priceFeat \
                       + phonesCntFeat \
                       + emailsCntFeat \
                       + urlsCntFeat[:-1] + "\n"
                vwWriter.write( vwLine )
                
                # report progress
                if (i+1)%10000 == 0:
                    print(( "\n%s\t%s"%((i+1),str(datetime.now() - start)) ))
                    print(( "Sample output:\n%s" % vwLine ))
def getVWFile(tsvFile, vwFile, dataIndex, dictionary, tfidf_model, train=True):
    """ Generate the overall features in VW format."""
    start = datetime.now()
    
    # extract all the index
    catIndex = dataIndex["catIndex"]
    subCatIndex = dataIndex["subCatIndex"]
    attrsKeyIndex = dataIndex["attrsKeyIndex"]
    attrsValIndex = dataIndex["attrsValIndex"]
    wordIndex = dataIndex["wordIndex"]
    
    with open(vwFile, "wb") as vwWriter:
        with open(tsvFile, "rb") as tsvReader:
            itemReader = DictReader(tsvReader, delimiter='\t', quotechar='"')
            for i, item in enumerate(itemReader):
                item = {featureName:featureValue.decode('utf-8') \
                        for featureName,featureValue in item.iteritems() \
                        if featureValue is not None}

                # get header
                itemid = int(item["itemid"])
                label = int(item["is_blocked"]) if train else 1
                header = "%s '%s " % (int(2*label - 1), itemid)
                
                # category
                categoryFeat = "|C %s " % catIndex[ item["category"] ]
                
                # subcategory
                subcategoryFeat = "|SC %s " % subCatIndex[ item["subcategory"] ]
                
                # title
                title = [ str(wordIndex[w]) for w in getWords(item["title"]) ]
                # first-gram
                title_start = title[0] if len(title)>0 else "0"
                # end-gram
                title_end = title[-1] if len(title)>0 else "0"
                # naming is a pain for me
                titleFeat = "|T %s |bT %s |cT %s " % (" ".join(title), title_start, title_end)
                titleStatsFeat = "|t %s " % getTextStatsFeat(item["title"])
                
                # description
                description = [ str(wordIndex[w]) for w in getWords(item["description"]) ]
                # first-gram
                description_start = description[0] if len(description)>0 else "0"
                # end-gram
                description_end = description[-1] if len(description)>0 else "0"
                descriptionFeat = "|D %s |fD %s |gD %s " % (" ".join(description), description_start, description_end)
                descriptionStatsFeat = "|d %s " % getTextStatsFeat(item["description"])
                tfidf_feat1 = getTfidfFeat(getUnigram(description), dictionary[1], tfidf_model[1])
                # 2gram tfidf seem to harm the performance, you are save to drop it here
                tfidf_feat2 = getTfidfFeat(getBigram(description, "_"), dictionary[2], tfidf_model[2])
                descriptionFeat += "|iD %s |jD %s " % (tfidf_feat1, tfidf_feat2)
                
                # attrs
                attrsFeat = ""
                countAttrs = 0
                if item.has_key("attrs"):
                    attrsDict = getAttrsDict(item["attrs"])
                    #print attrs
                    for k,v in attrsDict.items():
                        countAttrs += 1
                        attrsFeat += "|A%s %s " % (attrsKeyIndex[k], attrsValIndex[v])
                    attrsFeat += "|a "
                    for k,v in attrsDict.items():
                        attrsFeat += "%s " % (attrsKeyIndex[k])                        
                if len(attrsFeat) == 0:
                    attrsFeat = "|NA 1 "
                attrsFeat += "|hAC %s " % countAttrs
                
                # price
                priceFeat = "|P %s " % item["price"]                
                # phones_cnt
                phonesCntFeat = "|p %s " % item["phones_cnt"]
                # emails_cnt
                emailsCntFeat = "|e %s " % item["emails_cnt"]
                # urls_cnt
                urlsCntFeat = "|u %s " % item["urls_cnt"]
                
                # output
                vwLine = header \
                       + categoryFeat \
                       + subcategoryFeat \
                       + titleFeat \
                       + titleStatsFeat \
                       + descriptionFeat \
                       + descriptionStatsFeat \
                       + attrsFeat \
                       + priceFeat \
                       + phonesCntFeat \
                       + emailsCntFeat \
                       + urlsCntFeat[:-1] + "\n"
                vwWriter.write( vwLine )
                
                # report progress
                if (i+1)%10000 == 0:
                    print( "\n%s\t%s"%((i+1),str(datetime.now() - start)) )
                    print( "Sample output:\n%s" % vwLine )