示例#1
0
def removeDuplicationId(in_file):
    corpus=list(open(news_file,'r'))    
    fout_w2v=open(rmdup_idw2v_file,'w')
    with open(in_file,'r') as fin:
        for line in fin:            
            line=line.split()
            newStr=' '.join(line[4:])
            print line[1],newStr
            #print >> fout,line[1],newStr
            wordStr=delpunc(' '.join(jieba.cut(newStr)).lower()).encode('utf-8')
            wl1=wordStr.split()
            sims=[]
            for i in xrange(len(corpus)):
                news=corpus[i].split()
                wl2=news[1:]
                (_sq,sim)=i,w2vword2sim.getSimofWordList(wl1, wl2)
                sims.append((_sq,sim))
            sims=sorted(sims,key=lambda x:x[1],reverse=True) 
            for indx,sim in sims[:11]:
                line_mtid=corpus[indx].split()[0]
                if sim<0.7:
                    break
                if line_mtid==line[1]: # skip the news itself
                    continue  
                print >> fout_w2v,line[1],line_mtid    
    fout_w2v.close()   
示例#2
0
def getRelatedNewsBat(in_file,out_file):
    corpus=list(open(news_file,'r'))
    fout=open(out_file,'w')    
    with open(in_file,'r') as fin:
        for line in fin:
            line=line.split()
            newStr=' '.join(line[4:])
            #print >> fout,line[1],newStr
            wordStr=delpunc(' '.join(jieba.cut(newStr)).lower()).encode('utf-8')            
            wl1=wordStr.split()
            sims=[]
            for i in xrange(len(corpus)):
                news=corpus[i].split()
                wl2=news[1:]
                (_sq,sim)=i,w2vword2sim.getSimofWordList(wl1, wl2)
                sims.append((_sq,sim))
            sims=sorted(sims,key=lambda x:x[1],reverse=True)  
            print >> fout,line[1],wordStr                  
            for indx,sim in sims[:11]:
                line_mtid=corpus[indx].split()[0]
                if line_mtid==line[1]: # skip the news itself
                    continue
                #if len(_dic)>=2 and  'w2v' in _dic:                
                print >> fout,'    %.4f,%s'%(sim,corpus[indx]),
    fout.close()