def removeDuplicationId(in_file): corpus=list(open(news_file,'r')) fout_w2v=open(rmdup_idw2v_file,'w') with open(in_file,'r') as fin: for line in fin: line=line.split() newStr=' '.join(line[4:]) print line[1],newStr #print >> fout,line[1],newStr wordStr=delpunc(' '.join(jieba.cut(newStr)).lower()).encode('utf-8') wl1=wordStr.split() sims=[] for i in xrange(len(corpus)): news=corpus[i].split() wl2=news[1:] (_sq,sim)=i,w2vword2sim.getSimofWordList(wl1, wl2) sims.append((_sq,sim)) sims=sorted(sims,key=lambda x:x[1],reverse=True) for indx,sim in sims[:11]: line_mtid=corpus[indx].split()[0] if sim<0.7: break if line_mtid==line[1]: # skip the news itself continue print >> fout_w2v,line[1],line_mtid fout_w2v.close()
def getRelatedNewsBat(in_file,out_file): corpus=list(open(news_file,'r')) fout=open(out_file,'w') with open(in_file,'r') as fin: for line in fin: line=line.split() newStr=' '.join(line[4:]) #print >> fout,line[1],newStr wordStr=delpunc(' '.join(jieba.cut(newStr)).lower()).encode('utf-8') wl1=wordStr.split() sims=[] for i in xrange(len(corpus)): news=corpus[i].split() wl2=news[1:] (_sq,sim)=i,w2vword2sim.getSimofWordList(wl1, wl2) sims.append((_sq,sim)) sims=sorted(sims,key=lambda x:x[1],reverse=True) print >> fout,line[1],wordStr for indx,sim in sims[:11]: line_mtid=corpus[indx].split()[0] if line_mtid==line[1]: # skip the news itself continue #if len(_dic)>=2 and 'w2v' in _dic: print >> fout,' %.4f,%s'%(sim,corpus[indx]), fout.close()