def getMergeNews(): print tablemerge.getAllCount(dbconfig.mergetable); rows=tablemerge.getTitleSummary(dbconfig.mergetable) if rows !=-1: with open(news_file,'w') as fout: count=0 for row in rows: # id,title,summary,ctime,source count+=1 mtid,title,summary,ctime=row[0],row[1].strip(),re.sub('\s+','',row[2]),row[3] title=delpunc(' '.join(jieba.cut(title)).lower()).encode('utf-8') summary=delpunc(' '.join(jieba.cut(summary)).lower()).encode('utf-8') #timeStr=time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(ctime)) msg_ts='%s %s %s'%(count,title,summary) fout.write(msg_ts+'\n')
def getFirstRmDuplicationResult(split=True): rows=tablemerge.getTitleSummary(dbconfig.mergetable) if rows !=-1: f_summary=open(merge1_summary_file,'w') with open(merge1_title_file,'w') as fout: count=0 for row in rows: # title,summary,ctime,source count+=1 mtid,title,summary,ctime=row[0],row[1].strip(),row[2].strip(),row[3] if split: title=' '.join(jieba.cut(delpunc(title.lower()))).encode('utf-8') summary=' '.join(jieba.cut(delpunc(summary.lower()))).encode('utf-8') timeStr=time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(ctime)) msg_t='%s %s %s %s'%(count,mtid,timeStr,title) msg_s='%s %s %s %s'%(count,mtid,timeStr,summary) print msg_t print msg_s fout.write(msg_t+'\n') f_summary.write(msg_s+'\n') f_summary.close()