def getSecondRmDuplicationResult(split=True): rows=tablemerge2.getTitleSummary(dbconfig.mergetable2) if rows !=-1: f_summary=open(merge2_summary_file,'w') with open(merge2_title_file,'w') as fout: count=0 for row in rows: # title,summary,ctime,source count+=1 mtid,title,summary,ctime=row[0],row[1].strip(),row[2].strip(),row[3] if split: title=' '.join(jieba.cut(delpunc(title.lower()))).encode('utf-8') summary=' '.join(jieba.cut(delpunc(summary.lower()))).encode('utf-8') timeStr=time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(ctime)) msg_t='%s %s %s %s'%(count,mtid,timeStr,title) msg_s='%s %s %s %s'%(count,mtid,timeStr,summary) print msg_t print msg_s fout.write(msg_t+'\n') f_summary.write(msg_s+'\n') f_summary.close()
def getMerge2Title(): rows=tablemerge2.getTitleSummary(dbconfig.mergetable2) if rows !=-1: f_summary=open(merge2_summary_file,'w') with open(merge2_title_file,'w') as fout: count=0 for row in rows: # title,summary,ctime,source count+=1 mtid,title,summary,ctime=row[0],row[1].strip(),re.sub('\s+','',row[2]),row[3] title=delpunc(' '.join(jieba.cut(title)).lower()).encode('utf-8') summary=delpunc(' '.join(jieba.cut(summary)).lower()).encode('utf-8') if len(summary)<len(title): summary=title #timeStr=time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(ctime)) msg_t='%s %s %s %s'%(count,mtid,ctime,title) msg_s='%s %s %s %s'%(count,mtid,ctime,summary) print msg_t print msg_s fout.write(msg_t+'\n') f_summary.write(msg_s+'\n') f_summary.close()