Exemplo n.º 1
0
def getMergeNews():
    print tablemerge.getAllCount(dbconfig.mergetable);
    rows=tablemerge.getTitleSummary(dbconfig.mergetable)
    if rows !=-1:
        with open(news_file,'w') as fout:
            count=0
            for row in rows:
                # id,title,summary,ctime,source
                count+=1
                mtid,title,summary,ctime=row[0],row[1].strip(),re.sub('\s+','',row[2]),row[3]
                title=delpunc(' '.join(jieba.cut(title)).lower()).encode('utf-8')
                summary=delpunc(' '.join(jieba.cut(summary)).lower()).encode('utf-8')
                #timeStr=time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(ctime))
                msg_ts='%s %s %s'%(count,title,summary)                
                fout.write(msg_ts+'\n')   
Exemplo n.º 2
0
def getDuplicationRate():
    count=0
    for tablename in dbconfig.tableName.itervalues():
        count+=table.getAllCount(tablename)
    mcount=tablemerge.getAllCount(dbconfig.mergetable)
    m2count=tablemerge2.getAllCount(dbconfig.mergetable2)    
    print 'First Duplication Rate: %.4f (%d/%d)'%(float(count-mcount)/count,count-mcount,count)    
    print 'Second Duplication Rate: %.4f (%d/%d)'%(float(mcount-m2count)/mcount,mcount-m2count,mcount)