예제 #1
0
import sys
sys.path.append('..')
sys.path.append('../common')
from common.logger import log
import getdoc
from database import table,tablemerge,dbconfig
from aggregate.rmduplicate import Depository
import time
import logging
from common import toolpit
from config import merge1_rmd_file

oldtime=time.time()
# depos=Depository(0.8,merge1_rmd_file) # for debug
depos=Depository(0.8)
docs=getdoc.get_records_dayago(dbconfig.mergetable,20)
for doc,summary in docs.iteritems():
    depos.add_doc(doc, summary)     # just add doc into the repository
msg='Depository summary builds,time cost: %.2f (s)' % (time.time()-oldtime,) 
print msg
logging.info(msg)

def __addDoctoTable(doc):
    rows=table.getRecordsById(doc.source, doc.uid)
    if rows==-1:
        print '%s getRecordsById error'%(doc.source)
        return
    if len(rows[0])>0:
        exrecord=list(rows[0])
        mtype,click='',0
        exrecord+=[mtype,click]  
예제 #2
0
        # remove from the inverse index
        if doc in self.forindex:
            _summary=self.forindex[doc]
            for word in _summary:
                if word in self.invindex and doc in self.invindex[word]:
                    self.invindex[word].remove(doc) 
            # remove from the forward index
            self.forindex.pop(doc)
    def remove_doc_before(self,ctime):
        rms=[]
        for doc in self.forindex:
            if doc.ctime<ctime:
                rms.append(doc)
        for doc in rms:
            self.__remove_doc(doc)
        msg='remove:%s,left:%s'%(len(rms),len(self.forindex))
        print msg
        logging.info(msg)        
    
if __name__=='__main__':        
    depos=Depository(0.8)
    for tablename in dbconfig.tableName.itervalues():
        docs=getdoc.get_records_dayago(tablename,30)
        for doc,summary in docs.iteritems():
            isnew,exist_doc=depos.add_doc(doc, summary)
            if isnew:
                pass
#                 print exist_doc.uid,exist_doc.source,'-->',doc.uid,doc.source
    print 'time costs:%.2f (s)'%(time.time()-oldtime,)