Пример #1
0
    def insertPurecontent(self):
        if not self.pdb.has_key(self.url_md5) and len(self.content) > 1:
            self.serialdb[chr(0) * 4] = "0"  # initial serial db.
            serialnumber = contentline.asciitoint(self.serialcursor.last()[0]) + 1
            asciiserial = contentline.inttoascii(serialnumber)
            self.serialdb[asciiserial] = self.url_md5  #  serialdb insert
            self.puresedb["%s" % self.url_md5] = asciiserial  # insert serial to url_md5
            compresscontent = zlib.compress(self.content, 9)
            self.pdb["%s" % asciiserial] = "%s" % (compresscontent)
            if not self.tdb.has_key(asciiserial) and len(self.title) > 1:
                self.tdb["%s" % asciiserial] = "%s" % (self.title)

            # insert purecontentcount and pureinline
            totallinesize = len(self.purecotentinline)
            self.purecontentcount["%s" % asciiserial] = "%s" % str(totallinesize)
            # 2 bytes serial line
            self.pureinline["%s" % asciiserial + contentline.lintoascii(0)] = self.title
            for x in xrange(totallinesize):
                serialkey = asciiserial + contentline.lintoascii(x + 1)
                self.pureinline["%s" % serialkey] = self.purecotentinline[x]
Пример #2
0
def linesplitinster(md5urllist):
    purei = Purecontent("r")
    total = len(md5urllist)
    wordi = TextInsert()
    wsynccount = 0
    for md5url in md5urllist.keys():
        st = time.time()
        tail = []
        totaldic = 0
        totalcomp = 0
        pureserial = purei.queryserial(md5url)
        if purei.querycontentcount(pureserial):
            purecount = int(purei.querycontentcount(pureserial)) + 1
        else:
            purecount = 0
        for seri in xrange(purecount):
            querykey = pureserial + contentprocess.lintoascii(seri)
            while count_active(tail) >= config.splitercpu:
                time.sleep(0.5)
            getre = bngram.wordspliting(purei.querycontentinline(querykey), querykey)
            tail.append(getre)
            getre.start()  # execute getre.run()
        dba = DataInsert()
        dba.outdicdbinit()  # open the word database which are out of dic
        dba.companwordcount = 0
        wa = 0  # if we have to reload anuutf-8 dic
        for splitterlist in tail:
            splitterlist.join(config.splitertimeout)
            totalcomp = totalcomp + len(splitterlist.companword)
            totaldic = totaldic + len(splitterlist.dicword)
            dba.wordlist = splitterlist.companword
            if dba.wordlist:
                dba.anuworddb()
                wa = 1
        dba.outdicdbclose()
        if wa:
            wordi.anureload()
        # print dba.companwordcount,totalcomp,totaldic
        # wordi=TextInsert()
        for splitterlist in tail:
            if splitterlist.dicword:
                wordi.getdicdb = 1
                wordi.dicword = splitterlist.dicword
                wordi.tempwurl(splitterlist.querykey)
            if splitterlist.companword:
                wordi.getdicdb = 2
                wordi.dicword = splitterlist.companword
                wordi.tempwurl(splitterlist.querykey)
        tail = []
        # print time.time()-st
        wsynccount += 1
        if wsynccount > 8192:
            stderr.write("dbsync")
            wordi.sync_wpage()
            wsynccount = 0
            if reloadxmlrpcd():
                stderr.write("+")
        stderr.write(".")

    title, word = "", ""
    stderr.write("dbsync")
    wordi.sync_wpage()
    if reloadxmlrpcd():
        stderr.write("+")
    wordi.closedicdb()
    purei.close()
Пример #3
0
 def x3tracp(self, x3t):
     a = contentline.lintoascii(x3t[0])
     b = chr(x3t[1] - x3t[0])
     return a + b