Exemplo n.º 1
0
def urldbinsert(listsplit):
    urlinsert = DataInsert()
    urlinsert.urldbinit()  # open the url database group
    for i in listsplit:
        md5url = hashlib.md5(i).hexdigest()
        urlinsert.url = i
        urlinsert.md5url = md5url
        urlinsert.inserturldb()
    urlinsert.urldbclose()
Exemplo n.º 2
0
 def __init__(self):
     self.uni = ""
     self.title = ""
     self.content = ""
     self.md5urllist = {}
     self.purei = Purecontent("c")
     self.urlinsert = DataInsert()
     self.urlinsert.urldbinit()
Exemplo n.º 3
0
def linesplitinster(md5urllist):
    purei = Purecontent("r")
    total = len(md5urllist)
    wordi = TextInsert()
    wsynccount = 0
    for md5url in md5urllist.keys():
        st = time.time()
        tail = []
        totaldic = 0
        totalcomp = 0
        pureserial = purei.queryserial(md5url)
        if purei.querycontentcount(pureserial):
            purecount = int(purei.querycontentcount(pureserial)) + 1
        else:
            purecount = 0
        for seri in xrange(purecount):
            querykey = pureserial + contentprocess.lintoascii(seri)
            while count_active(tail) >= config.splitercpu:
                time.sleep(0.5)
            getre = bngram.wordspliting(purei.querycontentinline(querykey), querykey)
            tail.append(getre)
            getre.start()  # execute getre.run()
        dba = DataInsert()
        dba.outdicdbinit()  # open the word database which are out of dic
        dba.companwordcount = 0
        wa = 0  # if we have to reload anuutf-8 dic
        for splitterlist in tail:
            splitterlist.join(config.splitertimeout)
            totalcomp = totalcomp + len(splitterlist.companword)
            totaldic = totaldic + len(splitterlist.dicword)
            dba.wordlist = splitterlist.companword
            if dba.wordlist:
                dba.anuworddb()
                wa = 1
        dba.outdicdbclose()
        if wa:
            wordi.anureload()
        # print dba.companwordcount,totalcomp,totaldic
        # wordi=TextInsert()
        for splitterlist in tail:
            if splitterlist.dicword:
                wordi.getdicdb = 1
                wordi.dicword = splitterlist.dicword
                wordi.tempwurl(splitterlist.querykey)
            if splitterlist.companword:
                wordi.getdicdb = 2
                wordi.dicword = splitterlist.companword
                wordi.tempwurl(splitterlist.querykey)
        tail = []
        # print time.time()-st
        wsynccount += 1
        if wsynccount > 8192:
            stderr.write("dbsync")
            wordi.sync_wpage()
            wsynccount = 0
            if reloadxmlrpcd():
                stderr.write("+")
        stderr.write(".")

    title, word = "", ""
    stderr.write("dbsync")
    wordi.sync_wpage()
    if reloadxmlrpcd():
        stderr.write("+")
    wordi.closedicdb()
    purei.close()
Exemplo n.º 4
0
class Contentprocess(object):
    def __init__(self):
        self.uni = ""
        self.title = ""
        self.content = ""
        self.md5urllist = {}
        self.purei = Purecontent("c")
        self.urlinsert = DataInsert()
        self.urlinsert.urldbinit()

    def closeandreturn(self):
        self.purei.close()
        self.urlinsert.urldbclose()
        return self.md5urllist

    def contentadd(self, largeinsert):
        for x in largeinsert.keys():
            self.uni = x
            cdata = largeinsert[x]
            self.title = cdata[0]
            self.content = cdata[1]
            self.contentinsert()

    def contentinsert(self):
        md5url = hashlib.md5(self.uni).hexdigest()
        self.purei.url_md5 = md5url
        self.md5urllist[md5url] = self.uni
        # url db
        self.urlinsert.url = self.uni
        self.urlinsert.md5url = md5url
        self.urlinsert.inserturldb()
        stmk = stopmarks()

        if self.purei.checkexist():
            self.purei.title = self.title.encode("utf-8")
            context = ""
            word = self.content
            n = 0
            for xw in word:
                if ord(xw) >= 32 or ord(xw) in [9, 10, 13]:
                    context = context + xw
                n += 1
                if n > 40000000:  # may over 65535 line of a document.
                    break
            context = context + chr(32)
            contline = []
            contline.append("")
            word = ""  # release word value
            i = 0  # line of contline list
            x = 0  # word number
            msl = 260
            while x < len(context):
                ordx = ord(context[x])
                contline[i] = contline[i] + context[x]
                sentencecount = len(clearspace((contline[i])))
                if (
                    sentencecount > msl
                    and stmk.atypestopmarks(ordx)
                    or sentencecount > msl
                    and context[x : x + 2] == ". "
                    or sentencecount > msl + 20
                    and stmk.btypestopmarks(ordx)
                    or sentencecount > msl + 20
                    and ordx == 10
                    and ord(context[x + 1 : x + 2]) < 65
                ):
                    nextword = context[x + 1 : x + 2]
                    if nextword:
                        if punctuationmarks(ord(nextword)):
                            # at some case, chinese word will use two marks.
                            x += 1
                            contline[i] = contline[i] + context[x]
                    contline.append("")
                    i = len(contline) - 1
                    if msl <= 16640 and i % 2:
                        msl = msl + msl  # Dobule it, Until this value bigger then 16640.
                x += 1
                if sentencecount < msl:
                    contline[i] = contline[i] + context[x : x + msl]
                    x = x + msl

            contcleanline = []
            i = 0  # i for contline
            for x in contline:
                cont = clearspace(x)
                if len(cont) > 1:
                    if cont[0] == chr(32) and cont[-1] == chr(32):
                        cont = cont[1:-1]
                    elif cont[-1] == chr(32):
                        cont = cont[:-1]
                    elif cont[0] == chr(32):
                        cont = cont[1:]
                if len(cont) < 65025 and cont != chr(32):
                    contcleanline.append(cont.encode("utf-8"))
                    i = i + 1
            self.purei.purecotentinline = contcleanline
            self.purei.content = clearspace(context).encode("utf-8")
            self.purei.insertPurecontent()
            stderr.write(".")