def main(): if len(sys.argv) <= 1: help() sys.exit(2) # common exit code for syntax error else: if sys.argv: if sys.argv[1:] in (["--help"], ["-h"], ["--usage"], ["-?"]): help() sys.exit(0) if sys.argv[1] in ("--zipfile", "-z"): for zn in sys.argv: if os.path.exists(zn): filename = zn fp = ZipFile(filename, "r") namelist = fp.namelist() listsplit = [] OriginalHTMLdb = OriginalPage() print "\nOriginalHTML Insert:" for i in range(len(namelist)): if split(namelist[i], "/")[-1] != "linkinfo": nametourl = httpsplit(namelist[i]) OriginalHTMLdb.url = nametourl if OriginalHTMLdb.checkexist(): OriginalHTMLdb.content = fp.read(namelist[i]) OriginalHTMLdb.insertoriginalct() listsplit.append(nametourl) stderr.write(".") if sys.argv[1] in ("--url", "-u"): listsplit = [] OriginalHTMLdb = OriginalPage() OriginalHTMLdb.url = sys.argv[2] if OriginalHTMLdb.checkexist(): OriginalHTMLdb.content = openhtml(OriginalHTMLdb.url) OriginalHTMLdb.insertoriginalct() listsplit.append(OriginalHTMLdb.url) urldbinsert(listsplit) OriginalHTMLdb.sync() print "\nOriginalHTML Process:" md5urllist = OriginalHTMLprocess(listsplit) print "\nWordSplitting Process:" linesplitinster(md5urllist) OriginalHTMLdb.close()
def OriginalHTMLprocess(listsplit): OriginalHTMLdb = OriginalPage() ilog = infologger() purei = Purecontent("c") pat = re.compile("<([^>]|\n)*>") space = re.compile("\ \;|\©\;|\r|\t") stmk = stopmarks() md5urllist = {} for i in listsplit: md5url = md5hex(i) md5urllist[md5url] = [i] word = "" st = time.time() purei.url_md5 = md5url if purei.checkexist(): OriginalHTMLdb.url = i parser = html2txt() try: parser.feed(OriginalHTMLdb.queryoriginalct()) charset = parser.charset # charset detector parser.close() except: charset = "" Originaltext = langconvert(OriginalHTMLdb.queryoriginalct(), charset) Originaltext = Originaltext.decode("utf-8") ilog.sentence_split_info(time.time() - st) try: # If this page is normal html format parser = "" parser = html2txt() parser.feed(Originaltext) word = word + parser.text if len(word) == 0: word = word + space.sub(chr(32), pat.sub(chr(32), Originaltext)) contenttitle = clearspace(parser.title) parser.close() # print contenttitle,i,charset purei.title = contenttitle.encode("utf-8") except: try: parser = html2txt() parser.feed(Originaltext) contenttitle = clearspace(parser.title) parser.close() except: contenttitle = "" purei.title = contenttitle.encode("utf-8") word = word + space.sub(chr(32), pat.sub(chr(32), Originaltext)) context = "" ilog.sentence_split_info(time.time() - st) n = 0 for xw in word: if ord(xw) >= 32 or ord(xw) in [9, 10, 13]: context = context + xw n += 1 if n > 40000000: # may over 65535 line of a document. break ilog.sentence_split_info(purei.title + str(len(context)) + i + charset) context = context + chr(32) contline = [] contline.append("") i = 0 # line of contline list # for x in xrange(len(context)): x = 0 # word number msl = 260 while x < len(context): ordx = ord(context[x]) contline[i] = contline[i] + context[x] sentencecount = len(clearspace((contline[i]))) # sentencecount=len(contline[i]) if ( sentencecount > msl and stmk.atypestopmarks(ordx) or sentencecount > msl and context[x : x + 2] == ". " or sentencecount > msl + 20 and stmk.btypestopmarks(ordx) or sentencecount > msl + 20 and ordx == 10 and ord(context[x + 1 : x + 2]) < 65 ): nextword = context[x + 1 : x + 2] if nextword: if punctuationmarks(ord(nextword)): # at some case, chinese word will use two marks. x += 1 contline[i] = contline[i] + context[x] contline.append("") i += 1 if msl <= 16640 and i % 2: msl = msl + msl # Dobule it, Until this value bigger then 4000. x += 1 if sentencecount < msl: contline[i] = contline[i] + context[x : x + msl] x = x + msl contcleanline = [] i = 0 ilog.sentence_split_info(time.time() - st) for x in contline: cont = clearspace(x) if len(cont) > 1: if cont[0] == chr(32) and cont[-1] == chr(32): cont = cont[1:-1] elif cont[-1] == chr(32): cont = cont[:-1] elif cont[0] == chr(32): cont = cont[1:] if len(cont) < 65025 and cont != chr(32): contcleanline.append(cont.encode("utf-8")) i = i + 1 ilog.sentence_split_info(time.time() - st) purei.purecotentinline = contcleanline purei.content = clearspace(context).encode("utf-8") purei.insertPurecontent() stderr.write(".") OriginalHTMLdb.close() purei.close() return md5urllist