def work2JPG(filename, isPng = False): filepath = FileHelper.realpath(filename) filedir = FileHelper.dirname(filepath) name = FileHelper.basename(filepath) os.chdir(tpDir) jpgCMD = """%s -quality 90 %s %s """ % (convertBin, filepath, filepath) os.system(jpgCMD) #return tmpfilename = FileHelper.join(filedir, hashlib.md5(name.encode('utf-8')).hexdigest()) isSuccess = True with open(tmpfilename, 'wb+') as tmpFile: try: tmpFile.write(b'MNG') rgbname = filepath FileHelper.writeWithSize(tmpFile, filepath) except Exception: Log.printDetailln ("error00 !!!", filename, "cannot convert.") isSuccess = False finally: pass if isSuccess: FileHelper.remove(filepath) FileHelper.rename(tmpfilename, filepath) return 5 else: return 2
def __init__(self, language='en'): path = os.path.join('text', 'stopwords-%s.txt' % language) if language not in self._cached_stop_words: self._cached_stop_words[language] = \ set(FileHelper.loadResourceFile(path).splitlines()) self.STOP_WORDS = self._cached_stop_words[language] self.language = language
def extract_all_links(infile, outfile, base_url=None, mode='a'): '''extract links as train and test samples''' content = FileHelper.readUTF8File(infile) if content is None: return None docstring = content.lower() doctree = HtmlHelper.create_doc(docstring, 'utf-8') if base_url is None: try: base_url = content.splitlines()[0] except IndexError or AttributeError: return None if doctree is None: return None doctree = HtmlHelper.pre_process_domtree(doctree) if doctree is None: return None url_items = [] for item in get_link_word_by_pair(docstring, base_url, None): url_items.append(item) with open(outfile, mode) as fw: for item in url_items: anchor = item['anchor_text'] url = item['url'] tmp = anchor.decode('utf-8') print url, anchor if len(tmp)>5 and isdigit(tmp): fw.write('%s\t%s\n' %(url, anchor))
def create_train(src_dirs, destfile): with open(destfile, 'a') as fw: for f in os.listdir(src_dirs): print f path = src_dirs + f content = FileHelper.readUTF8File(path) doc = HtmlHelper.create_doc(content, 'utf-8') doc = HtmlHelper.pre_process_domtree(doc)
def get_document(name): home = '/mnt/UbutunShare/graduate/DataSet/PageClassification/%s/' %name json_dir = home + 'json/' mkdirs(home, json_dir) history = home+'history.txt' visited_files = get_history(history) finput = open(history, 'w') yule = home+'yule/' sport = home +'sport/' finance = home +'finance/' junshi = home + 'junshi/' for dir in [yule, sport, finance, junshi]: print dir for f in os.listdir(dir): if f in visited_files: continue print '========================================' print 'parse file: %s ....' % f t = dir + f content=FileHelper.readUTF8File(t) if content is None: continue try: base_url=content.splitlines()[0] except IndexError or AttributeError: continue document=html2words(content, base_url, encoding='utf-8') # if document is None: # os.remove(t) # continue # div = document['para'] # # if len(div)<50: # os.remove(t) # continue json_file=os.path.join(json_dir, f.split('.')[0] + '.json') save(document, json_file) finput.writelines(f + '\n') finput.close()
def predict(name): home = '/mnt/UbutunShare/graduate/DataSet/PageClassification/%s/' %name temp = home+'temp/' mkdirs(temp) supervisior=MultiClassifier('test-3-topic') supervisior.load(path='/mnt/UbutunShare/TopicalCrawl/TopicalCrawl/classifier') cetd = home + 'plain-text/cetd/' wish_ce = home + 'plain-text/wish-ce/' remove_tag_ = home + 'plain-text/remove-tag/' history = temp+'history.txt' visited_files = get_history(history) finput = open(history, 'w') document_class = {} for dir in [cetd, wish_ce, remove_tag_]: class_ = '1' if 'cetd' in dir:class_ = '0' elif 'wish' in dir:class_ = '1' else: class_ = '2' print dir print '========================' document_class[class_] = {} tmpfile = open(temp+class_+'.txt', 'w') for f in os.listdir(dir): if f in visited_files: continue print '===============================' print f t = dir + f content=FileHelper.readUTF8File(t) if content is None: continue predicted_y, precision=supervisior.predict(content) tmpfile.write('%s\t%s\n' %(predicted_y, f)) document_class[class_][f] = predicted_y finput.writelines(f + '\n') tmpfile.close() finput.close() json.dump(document_class, open(temp+'test_result.txt', 'w'))
def get_document(path): content = FileHelper.readUTF8File(path) if content is None: return None try: base_url = content.splitlines()[0] except IndexError or AttributeError: return None docstring = content.lower() doctree = HtmlHelper.create_doc(docstring, 'utf-8') if doctree is None: return None document = html2words(docstring, base_url) return document
def create_train_url(name): # src_dir = '/mnt/UbutunShare/graduate/DataSet/PageClassification/%s/' % name # # supervisior=MultiClassifier('test-6-topic') # supervisior.load(path='/mnt/UbutunShare/TopicalCrawl/TopicalCrawl/classifier') # # home = src_dir # yule = home+'yule/' # sport = home +'sport/' # finance = home +'finance/' # junshi = home + 'junshi/' # # # url_train_path=os.path.join(src_dir, 'url_train.txt') # # with open(url_train_path, 'w') as fw: # for original in [yule, sport, finance, junshi]: src_dir = '/mnt/UbutunShare/graduate/DataSet/scrapy_dataset/' supervisior=MultiClassifier('test-6-topic') supervisior.load(path='/mnt/UbutunShare/TopicalCrawl/TopicalCrawl/classifier') url_train_path=os.path.join(src_dir, 'url_train.txt') with open(url_train_path, 'w') as fw: for dir in ['qq/', 'sina/', 'ifeng/', 'sohu/']: original = src_dir + dir + 'original/' for f in os.listdir(original): print '========================================' print 'parse file: %s ....' % f t=os.path.join(original, f) content=FileHelper.readUTF8File(t) try: if content is None: continue base_url=content.splitlines()[0] except IndexError: continue for link in get_link_word_by_pair(content, base_url, supervisior, 'utf-8'): try: fw.write('%s\t%s\t%s\t%s\t%.3f\n' % ( link['parent_url'], link['url'], link['anchor_text'], str(link['label']), link['interestness'])) except UnicodeDecodeError: continue
def get_page_para(name): home = '/mnt/UbutunShare/graduate/DataSet/PageClassification/%s/' %name cetd = home + 'plain-text/cetd/' wish_ce = home + 'plain-text/wish-ce/' remove_tag_ = home + 'plain-text/remove-tag/' mkdirs(cetd, wish_ce, remove_tag_) history = home+'history.txt' visited_files = get_history(history) finput = open(history, 'w') yule = home+'yule/' sport = home +'sport/' finance = home +'finance/' junshi = home + 'junshi/' for dir in [yule, sport, finance, junshi]: for f in os.listdir(dir): if f in visited_files: continue print '========================================' print 'parse file: %s ....' % f t = dir + f content=FileHelper.readUTF8File(t) if content is None: continue try: base_url=content.splitlines()[0] except IndexError or AttributeError: continue docstring=content.lower() doctree=HtmlHelper.create_doc(docstring, 'utf-8') if doctree is None: return None cetd_doc = copy.deepcopy(doctree) try: article_c = get_aricle_cetd(cetd_doc) doctree=HtmlHelper.pre_process_domtree(doctree) article_w, title = HtmlHelper.get_article(doctree) article_w = ' '.join([title, article_w]) article_r = remove_tags(docstring) finput.writelines(f + '\n') except: continue f = f.split('.')[0] + '.txt' FileHelper.WriteInUTF8(cetd+f, article_c) FileHelper.WriteInUTF8(wish_ce+f, article_w) FileHelper.WriteInUTF8(remove_tag_+f, article_r) finput.close()
def create_train_samples(name, model='test-zh-topic'): supervisior, original, history, tree, json_dir,\ url, url_sample_file, url_sample_pickle, url_sample,\ document_class, dcoument_class_file = init(name) visited_files = get_history(history) finput=open(history, 'a') list_file = os.listdir(original) num = lambda x: int(x.split('.')[0]) list_file.sort(cmp=lambda a,b:num(a)-num(b)) for f in list_file: if f in visited_files: continue print '========================================' print 'parse file: %s ....' % f t=os.path.join(original, f) content=FileHelper.readUTF8File(t) if content is None: continue try: base_url=content.splitlines()[0] except IndexError or AttributeError: continue document=html2words(content, base_url, encoding='utf-8', supervisior=supervisior) if document is None: os.remove(t) continue pure_text='\t'.join([document['title'], document['meta_descrip'], document['para']]) predicted_y, precision=supervisior.predict(pure_text) tree_file=os.path.join(tree, f) json_file=os.path.join(json_dir, f.split('.')[0] + '.json') save(document, json_file, tree_file) key=f.split('.')[0] urlitems_file=os.path.join(url, key) cPickle.dump(document['url_items'], open(urlitems_file, 'wb'), -1) item=gen_url_item(base_url, document['title'], predicted_y, precision) url_sample[key]=item document_class[key]=predicted_y # with open(url_sample_file, 'a') as fu: # fu.write('%s\t%s\t%s\t%s\t%.3f\n' % ( # key, encode(item['url']), encode(item['anchor_text']), item['label'], item['interestness'])) finput.writelines(f + '\n') finput.close() # cPickle.dump(url_sample, open(url_sample_pickle, 'wb'), -1) json.dump(document_class, open(dcoument_class_file, 'w'))
# import lxml.etree as etree # doc1 = etree.fromstring(html1) # doc2 = etree.fromstring(html2) # # root1 = ElementHelper.get_root(doc1) # root2 = ElementHelper.get_root(doc2) # w = WISH() # print w.similar_check(root1, root2) dir = '/mnt/UbutunShare/graduate/DataSet/scrapy_dataset/ifeng/original/other_neg_524.html' dir = '/mnt/UbutunShare/graduate/DataSet/PageClassification/Test1/yule/yule (55).html' dir = '/mnt/UbutunShare/graduate/DataSet/1.txt' dir = 'classifier/sample-data/1.html' dir = '/mnt/UbutunShare/Work/CETD_DATA/Test/original/0.htm' from api import HtmlHelper from util import FileHelper content = FileHelper.readUTF8File(dir) doc = HtmlHelper.create_doc(content, 'utf-8') doc = HtmlHelper.pre_process_domtree(doc) article, title = HtmlHelper.get_article(doc, True) print article.encode('utf-8')
def work_file_PVR(filename, isDTC4Module = False, isTC4 = False): filepath = FileHelper.realpath(filename) filedir = FileHelper.dirname(filepath) sys.stdout.flush() os.chdir(toolsPath) isTC4 = True isAlphaJPG = False if isDTC4Module: isTC4 = False preCMD = " -p " preCMD = "" info = ImageInfo.size(filepath) # 只支持png纹理 if info[0] != 'PNG': return 2 width = info[1] height = info[2] # 只支持正方形2的幂纹理 if width & (width-1) != 0 or width != height: return 2 rgbCMD = """ %s -f PVRTC1_4_RGB %s -q pvrtcbest -i %s -o %s """ % (pvrTexToolBin, preCMD, filepath, filepath.replace(".png", ".pvr")) alphaCMD = """%s %s -alpha extract %s """ % (convertBin, filepath, filepath.replace(".png", ".alpha.jpg")) alphaJPGCMD = """ %s -f PVRTC1_4_RGB -q pvrtcbest -i %s -o %s """ % (pvrTexToolBin, filepath.replace(".png", ".alpha.jpg"), filepath.replace(".png", ".alpha.pvr")) if isTC4: rgbCMD = """ %s -f PVRTC1_4 %s -q pvrtcbest -i %s -o %s """ % (pvrTexToolBin, preCMD, filepath, filepath.replace(".png", ".pvr")) try: FileHelper.remove(filepath.replace(".png", ".pkm")) FileHelper.remove(filepath.replace(".png", "_alpha.pkm")) os.system(rgbCMD) if not isTC4: os.system(alphaCMD) if not isAlphaJPG and not isTC4: os.system(alphaJPGCMD) if not FileHelper.exists(filepath.replace(".png", ".pvr")): return 2 os.rename(filepath.replace(".png", ".pvr"), filepath.replace(".png", ".pkm")) if not isTC4: if not isAlphaJPG: os.rename(filepath.replace(".png", ".alpha.jpg"), filepath.replace(".png", "_alpha.pkm")) else: os.rename(filepath.replace(".png", ".alpha.pvr"), filepath.replace(".png", "_alpha.pkm")) FileHelper.remove(filepath.replace(".png", ".alpha.jpg")) FileHelper.remove(filepath.replace(".png", ".alpha.pvr")) except Exception: Log.printError() return 2 finally: pass tmpfilename = filepath.replace(".png", ".tmp") FileHelper.remove(tmpfilename) isSuccess = True with open(tmpfilename, 'wb+') as tmpFile: try: tmpFile.write(b'MNG') rgbname = filepath.replace(".png", ".pkm") statinfo = os.stat(rgbname) fileSize = statinfo.st_size tmpFile.write(pack("i", fileSize)) rgbfile = open(rgbname, "rb") tmpFile.write(rgbfile.read()) rgbfile.close() alphaname = filepath.replace(".png", "_alpha.pkm") if not isTC4: statinfo = os.stat(alphaname) fileSize = statinfo.st_size tmpFile.write(pack("i", fileSize)) alphafile = open(alphaname, "rb") tmpFile.write(alphafile.read()) alphafile.close() # if preAlpha: # tmpFile.write('p') # else: # tmpFile.write('P') if not isSaveTransFile: FileHelper.remove(rgbname) FileHelper.remove(alphaname) except Exception: t, v, tb = sys.exc_info() Log.printDetailln(t, v) isSuccess = False pass finally: pass if isSuccess: if isUseGzip: gzip_cmd = gzipBin + tmpfilename + " -n -f -9" os.system(gzip_cmd) FileHelper.remove(tmpfilename.replace(".tmp", ".png")) FileHelper.rename(tmpfilename + ".gz", tmpfilename.replace(".tmp", ".png")) return 3 else: FileHelper.remove(tmpfilename.replace(".tmp", ".png")) FileHelper.rename(tmpfilename, tmpfilename.replace(".tmp", ".png")) return 5
def work_png(filename): filepath = FileHelper.realpath(filename) filedir = FileHelper.dirname(filepath) os.chdir(tpDir) isSaveTransFile = False isPng = True useGZIP = False if isPng: jpgCMD = """%s %s -background black %s """ % (convertBin, filepath, filepath.replace(".png", ".rgb.jpg")) alphaCMD = """%s %s -alpha extract %s """ % (convertBin, filepath, filepath.replace(".png", ".alpha.jpg")) try: os.system(jpgCMD) os.system(alphaCMD) except Exception: Log.printDetailln ("error33 !!!", filename, "cannot convert.") return 2 finally: pass tmpfilename = filepath.replace(".png", ".tmp") FileHelper.remove(tmpfilename) isSuccess = True with open(tmpfilename, 'wb+') as tmpFile: try: tmpFile.write(b'MNG') rgbname = filepath.replace(".png", ".rgb.jpg") FileHelper.writeWithSize(tmpFile, rgbname) alphaname = filepath.replace(".png", ".alpha.jpg") FileHelper.writeWithSize(tmpFile, alphaname) if not isSaveTransFile: FileHelper.remove(rgbname) FileHelper.remove(alphaname) except Exception: Log.printError() isSuccess = False pass finally: pass if isSuccess: if useGZIP: gzip_cmd = gzipBin + tmpfilename + " -n -f -9" os.system(gzip_cmd) FileHelper.remove(tmpfilename.replace(".tmp", ".png")) FileHelper.rename(tmpfilename + ".gz", tmpfilename.replace(".tmp", ".png")) return 3 else: FileHelper.remove(tmpfilename.replace(".tmp", ".png")) FileHelper.rename(tmpfilename, tmpfilename.replace(".tmp", ".png")) return 5 else: return 2
def work_file_ETC(filename, isAlphaJPG = False, isFast = False): filepath = FileHelper.realpath(filename) filedir = FileHelper.dirname(filepath) sys.stdout.flush() #preAlpha = needPreAplha(filedir) preAlpha = False preCMD = " -p " if not preAlpha: preCMD = "" os.chdir(toolsPath) isPng = True if filename.find(".png") != -1: isPng = True elif filename.find(".jpg") != -1: isPng = False else: return 2 if isFast: quality = 'etcfast' else: quality = 'etcslow' rgbCMD = """ %s -f ETC1 %s -q %s -i %s -o %s """ % (pvrTexToolBin, preCMD, quality, filepath, filepath.replace(".png", ".pvr")) alphaCMD = """%s %s -alpha extract %s """ % (convertBin, filepath, filepath.replace(".png", ".alpha.jpg")) alphaJPGCMD = """ %s -f ETC1 -q %s -i %s -o %s """ % (pvrTexToolBin, quality, filepath.replace(".png", ".alpha.jpg"), filepath.replace(".png", ".alpha.pvr")) try: if isPng: FileHelper.remove(filepath.replace(".png", ".pkm")) FileHelper.remove(filepath.replace(".png", "_alpha.pkm")) os.system(rgbCMD) os.system(alphaCMD) if not isAlphaJPG: os.system(alphaJPGCMD) FileHelper.rename(filepath.replace(".png", ".pvr"), filepath.replace(".png", ".pkm")) if isAlphaJPG: FileHelper.rename(filepath.replace(".png", ".alpha.jpg"), filepath.replace(".png", "_alpha.pkm")) else: FileHelper.rename(filepath.replace(".png", ".alpha.pvr"), filepath.replace(".png", "_alpha.pkm")) FileHelper.remove(filepath.replace(".png", ".alpha.jpg")) FileHelper.remove(filepath.replace(".png", ".alpha.pvr")) else: FileHelper.remove(filepath.replace(".jpg", ".pkm")) rgbCMD = """ %s -f ETC1 -p -q %s -i %s -o %s """ % (pvrTexToolBin, quality, filepath, filepath.replace(".jpg", ".pvr")) os.system(rgbCMD) FileHelper.rename(filepath.replace(".jpg", ".pvr"), filepath.replace(".jpg", ".pkm")) except Exception: t, v, tb = sys.exc_info() Log.printDetailln(t, v) pass finally: pass if isPng: tmpfilename = filepath.replace(".png", ".tmp") FileHelper.remove(tmpfilename) isSuccess = True with open(tmpfilename, 'wb+') as tmpFile: try: tmpFile.write(b'MNG') rgbname = filepath.replace(".png", ".pkm") alphaname = filepath.replace(".png", "_alpha.pkm") FileHelper.writeWithSize(tmpFile, rgbname) FileHelper.writeWithSize(tmpFile, alphaname) # if preAlpha: # tmpFile.write('p') # else: # tmpFile.write('P') if not isSaveTransFile: FileHelper.remove(rgbname) FileHelper.remove(alphaname) except Exception: Log.printError() isSuccess = False pass finally: pass if isSuccess: if isUseGzip: gzip_cmd = gzipBin + tmpfilename + " -n -f -9" os.system(gzip_cmd) FileHelper.remove(tmpfilename.replace(".tmp", ".png")) FileHelper.rename(tmpfilename + ".gz", tmpfilename.replace(".tmp", ".png")) return 3 else: FileHelper.remove(tmpfilename.replace(".tmp", ".png")) FileHelper.rename(tmpfilename, tmpfilename.replace(".tmp", ".png")) return 5 else: FileHelper.remove(tmpfilename) return 2 else: tmpfilename = filepath.replace(".jpg", ".pkm") if not FileHelper.exists(tmpfilename): Log.printDetailln ("error !!!", filepath, "cannot convert.") return 2 if isUseGzip: gzip_cmd = gzipBin + tmpfilename + " -n -f -9" os.system(gzip_cmd) FileHelper.remove(tmpfilename.replace(".pkm", ".jpg")) FileHelper.rename(tmpfilename + ".gz", tmpfilename.replace(".pkm", ".jpg")) return 3 else: FileHelper.remove(tmpfilename.replace(".pkm", ".jpg")) FileHelper.rename(tmpfilename, tmpfilename.replace(".pkm", ".jpg")) return 4
def CleanTreeByMark(element): mark = long(element.get(kg_mark)) if 0 == mark: ElementHelper.remove_element(element) elif 1 == mark: return else: for child in element: CleanTreeByMark(child) def get_aricle_cetd(doctree): cetd_parse(doctree) body = ElementHelper.get_body(doctree) # ElementHelper.print_element(body) CleanTreeByMark(body) RemoveAttribute(body) return ElementHelper.element_text_content(body) if __name__ == '__main__': # dir = '/mnt/UbutunShare/graduate/DataSet/scrapy_dataset/other_neg/original/42.html' # dir = '/mnt/UbutunShare/graduate/DataSet/original/0.htm' dir = '/mnt/UbutunShare/graduate/DataSet/scrapy_dataset/ifeng/image/24.html' from api import HtmlHelper from util import FileHelper content = FileHelper.readUTF8File(dir) doc = HtmlHelper.create_doc(content, 'utf-8') print get_aricle_cetd(doc)