def transDoc(self): '将html源码转化为document文件' htmlli=os.listdir(self.htmlph)#取得html路径 num=0 for hp in htmlli: print hp f=open(self.htmlph+'/'+hp) c=f.read() #自动判别编码 并进行转化 res=chardet.detect(c) print res coding=res['encoding'] #print 'the former coding',coding if coding!='utf-8': try: c=c.decode(coding) except: print 'something wrong' collec=collector(c)#开始解析 f.close() f=open(self.xmlph+'/'+hp,'w') try: f.write(collec.xml(hp).toxml())#写入到新文件中 except: print 'can not trans xml' f.close() num+=1
def __init__(self, site_id, Name, runtime_queue, list, per_max_num, Flcok, home_urls): ''' site_id: 获得相应的目录 ''' threading.Thread.__init__(self, name=Name) self.runtime_queue = runtime_queue #self.result = result #路径管理 self.path = path(site_id) self.num = 0 self.maxnum = per_max_num self.list = list self.Flcok = Flcok #self.sqlite=sqlite3.connect('store/qlin.db') self.urltest = Urltest(home_urls) self.htmlparser = Collector() self.collector = collector(home_urls) #初始化home_list self.home_urls = home_urls self.inqueue = Queue() #开始对原始目录进行清扫 #建立站点 self.path.mk_dir(self.path.g_site()) #urltest self.path.rm_file(self.path.g_urltest()) #晴空document self.path.clean_dir(self.path.g_document())
def __init__(self,site_id, Name, runtime_queue, list, per_max_num ,Flcok,home_urls): ''' site_id: 获得相应的目录 ''' threading.Thread.__init__(self, name = Name ) self.runtime_queue = runtime_queue #self.result = result #路径管理 self.path = path(site_id) self.num = 0 self.maxnum = per_max_num self.list=list self.Flcok=Flcok #self.sqlite=sqlite3.connect('store/qlin.db') self.urltest=Urltest(home_urls) self.htmlparser=Collector() self.collector=collector(home_urls) #初始化home_list self.home_urls=home_urls self.inqueue = Queue() #开始对原始目录进行清扫 #建立站点 self.path.mk_dir( self.path.g_site() ) #urltest self.path.rm_file( self.path.g_urltest() ) #晴空document self.path.clean_dir( self.path.g_document() )
def transDoc(self): '将html源码转化为document文件' htmlli = os.listdir(self.htmlph) #取得html路径 num = 0 for hp in htmlli: print hp f = open(self.htmlph + '/' + hp) c = f.read() #自动判别编码 并进行转化 res = chardet.detect(c) print res coding = res['encoding'] #print 'the former coding',coding if coding != 'utf-8': try: c = c.decode(coding) except: print 'something wrong' collec = collector(c) #开始解析 f.close() f = open(self.xmlph + '/' + hp, 'w') try: f.write(collec.xml(hp).toxml()) #写入到新文件中 except: print 'can not trans xml' f.close() num += 1
def __init__(self, Name, runtime_queue, list, per_max_num ,Flcok): threading.Thread.__init__(self, name = Name ) self.runtime_queue = runtime_queue #self.result = result self.num = 0 self.maxnum = per_max_num self.list=list self.Flcok=Flcok #self.sqlite=sqlite3.connect('store/qlin.db') self.urltest=Urltest() self.htmlparser=Collector() self.collector=collector() #初始化home_list self.home_urls=[] self.inqueue = Queue()
def __init__(self, Name, runtime_queue, list, per_max_num, Flcok): threading.Thread.__init__(self, name=Name) self.runtime_queue = runtime_queue #self.result = result self.num = 0 self.maxnum = per_max_num self.list = list self.Flcok = Flcok #self.sqlite=sqlite3.connect('store/qlin.db') self.urltest = Urltest() self.htmlparser = Collector() self.collector = collector() #初始化home_list self.home_urls = [] self.inqueue = Queue()