def __init__(self, name, setting={}): if setting: if setting.get('port'): self._port = setting.get('port') if (not os.path.exists('db/')): os.mkdir('db/') self._name = name self._dbPath = self._dbPath.format(name) conn = None try: conn = sqlite3.connect(self._dbPath) c = conn.cursor() c.execute('''CREATE TABLE IF NOT EXISTS {} (id INTEGER PRIMARY KEY AUTOINCREMENT, name VARCHAR(50) NOT NULL, method VARCHAR(10) NOT NULL, url TEXT NOT NULL, tmSpan REAL NOT NULL, concurrency INT NOT NULL DEFAULT 0, countPer10s INT NOT NULL DEFAULT 0, state INT NOT NULL DEFAULT 0, size INT NOT NULL DEFAULT 0, tm CHAR(20));'''.format(self._tbName)) conn.commit() except Exception as err: printInfo('error', err) if (conn): conn.close()
def downloadThread2(self, *args): url = args[0] ssp = args[1] try: self._concurrency -= 1 printInfo(url['url']) self._downloadPageNum += 1 startTm = time.time() url['_startTm'] = startTm html = execDownload(url, ssp) url['_endTm'] = time.time() tmSpan = url['_endTm'] - startTm state = 1 if (html == "_error_"): state = 0 ssp.downloadError(url) else: data = ssp.saveHtml(url, html) if data and (isinstance(data, dict) or isinstance(data, list)): self._extractQueue.put((ssp, data)) if self.statistics: curIndex = int(time.time() / 10) % 2 self.statistics.addRecode( ssp, url, tmSpan, state, self._concurrency, self._downloadPagePer10s[1 - curIndex], len(html) if html and state else 0) except Exception as err: self.log(err, logging.ERROR)
def __init__(self, name): try: self._urlFilename=self._urlFilename.format(name) self._dicFilename=self._dicFilename.format(name) self._indexFilename=self._indexFilename.format(name) if(not os.path.exists('db/')): os.mkdir('db/') self._urlfile = io.open(self._urlFilename, "a+",encoding="utf-8") self._dicfile = io.open(self._dicFilename, "a+",encoding="utf-8") self._indexfile = io.open(self._indexFilename, "a+",encoding="utf-8") self._urlfile.seek(0) self._dicfile.seek(0) self._indexfile.seek(0) index = self._indexfile.read() if(index): self._i = int(index) line = 'start' while(line): line = self._dicfile.readline() if(line): self._dic.add(line[:-1]) i = 0 line = 'start' while(line): line = self._urlfile.readline() if(i<self._i): i=i+1 continue if(line): self._urls.append(json.loads(line[:-1])) except Exception as err: printInfo(err)
def log(self, msg, level=logging.DEBUG): if (isinstance(msg, UnicodeEncodeError)): printInfo('UnicodeEncodeError', msg) return printInfo(msg) if (level == logging.ERROR): logger = logging.getLogger() logging.LoggerAdapter(logger, None).log(level, msg)
def clearRecode(self): conn = sqlite3.connect(self._dbPath) try: cursor = conn.cursor() cursor.execute("delete from request_tm") conn.commit() except Exception as err: printInfo('error', err) conn.close()
def listImg(html, baseUrl=None, start=None, end=None, before=None): if (not html or html.find("<img") < 0): return [] section = getSection(html, start, end, before) s = section[0] e = section[1] if (s < 0 or e < s): return None html = html[s:e] if (not html or html.find("<img") < 0): return None patternLst = _getRegex(u'<img[\s]+[^>]*>') patternUrl = _getRegex(u'src[\s]*=[\s\'"]*(?P<url>.*?)[\'"\s>]') patternTitle = _getRegex(u'alt[\s]*=[\s\'"]*(?P<title>.*?)[\'"\s>]') lstStr = patternLst.findall(html) # lst=[] dic = Dict() for i in lstStr: url = None title = None tmp = patternUrl.search(i) if tmp: url = tmp.group("url") tmp = patternTitle.search(i) if tmp: title = tmp.group("title") try: if (url): url = url.strip().lower() if (baseUrl and url[:7] != "http://" and url[:8] != "https://"): absUrl = absoluteUrl(baseUrl, url) if (absUrl != url): d = dic[absUrl] if d: if not d.alt or (title and len(d.alt) < len(title)): d['alt'] = title else: dic[absUrl] = Dict({ 'url': absUrl, 'alt': title, 'relativeUrl': url }) # lst.append(Dict({'url':absUrl,'alt':title, 'relativeUrl':url})) else: d = dic[url] if d: if not d.alt or (title and len(d.alt) < len(title)): d['alt'] = title else: dic[url] = Dict({'url': url, 'alt': title}) # lst.append(Dict({'url':url,'alt':title})) except Exception as ex: printInfo(ex) return list(dic.values())
def listA(html,baseUrl=None,start=None,end=None,before=None): if(not html): return [] section = getSection(html,start,end,before) s = section[0] e = section[1] if(s < 0 or e < s): return [] html = html[s:e] if(not html or html.find("<a")<0): return [] patternLst = _getRegex(u'<a[\s]+[^>]*>[\s\S]*?</a>') patternUrl = _getRegex(u'href[\s]*=[\s\'"]*(?P<url>.*?)[\'"\s>]') patternTitle1 = _getRegex(u'title[\s]*=[\s\'"]*(?P<title>.*?)[\'"\s>]') patternTitle2 = _getRegex(u'<a[\s]+[^>]*>(?P<title>.*?)</a>') strA = patternLst.findall(html) dic = Dict() for i in strA: url = None title = None tmp = patternUrl.search(i) if tmp: url = tmp.group("url") tmp = patternTitle1.search(i) if tmp: title = tmp.group("title") if(not title): tmp = patternTitle2.search(i) if tmp: title = tmp.group("title") title = _getRegex('<[^<>]+>').sub('',title) try: if(url): url = url.strip().lower() if(baseUrl and url[:7]!="http://" and url[:8]!="https://"): absUrl = absoluteUrl(baseUrl,url) if(absUrl!=url): d = dic[absUrl] if d: if not d.title or (title and len(d.title)<len(title)): d['title'] = title else: dic[absUrl] = Dict({'url':absUrl,'title':title,'relativeUrl':url}) else: if url.rfind('/')<9: url += '/' d = dic[url] if d: if not d.title or (title and len(d.title)<len(title)): d['title'] = title else: dic[url] = Dict({'url':url,'title':title}) except Exception as ex: printInfo(ex) return list(dic.values())
def _insertRT(self, datas): conn = sqlite3.connect(self._dbPath) try: cursor = conn.cursor() cursor.executemany( "insert into request_tm(name,method,url,tmSpan,concurrency,countPer10s,state,size,tm) values(?,?,?,?,?,?,?,?,?)", tuple(datas)) conn.commit() except Exception as err: printInfo('error', err) conn.close()
def _select(self, sql): conn = sqlite3.connect(self._dbPath) rows = [] header = [] try: cursor = conn.cursor().execute(sql) for i in cursor.description: header.append(i[0]) for row in cursor: rows.append(row) except Exception as err: printInfo('error', err) conn.close() return (rows, header)
def _dealRequestTmThread(self): while (self._runflag): try: datas = [] i = 0 while (True): i += 1 if self._requestTmq.empty(): break data = self._requestTmq.get_nowait() datas.append(data) if i > 1000: break if datas: self._insertRT(datas) except Exception as err: printInfo('error', err) time.sleep(1)
def extract(self, url, html, ssp): mds = url.get("model") if (not mds): mds = ssp.models models = [] if (mds): for modelName in mds: m = ExtractModel.get(modelName) if (m): models.append(m) else: printInfo('no model ' + modelName) return ssp.extract(Dict(url), html, models, mds) # print (ExtractModel.auto_all)
def _iniModels(self, rootdir): try: if (not os.path.isdir(rootdir)): return list = os.listdir(rootdir) for i in range(0, len(list)): name = list[i] path = os.path.join(rootdir, name) if os.path.isfile(path): f = open(path, 'r') if sys.version_info.major == 2: ExtractModel[name[:-5]] = json.loads( f.read().decode('utf-8')) else: ExtractModel[name[:-5]] = json.loads(f.read()) f.close() except Exception as err: printInfo(err)
def resetUrls(self, urls): self._lock.acquire() try: flag=False for url in urls: if(not isinstance(url,dict)): url={'url':url} id=md5(url['url']) self._urls.append(url) self._dic.add(id) self._writeFile(url,id) flag=True if(flag): self._flushFile() except Exception as err: printInfo(err) finally: self._lock.release()
def __init__(self): try: if (not os.path.exists('db/')): os.mkdir('db/') self._cookiefile = io.open(self._cookieFilename, "a+", encoding="utf-8") self._cookiefile.seek(0) line = 'start' while (line): line = self._cookiefile.readline() if (line): line = line[:-1] start = line.index(',') if (start > 0): self._cookies[line[0:start]] = line[start + 1:] self._refreshCookieFile() except Exception as err: printInfo(err)
def saveUrl(self, urls,i=None): # if (type(urls).__name__=='dict'): # urls=urls["Urls"] self._lock.acquire() try: flag=False for url in urls: if(not isinstance(url,dict)): url={'url':url} if(md5(url['url']) not in self._dic): self._urls.append(url) self._dic.add(md5) self._writeFile(url,md5) flag=True if(flag): self._flushFile() except Exception as err: printInfo(err) finally: self._lock.release()
def popUrl(self): db = self._connect() lst = [] while (True): if (lst.count == 10): return None tbName = self._tbName i = random.randint(0, 9) printInfo('popUrl', i) if (i in lst): continue lst.append(i) if (i): tbName = tbName + str(i) if (self._tbCache.get(tbName)): continue url = db[tbName].find_one({"state": 0}) if (url): db[tbName].update({"_id": url["_id"]}, {"$set": {"state": 1}}) if (i in self._totalCount): self._totalCount[i] -= 1 else: self._tbCache[tbName] = True return url
def __init__(self, name): try: self._htmlPath = self._htmlPath.format(name) if (not os.path.exists('db/')): os.mkdir('db/') if (not os.path.exists('htmls/')): os.mkdir('htmls/') if (not os.path.exists(self._htmlPath)): os.mkdir(self._htmlPath) self._dbPath = self._dbPath.format(name) conn = sqlite3.connect(self._dbPath) c = conn.cursor() c.execute('''CREATE TABLE IF NOT EXISTS htmls (id INTEGER PRIMARY KEY autoincrement, json TEXT NOT NULL, state INT NOT NULL DEFAULT 0, tm TEXT);''') conn.commit() conn.close() except Exception as err: printInfo(err)
def __init__(self): try: self._iniModels('models/') except Exception as err: printInfo(err)
def convert2Dic(html): try: start = html.find('<') end = html.find('>') html = html[start + 1:end].strip('/').strip() html = re.sub('(\\s| )+', ' ', html, 0) html = re.sub('(\')+', '"', html, 0) html = re.sub('(=\s*")+', '="', html, 0) lstC = [] #list(html) N = len(html) i = 0 first = False flag = False while i < N: if html[i] == '"': lstC.append(html[i]) first = not first elif not first and html[i] == '=' and html[i + 1] != '"': lstC.append(html[i]) lstC.append('"') flag = True elif not first and flag and html[i] == ' ': flag = False lstC.append('"') lstC.append(html[i]) else: lstC.append(html[i]) i += 1 html = ''.join(lstC) paras = html.split('"') dic = Dict() lastP = None first = True for para in paras: if (first): first = False tmp = para.split() dic['tag'] = tmp[0] if (len(tmp) > 1): lastP = tmp[1].strip().strip('=').strip() continue if (lastP): if (not dic[lastP]): dic[lastP] = para else: dic[lastP] += ' ' dic[lastP] += para lastP = None elif para: if (para.find('=') > 0): lastP = para.strip().strip('=').strip() else: dic[para] = '' return dic except Exception as err: printInfo(err) try: tag = '' if (html.find('</') < 0 and html.find('/>') < 0): start = html.find('<') end = html.find(' ', start + 1) tag = '</' + html[start + 1:end] + '>' tree = ET.XML(html + tag) return XmlDictConfig(tree) except Exception as err: printInfo(err) return None
def __del__(self): self._urlfile.close() self._dicfile.close() self._indexfile.close() printInfo('__del__')
def log(err, data): printInfo(err, data) logger = logging.getLogger() logging.LoggerAdapter(logger, None).log(logging.ERROR, err) logging.LoggerAdapter(logger, None).log(logging.ERROR, data)
def downloadError(self, url, err=None): printInfo('error url:', url, err) self.url_store.updateState(url, 2)
def renderUrl(self, url, callback): printInfo('Need to implement method "renderUrl"')
def customDown(self, url): printInfo('Need to implement method "customDown"')
def log(self, msg, level=logging.ERROR): printInfo(msg) logger = logging.getLogger() logging.LoggerAdapter(logger, None).log(level, msg)
def downloadRender(self, url, ssp): printInfo(url['url']) self._downloadPageNum += 1 ssp.renderUrl(url, self.down_callback)