def __init__(self, startUrl=None): self.startUrl = startUrl path, file = os.path.split(configs.proxy.srcname) if not os.path.exists(path): os.makedirs(path) if startUrl == None: self.startUrl = 'http://www.baidu.com' if configs.proxy.rescrab: Logger.info('rescrab proxylist...') self.getFreeProxy() if configs.proxy.retest: Logger.info('retest proxy list...') self.testProxy() if not os.path.exists(configs.proxy.srcname): self.loadDefaultProxy() else: self.proxyList = self.loadProxy() self.proxyList = list( filter(lambda x: abs(int(x['available'])) == 1, self.proxyList)) self.proxyList = list( filter(lambda x: float(x['ping']) < 2, self.proxyList)) if len(self.proxyList) == 0: Logger.critical( 'There is no available proxy! espider is shuting down...') exit(1) self.proxyList.sort( key=lambda x: 1000 if float(x['ping']) == -1 else float(x['ping'])) self.proxyCount = 0
def startParseContent(self): self.fileList = getFileList(self.contentPath) if len(self.fileList) == 0: Logger.warning('There is no %s file in %s, please have a check' %(self.contentType, self.contentPath)) return self.fileListFilter() Logger.info('starting parsing content...') if len(self.fileList) == 0: return dataDict = [] for i in range(len(self.fileList)): try: try: with open(self.fileList[i], self.openMethod, encoding=self.openEncoding) as f: data = f.read() dataDict = self.parseContent(data) except OSError: Logger.error('an error occured when open %s' %self.fileList[i]) continue except Exception: Logger.error('an error occured when parsing content. If this takes place very often, please check your parseContent()...') continue if not isinstance(dataDict, list): Logger.error('please make sure parseContent() returns an list-like object') continue if len(dataDict) == 0: continue for item in dataDict: if not isinstance(item, dict): Logger.error('please make sure parseContent() returns dict-like objects in each element of a list. if this occur often, please teminate progress...') self.addDataItem(item, self.primaryKey) self.saveData() Logger.info('parsing content done')
def contentHandler(self, url, count): url = urljoin(self.host, url) Logger.info('(%s%%)get content data from %s' % (round(100 * count / len(self.contentDictList), 2), url)) data = None type = '' name = None for i in range(configs.spider.retry): response = self.httpHandler.getResponseByUrl(url) if response == None: Logger.warning('cannot get url %s. please check httphandler...' % url) return ('disabled', 'disabled') response = EsResponse(response) try: data, type = self.contentResponseHandle(response) if data == None: Logger.debug('data == None') raise Exception name = self.contentFileName(response) except Exception: Logger.error('an error occured in getUrlList(). if this take place very often, please check your code') self.httpHandler.nextHandler() if i == configs.spider.retry - 1: self.uncatchableUrlList.append(url) self.saveUncatchableUrl() continue break if data == None: return ('disabled', 'disabled') if name == None: name = '%s.' % count + type if not os.path.exists(configs.spider.contentdatapath): os.makedirs(configs.spider.contentdatapath) if self.parser == None: MD5 = buildMD5String(data) else: try: parsedData = '%s' %self.parser.parseContent(data) MD5 = buildMD5String(parsedData) except Exception: Logger.error('An error occured in parseContent()! Please check your code. Espider will use the whole file as update md5') MD5 = buildMD5String(data) filepath = configs.spider.contentdatapath + name try: if type == 'html' or type == 'xml' or type == 'json' or type == 'js' or type == 'css': with open(filepath, 'w+', encoding='utf8') as f: f.write(data) return (MD5, filepath) if type == 'jpg' or type == 'tif' or type == 'ico' or type == 'png' or type == 'bmp' or type == 'mp3' or type == 'avi' or type == 'mp4': with open(filepath, 'wb+') as f: f.write(data) return (MD5, filepath) with open(filepath, 'wb+') as f: f.write(data) except OSError: Logger.error('anerrer occured when open %s' % configs.spider.contentdatapath + name) return (MD5, filepath)
def catalogueUrlRecursion(self, url): if configs.spider.catalogueLimit != 'inf': if self.catalogueCount >= configs.spider.catalogueLimit: return url = urljoin(self.host, url) urllistContent = [] urllistCatalogue = [] for i in range(configs.spider.retry): response = self.httpHandler.getResponseByUrl(url) if response == None: Logger.warning( 'cannot get url %s. please check httphandler...' % url) return response = EsResponse(response) try: urllistCatalogue, urllistContent = self.getUrlList(response) break except ValueError: Logger.critical( 'please verify your getUrlList() return 2 lists. espider is shutting down...' ) exit(1) except Exception as e: Logger.error( 'an error occured in getUrlList(). if this take place often, please check your code' ) self.httpHandler.nextHandler() if i == configs.spider.retry - 1: self.uncatchableUrlList.append(url) self.saveUncatchableUrl() if (len(urllistContent) != 0): for item in urllistContent: self.contentCount = self.contentCount + 1 if configs.spider.contentLimit != 'inf': if self.contentCount > configs.spider.contentLimit: break if not keyValueInDictList('contentUrl', item, self.contentDictList): Logger.debug('discover content url %s' % item) dictTemp = {} dictTemp['contentUrl'] = item self.contentDictList.append(dictTemp) if len(urllistCatalogue) == 0: return else: for item in urllistCatalogue: if not item in self.catalogueUrl: if configs.spider.catalogueLimit != 'inf': if self.catalogueCount >= configs.spider.catalogueLimit: return Logger.info('get catalogue url %s' % item) self.catalogueUrl.add(item) self.catalogueCount = self.catalogueCount + 1 time.sleep(random.random() * configs.http.sleeptime) self.catalogueUrlRecursion(item) return
def __init__(self): Logger.info('Espider %s initiating...' % self.espiderName) if self.startUrl == '': Logger.critical('Your espider should have a startUrl! Espider is shutting down...') exit(1) self.startUrl = urlunparse(urlparse(self.startUrl, 'http')) if urlparse(self.startUrl).hostname == None: Logger.critical('Illegal url! Please make sure url like "http://www.baidu.com". Espider will be closed...') exit(1) self.host = urlparse(self.startUrl).scheme + '://' + urlparse(self.startUrl).hostname self.checkUrlQuery() self.httpHandler = HttpHandler(self.host)
def catalogueUrlRecursion(self, url): if configs.spider.catalogueLimit != 'inf': if self.catalogueCount >= configs.spider.catalogueLimit: return url = urljoin(self.host, url) urllistContent = [] urllistCatalogue = [] for i in range(configs.spider.retry): response = self.httpHandler.getResponseByUrl(url) if response == None: Logger.warning('cannot get url %s. please check httphandler...' % url) return response = EsResponse(response) try: urllistCatalogue, urllistContent = self.getUrlList(response) break except ValueError: Logger.critical('please verify your getUrlList() return 2 lists. espider is shutting down...') exit(1) except Exception as e: Logger.error('an error occured in getUrlList(). if this take place often, please check your code') self.httpHandler.nextHandler() if i == configs.spider.retry - 1: self.uncatchableUrlList.append(url) self.saveUncatchableUrl() if(len(urllistContent) != 0): for item in urllistContent: self.contentCount = self.contentCount + 1 if configs.spider.contentLimit != 'inf': if self.contentCount > configs.spider.contentLimit: break if not keyValueInDictList('contentUrl', item, self.contentDictList): Logger.debug('discover content url %s' % item) dictTemp = {} dictTemp['contentUrl'] = item self.contentDictList.append(dictTemp) if len(urllistCatalogue) == 0: return else: for item in urllistCatalogue: if not item in self.catalogueUrl: if configs.spider.catalogueLimit != 'inf': if self.catalogueCount >= configs.spider.catalogueLimit: return Logger.info('get catalogue url %s' % item) self.catalogueUrl.add(item) self.catalogueCount = self.catalogueCount + 1 time.sleep(random.random() * configs.http.sleeptime) self.catalogueUrlRecursion(item) return
def catalogueUrlRecursion(self, param, path, level): if not os.path.exists(path): os.makedirs(path) Logger.info('(level %s)start to scrab param:%s' % (level, param)) if not isinstance(self.queryList[level - 1], list): self.queryList[level - 1] = [self.queryList[level - 1]] for query in self.queryList[level - 1]: url = self.buildUrl(query, param) url, headers = self.buildExtraHeaders(url) response = self.httpHandler.getResponseByUrl(url, headers=headers) data, type = self.contentResponseHandle(response) with open(path + 'data_query=' + query + '.' + type, 'w+', encoding='utf8') as f: f.write(data) if level == self.level: return try: nextParamList = self.contentHandler(data) except Exception: Logger.error( 'an error occured in contentHandler(). If this take place often, please shut espider down...' ) nextParamList = None if nextParamList == None or nextParamList == []: return if not isinstance(nextParamList, list): Logger.critical( 'contentHandler() should return a list. Espider is shutting down...' ) exit(1) if not isinstance(nextParamList[0], dict): Logger.critical( 'contentHandler() should return list made by dict of each element. Espider is shutting down...' ) exit(1) writeLinesFile(path + 'param_query=' + query + '.txt', nextParamList) for nextParam in nextParamList: for k, v in nextParam.items(): if k in self.parameterList[level]: nextParamDict = dict(param) nextParamDict[k] = v nextPath = path + k + '=' + v + '/' time.sleep(random.random() * configs.http.sleeptime) self.catalogueUrlRecursion(nextParamDict, nextPath, level + 1) else: pass
def startEspider(self): Logger.info('starting espider...') paramList = readLinesFile(configs.spider.contentdatapath + 'param.txt') if paramList == None: Logger.critical('You should create starting parameters in %s' % (configs.spider.contentdatapath + 'param.txt')) exit(1) for i in range(len(paramList)): paramList[i] = json.loads(paramList[i]) for k,v in paramList[i].items(): if k in self.parameterList[0]: param = {} param[k] = v path = configs.spider.contentdatapath + k + '=' + v + '/' self.catalogueUrlRecursion(param, path, 1) else: Logger.error('param.txt gives an incorrect key compared to self.paramterList...')
def __init__(self): Logger.info('Espider %s initiating...' % self.espiderName) if self.startUrl == '': Logger.critical( 'Your espider should have a startUrl! Espider is shutting down...' ) exit(1) self.startUrl = urlunparse(urlparse(self.startUrl, 'http')) if urlparse(self.startUrl).hostname == None: Logger.critical( 'Illegal url! Please make sure url like "http://www.baidu.com". Espider will be closed...' ) exit(1) self.host = urlparse(self.startUrl).scheme + '://' + urlparse( self.startUrl).hostname self.checkUrlQuery() self.httpHandler = HttpHandler(self.host)
def __init__(self): Logger.info('espider %s initiating...' % self.espiderName) if self.startUrl == '' or self.espiderName == '': Logger.critical('Your espider should have an espiderName and a startUrl! Espider is shutting down...') exit(1) self.startUrl = urlunparse(urlparse(self.startUrl, 'http')) if urlparse(self.startUrl).hostname == None: Logger.critical('Illegal url! Please make sure url like "http://www.baidu.com". Espider will be closed...') exit(1) self.host = urlparse(self.startUrl).scheme + '://' + urlparse(self.startUrl).hostname self.httpHandler = HttpHandler(self.host) if not os.path.exists(configs.spider.pipelinepath): os.makedirs(configs.spider.pipelinepath) self.catalogueUrl = set() self.catalogueCount = 0 self.contentCount = 0 self.contentDictList = [] self.uncatchableUrlList = []
def startEspider(self): Logger.info('starting espider...') paramList = readLinesFile(configs.spider.contentdatapath + 'param.txt') if paramList == None: Logger.critical('You should create starting parameters in %s' % (configs.spider.contentdatapath + 'param.txt')) exit(1) for i in range(len(paramList)): paramList[i] = json.loads(paramList[i]) for k, v in paramList[i].items(): if k in self.parameterList[0]: param = {} param[k] = v path = configs.spider.contentdatapath + k + '=' + v + '/' self.catalogueUrlRecursion(param, path, 1) else: Logger.error( 'param.txt gives an incorrect key compared to self.paramterList...' )
def getFreeProxy(self): """ Two different ways getting free proxy which can be configured in configs. You can also define your own way of getting. """ Logger.info('get free proxy from the Internet...') proxyList = [] if configs.proxy.proxysrc == 1: for i in range(configs.proxy.srcpage): Logger.info('get page %s...' %(i + 1)) req = urllib.request.Request('http://www.kuaidaili.com/free/inha/%s/' %(i + 1)) req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0') data = urllib.request.urlopen(req).read().decode('utf-8') proxy = re.findall('<tr>\s*?<td data-title="IP">(.*?)</td>\s*?<td data-title="PORT">(.*?)</td>\s*?<td data-title="匿名度">.*?</td>\s*?<td data-title="类型">(.*?)</td>', data) for item in proxy: if not [item[2].lower(), item[0], item[1]] in proxyList: proxyList.append([item[2].lower(), item[0], item[1]]) if configs.proxy.proxysrc == 2: for i in range(configs.proxy.srcpage): Logger.info('get page %s...' %(i + 1)) req = urllib.request.Request('http://www.xicidaili.com/nn/%s' %(i + 1)) req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0') data = urllib.request.urlopen(req).read().decode('utf-8') proxy = re.findall('<td class="country"><img [\s\S]*?<td>(.*?)</td>\s*?<td>(.*?)</td>\s*?<td>[\s\S]*?</td>\s*?<td class="country">.*?</td>\s*?<td>(.*)</td>', data) for item in proxy: if not [item[2].lower(), item[0], item[1]] in proxyList: proxyList.append([item[2].lower(), item[0], item[1]]) dataset = [] for item in proxyList: dataset.append('%s\t%s\t%s\t-1\t-1' %(item[0], item[1], item[2])) writeLinesFile(configs.proxy.srcname, dataset)
def testProxy(self): """ Test the proxy connection performance with self.startUrl. """ req = urllib.request.Request(self.startUrl) for k,v in configs.urlrequest.items(): if isinstance(v, list): l = len(v) v = v[randint(0, len(v) - 1)] req.add_header(k,v) Logger.info('test proxy list in %s' % configs.proxy.srcname) data = readLinesFile(configs.proxy.srcname) time.clock() proxyList = [] for i in range(len(data)): proxyList.append(dict(zip(('type', 'ip', 'port'), data[i].split('\t')))) openner = urllib.request.build_opener(urllib.request.ProxyHandler({proxyList[i]['type']:proxyList[i]['ip'] + ':' + proxyList[i]['port']}), urllib.request.ProxyBasicAuthHandler()) try: begin = time.clock() openner.open(req, timeout=configs.proxy.timeout) ping = time.clock() - begin available = 1 Logger.info('proxy %s is good...' %proxyList[i]['ip']) except Exception as e: Logger.info('proxy %s is not available...' %proxyList[i]['ip']) ping = -1 available = 0 proxyList[i]['available'] = available proxyList[i]['ping'] = ping dataset = [] for i in range(len(proxyList)): dataset.append('%s\t%s\t%s\t%s\t%s' %(proxyList[i]['type'], proxyList[i]['ip'], proxyList[i]['port'], proxyList[i]['available'], proxyList[i]['ping'])) writeLinesFile(configs.proxy.srcname, dataset) return
def catalogueUrlRecursion(self, param, path, level): if not os.path.exists(path): os.makedirs(path) Logger.info('(level %s)start to scrab param:%s' % (level, param)) if not isinstance(self.queryList[level - 1], list): self.queryList[level - 1] = [self.queryList[level - 1]] for query in self.queryList[level - 1]: url = self.buildUrl(query, param) url, headers = self.buildExtraHeaders(url) response = self.httpHandler.getResponseByUrl(url, headers=headers) data, type = self.contentResponseHandle(response) with open(path + 'data_query=' + query + '.' + type, 'w+', encoding='utf8') as f: f.write(data) if level == self.level: return try: nextParamList = self.contentHandler(data) except Exception: Logger.error('an error occured in contentHandler(). If this take place often, please shut espider down...') nextParamList = None if nextParamList == None or nextParamList == []: return if not isinstance(nextParamList, list): Logger.critical('contentHandler() should return a list. Espider is shutting down...') exit(1) if not isinstance(nextParamList[0], dict): Logger.critical('contentHandler() should return list made by dict of each element. Espider is shutting down...') exit(1) writeLinesFile(path + 'param_query=' + query + '.txt', nextParamList) for nextParam in nextParamList: for k,v in nextParam.items(): if k in self.parameterList[level]: nextParamDict = dict(param) nextParamDict[k] = v nextPath = path + k + '=' + v + '/' time.sleep(random.random() * configs.http.sleeptime) self.catalogueUrlRecursion(nextParamDict, nextPath, level + 1) else: pass
def __init__(self): Logger.info('espider %s initiating...' % self.espiderName) if self.startUrl == '' or self.espiderName == '': Logger.critical( 'Your espider should have an espiderName and a startUrl! Espider is shutting down...' ) exit(1) self.startUrl = urlunparse(urlparse(self.startUrl, 'http')) if urlparse(self.startUrl).hostname == None: Logger.critical( 'Illegal url! Please make sure url like "http://www.baidu.com". Espider will be closed...' ) exit(1) self.host = urlparse(self.startUrl).scheme + '://' + urlparse( self.startUrl).hostname self.httpHandler = HttpHandler(self.host) if not os.path.exists(configs.spider.pipelinepath): os.makedirs(configs.spider.pipelinepath) self.catalogueUrl = set() self.catalogueCount = 0 self.contentCount = 0 self.contentDictList = [] self.uncatchableUrlList = []
def __init__(self, startUrl = None): self.startUrl = startUrl path, file = os.path.split(configs.proxy.srcname) if not os.path.exists(path): os.makedirs(path) if startUrl == None: self.startUrl = 'http://www.baidu.com' if configs.proxy.rescrab: Logger.info('rescrab proxylist...') self.getFreeProxy() if configs.proxy.retest: Logger.info('retest proxy list...') self.testProxy() if not os.path.exists(configs.proxy.srcname): self.loadDefaultProxy() else: self.proxyList = self.loadProxy() self.proxyList = list(filter(lambda x:abs(int(x['available'])) == 1, self.proxyList)) self.proxyList = list(filter(lambda x:float(x['ping']) < 2, self.proxyList)) if len(self.proxyList) == 0: Logger.critical('There is no available proxy! espider is shuting down...') exit(1) self.proxyList.sort(key = lambda x:1000 if float(x['ping']) == -1 else float(x['ping'])) self.proxyCount = 0
def startEspider(self): if configs.spider.mode != 'override' and configs.spider.mode != 'update': Logger.error( 'Please verify spider.mode is override or update in configs. Spider will run in default mode(override)' ) if configs.spider.mode == 'update' and self.parser == None: Logger.error( 'Spider cannot run in update mode without a correct function setParser() defined. ' ) Logger.info('Espider running in %s mode' % ('override' if self.parser == None else 'update')) if self.parser != None: # update mode self.backupUpdate() self.contentDictList = self.loadContentDictList() Logger.info('start to get catalogue urls...') if configs.spider.loadurllistfromfile: self.loadCatalogueList() self.contentDictList = self.loadContentDictList() else: self.catalogueUrlRecursion(self.startUrl) writeLinesFile(configs.spider.cataloguefilename, self.catalogueUrl, method='w+') count = 0 for item in self.contentDictList: count = count + 1 MD5, filepath = self.contentHandler(item['contentUrl'], count) item['filepath'] = filepath if 'MD5' in item: if self.parser == None: item['update'] = 'disabled' elif item['MD5'] == MD5: item['update'] = 'false' else: item['update'] = 'true' item['MD5'] = MD5 else: if self.parser == None: item['update'] = 'disabled' else: item['update'] = 'true' item['MD5'] = MD5 self.saveContentUrlDictList() self.saveContentUrlUpdate() Logger.info('espider complete the task!')
def testProxy(self): """ Test the proxy connection performance with self.startUrl. """ req = urllib.request.Request(self.startUrl) for k, v in configs.urlrequest.items(): if isinstance(v, list): l = len(v) v = v[randint(0, len(v) - 1)] req.add_header(k, v) Logger.info('test proxy list in %s' % configs.proxy.srcname) data = readLinesFile(configs.proxy.srcname) time.clock() proxyList = [] for i in range(len(data)): proxyList.append( dict(zip(('type', 'ip', 'port'), data[i].split('\t')))) openner = urllib.request.build_opener( urllib.request.ProxyHandler({ proxyList[i]['type']: proxyList[i]['ip'] + ':' + proxyList[i]['port'] }), urllib.request.ProxyBasicAuthHandler()) try: begin = time.clock() openner.open(req, timeout=configs.proxy.timeout) ping = time.clock() - begin available = 1 Logger.info('proxy %s is good...' % proxyList[i]['ip']) except Exception as e: Logger.info('proxy %s is not available...' % proxyList[i]['ip']) ping = -1 available = 0 proxyList[i]['available'] = available proxyList[i]['ping'] = ping dataset = [] for i in range(len(proxyList)): dataset.append('%s\t%s\t%s\t%s\t%s' % (proxyList[i]['type'], proxyList[i]['ip'], proxyList[i]['port'], proxyList[i]['available'], proxyList[i]['ping'])) writeLinesFile(configs.proxy.srcname, dataset) return
def getFreeProxy(self): """ Two different ways getting free proxy which can be configured in configs. You can also define your own way of getting. """ Logger.info('get free proxy from the Internet...') proxyList = [] if configs.proxy.proxysrc == 1: for i in range(configs.proxy.srcpage): Logger.info('get page %s...' % (i + 1)) req = urllib.request.Request( 'http://www.kuaidaili.com/free/inha/%s/' % (i + 1)) req.add_header( 'User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0' ) data = urllib.request.urlopen(req).read().decode('utf-8') proxy = re.findall( '<tr>\s*?<td data-title="IP">(.*?)</td>\s*?<td data-title="PORT">(.*?)</td>\s*?<td data-title="匿名度">.*?</td>\s*?<td data-title="类型">(.*?)</td>', data) for item in proxy: if not [item[2].lower(), item[0], item[1]] in proxyList: proxyList.append([item[2].lower(), item[0], item[1]]) if configs.proxy.proxysrc == 2: for i in range(configs.proxy.srcpage): Logger.info('get page %s...' % (i + 1)) req = urllib.request.Request('http://www.xicidaili.com/nn/%s' % (i + 1)) req.add_header( 'User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0' ) data = urllib.request.urlopen(req).read().decode('utf-8') proxy = re.findall( '<td class="country"><img [\s\S]*?<td>(.*?)</td>\s*?<td>(.*?)</td>\s*?<td>[\s\S]*?</td>\s*?<td class="country">.*?</td>\s*?<td>(.*)</td>', data) for item in proxy: if not [item[2].lower(), item[0], item[1]] in proxyList: proxyList.append([item[2].lower(), item[0], item[1]]) dataset = [] for item in proxyList: dataset.append('%s\t%s\t%s\t-1\t-1' % (item[0], item[1], item[2])) writeLinesFile(configs.proxy.srcname, dataset)
def startEspider(self): if configs.spider.mode != 'override' and configs.spider.mode != 'update': Logger.error('Please verify spider.mode is override or update in configs. Spider will run in default mode(override)') if configs.spider.mode == 'update' and self.parser == None: Logger.error('Spider cannot run in update mode without a correct function setParser() defined. ') Logger.info('Espider running in %s mode' %('override' if self.parser == None else 'update')) if self.parser != None: # update mode self.backupUpdate() self.contentDictList = self.loadContentDictList() Logger.info('start to get catalogue urls...') if configs.spider.loadurllistfromfile: self.loadCatalogueList() self.contentDictList = self.loadContentDictList() else: self.catalogueUrlRecursion(self.startUrl) writeLinesFile(configs.spider.cataloguefilename, self.catalogueUrl, method='w+') count = 0 for item in self.contentDictList: count = count + 1 MD5, filepath = self.contentHandler(item['contentUrl'], count) item['filepath'] = filepath if 'MD5' in item: if self.parser == None: item['update'] = 'disabled' elif item['MD5'] == MD5: item['update'] = 'false' else: item['update'] = 'true' item['MD5'] = MD5 else: if self.parser == None: item['update'] = 'disabled' else: item['update'] = 'true' item['MD5'] = MD5 self.saveContentUrlDictList() self.saveContentUrlUpdate() Logger.info('espider complete the task!')
def contentHandler(self, url, count): url = urljoin(self.host, url) Logger.info('(%s%%)get content data from %s' % (round(100 * count / len(self.contentDictList), 2), url)) data = None type = '' name = None for i in range(configs.spider.retry): response = self.httpHandler.getResponseByUrl(url) if response == None: Logger.warning( 'cannot get url %s. please check httphandler...' % url) return ('disabled', 'disabled') response = EsResponse(response) try: data, type = self.contentResponseHandle(response) if data == None: Logger.debug('data == None') raise Exception name = self.contentFileName(response) except Exception: Logger.error( 'an error occured in getUrlList(). if this take place very often, please check your code' ) self.httpHandler.nextHandler() if i == configs.spider.retry - 1: self.uncatchableUrlList.append(url) self.saveUncatchableUrl() continue break if data == None: return ('disabled', 'disabled') if name == None: name = '%s.' % count + type if not os.path.exists(configs.spider.contentdatapath): os.makedirs(configs.spider.contentdatapath) if self.parser == None: MD5 = buildMD5String(data) else: try: parsedData = '%s' % self.parser.parseContent(data) MD5 = buildMD5String(parsedData) except Exception: Logger.error( 'An error occured in parseContent()! Please check your code. Espider will use the whole file as update md5' ) MD5 = buildMD5String(data) filepath = configs.spider.contentdatapath + name try: if type == 'html' or type == 'xml' or type == 'json' or type == 'js' or type == 'css': with open(filepath, 'w+', encoding='utf8') as f: f.write(data) return (MD5, filepath) if type == 'jpg' or type == 'tif' or type == 'ico' or type == 'png' or type == 'bmp' or type == 'mp3' or type == 'avi' or type == 'mp4': with open(filepath, 'wb+') as f: f.write(data) return (MD5, filepath) with open(filepath, 'wb+') as f: f.write(data) except OSError: Logger.error('anerrer occured when open %s' % configs.spider.contentdatapath + name) return (MD5, filepath)