def getDocument(self, url, file_path, file_name, deep, error_count): if 0 <= recursion_deep < deep or error_count > document_error_max: return url = urlTools.dealUrl2Request(url, url) if file_path == '' and file_name == '': file_name = 'index.html' try: req = self.requestGet(url) charset = self.getHTMLCharset(req.content) req.encoding = charset d = pq(req.text) # print charset linkList1 = d('link') self.dealSourceLink(linkList1, Url(req.url), 'href') linkList2 = d('script') self.dealSourceLink(linkList2, Url(req.url), 'src') linkList3 = d('img') self.dealSourceLink(linkList3, Url(req.url), 'src') linkList4 = d('a') self.dealALink(linkList4, Url(req.url), 'href', deep) self.source_task.start() self.saveFile(file_path, file_name, bytearray(source=d.outer_html(), encoding='utf-8')) except requests.exceptions.ConnectionError, e: print 'ConnectionError:', e self.queue_document.push( [url, file_path, file_name, deep, error_count + 1])
def __init__(self, url): self.main_url = Url(self.requestGet(url).url) self.host_option, self.host, self.host_url = self.main_url.getHost() self.queue_resource = None self.queue_document = None self.set = None self.document_task = None self.source_task = None
def saveFile(self, file_path, file_name, bytes): path = Url(output + file_path) path = path.addUrlEnd() if not os.path.exists(path): os.makedirs(path) try: f = open(path + file_name, "wb") f.write(bytes) f.close() except IOError, e: print 'save Error: ', e, 'path: ', path, 'name: ', file_name
def convLongPath(self, file_path, file_name): if len(file_name) > 128: file_name = Url(hashlib.sha1(file_name).hexdigest()) if len(file_path) > 128: # 懒得管最前面有没有/了 file_path = file_path[0] + Url(hashlib.sha1(file_path).hexdigest()) # path_dirs = file_path.split('/') # for i, it in enumerate(path_dirs): # if len(it) > 250: # path_dirs[i] = str(hashlib.sha1(it).hexdigest()) # file_path = '/'.join(path_dirs) return file_path, file_name
def dealUrl2File(self, url, origin, host=None, is_req_url=False): """ :param url: 待处理的url origin: 请求发生时所在的url host: 对于域名为host的url,资源存放目录为output根目录,而不是域名文件夹。默认不设置主host is_req_url: url是否做过 dealUrl2Request 处理 :return """ if not is_req_url: url = self.dealUrl2Request(url, origin) # url = self.simplifyUrl(url) # url除去最后的/ url = url.delUrlEnd() if host is not None: # 如果该url就是这个站点域名下的,那么无需新建域名目录存放 if url.cmpHost(host): # 除去host 这里有可能超出output根目录 url = url.delHttp() url = url.delUrlStart() url = url.replace(host.getHost()[1], '') # 除去头,变身成文件路径 url = url.delHttp() url = url.delUrlStart() for k, v in self.replaceChr.iteritems(): if k in url: url = url.replace(k, v) file_name = Url(os.path.basename(url)) file_path = Url(os.path.dirname(url)) # 如果文件名或文件路径过长 file_path, file_name = self.convLongPath(file_path, file_name) # if file_path.startswith('/') or file_path.startswith('.'): # file_path = file_path[1:] # 为了解决同一目录下,文件和文件夹名不能重复的问题 if file_name != '': file_name = 'f_'+file_name url = file_path.addUrlEnd() + file_name url = url.addUrlStart() # 当file_path为""时,表示当前目录 return file_path, file_name, url
def dealUrl2Request(self, url, origin): origin = Url(origin) url = Url(url) if not url.startswith('http://') and not url.startswith('https://'): if url.startswith('//'): url = 'http:' + url elif url.startswith('/'): url = origin.getHost()[2] + url else: origin = origin.getUrlDir() url = origin + url url = url.simplifyUrl() return url
def dealUrl2File(self, url, origin, host=None, is_req_url=False): """ :param url: 待处理的url origin: 请求发生时所在的url host: 对于域名为host的url,资源存放目录为output根目录,而不是域名文件夹。默认不设置主host is_req_url: url是否做过 dealUrl2Request 处理 :return """ if not is_req_url: url = self.dealUrl2Request(url, origin) # url = self.simplifyUrl(url) # url除去最后的/ url = url.delUrlEnd() if host is not None: # 如果该url就是这个站点域名下的,那么无需新建域名目录存放 if url.cmpHost(host): # 除去host 这里有可能超出output根目录 url = url.delHttp() url = url.delUrlStart() url = url.replace(host.getHost()[1], '') # 除去头,变身成文件路径 url = url.delHttp() url = url.delUrlStart() for k, v in self.replaceChr.iteritems(): if k in url: url = url.replace(k, v) file_name = Url(os.path.basename(url)) file_path = Url(os.path.dirname(url)) # 如果文件名或文件路径过长 file_path, file_name = self.convLongPath(file_path, file_name) # if file_path.startswith('/') or file_path.startswith('.'): # file_path = file_path[1:] # 为了解决同一目录下,文件和文件夹名不能重复的问题 if file_name != '': file_name = 'f_' + file_name url = file_path.addUrlEnd() + file_name url = url.addUrlStart() # 当file_path为""时,表示当前目录 return file_path, file_name, url
def dealSourceLink(self, linkList, origin_url, attr): for li in linkList: url = pq(li).attr(attr) if url is not None: url = Url(url) request_url = urlTools.dealUrl2Request(url, origin_url) if request_url in self.set: file_path, file_name, html_url = self.set[request_url] else: file_path, file_name, html_url = urlTools.dealUrl2File( request_url, origin_url, self.host, True) error_count = 0 self.queue_resource.push( [request_url, file_path, file_name, error_count]) self.set[request_url] = [file_path, file_name, html_url] pq(li).attr(attr, html_url)
def dealALink(self, linkList, origin_url, attr, deep): for li in linkList: url = pq(li).attr(attr) if url is not None: url = Url(url) request_url = urlTools.dealUrl2Request(url, origin_url) # print 'A:', request_url if outsite_page or request_url.getHost()[1] == self.host: if request_url in self.set: file_path, file_name, html_url = self.set[request_url] else: file_path, file_name, html_url = urlTools.dealUrl2File( request_url, origin_url, self.host, True) self.queue_document.push( [request_url, file_path, file_name, deep + 1, 0]) self.set[request_url] = [ file_path, file_name, html_url ] pq(li).attr(attr, html_url)
class Crawler: def __init__(self, url): self.main_url = Url(self.requestGet(url).url) self.host_option, self.host, self.host_url = self.main_url.getHost() self.queue_resource = None self.queue_document = None self.set = None self.document_task = None self.source_task = None def __del__(self): # del self.document_task # del self.queue_document # del self.source_task # del self.queue_resource print 'del c' def start(self, is_continue=False): if is_continue: self.queue_resource = Queue.load(logs_path + 'queue_resource.json') self.queue_document = Queue.load(logs_path + 'queue_document.json') self.set = UrlSet.load(logs_path + 'url_set.json') else: self.queue_resource = Queue(logs_path + 'queue_resource.json') self.queue_document = Queue(logs_path + 'queue_document.json') self.set = UrlSet(logs_path + 'url_set.json') self.document_task = Task(self.queue_document, doc_pool_max) self.document_task.initTaskWork(self.getDocument) self.source_task = Task(self.queue_resource, res_pool_max) self.source_task.initTaskWork(self.requestSource) self.main_url = urlTools.dealUrl2Request(self.main_url, self.main_url) print self.main_url, self.host file_path, file_name, html_url = urlTools.dealUrl2File( self.main_url, self.main_url, self.host, True) self.queue_document.push([self.main_url, file_path, file_name, 0, 0]) print "file_path:" + file_path, "file_name:" + file_name, "html_url:" + html_url self.document_task.start() def requestGet(self, url): wait = random.random() * (wait_time[1] - wait_time[0]) sleep(wait) timeout = Timeout(request_timeout) timeout.start() try: req = requests.get(url=url, verify=True, headers=headers, proxies=proxies) except IncompleteRead: pass # todo:未知错误,暂还未查清 timeout.cancel() return req def saveFile(self, file_path, file_name, bytes): path = Url(output + file_path) path = path.addUrlEnd() if not os.path.exists(path): os.makedirs(path) try: f = open(path + file_name, "wb") f.write(bytes) f.close() except IOError, e: print 'save Error: ', e, 'path: ', path, 'name: ', file_name
self.set.save() self.document_task.stop() self.source_task.stop() def work(): print 123 c.start() time.sleep(10) print 321 if __name__ == '__main__': url0 = raw_input("input the Url:") c = Crawler(Url(url0)) def work(is_continue): c.start(is_continue) print 'work stoped' print "Enter 's' to start, 'c' to continue, 'e' to stop." while True: char = raw_input() if char == 's': p = threading.Thread(target=work, args=(False, )) print 'Process will start.' p.start() elif char == 'c': p = threading.Thread(target=work, args=(True, )) print 'Process will start.'
class Crawler: def __init__(self, url): self.main_url = Url(self.requestGet(url).url) self.host_option, self.host, self.host_url = self.main_url.getHost() self.queue_resource = None self.queue_document = None self.set = None self.document_task = None self.source_task = None def __del__(self): # del self.document_task # del self.queue_document # del self.source_task # del self.queue_resource print 'del c' def start(self, is_continue=False): if is_continue: self.queue_resource = Queue.load(logs_path + 'queue_resource.json') self.queue_document = Queue.load(logs_path + 'queue_document.json') self.set = UrlSet.load(logs_path + 'url_set.json') else: self.queue_resource = Queue(logs_path + 'queue_resource.json') self.queue_document = Queue(logs_path + 'queue_document.json') self.set = UrlSet(logs_path + 'url_set.json') self.document_task = Task(self.queue_document, doc_pool_max) self.document_task.initTaskWork(self.getDocument) self.source_task = Task(self.queue_resource, res_pool_max) self.source_task.initTaskWork(self.requestSource) self.main_url = urlTools.dealUrl2Request(self.main_url, self.main_url) print self.main_url, self.host file_path, file_name, html_url = urlTools.dealUrl2File(self.main_url, self.main_url, self.host, True) self.queue_document.push([self.main_url, file_path, file_name, 0, 0]) print "file_path:" + file_path, "file_name:" + file_name, "html_url:" + html_url self.document_task.start() def requestGet(self, url): wait = random.random() * (wait_time[1] - wait_time[0]) sleep(wait) timeout = Timeout(request_timeout) timeout.start() try: req = requests.get(url=url, verify=True, headers=headers, proxies=proxies) except IncompleteRead: pass # todo:未知错误,暂还未查清 timeout.cancel() return req def saveFile(self, file_path, file_name, bytes): path = Url(output + file_path) path = path.addUrlEnd() if not os.path.exists(path): os.makedirs(path) try: f = open(path + file_name, "wb") f.write(bytes) f.close() except IOError, e: print 'save Error: ', e, 'path: ', path, 'name: ', file_name