def dealSourceLink(self, linkList, origin_url, attr): for li in linkList: url = pq(li).attr(attr) if url is not None: url = Url(url) request_url = urlTools.dealUrl2Request(url, origin_url) if request_url in self.set: file_path, file_name, html_url = self.set[request_url] else: file_path, file_name, html_url = urlTools.dealUrl2File(request_url, origin_url, self.host, True) error_count = 0 self.queue_resource.push([request_url, file_path, file_name, error_count]) self.set[request_url] = [file_path, file_name, html_url] pq(li).attr(attr, html_url)
def dealALink(self, linkList, origin_url, attr, deep): for li in linkList: url = pq(li).attr(attr) if url is not None: url = Url(url) request_url = urlTools.dealUrl2Request(url, origin_url) # print 'A:', request_url if outsite_page or request_url.getHost()[1] == self.host: if request_url in self.set: file_path, file_name, html_url = self.set[request_url] else: file_path, file_name, html_url = urlTools.dealUrl2File(request_url, origin_url, self.host, True) self.queue_document.push([request_url, file_path, file_name, deep + 1, 0]) self.set[request_url] = [file_path, file_name, html_url] pq(li).attr(attr, html_url)
def dealSourceLink(self, linkList, origin_url, attr): for li in linkList: url = pq(li).attr(attr) if url is not None: url = Url(url) request_url = urlTools.dealUrl2Request(url, origin_url) if request_url in self.set: file_path, file_name, html_url = self.set[request_url] else: file_path, file_name, html_url = urlTools.dealUrl2File( request_url, origin_url, self.host, True) error_count = 0 self.queue_resource.push( [request_url, file_path, file_name, error_count]) self.set[request_url] = [file_path, file_name, html_url] pq(li).attr(attr, html_url)
def dealCss(self, text, origin_url): list = re.findall(r'url\(\'(.*?)\'\)|url\(\"(.*?)\"\)|url\((.*?)\)', text) for ans_list in list: for li in ans_list: if li != '' and not li.startswith('data'): request_url = urlTools.dealUrl2Request(li, origin_url) if request_url in self.set: file_path, file_name, html_url = self.set[request_url] else: file_path, file_name, html_url = urlTools.dealUrl2File(request_url, origin_url, self.host, True) error_count = 0 self.queue_resource.push([request_url, file_path, file_name, error_count]) self.set[request_url] = [file_path, file_name, html_url] # self.requestSource(request_url, file_path, file_name) text = text.replace(li, html_url.encode()) return text
def dealALink(self, linkList, origin_url, attr, deep): for li in linkList: url = pq(li).attr(attr) if url is not None: url = Url(url) request_url = urlTools.dealUrl2Request(url, origin_url) # print 'A:', request_url if outsite_page or request_url.getHost()[1] == self.host: if request_url in self.set: file_path, file_name, html_url = self.set[request_url] else: file_path, file_name, html_url = urlTools.dealUrl2File( request_url, origin_url, self.host, True) self.queue_document.push( [request_url, file_path, file_name, deep + 1, 0]) self.set[request_url] = [ file_path, file_name, html_url ] pq(li).attr(attr, html_url)
def dealCss(self, text, origin_url): list = re.findall(r'url\(\'(.*?)\'\)|url\(\"(.*?)\"\)|url\((.*?)\)', text) for ans_list in list: for li in ans_list: if li != '' and not li.startswith('data'): request_url = urlTools.dealUrl2Request(li, origin_url) if request_url in self.set: file_path, file_name, html_url = self.set[request_url] else: file_path, file_name, html_url = urlTools.dealUrl2File( request_url, origin_url, self.host, True) error_count = 0 self.queue_resource.push( [request_url, file_path, file_name, error_count]) self.set[request_url] = [ file_path, file_name, html_url ] # self.requestSource(request_url, file_path, file_name) text = text.replace(li, html_url.encode()) return text
def start(self, is_continue=False): if is_continue: self.queue_resource = Queue.load(logs_path + 'queue_resource.json') self.queue_document = Queue.load(logs_path + 'queue_document.json') self.set = UrlSet.load(logs_path + 'url_set.json') else: self.queue_resource = Queue(logs_path + 'queue_resource.json') self.queue_document = Queue(logs_path + 'queue_document.json') self.set = UrlSet(logs_path + 'url_set.json') self.document_task = Task(self.queue_document, doc_pool_max) self.document_task.initTaskWork(self.getDocument) self.source_task = Task(self.queue_resource, res_pool_max) self.source_task.initTaskWork(self.requestSource) self.main_url = urlTools.dealUrl2Request(self.main_url, self.main_url) print self.main_url, self.host file_path, file_name, html_url = urlTools.dealUrl2File(self.main_url, self.main_url, self.host, True) self.queue_document.push([self.main_url, file_path, file_name, 0, 0]) print "file_path:" + file_path, "file_name:" + file_name, "html_url:" + html_url self.document_task.start()
def start(self, is_continue=False): if is_continue: self.queue_resource = Queue.load(logs_path + 'queue_resource.json') self.queue_document = Queue.load(logs_path + 'queue_document.json') self.set = UrlSet.load(logs_path + 'url_set.json') else: self.queue_resource = Queue(logs_path + 'queue_resource.json') self.queue_document = Queue(logs_path + 'queue_document.json') self.set = UrlSet(logs_path + 'url_set.json') self.document_task = Task(self.queue_document, doc_pool_max) self.document_task.initTaskWork(self.getDocument) self.source_task = Task(self.queue_resource, res_pool_max) self.source_task.initTaskWork(self.requestSource) self.main_url = urlTools.dealUrl2Request(self.main_url, self.main_url) print self.main_url, self.host file_path, file_name, html_url = urlTools.dealUrl2File( self.main_url, self.main_url, self.host, True) self.queue_document.push([self.main_url, file_path, file_name, 0, 0]) print "file_path:" + file_path, "file_name:" + file_name, "html_url:" + html_url self.document_task.start()