Exemplo n.º 1
0
    def getDocument(self, url, file_path, file_name, deep, error_count):
        if 0 <= recursion_deep < deep or error_count > document_error_max:
            return
        url = urlTools.dealUrl2Request(url, url)

        if file_path == '' and file_name == '':
            file_name = 'index.html'
        try:
            req = self.requestGet(url)
            charset = self.getHTMLCharset(req.content)
            req.encoding = charset
            d = pq(req.text)
            # print charset

            linkList1 = d('link')
            self.dealSourceLink(linkList1, Url(req.url), 'href')

            linkList2 = d('script')
            self.dealSourceLink(linkList2, Url(req.url), 'src')

            linkList3 = d('img')
            self.dealSourceLink(linkList3, Url(req.url), 'src')

            linkList4 = d('a')
            self.dealALink(linkList4, Url(req.url), 'href', deep)

            self.source_task.start()

            self.saveFile(file_path, file_name, bytearray(source=d.outer_html(), encoding='utf-8'))
        except requests.exceptions.ConnectionError, e:
            print 'ConnectionError:', e
            self.queue_document.push([url, file_path, file_name, deep, error_count + 1])
Exemplo n.º 2
0
    def getDocument(self, url, file_path, file_name, deep, error_count):
        if 0 <= recursion_deep < deep or error_count > document_error_max:
            return
        url = urlTools.dealUrl2Request(url, url)

        if file_path == '' and file_name == '':
            file_name = 'index.html'
        try:
            req = self.requestGet(url)
            charset = self.getHTMLCharset(req.content)
            req.encoding = charset
            d = pq(req.text)
            # print charset

            linkList1 = d('link')
            self.dealSourceLink(linkList1, Url(req.url), 'href')

            linkList2 = d('script')
            self.dealSourceLink(linkList2, Url(req.url), 'src')

            linkList3 = d('img')
            self.dealSourceLink(linkList3, Url(req.url), 'src')

            linkList4 = d('a')
            self.dealALink(linkList4, Url(req.url), 'href', deep)

            self.source_task.start()

            self.saveFile(file_path, file_name,
                          bytearray(source=d.outer_html(), encoding='utf-8'))
        except requests.exceptions.ConnectionError, e:
            print 'ConnectionError:', e
            self.queue_document.push(
                [url, file_path, file_name, deep, error_count + 1])
Exemplo n.º 3
0
 def dealSourceLink(self, linkList, origin_url, attr):
     for li in linkList:
         url = pq(li).attr(attr)
         if url is not None:
             url = Url(url)
             request_url = urlTools.dealUrl2Request(url, origin_url)
             if request_url in self.set:
                 file_path, file_name, html_url = self.set[request_url]
             else:
                 file_path, file_name, html_url = urlTools.dealUrl2File(request_url, origin_url, self.host, True)
                 error_count = 0
                 self.queue_resource.push([request_url, file_path, file_name, error_count])
                 self.set[request_url] = [file_path, file_name, html_url]
             pq(li).attr(attr, html_url)
Exemplo n.º 4
0
 def dealALink(self, linkList, origin_url, attr, deep):
     for li in linkList:
         url = pq(li).attr(attr)
         if url is not None:
             url = Url(url)
             request_url = urlTools.dealUrl2Request(url, origin_url)
             # print 'A:', request_url
             if outsite_page or request_url.getHost()[1] == self.host:
                 if request_url in self.set:
                     file_path, file_name, html_url = self.set[request_url]
                 else:
                     file_path, file_name, html_url = urlTools.dealUrl2File(request_url, origin_url, self.host, True)
                     self.queue_document.push([request_url, file_path, file_name, deep + 1, 0])
                     self.set[request_url] = [file_path, file_name, html_url]
                 pq(li).attr(attr, html_url)
Exemplo n.º 5
0
 def dealSourceLink(self, linkList, origin_url, attr):
     for li in linkList:
         url = pq(li).attr(attr)
         if url is not None:
             url = Url(url)
             request_url = urlTools.dealUrl2Request(url, origin_url)
             if request_url in self.set:
                 file_path, file_name, html_url = self.set[request_url]
             else:
                 file_path, file_name, html_url = urlTools.dealUrl2File(
                     request_url, origin_url, self.host, True)
                 error_count = 0
                 self.queue_resource.push(
                     [request_url, file_path, file_name, error_count])
                 self.set[request_url] = [file_path, file_name, html_url]
             pq(li).attr(attr, html_url)
Exemplo n.º 6
0
    def dealCss(self, text, origin_url):
        list = re.findall(r'url\(\'(.*?)\'\)|url\(\"(.*?)\"\)|url\((.*?)\)', text)
        for ans_list in list:
            for li in ans_list:
                if li != '' and not li.startswith('data'):
                    request_url = urlTools.dealUrl2Request(li, origin_url)
                    if request_url in self.set:
                        file_path, file_name, html_url = self.set[request_url]
                    else:
                        file_path, file_name, html_url = urlTools.dealUrl2File(request_url, origin_url, self.host, True)
                        error_count = 0
                        self.queue_resource.push([request_url, file_path, file_name, error_count])
                        self.set[request_url] = [file_path, file_name, html_url]
                    # self.requestSource(request_url, file_path, file_name)
                    text = text.replace(li, html_url.encode())

        return text
Exemplo n.º 7
0
 def dealALink(self, linkList, origin_url, attr, deep):
     for li in linkList:
         url = pq(li).attr(attr)
         if url is not None:
             url = Url(url)
             request_url = urlTools.dealUrl2Request(url, origin_url)
             # print 'A:', request_url
             if outsite_page or request_url.getHost()[1] == self.host:
                 if request_url in self.set:
                     file_path, file_name, html_url = self.set[request_url]
                 else:
                     file_path, file_name, html_url = urlTools.dealUrl2File(
                         request_url, origin_url, self.host, True)
                     self.queue_document.push(
                         [request_url, file_path, file_name, deep + 1, 0])
                     self.set[request_url] = [
                         file_path, file_name, html_url
                     ]
                 pq(li).attr(attr, html_url)
Exemplo n.º 8
0
    def dealCss(self, text, origin_url):
        list = re.findall(r'url\(\'(.*?)\'\)|url\(\"(.*?)\"\)|url\((.*?)\)',
                          text)
        for ans_list in list:
            for li in ans_list:
                if li != '' and not li.startswith('data'):
                    request_url = urlTools.dealUrl2Request(li, origin_url)
                    if request_url in self.set:
                        file_path, file_name, html_url = self.set[request_url]
                    else:
                        file_path, file_name, html_url = urlTools.dealUrl2File(
                            request_url, origin_url, self.host, True)
                        error_count = 0
                        self.queue_resource.push(
                            [request_url, file_path, file_name, error_count])
                        self.set[request_url] = [
                            file_path, file_name, html_url
                        ]
                    # self.requestSource(request_url, file_path, file_name)
                    text = text.replace(li, html_url.encode())

        return text
Exemplo n.º 9
0
    def start(self, is_continue=False):
        if is_continue:
            self.queue_resource = Queue.load(logs_path + 'queue_resource.json')
            self.queue_document = Queue.load(logs_path + 'queue_document.json')
            self.set = UrlSet.load(logs_path + 'url_set.json')
        else:
            self.queue_resource = Queue(logs_path + 'queue_resource.json')
            self.queue_document = Queue(logs_path + 'queue_document.json')
            self.set = UrlSet(logs_path + 'url_set.json')

        self.document_task = Task(self.queue_document, doc_pool_max)
        self.document_task.initTaskWork(self.getDocument)

        self.source_task = Task(self.queue_resource, res_pool_max)
        self.source_task.initTaskWork(self.requestSource)

        self.main_url = urlTools.dealUrl2Request(self.main_url, self.main_url)
        print self.main_url, self.host
        file_path, file_name, html_url = urlTools.dealUrl2File(self.main_url, self.main_url, self.host, True)
        self.queue_document.push([self.main_url, file_path, file_name, 0, 0])
        print "file_path:" + file_path, "file_name:" + file_name, "html_url:" + html_url
        self.document_task.start()
Exemplo n.º 10
0
    def start(self, is_continue=False):
        if is_continue:
            self.queue_resource = Queue.load(logs_path + 'queue_resource.json')
            self.queue_document = Queue.load(logs_path + 'queue_document.json')
            self.set = UrlSet.load(logs_path + 'url_set.json')
        else:
            self.queue_resource = Queue(logs_path + 'queue_resource.json')
            self.queue_document = Queue(logs_path + 'queue_document.json')
            self.set = UrlSet(logs_path + 'url_set.json')

        self.document_task = Task(self.queue_document, doc_pool_max)
        self.document_task.initTaskWork(self.getDocument)

        self.source_task = Task(self.queue_resource, res_pool_max)
        self.source_task.initTaskWork(self.requestSource)

        self.main_url = urlTools.dealUrl2Request(self.main_url, self.main_url)
        print self.main_url, self.host
        file_path, file_name, html_url = urlTools.dealUrl2File(
            self.main_url, self.main_url, self.host, True)
        self.queue_document.push([self.main_url, file_path, file_name, 0, 0])
        print "file_path:" + file_path, "file_name:" + file_name, "html_url:" + html_url
        self.document_task.start()