Пример #1
0
    def getDocument(self, url, file_path, file_name, deep, error_count):
        if 0 <= recursion_deep < deep or error_count > document_error_max:
            return
        url = urlTools.dealUrl2Request(url, url)

        if file_path == '' and file_name == '':
            file_name = 'index.html'
        try:
            req = self.requestGet(url)
            charset = self.getHTMLCharset(req.content)
            req.encoding = charset
            d = pq(req.text)
            # print charset

            linkList1 = d('link')
            self.dealSourceLink(linkList1, Url(req.url), 'href')

            linkList2 = d('script')
            self.dealSourceLink(linkList2, Url(req.url), 'src')

            linkList3 = d('img')
            self.dealSourceLink(linkList3, Url(req.url), 'src')

            linkList4 = d('a')
            self.dealALink(linkList4, Url(req.url), 'href', deep)

            self.source_task.start()

            self.saveFile(file_path, file_name,
                          bytearray(source=d.outer_html(), encoding='utf-8'))
        except requests.exceptions.ConnectionError, e:
            print 'ConnectionError:', e
            self.queue_document.push(
                [url, file_path, file_name, deep, error_count + 1])
Пример #2
0
    def __init__(self, url):
        self.main_url = Url(self.requestGet(url).url)
        self.host_option, self.host, self.host_url = self.main_url.getHost()

        self.queue_resource = None
        self.queue_document = None
        self.set = None
        self.document_task = None
        self.source_task = None
Пример #3
0
 def saveFile(self, file_path, file_name, bytes):
     path = Url(output + file_path)
     path = path.addUrlEnd()
     if not os.path.exists(path):
         os.makedirs(path)
     try:
         f = open(path + file_name, "wb")
         f.write(bytes)
         f.close()
     except IOError, e:
         print 'save Error: ', e, 'path: ', path, 'name: ', file_name
Пример #4
0
 def saveFile(self, file_path, file_name, bytes):
     path = Url(output + file_path)
     path = path.addUrlEnd()
     if not os.path.exists(path):
         os.makedirs(path)
     try:
         f = open(path + file_name, "wb")
         f.write(bytes)
         f.close()
     except IOError, e:
         print 'save Error: ', e, 'path: ', path, 'name: ', file_name
Пример #5
0
 def convLongPath(self, file_path, file_name):
     if len(file_name) > 128:
         file_name = Url(hashlib.sha1(file_name).hexdigest())
     if len(file_path) > 128:
         # 懒得管最前面有没有/了
         file_path = file_path[0] + Url(hashlib.sha1(file_path).hexdigest())
     # path_dirs = file_path.split('/')
     # for i, it in enumerate(path_dirs):
     #     if len(it) > 250:
     #         path_dirs[i] = str(hashlib.sha1(it).hexdigest())
     # file_path = '/'.join(path_dirs)
     return file_path, file_name
Пример #6
0
    def dealUrl2File(self, url, origin, host=None, is_req_url=False):
        """
        :param
            url: 待处理的url
            origin: 请求发生时所在的url
            host: 对于域名为host的url,资源存放目录为output根目录,而不是域名文件夹。默认不设置主host
            is_req_url: url是否做过 dealUrl2Request 处理

        :return
        """

        if not is_req_url:
            url = self.dealUrl2Request(url, origin)
        # url = self.simplifyUrl(url)
        # url除去最后的/
        url = url.delUrlEnd()

        if host is not None:
            # 如果该url就是这个站点域名下的,那么无需新建域名目录存放
            if url.cmpHost(host):
                # 除去host 这里有可能超出output根目录
                url = url.delHttp()
                url = url.delUrlStart()
                url = url.replace(host.getHost()[1], '')
        # 除去头,变身成文件路径
        url = url.delHttp()
        url = url.delUrlStart()
        for k, v in self.replaceChr.iteritems():
            if k in url:
                url = url.replace(k, v)


        file_name = Url(os.path.basename(url))
        file_path = Url(os.path.dirname(url))
        # 如果文件名或文件路径过长
        file_path, file_name = self.convLongPath(file_path, file_name)

        # if file_path.startswith('/') or file_path.startswith('.'):
        #     file_path = file_path[1:]
        # 为了解决同一目录下,文件和文件夹名不能重复的问题
        if file_name != '':
            file_name = 'f_'+file_name
        url = file_path.addUrlEnd() + file_name
        url = url.addUrlStart()

        # 当file_path为""时,表示当前目录
        return file_path, file_name, url
Пример #7
0
 def dealUrl2Request(self, url, origin):
     origin = Url(origin)
     url = Url(url)
     if not url.startswith('http://') and not url.startswith('https://'):
         if url.startswith('//'):
             url = 'http:' + url
         elif url.startswith('/'):
             url = origin.getHost()[2] + url
         else:
             origin = origin.getUrlDir()
             url = origin + url
     url = url.simplifyUrl()
     return url
Пример #8
0
    def dealUrl2File(self, url, origin, host=None, is_req_url=False):
        """
        :param
            url: 待处理的url
            origin: 请求发生时所在的url
            host: 对于域名为host的url,资源存放目录为output根目录,而不是域名文件夹。默认不设置主host
            is_req_url: url是否做过 dealUrl2Request 处理

        :return
        """

        if not is_req_url:
            url = self.dealUrl2Request(url, origin)
        # url = self.simplifyUrl(url)
        # url除去最后的/
        url = url.delUrlEnd()

        if host is not None:
            # 如果该url就是这个站点域名下的,那么无需新建域名目录存放
            if url.cmpHost(host):
                # 除去host 这里有可能超出output根目录
                url = url.delHttp()
                url = url.delUrlStart()
                url = url.replace(host.getHost()[1], '')
        # 除去头,变身成文件路径
        url = url.delHttp()
        url = url.delUrlStart()
        for k, v in self.replaceChr.iteritems():
            if k in url:
                url = url.replace(k, v)

        file_name = Url(os.path.basename(url))
        file_path = Url(os.path.dirname(url))
        # 如果文件名或文件路径过长
        file_path, file_name = self.convLongPath(file_path, file_name)

        # if file_path.startswith('/') or file_path.startswith('.'):
        #     file_path = file_path[1:]
        # 为了解决同一目录下,文件和文件夹名不能重复的问题
        if file_name != '':
            file_name = 'f_' + file_name
        url = file_path.addUrlEnd() + file_name
        url = url.addUrlStart()

        # 当file_path为""时,表示当前目录
        return file_path, file_name, url
Пример #9
0
    def __init__(self, url):
        self.main_url = Url(self.requestGet(url).url)
        self.host_option, self.host, self.host_url = self.main_url.getHost()

        self.queue_resource = None
        self.queue_document = None
        self.set = None
        self.document_task = None
        self.source_task = None
Пример #10
0
 def dealSourceLink(self, linkList, origin_url, attr):
     for li in linkList:
         url = pq(li).attr(attr)
         if url is not None:
             url = Url(url)
             request_url = urlTools.dealUrl2Request(url, origin_url)
             if request_url in self.set:
                 file_path, file_name, html_url = self.set[request_url]
             else:
                 file_path, file_name, html_url = urlTools.dealUrl2File(
                     request_url, origin_url, self.host, True)
                 error_count = 0
                 self.queue_resource.push(
                     [request_url, file_path, file_name, error_count])
                 self.set[request_url] = [file_path, file_name, html_url]
             pq(li).attr(attr, html_url)
Пример #11
0
 def dealALink(self, linkList, origin_url, attr, deep):
     for li in linkList:
         url = pq(li).attr(attr)
         if url is not None:
             url = Url(url)
             request_url = urlTools.dealUrl2Request(url, origin_url)
             # print 'A:', request_url
             if outsite_page or request_url.getHost()[1] == self.host:
                 if request_url in self.set:
                     file_path, file_name, html_url = self.set[request_url]
                 else:
                     file_path, file_name, html_url = urlTools.dealUrl2File(
                         request_url, origin_url, self.host, True)
                     self.queue_document.push(
                         [request_url, file_path, file_name, deep + 1, 0])
                     self.set[request_url] = [
                         file_path, file_name, html_url
                     ]
                 pq(li).attr(attr, html_url)
Пример #12
0
class Crawler:
    def __init__(self, url):
        self.main_url = Url(self.requestGet(url).url)
        self.host_option, self.host, self.host_url = self.main_url.getHost()

        self.queue_resource = None
        self.queue_document = None
        self.set = None
        self.document_task = None
        self.source_task = None

    def __del__(self):
        # del self.document_task
        # del self.queue_document
        # del self.source_task
        # del self.queue_resource
        print 'del c'

    def start(self, is_continue=False):
        if is_continue:
            self.queue_resource = Queue.load(logs_path + 'queue_resource.json')
            self.queue_document = Queue.load(logs_path + 'queue_document.json')
            self.set = UrlSet.load(logs_path + 'url_set.json')
        else:
            self.queue_resource = Queue(logs_path + 'queue_resource.json')
            self.queue_document = Queue(logs_path + 'queue_document.json')
            self.set = UrlSet(logs_path + 'url_set.json')

        self.document_task = Task(self.queue_document, doc_pool_max)
        self.document_task.initTaskWork(self.getDocument)

        self.source_task = Task(self.queue_resource, res_pool_max)
        self.source_task.initTaskWork(self.requestSource)

        self.main_url = urlTools.dealUrl2Request(self.main_url, self.main_url)
        print self.main_url, self.host
        file_path, file_name, html_url = urlTools.dealUrl2File(
            self.main_url, self.main_url, self.host, True)
        self.queue_document.push([self.main_url, file_path, file_name, 0, 0])
        print "file_path:" + file_path, "file_name:" + file_name, "html_url:" + html_url
        self.document_task.start()

    def requestGet(self, url):
        wait = random.random() * (wait_time[1] - wait_time[0])
        sleep(wait)
        timeout = Timeout(request_timeout)
        timeout.start()
        try:
            req = requests.get(url=url,
                               verify=True,
                               headers=headers,
                               proxies=proxies)
        except IncompleteRead:
            pass
            # todo:未知错误,暂还未查清
        timeout.cancel()
        return req

    def saveFile(self, file_path, file_name, bytes):
        path = Url(output + file_path)
        path = path.addUrlEnd()
        if not os.path.exists(path):
            os.makedirs(path)
        try:
            f = open(path + file_name, "wb")
            f.write(bytes)
            f.close()
        except IOError, e:
            print 'save Error: ', e, 'path: ', path, 'name: ', file_name
Пример #13
0
        self.set.save()
        self.document_task.stop()
        self.source_task.stop()


def work():
    print 123

    c.start()
    time.sleep(10)
    print 321


if __name__ == '__main__':
    url0 = raw_input("input the Url:")
    c = Crawler(Url(url0))

    def work(is_continue):
        c.start(is_continue)
        print 'work stoped'

    print "Enter 's' to start, 'c' to continue, 'e' to stop."
    while True:
        char = raw_input()
        if char == 's':
            p = threading.Thread(target=work, args=(False, ))
            print 'Process will start.'
            p.start()
        elif char == 'c':
            p = threading.Thread(target=work, args=(True, ))
            print 'Process will start.'
Пример #14
0
class Crawler:
    def __init__(self, url):
        self.main_url = Url(self.requestGet(url).url)
        self.host_option, self.host, self.host_url = self.main_url.getHost()

        self.queue_resource = None
        self.queue_document = None
        self.set = None
        self.document_task = None
        self.source_task = None

    def __del__(self):
        # del self.document_task
        # del self.queue_document
        # del self.source_task
        # del self.queue_resource
        print 'del c'

    def start(self, is_continue=False):
        if is_continue:
            self.queue_resource = Queue.load(logs_path + 'queue_resource.json')
            self.queue_document = Queue.load(logs_path + 'queue_document.json')
            self.set = UrlSet.load(logs_path + 'url_set.json')
        else:
            self.queue_resource = Queue(logs_path + 'queue_resource.json')
            self.queue_document = Queue(logs_path + 'queue_document.json')
            self.set = UrlSet(logs_path + 'url_set.json')

        self.document_task = Task(self.queue_document, doc_pool_max)
        self.document_task.initTaskWork(self.getDocument)

        self.source_task = Task(self.queue_resource, res_pool_max)
        self.source_task.initTaskWork(self.requestSource)

        self.main_url = urlTools.dealUrl2Request(self.main_url, self.main_url)
        print self.main_url, self.host
        file_path, file_name, html_url = urlTools.dealUrl2File(self.main_url, self.main_url, self.host, True)
        self.queue_document.push([self.main_url, file_path, file_name, 0, 0])
        print "file_path:" + file_path, "file_name:" + file_name, "html_url:" + html_url
        self.document_task.start()

    def requestGet(self, url):
        wait = random.random() * (wait_time[1] - wait_time[0])
        sleep(wait)
        timeout = Timeout(request_timeout)
        timeout.start()
        try:
            req = requests.get(url=url, verify=True, headers=headers, proxies=proxies)
        except IncompleteRead:
            pass
            # todo:未知错误,暂还未查清
        timeout.cancel()
        return req

    def saveFile(self, file_path, file_name, bytes):
        path = Url(output + file_path)
        path = path.addUrlEnd()
        if not os.path.exists(path):
            os.makedirs(path)
        try:
            f = open(path + file_name, "wb")
            f.write(bytes)
            f.close()
        except IOError, e:
            print 'save Error: ', e, 'path: ', path, 'name: ', file_name
Пример #15
0
 def dealUrl2Request(self, url, origin):
     origin = Url(origin)
     url = Url(url)
     if not url.startswith('http://') and not url.startswith('https://'):
         if url.startswith('//'):
             url = 'http:' + url
         elif url.startswith('/'):
             url = origin.getHost()[2] + url
         else:
             origin = origin.getUrlDir()
             url = origin + url
     url = url.simplifyUrl()
     return url