示例#1
0
 def dealUrl2Request(self, url, origin):
     origin = Url(origin)
     url = Url(url)
     if not url.startswith('http://') and not url.startswith('https://'):
         if url.startswith('//'):
             url = 'http:' + url
         elif url.startswith('/'):
             url = origin.getHost()[2] + url
         else:
             origin = origin.getUrlDir()
             url = origin + url
     url = url.simplifyUrl()
     return url
示例#2
0
 def dealUrl2Request(self, url, origin):
     origin = Url(origin)
     url = Url(url)
     if not url.startswith('http://') and not url.startswith('https://'):
         if url.startswith('//'):
             url = 'http:' + url
         elif url.startswith('/'):
             url = origin.getHost()[2] + url
         else:
             origin = origin.getUrlDir()
             url = origin + url
     url = url.simplifyUrl()
     return url
示例#3
0
class Crawler:
    def __init__(self, url):
        self.main_url = Url(self.requestGet(url).url)
        self.host_option, self.host, self.host_url = self.main_url.getHost()

        self.queue_resource = None
        self.queue_document = None
        self.set = None
        self.document_task = None
        self.source_task = None

    def __del__(self):
        # del self.document_task
        # del self.queue_document
        # del self.source_task
        # del self.queue_resource
        print 'del c'

    def start(self, is_continue=False):
        if is_continue:
            self.queue_resource = Queue.load(logs_path + 'queue_resource.json')
            self.queue_document = Queue.load(logs_path + 'queue_document.json')
            self.set = UrlSet.load(logs_path + 'url_set.json')
        else:
            self.queue_resource = Queue(logs_path + 'queue_resource.json')
            self.queue_document = Queue(logs_path + 'queue_document.json')
            self.set = UrlSet(logs_path + 'url_set.json')

        self.document_task = Task(self.queue_document, doc_pool_max)
        self.document_task.initTaskWork(self.getDocument)

        self.source_task = Task(self.queue_resource, res_pool_max)
        self.source_task.initTaskWork(self.requestSource)

        self.main_url = urlTools.dealUrl2Request(self.main_url, self.main_url)
        print self.main_url, self.host
        file_path, file_name, html_url = urlTools.dealUrl2File(
            self.main_url, self.main_url, self.host, True)
        self.queue_document.push([self.main_url, file_path, file_name, 0, 0])
        print "file_path:" + file_path, "file_name:" + file_name, "html_url:" + html_url
        self.document_task.start()

    def requestGet(self, url):
        wait = random.random() * (wait_time[1] - wait_time[0])
        sleep(wait)
        timeout = Timeout(request_timeout)
        timeout.start()
        try:
            req = requests.get(url=url,
                               verify=True,
                               headers=headers,
                               proxies=proxies)
        except IncompleteRead:
            pass
            # todo:未知错误,暂还未查清
        timeout.cancel()
        return req

    def saveFile(self, file_path, file_name, bytes):
        path = Url(output + file_path)
        path = path.addUrlEnd()
        if not os.path.exists(path):
            os.makedirs(path)
        try:
            f = open(path + file_name, "wb")
            f.write(bytes)
            f.close()
        except IOError, e:
            print 'save Error: ', e, 'path: ', path, 'name: ', file_name
示例#4
0
class Crawler:
    def __init__(self, url):
        self.main_url = Url(self.requestGet(url).url)
        self.host_option, self.host, self.host_url = self.main_url.getHost()

        self.queue_resource = None
        self.queue_document = None
        self.set = None
        self.document_task = None
        self.source_task = None

    def __del__(self):
        # del self.document_task
        # del self.queue_document
        # del self.source_task
        # del self.queue_resource
        print 'del c'

    def start(self, is_continue=False):
        if is_continue:
            self.queue_resource = Queue.load(logs_path + 'queue_resource.json')
            self.queue_document = Queue.load(logs_path + 'queue_document.json')
            self.set = UrlSet.load(logs_path + 'url_set.json')
        else:
            self.queue_resource = Queue(logs_path + 'queue_resource.json')
            self.queue_document = Queue(logs_path + 'queue_document.json')
            self.set = UrlSet(logs_path + 'url_set.json')

        self.document_task = Task(self.queue_document, doc_pool_max)
        self.document_task.initTaskWork(self.getDocument)

        self.source_task = Task(self.queue_resource, res_pool_max)
        self.source_task.initTaskWork(self.requestSource)

        self.main_url = urlTools.dealUrl2Request(self.main_url, self.main_url)
        print self.main_url, self.host
        file_path, file_name, html_url = urlTools.dealUrl2File(self.main_url, self.main_url, self.host, True)
        self.queue_document.push([self.main_url, file_path, file_name, 0, 0])
        print "file_path:" + file_path, "file_name:" + file_name, "html_url:" + html_url
        self.document_task.start()

    def requestGet(self, url):
        wait = random.random() * (wait_time[1] - wait_time[0])
        sleep(wait)
        timeout = Timeout(request_timeout)
        timeout.start()
        try:
            req = requests.get(url=url, verify=True, headers=headers, proxies=proxies)
        except IncompleteRead:
            pass
            # todo:未知错误,暂还未查清
        timeout.cancel()
        return req

    def saveFile(self, file_path, file_name, bytes):
        path = Url(output + file_path)
        path = path.addUrlEnd()
        if not os.path.exists(path):
            os.makedirs(path)
        try:
            f = open(path + file_name, "wb")
            f.write(bytes)
            f.close()
        except IOError, e:
            print 'save Error: ', e, 'path: ', path, 'name: ', file_name