def _verify_get(self, url):
     # kwargs.setdefault("allow_redirects", False)
     response = self._http_client.get(url)
     if response.status_code == 200:
         pass
     elif response.status_code == 302:
         location = response.headers['Location']
         user_verify_url = urljoin("http://qiye.qianzhan.com/", location)
         is_success = self.do_verify(user_verify_url)
         if is_success:
             response = self._verify_get(url)
         else:
             is_success = self.login()
             if is_success:
                 response = self._http_client.get(url)
             else:
                 raise Error302()
     elif response.status_code == 403:
         raise Error403()
     elif response.status_code == 404:
         is_success = self.login()
         if is_success:
             response = self._http_client.get(url)
         else:
             raise Error404()
     else:
         raise ErrorStatusCode()
     return response
예제 #2
0
 def _verify_get(self,
                 url,
                 times=0,
                 headers=default_headers,
                 refresh_ip=False,
                 timeout=download_timeout):
     headers.update({'User-Agent': self._user_agent})
     try:
         response = self._http_client.get(url,
                                          headers=headers,
                                          timeout=timeout)
         if response.status_code == 200:
             logging.debug(response.headers)
             pass
         elif response.status_code == 302:
             location = response.headers['Location']
             logging.debug("location: %s" % location)
             raise Error302()
         elif response.status_code == 403:
             raise Error403()
         elif response.status_code == 404:
             raise Error404()
         elif response.status_code == 502:
             raise Error502()
         else:
             raise ErrorStatusCode(response.status_code)
         return response
     except Error403, err:
         raise err
예제 #3
0
    def _run(self):

        for i in range(968, len(self._txt)):
            for j in range(i, len(self._txt)):
                # if i % 2 == 0:
                # j = i + 5
                search_key = self._txt[i] + self._txt[j]
                # search_key = u'在线途游(北京)科技有限公司'
                # search_key = u'北京'
                if RedisClient.get_search_key_key(search_key):
                    continue
                logging.info(
                    "++++++crawl 1000:->i: %d, j: %d, len: %d, search_key: %s"
                    % (i, j, len(self._txt), search_key))
                # url = "http://www.qichacha.com/search?key=" + urllib.quote(search_key.encode('utf-8')) + "&index=0"
                # url = "http://qiye.qianzhan.com/orgcompany/searchlistview/qy/" + urllib.quote(
                #     search_key.encode('utf-8')) + "?o=0&area=0&areaN=%E5%85%A8%E5%9B%BD&p=1"
                # url = "http://qiye.qianzhan.com/orgcompany/searchlistview/qy/" + urllib.quote(
                #     search_key.encode('utf-8')) + "?o=0&area=11&areaN=%E5%8C%97%E4%BA%AC&p=" + str(page)
                url = "http://qiye.qianzhan.com/search/all/" + urllib.quote(
                    search_key.encode(
                        'utf-8')) + "?o=0&area=11&areaN=%E5%8C%97%E4%BA%AC"

                try:
                    self._get_search(url)
                    RedisClient.set_search_key_key(search_key)
                except Error302, err:
                    raise Error302(i, j)
                except Error403, err:
                    raise Error403(i, j)
                except Exception, e:
                    logging.exception(
                        "_get_search:->i: %d, j: %d, len: %d, search_key: %s, %s"
                        % (i, j, len(self._txt), search_key, e.message))
                    pass
예제 #4
0
    def _verify_post(self, url, data=None, json=None, times=0, headers=default_headers, timeout=download_timeout):

        # headers.update({
        #     'User-Agent': self._user_agent,
        #     # "Proxy-Authorization": self.get_authHeader()
        # })

        try:
            response = self._http_client.post(url=url, data=data, json=json, headers=headers, timeout=timeout)
            if response.status_code == 200:
                logging.debug(response.headers)
                pass
            elif response.status_code == 302:
                location = response.headers['Location']
                logging.debug("location: %s" % location)
                raise Error302()
            elif response.status_code == 403:
                raise Error403()
            elif response.status_code == 404:
                raise Error404()
            elif response.status_code == 502:
                raise Error502()
            elif response.status_code == 503:
                raise Error503()
            else:
                raise ErrorStatusCode(response.status_code)
            return response
        except Error403, err:
            raise err
예제 #5
0
    def _verify_post(self, url, data=None, json=None, times=0):
        try:
            response = self._http_client.post(url, data, json)
            if response.status_code == 200:
                pass
            elif response.status_code == 302:
                location = response.headers['Location']
                logging.debug("location: %s" % location)
                raise Error302()
            elif response.status_code == 403:
                raise Error403()
            elif response.status_code == 404:
                raise Error404()
            elif response.status_code == 502:
                raise Error502()
            elif response.status_code == 503:
                raise Error503()
            else:
                raise ErrorStatusCode(response.status_code)
            return response

        except HttpClientError, err:
            times += 1
            if times < 3:
                return self._verify_post(url, data=data, json=json, times=times)
            else:
                raise err
예제 #6
0
 def run(self):
     logging.info("+++++++++++++run++++++++++++++++")
     try:
         is_success = self._qianzhan_client.login()
         if is_success:
             self._run()
             logging.info("++++++++++++++success finish!!!++++++++")
         else:
             raise Error302()
     except Error302, err:
         logging.error(err.message)
예제 #7
0
 def _verify_get(self, url, **kwargs):
     kwargs.setdefault("allow_redirects", False)
     response = self._http_client.get(url, **kwargs)
     if response.status_code == 200:
         pass
     elif response.status_code == 302:
         location = response.headers['Location']
         logging.debug("location: %s" % location)
         raise Error302()
     elif response.status_code == 403:
         raise Error403()
     elif response.status_code == 404:
         raise Error404()
     else:
         raise ErrorStatusCode()
     return response
예제 #8
0
 def _check_response(self, response):
     if response.status_code == 200:
         logging.debug(response.headers)
         pass
     elif response.status_code == 302:
         location = response.headers['Location']
         logging.debug("location: %s" % location)
         raise Error302()
     elif response.status_code == 403:
         raise Error403()
     elif response.status_code == 404:
         raise Error404()
     elif response.status_code == 502:
         raise Error502()
     elif response.status_code == 503:
         raise Error503()
     else:
         raise ErrorStatusCode(response.status_code)
     return response
예제 #9
0
    def _run(self):

        cur = ZhaopinDB.get_companys()
        for item in cur:
            search_key = item['company_name']
            if RedisClient.get_search_key_detail_key(search_key):
                continue
            logging.info("++++++crawl zhaopin:->search_key: %s" % search_key)
            url = "http://qiye.qianzhan.com/search/all/" + urllib.quote(
                search_key.encode(
                    'utf-8')) + "?o=0&area=11&areaN=%E5%8C%97%E4%BA%AC"

            try:
                self._get_search(url)
                RedisClient.set_search_key_detail_key(search_key)
            except Error302, err:
                raise Error302()
            except Error403, err:
                raise Error403()