Пример #1
0
 def update_headers(cls):
     cls.logger.info('*********updating cookies*********')
     _, headers, _ = parse_curl_str(cls.curl_str)
     headers['Cookie'] = cls.get_cookie_str()
     if headers['Cookie'] is None:
         change_ip()
         cls.update_headers()
     else:
         cls.headers = headers
Пример #2
0
 def update_headers(cls):
     cls.logger.info('*********updating cookies*********')
     _, headers, _ = parse_curl_str(cls.curl_str)
     headers['Cookie'] = cls.get_cookie_str()
     if headers['Cookie'] is None:
         change_ip()
         cls.update_headers()
     else:
         cls.headers = headers
Пример #3
0
 def update_headers(self, changeip=True):
     if changeip:
         change_ip()
     r = get(self.base_url)
     h = cookie_dict_from_cookie_str(r.headers.get('Set-Cookie'))
     cookies_dict = cookie_dict_from_cookie_str(self.headers['Cookie'])
     cookies_dict.update(h)
     self.headers['Cookie'] = cookies_dict
     self.logger.info('headers: %s', pformat(self.headers))
Пример #4
0
 def handle_response(self, url, response):
     self.logger.info('handle url: %s', url)
     if not response:
         return
     if response.status_code == 200:
         html = response.text
         html_parser = KuaidailiHtmlParser(url, html)
         ip_info_dict_yield = html_parser.parse()
         self.bulk_update_to_mongo(ip_info_dict_yield)
     elif response.status_code == 503:
         change_ip()
         self.urls.append(url)  # retry
Пример #5
0
    def handle_response(self, url, response):
        """handle_response 把代理ip的信息存储到mongodb中

        :param url:
        :param response: requests.models.Response
        """
        self.logger.info('handle url: %s', url)
        if not response:
            return
        if response.status_code == 200:
            html = response.text
            html_parser = XiciHtmlParser(url, html)
            ip_info_dict_yield = html_parser.parse()
            self.bulk_update_to_mongo(ip_info_dict_yield)
        elif response.status_code == 503:
            change_ip()
            self.urls.append(url)  # retry
Пример #6
0
    def fetch_channel_json(self, channel_json_url):
        time.sleep(random.randint(30, 60))
        self.logger.info(channel_json_url)
        res = get(channel_json_url, headers=self.headers)
        # http://stackoverflow.com/questions/24027589/how-to-convert-raw-javascript-object-to-python-dictionary
        html = res.text.strip()
        o = ast.literal_eval(html)
        if not o:
            self.logger.info(pprint.pformat(html))
            self.logger.info('fetch channel_json_url: %s failed',
                             channel_json_url)
            change_ip()
            return
        nick_name = o['nick_name']
        general_msg_list = o['general_msg_list']
        article_list = ast.literal_eval(general_msg_list)['list']
        article_dict_list = []
        for article in article_list:
            app_msg_ext_info = article['app_msg_ext_info']
            comm_msg_info = article['comm_msg_info']
            ori_create_time = comm_msg_info['datetime']

            article_dict_list.append(
                self._get_articel_info(app_msg_ext_info, nick_name,
                                       ori_create_time))
            if app_msg_ext_info['is_multi']:
                for article_info in app_msg_ext_info[
                        'multi_app_msg_item_list']:
                    article_dict_list.append(
                        self._get_articel_info(article_info, nick_name,
                                               ori_create_time))

        article_dict_list = self.get_remove_too_old_days_article(
            article_dict_list)
        article_dict_list = self.get_remove_mongodb_already_has_article(
            nick_name, article_dict_list)

        for article_dict in article_dict_list:
            article_dict['link'] = self.get_permanent_wechat_article_url(
                article_dict['link'])
        self.logger.info(pprint.pformat(article_dict_list))
        self.save_article_dict_list(nick_name, article_dict_list)
Пример #7
0
    def fetch_channel_json(self, channel_json_url):
        time.sleep(random.randint(60, 120))
        self.logger.info(channel_json_url)
        res = get(channel_json_url, headers=self.headers)
        # http://stackoverflow.com/questions/24027589/how-to-convert-raw-javascript-object-to-python-dictionary
        html = res.text.strip()
        o = ast.literal_eval(html)
        if not o:
            self.logger.debug(pprint.pformat(html))
            self.logger.info(
                'fetch channel_json_url: %s failed', channel_json_url
            )
            change_ip()
            return
        nick_name = o['nick_name']
        general_msg_list = o['general_msg_list']
        article_list = ast.literal_eval(general_msg_list)['list']
        article_dict_list = []
        for article in article_list:
            app_msg_ext_info = article['app_msg_ext_info']
            comm_msg_info = article['comm_msg_info']
            ori_create_time = comm_msg_info['datetime']

            article_dict_list.append(
                self._get_articel_info(
                    app_msg_ext_info, nick_name, ori_create_time
                )
            )
            if app_msg_ext_info['is_multi']:
                for article_info in app_msg_ext_info['multi_app_msg_item_list']:
                    article_dict_list.append(
                        self._get_articel_info(
                            article_info, nick_name, ori_create_time
                        )
                    )

        article_dict_list = self.get_remove_too_old_days_article(article_dict_list)
        article_dict_list = self.get_remove_mongodb_already_has_article(nick_name, article_dict_list)

        self.logger.info(pprint.pformat(article_dict_list))
        self.save_article_dict_list(nick_name, article_dict_list)
Пример #8
0
        def _wrapper(*args, **kwargs):
            index = 0
            while index < retries:
                index += 1
                try:
                    response = func(*args, **kwargs)
                    if response and (
                        LagouCrawler.is_block_html(response.text) or
                        LagouCrawler.is_check_html(response.text)
                    ):
                        sleep_time = (sleep ** index + random.randint(1, 10))
                        if sleep_time > 300:   # 5 mins
                            change_ip()
                            continue
                        else:
                            print('sleep for %ds' % sleep_time)
                            time.sleep(sleep_time)
                            continue

                    if response.status_code in (301, 302, 404, 500):
                        print('status_code', response.status_code)
                        break
                    elif response.status_code != 200:
                        print(response.status_code)
                        if changeip:
                            change_ip()
                        continue
                    else:
                        break
                except Exception as e:
                    traceback.print_exc()
                    response = None
                    if isinstance(e, Timeout):
                        if sleep is not None:
                            time.sleep(sleep + random.randint(10, 15))
                        continue
                    elif isinstance(e, TooManyRedirects):
                        break

            return response