Exemplo n.º 1
0
    def _get_data(self, url, method=GET_METHOD, trim_chars=0, **kwargs):
        """Send a request to Google and return the JSON response as a Python object
        :param url: the url to which the request will be sent
        :param method: the HTTP method ('get' or 'post')
        :param trim_chars: how many characters should be trimmed off the beginning of the content of the response
            before this is passed to the JSON parser
        :param kwargs: any extra key arguments passed to the request builder (usually query parameters or data)
        :return:
        """
        s = requests.session()
        if self.proxies != '':
            s.proxies.update(self.proxies)
        if method == TrendReq.POST_METHOD:
            response = s.post(url, cookies=self.cookies, **kwargs)
        else:
            response = s.get(url, cookies=self.cookies, **kwargs)

        # check if the response contains json and throw an exception otherwise
        # Google mostly sends 'application/json' in the Content-Type header,
        # but occasionally it sends 'application/javascript
        # and sometimes even 'text/javascript
        if 'application/json' in response.headers['Content-Type'] or \
            'application/javascript' in response.headers['Content-Type'] or \
                'text/javascript' in response.headers['Content-Type']:

            # trim initial characters
            # some responses start with garbage characters, like ")]}',"
            # these have to be cleaned before being passed to the json parser
            content = response.text[trim_chars:]

            # parse json
            return json.loads(content)
        else:
            # this is often the case when the amount of keywords in the payload for the IP
            # is not allowed by Google
            raise exceptions.ResponseError(
                'The request failed: Google returned a '
                'response with code {0}.'.format(response.status_code),
                response=response)
Exemplo n.º 2
0
    def _get_data(self, url, method=GET_METHOD, trim_chars=0, **kwargs):
        """Send a request to Google and return the JSON response as a Python object
        :param url: the url to which the request will be sent
        :param method: the HTTP method ('get' or 'post')
        :param trim_chars: how many characters should be trimmed off the beginning of the content of the response
            before this is passed to the JSON parser
        :param kwargs: any extra key arguments passed to the request builder (usually query parameters or data)
        :return:
        """
        s = requests.session()
        # Retries mechanism. Activated when one of statements >0 (best used for proxy)
        if self.retries > 0 or self.backoff_factor > 0:
            retry = Retry(total=self.retries,
                          read=self.retries,
                          connect=self.retries,
                          backoff_factor=self.backoff_factor,
                          status_forcelist=TrendReq.ERROR_CODES,
                          raise_on_status=False,
                          method_whitelist=frozenset(['GET', 'POST']))
            s.mount('https://', HTTPAdapter(max_retries=retry))

        s.headers.update({
            'accept-language':
            self.hl,
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36'
        })
        # retries_with_new_proxy = self.retries if self.retries > 1 else 3
        # for _ in range(retries_with_new_proxy):  # retry at 429 using new proxies
        while True:  # Keep retrying at 429 using new proxies
            if len(self.proxies) > 0:
                self.cookies = self.GetGoogleCookie()
                s.proxies.update({'https': self.proxies[self.proxy_index]})
                print('Using proxy: {}'.format(
                    str(self.proxies[self.proxy_index])))
            try:
                if method == TrendReq.POST_METHOD:
                    response = s.post(
                        url,
                        timeout=self.timeout,
                        cookies=self.cookies,
                        **kwargs,
                        **self.requests_args
                    )  # DO NOT USE retries or backoff_factor here
                else:
                    response = s.get(
                        url,
                        timeout=self.timeout,
                        cookies=self.cookies,
                        **kwargs,
                        **self.requests_args
                    )  # DO NOT USE retries or backoff_factor here
                # check if the response contains json and throw an exception otherwise
                # Google mostly sends 'application/json' in the Content-Type header,
                # but occasionally it sends 'application/javascript
                # and sometimes even 'text/javascript
                if response.status_code == 200 and 'application/json' in \
                        response.headers['Content-Type'] or \
                        'application/javascript' in response.headers['Content-Type'] or \
                        'text/javascript' in response.headers['Content-Type']:
                    # trim initial characters
                    # some responses start with garbage characters, like ")]}',"
                    # these have to be cleaned before being passed to the json parser
                    content = response.text[trim_chars:]
                    # parse json
                    self.GetNewProxy()
                    return json.loads(content)
                elif response.status_code == 429:
                    print('Google complains 429. Switch to another proxy.')
                    continue
                else:
                    # error
                    raise exceptions.ResponseError(
                        'The request failed: Google returned a '
                        'response with code {0}.'.format(response.status_code),
                        response=response)
                    continue
            except (RetryError, MaxRetryError, ProxyError, Timeout):
                continue