def _get_data(self, url, method=GET_METHOD, trim_chars=0, **kwargs): """Send a request to Google and return the JSON response as a Python object :param url: the url to which the request will be sent :param method: the HTTP method ('get' or 'post') :param trim_chars: how many characters should be trimmed off the beginning of the content of the response before this is passed to the JSON parser :param kwargs: any extra key arguments passed to the request builder (usually query parameters or data) :return: """ s = requests.session() if self.proxies != '': s.proxies.update(self.proxies) if method == TrendReq.POST_METHOD: response = s.post(url, cookies=self.cookies, **kwargs) else: response = s.get(url, cookies=self.cookies, **kwargs) # check if the response contains json and throw an exception otherwise # Google mostly sends 'application/json' in the Content-Type header, # but occasionally it sends 'application/javascript # and sometimes even 'text/javascript if 'application/json' in response.headers['Content-Type'] or \ 'application/javascript' in response.headers['Content-Type'] or \ 'text/javascript' in response.headers['Content-Type']: # trim initial characters # some responses start with garbage characters, like ")]}'," # these have to be cleaned before being passed to the json parser content = response.text[trim_chars:] # parse json return json.loads(content) else: # this is often the case when the amount of keywords in the payload for the IP # is not allowed by Google raise exceptions.ResponseError( 'The request failed: Google returned a ' 'response with code {0}.'.format(response.status_code), response=response)
def _get_data(self, url, method=GET_METHOD, trim_chars=0, **kwargs): """Send a request to Google and return the JSON response as a Python object :param url: the url to which the request will be sent :param method: the HTTP method ('get' or 'post') :param trim_chars: how many characters should be trimmed off the beginning of the content of the response before this is passed to the JSON parser :param kwargs: any extra key arguments passed to the request builder (usually query parameters or data) :return: """ s = requests.session() # Retries mechanism. Activated when one of statements >0 (best used for proxy) if self.retries > 0 or self.backoff_factor > 0: retry = Retry(total=self.retries, read=self.retries, connect=self.retries, backoff_factor=self.backoff_factor, status_forcelist=TrendReq.ERROR_CODES, raise_on_status=False, method_whitelist=frozenset(['GET', 'POST'])) s.mount('https://', HTTPAdapter(max_retries=retry)) s.headers.update({ 'accept-language': self.hl, 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36' }) # retries_with_new_proxy = self.retries if self.retries > 1 else 3 # for _ in range(retries_with_new_proxy): # retry at 429 using new proxies while True: # Keep retrying at 429 using new proxies if len(self.proxies) > 0: self.cookies = self.GetGoogleCookie() s.proxies.update({'https': self.proxies[self.proxy_index]}) print('Using proxy: {}'.format( str(self.proxies[self.proxy_index]))) try: if method == TrendReq.POST_METHOD: response = s.post( url, timeout=self.timeout, cookies=self.cookies, **kwargs, **self.requests_args ) # DO NOT USE retries or backoff_factor here else: response = s.get( url, timeout=self.timeout, cookies=self.cookies, **kwargs, **self.requests_args ) # DO NOT USE retries or backoff_factor here # check if the response contains json and throw an exception otherwise # Google mostly sends 'application/json' in the Content-Type header, # but occasionally it sends 'application/javascript # and sometimes even 'text/javascript if response.status_code == 200 and 'application/json' in \ response.headers['Content-Type'] or \ 'application/javascript' in response.headers['Content-Type'] or \ 'text/javascript' in response.headers['Content-Type']: # trim initial characters # some responses start with garbage characters, like ")]}'," # these have to be cleaned before being passed to the json parser content = response.text[trim_chars:] # parse json self.GetNewProxy() return json.loads(content) elif response.status_code == 429: print('Google complains 429. Switch to another proxy.') continue else: # error raise exceptions.ResponseError( 'The request failed: Google returned a ' 'response with code {0}.'.format(response.status_code), response=response) continue except (RetryError, MaxRetryError, ProxyError, Timeout): continue