def _defaultFetcher(url): """Retrieve data from ``url``. css_parser default implementation of fetch URL function. Returns ``(encoding, string)`` or ``None`` """ try: request = urllib_Request(url) request.add_header('User-agent', 'css_parser %s (http://www.cthedot.de/css_parser/)' % VERSION) res = urllib_urlopen(request) except urllib_HTTPError as e: # http error, e.g. 404, e can be raised log.warn('HTTPError opening url=%s: %s %s' % (url, e.code, e.msg), error=e) except urllib_URLError as e: # URLError like mailto: or other IO errors, e can be raised log.warn('URLError, %s' % e.reason, error=e) except OSError as e: # e.g if file URL and not found log.warn(e, error=OSError) except ValueError as e: # invalid url, e.g. "1" log.warn('ValueError, %s' % e.args[0], error=ValueError) else: if res: mimeType, encoding = encutils.getHTTPInfo(res) if mimeType != 'text/css': log.error('Expected "text/css" mime type for url=%r but found: %r' % (url, mimeType), error=ValueError) content = res.read() if hasattr(res, 'close'): res.close() return encoding, content
def _doRequest(self, url): """Do an HTTP request Return (url, rawcontent) url might have been changed by server due to redirects etc """ self._log.debug(' CSSCapture._doRequest\n * URL: %s' % url) req = urllib_Request(url) if self._ua: req.add_header('User-agent', self._ua) self._log.info(' * Using User-Agent: %s', self._ua) try: res = urllib_urlopen(req) except urllib_HTTPError as e: self._log.critical(' %s\n%s %s\n%s' % ( e.geturl(), e.code, e.msg, e.headers)) return None, None # get real url if url != res.geturl(): url = res.geturl() self._log.info(' URL retrieved: %s', url) return url, res
def download_url(url, header=None): for retries in range(0, 5): try: r = urllib_Request(url) r.add_header('User-Agent', UA) if header: for h_key, h_value in header.items(): r.add_header(h_key, h_value) http_handler = HTTPHandler(debuglevel=0) https_handler = HTTPSHandler(debuglevel=0) opener = build_opener(http_handler, https_handler) install_opener(opener) u = urlopen(r) contents = u.read() u.close() return contents except: raise RuntimeError('Could not open URL: {}'.format(url))
def _fetch_and_parse(self, job_id, url, depth): """ Fetch a webpage and parse it for links and images. Arguments: job_id: intefer job id. url: string URL. depth: integer current depth. Returns: None. """ html_parser = MyHtmlParser(url) request_headers = {'User-Agent': self.user_agent} request = urllib_Request(url, headers=request_headers) try: webpage = urlopen(request).read().decode() except Exception as error: data.redis.set(url, 'failed') return try: html_parser.feed(webpage) except (HTMLParseError) as error: data.redis.set(url, 'failed') return data.add_webpages(url, html_parser.hyperlinks, depth) data.redis.set(url, 'complete') data.complete_crawl(url) if 0 < depth and self._active and not data.job_is_aborted(job_id): if html_parser.hyperlinks: data.redis.sadd('job' + str(job_id), *html_parser.hyperlinks) data.redis.publish('deploy', pickle.dumps(job_id))
def _defaultFetcher(url): """Retrieve data from ``url``. css_parser default implementation of fetch URL function. Returns ``(encoding, string)`` or ``None`` """ try: request = urllib_Request(url) request.add_header( 'User-agent', 'css_parser %s (http://www.cthedot.de/css_parser/)' % VERSION) res = urllib_urlopen(request) except urllib_HTTPError as e: # http error, e.g. 404, e can be raised log.warn('HTTPError opening url=%s: %s %s' % (url, e.code, e.msg), error=e) except urllib_URLError as e: # URLError like mailto: or other IO errors, e can be raised log.warn('URLError, %s' % e.reason, error=e) except OSError as e: # e.g if file URL and not found log.warn(e, error=OSError) except ValueError as e: # invalid url, e.g. "1" log.warn('ValueError, %s' % e.args[0], error=ValueError) else: if res: mimeType, encoding = encutils.getHTTPInfo(res) if mimeType != 'text/css': log.error( 'Expected "text/css" mime type for url=%r but found: %r' % (url, mimeType), error=ValueError) content = res.read() if hasattr(res, 'close'): res.close() return encoding, content
tracert_line = re_sub(' +', ' ', tracert_line) counter = 1 while len(tracert_line) > 5: if 'Ошибка передачи' in tracert_line: print('Ошибка передачи') sys_exit(ERR_TRANSITION_ERROR) if "Превышен интервал ожидания для запроса." in tracert_line: print(str(counter) + "\t" + " ".join(tracert_line.split(' ')[5:])) break node_ip = tracert_line.split(' ')[-2] if node_ip.startswith('['): node_ip = node_ip[1:-1] url_iptoasn = 'https://api.iptoasn.com/v1/as/ip/' + node_ip request = urllib_Request(url_iptoasn, headers={'User-Agent': 'Mozilla/5.0'}) json_answer = {} try: json_answer = json_loads(urlopen(request).read().decode('utf-8')) time_sleep(0.5) except URLError as error: print("Internet connection problem occurred:") print(str(error)) sys_exit(ERR_INTERNET_CONNECTION_ERROR) if counter == 1: print('№\tIP\t\tASN\tCOUNTRY\tPROVIDER') if 'announced' not in json_answer: print(f'{counter}\tIncorrect data in json_answer') elif not json_answer['announced']:
def get_data(url, forceFetch=False, decrypt=False, useCache=True): if not url: return url start = datetime.datetime.now() tag = '' data = '' forceFetch = forceFetch or not useCache cache = common_cache.get(url) if cache: try: tag = cache.get('tag') data = cache.get('data') except: data = cache if data and not forceFetch: log( 'getData Cache (' + str( int((datetime.datetime.now() - start).total_seconds() * 1000)) + 'ms) ' + str(url), 'Debug') return json.loads(data) new_headers = {} if tag != '': new_headers.update({'If-None-Match': tag}) new_headers.update({'User-Agent': 'okhttp/3.10.0'}) new_headers.update({'Accept-Encoding': 'gzip'}) try: request = urllib_urlopen(urllib_Request(url, headers=new_headers)) except urllib_HTTPError as e: if e.code == 304: log( 'getData 304 (' + str( int((datetime.datetime.now() - start).total_seconds() * 1000)) + 'ms) ' + str(url), 'Debug') return json.loads(data) failure = str(e) if hasattr(e, 'code') or hasattr(e, 'reason'): log('get_data ERROR: ' + url + ' / ' + failure) log( 'getData RequestErr (' + str(int( (datetime.datetime.now() - start).total_seconds() * 1000)) + 'ms) ' + str(url), 'Debug') return json.loads(data) if request.info().get('Content-Encoding') == 'gzip': buffer = StringIO(request.read()) deflatedContent = gzip.GzipFile(fileobj=buffer) data = deflatedContent.read() else: data = request.read() #if Etag is set, use it exp = datetime.timedelta(minutes=_cacheMinutes) if request.info().get('ETag'): tag = request.info().get('ETag') exp = datetime.timedelta(days=200) if decrypt: data = decryptBase64StringToStringss(data, _xxtea_key) common_cache.set(url, {'data': data, 'tag': tag}, expiration=exp) log( 'getData (' + str(int((datetime.datetime.now() - start).total_seconds() * 1000)) + 'ms) ' + str(url), 'Debug') return json.loads(data)