def download(self, url): scheme = urlparse(url)[0] ext = url[url.rfind("."):] urlpath = urlparse(url)[2] filename = unquote(urlpath.split("/")[-1]) self.using_temp_file = True if scheme == 's3': client = boto3.client('s3') bucket_name, key = re.compile('s3://([\w\d\-\.]+)/(.*)').search(url).groups() url = client.generate_presigned_url( 'get_object', Params={'Bucket': bucket_name, 'Key': key.replace("+", " ")} ) src = urlopen(url) dest_fd, self.path = tempfile.mkstemp(suffix=ext) try: with os.fdopen(dest_fd, 'wb') as dest: shutil.copyfileobj(src, dest) except: os.remove(self.path) finally: src.close() return filename
def validate_url(url, parent_url='http:'): """ Validate a URL to be a string having an explicit recognized scheme. Arguments: url: string URL parent_url: optional string URL from which to inherit an implicit scheme. Returns: dict having: valid: boolean truth value. url: string modified URL. """ if bytes == type(url): url = url.decode() parsed_url = urlparse(url) if 0 < len(parsed_url.path) and '/' == parsed_url.path[0]: url = urldefrag(urljoin(parent_url, url))[0] elif not parsed_url.scheme: parent_scheme = urlparse(parent_url).scheme or 'http' url = parent_scheme + ':' + url parsed_url = urlparse(url) valid = parsed_url.scheme in ('http', 'https', '') and \ bool(parsed_url.netloc) return {'valid': valid, 'url': url}
def test_path_ended_with_pound(self): url = '//example.com:8042/over/there#name=ferret' a = fetch.urlparse(url).path b = request.urlparse(url).path self.assertEqual(a, b) a = fetch.urlparse(url).query b = request.urlparse(url).query self.assertEqual(a, b)
def test_no_netloc(self): url = 'www.example.com' a = fetch.urlparse(url).netloc b = request.urlparse(url).netloc self.assertEqual(a, b) a = fetch.urlparse(url).path b = request.urlparse(url).path self.assertEqual(a, b)
def test_path_with_query_and_fragment(self): url = '//example.com:8042/over/there?name=ferret#nose' a = fetch.urlparse(url).path b = request.urlparse(url).path self.assertEqual(a, b) a = fetch.urlparse(url).query b = request.urlparse(url).query self.assertEqual(a, b) a = fetch.urlparse(url).fragment b = request.urlparse(url).fragment self.assertEqual(a, b)
def gather_hyperlinks(self, url): """ Opens a url and searches its content for hyperlinks. url = [ParseResult], the url to open and read returns [list(ParseResult)] list of found hyperlinks """ links = [] with request.urlopen(url.geturl()) as page: dom = lxml.html.fromstring(page.read().decode("utf-8")) self.host_requests[url.netloc] = time.time() for link in dom.xpath("//a/@href"): new_link = request.urlparse(link) if not new_link.scheme: new_link = request.urlparse(url.scheme + "://" + url.netloc + new_link.path) if new_link.scheme == "http": links.append(new_link) return links
def upload(dist_filename, dist_type, package, config, sign=False): schema, netloc, url, params, query, fragments = urlparse(config.repository) if params or query or fragments: raise InvalidRepository("Incompatible url %s" % config.repository) if schema not in ('http', 'https'): raise InvalidRepository("unsupported schema " + schema) if sign: raise NotImplementedError() data = build_upload_post_data(dist_filename, dist_type, package) userpass = (config.username + ":" + config.password).encode("ascii") auth = six.b("Basic ") + base64.standard_b64encode(userpass) request = build_request(config.repository, data, auth) try: result = urlopen(request) status = result.getcode() reason = result.msg except HTTPError: e = extract_exception() status = e.code reason = e.msg if status != 200: raise PyPIError( "Could not upload to repository %r - error %s (server answered '%s')" \ % (config.repository, status, reason))
def __init__(self, endpoint, server=None, port=None, use_srv=True, wait=80, hold=4, requests=5, headers=None, PIPELINE=True, GZIP=True): PlugIn.__init__(self) self.DBG_LINE = 'bosh' self._exported_methods = [ self.send, self.receive, self.disconnect, ] url = urlparse(endpoint) self._http_host = url.hostname self._http_path = url.path if url.port: self._http_port = url.port elif url.scheme == 'https': self._http_port = 443 else: self._http_port = 80 self._http_proto = url.scheme self._server = server self._port = port self.use_srv = use_srv self.Sid = None self._rid = 0 self.wait = 80 self.hold = hold self.requests = requests self._pipeline = None self.PIPELINE = PIPELINE if self.PIPELINE: self._respobjs = [] else: self._respobjs = {} self.headers = headers or self.default_headers self.GZIP = GZIP
def query_splitter(url): from attrdict import AttrDict from collections import OrderedDict query = attrgetter('query')(urlsplit(url)) dic = urlparse(url)._asdict() _query = itemgetter('query')(dic) print(OrderedDict([q.split('=') for q in _query.split('&')])['text']) return AttrDict(OrderedDict([q.split('=') for q in query.split('&')]))
def get_nhlid_from_tablerow(tr): """Get player ID from href inside the row""" anchor_tag = tr.find(".//a[@href]") if anchor_tag is not None: href = anchor_tag.attrib['href'] if re.match(r"^/ice/player.htm", href): qs = urlparse(href).query return parse_qs(qs).get("id", None)[0]
def httpServer(url): u = urlparse(url) host = u[1] page = u[2] s = socket.socket() port =80 s.connect((host,port)) httpcmd = 'get'+page+'\n' s.send(httpcmd) s.close()
def update_img(ev): url = ev.get('img_url', None) if url: o = urlparse(url) fname = os.path.join(res_dir, os.path.basename(o.path)) if not os.path.exists(res_dir): os.mkdir(res_dir) if not os.path.exists(fname): urlretrieve(url, fname) ev['img_cache'] = os.path.join(os.path.basename(res_dir), os.path.basename(o.path)) return ev
def _unshorten_hrefli(self, uri): try: # Extract url from query parsed_uri = urlparse(uri) extracted_uri = parsed_uri.query if not extracted_uri: return uri, INVALID_URL_ERROR_CODE # Get url status code r = requests.head(extracted_uri, headers=self._headers, timeout=self._timeout) return r.url, r.status_code except Exception as e: return uri, str(e)
def set_scheme(self, url): """ Checks if we haven't got a scheme. Sets scheme if needed. :param str url: The url address with scheme or without. """ if not request.urlparse(url).scheme: if url.startswith('ftp.'): url = 'ftp://{0!s}'.format(url) self._options['href'] = url else: url = 'http://{0!s}'.format(url) self._options['href'] = url
def test_register_server(self): package = PackageDescription(name="foo") repository = "http://testpypi.python.org/pypi" realm = DEFAULT_REALM config = PyPIConfig(username="******", password="******", repository=repository, realm=realm) auth = HTTPPasswordMgr() host = urlparse(config.repository)[0] auth.add_password(config.realm, host, config.username, config.password) post_data = build_post_data(package, "submit") code, msg = post_to_server(post_data, config, auth) self.assertEqual(code, 200) self.assertEqual(msg, "OK")
def uri_to_db(self, uri): parse_result = urlparse(uri) db = {} if 'postgres' in parse_result.scheme: db['ENGINE'] = 'django.db.backends.postgresql_psycopg2' db['NAME'] = os.path.split(parse_result.path)[-1] db['HOST'] = parse_result.hostname db['USER'] = parse_result.username db['PASSWORD'] = parse_result.password db['PORT'] = parse_result.port or '' else: db['ENGINE'] = 'django.db.backends.sqlite3' db['NAME'] = os.path.abspath(parse_result.path) return db
def download_ftp_file(url, outfile): url_parsed = urlparse(url) assert url_parsed.scheme == 'ftp' ftp = FTP(url_parsed.hostname) ftp.login() with open(outfile, 'wb') as out_f: ftp.retrbinary('RETR %s' % url_parsed.path, out_f.write) # set the mtime to match remote ftp server response = ftp.sendcmd('MDTM ' + url_parsed.path) code, lastmodified = response.split() # an example: 'last-modified': '20121128150000' lastmodified = time.mktime(datetime.strptime(lastmodified, '%Y%m%d%H%M%S').timetuple()) os.utime(outfile, (lastmodified, lastmodified))
def try_safely(remote_url): try: if urlparse(remote_url): try: log.info("Attempting to archive url.") archive = get(remote_url) log.info("Archive Created.") return archive except HTTPError: log.info("Cannot archive object, returning url.") return remote_url except ValueError as _e: log.info("No URL given") log.debug(_e) log.info("Not a valid URL") return remote_url
def run(self, context): o, a = context.get_parsed_arguments() if o.repository and (o.username or o.password or o.repository_url): raise bento.errors.UsageException("Cannot specify repository and username/password/url at the same time") if not (o.repository or (o.username or o.password or o.repository_url)): # FIXME: why does distutils use DEFAULT_REPOSITORY (i.e. an url) # here ? config = _read_pypirc(DEFAULT_REPOSITORY) elif o.repository: config = _read_pypirc(o.repository) else: config = PyPIConfig(o.username, o.password, o.repository_url) auth = HTTPPasswordMgr() host = urlparse(config.repository)[1] auth.add_password(config.realm, host, config.username, config.password) post_data = build_post_data(context.pkg, "submit") code, msg = post_to_server(post_data, config, auth) if code != 200: raise bento.errors.BentoError("Error while submitting package metadata to server: %r" % msg)
def downloadImg(img): try: src = img.attrs['src'] if not src.startswith("http"): print("Ignore img:", src) return print("Downloading image...:", src) resp = requests.request('get', src) o = urlparse(src) query = parse_qs(o.query) save_as = query.get("id") if save_as: save_as = save_as[0] else: save_as = os.path.basename(src) save_as = "/tmp/" + save_as f = open(save_as, 'wb') f.write(resp.content) return save_as except Exception as e: print(e) pass
def facebook(request): if request.method == 'POST': json_acceptable_string = request.body.decode('utf-8').replace("'", "\"") json_data = json.loads(json_acceptable_string) try: fd = urlopen(json_data['photo_url']) image_name = urlparse(json_data['photo_url']).path.split('/')[-1] image_file = BytesIO(fd.read()) try: user = ExtUser.objects.get(username=json_data['username']) Token.objects.get(user=user) return JsonResponse({ 'message': 'User and token already exist.' }) except ExtUser.DoesNotExist: json_data = check_json_data(json_data) new_user = ExtUser.objects.create_user( username=json_data['username'], email=json_data['email'], location=json_data['location'], orientation='S', gender=json_data['gender'][0].upper(), birthday=json_data['birthday'], password=json_data['password'], ) new_user.photo.save(image_name, File(image_file)) new_user.save() token = Token.objects.create(user=new_user) if new_user: return JsonResponse({ 'message': 'User is created. Sign in please.', 'token': token.key }) except Exception as e: return HttpResponseBadRequest('Something went wrong.') return HttpResponseBadRequest('Only POST request.')
def downloadFile(url, download_dir, target_dir_name, sha1_hash = None, force_download = False, user_agent = None): if not os.path.isdir(download_dir): os.mkdir(download_dir) p = urlparse(url) url = urlunparse([p[0], p[1], quote(p[2]), p[3], p[4], p[5]]) # replace special characters in the URL path filename_rel = os.path.split(p.path)[1] # get original filename target_filename = os.path.join(download_dir, filename_rel) # check SHA1 hash, if file already exists if os.path.exists(target_filename) and sha1_hash is not None and sha1_hash != "": hash_file = computeFileHash(target_filename) if hash_file != sha1_hash: log("Hash of " + target_filename + " (" + hash_file + ") does not match expected hash (" + sha1_hash + "); forcing download") force_download = True # download file if (not os.path.exists(target_filename)) or force_download: log("Downloading " + url + " to " + target_filename) if p.scheme == "ssh": downloadSCP(p.hostname, p.username, p.path, download_dir) else: if user_agent is not None: MyURLOpener.version = user_agent MyURLOpener().retrieve(url, target_filename) else: urlretrieve(url, target_filename) else: log("Skipping download of " + url + "; already downloaded") # check SHA1 hash if sha1_hash is not None and sha1_hash != "": hash_file = computeFileHash(target_filename) if hash_file != sha1_hash: raise RuntimeError("Hash of " + target_filename + " (" + hash_file + ") differs from expected hash (" + sha1_hash + ")") return target_filename
import argparse parser = argparse.ArgumentParser( description="Link Extractor Tool with Python") parser.add_argument("url", help="The URL to extract links from.") parser.add_argument("-m", "--max-urls", help="Number of max URLs to crawl, default is 30.", default=30, type=int) args = parser.parse_args() url = args.url max_urls = args.max_urls crawl(url, max_urls=max_urls) print("[+] Total Internal links:", len(internal_urls)) print("[+] Total External links:", len(external_urls)) print("[+] Total URLs:", len(external_urls) + len(internal_urls)) domain_name = urlparse(url).netloc # save the internal links to a file with open(f"{domain_name}_internal_links.txt", "w") as f: for internal_link in internal_urls: print(internal_link.strip(), file=f) # save the external links to a file with open(f"{domain_name}_external_links.txt", "w") as f: for external_link in external_urls: print(external_link.strip(), file=f)
def get_domain(url): parsed_url = urlparse(url) return "{url.netloc}".format(url=parsed_url)
def test_no_scheme_but_port(self): url = '//example.com:8042' a = fetch.urlparse(url).scheme b = request.urlparse(url).scheme self.assertEqual(a, b)
def read_chunk(url, x0, x1, y0, y1, z0, z1, level=1, format="tiff"): """Read an arbitrary chunk of data :param url: Base URL of the precomputed data source :param x0: starting X coordinate, in the level's coordinate space :param x1: ending X coordinate (non-inclusive) :param y0: starting Y coordinate :param y1: ending Y cooridinate :param z0: starting Z coordinate :param z1: ending Z coordinate :param level: mipmap level :param format: the read format if it's a file URL. Defaults to tiff, but you can use "blockfs" :return: a Numpy array containing the data """ is_file = urlparse(url).scheme.lower() == "file" info = get_info(url) scale = info.get_scale(level) result = np.zeros((z1 - z0, y1 - y0, x1 - x0), info.data_type) shape = np.array(scale.shape) offset = np.array(scale.offset) stride = np.array(scale.chunk_sizes) end = offset + shape x0d = _chunk_start(x0, offset[0], stride[0]) x1d = _chunk_end(x1, offset[0], stride[0], end[0]) y0d = _chunk_start(y0, offset[1], stride[1]) y1d = _chunk_end(y1, offset[1], stride[1], end[1]) z0d = _chunk_start(z0, offset[2], stride[2]) z1d = _chunk_end(z1, offset[2], stride[2], end[2]) for x0c, y0c, z0c in itertools.product(range(x0d, x1d, stride[0]), range(y0d, y1d, stride[1]), range(z0d, z1d, stride[2])): x1c = min(x1d, x0c + stride[0]) y1c = min(y1d, y0c + stride[1]) z1c = min(z1d, z0c + stride[2]) chunk_url = url + "/" + scale.key + "/%d-%d_%d-%d_%d-%d" % ( x0c, x1c, y0c, y1c, z0c, z1c) if is_file: if format == "tiff": chunk_url += ".tiff" with urlopen(chunk_url) as fd: chunk = tifffile.imread(fd) elif format == "blockfs": from blockfs import Directory from .blockfs_stack import BlockfsStack directory_url = url + "/" + scale.key + "/" +\ BlockfsStack.DIRECTORY_FILENAME directory_parse = urlparse(directory_url) directory_path = os.path.join(directory_parse.netloc, unquote(directory_parse.path)) directory = Directory.open(directory_path) chunk = directory.read_block(x0c, y0c, z0c) elif format == 'ngff': group = get_ngff_group_from_url(url) key = str(int(np.log2(level))) dataset = group[key] dataset.read_only = True chunk = dataset[0, 0, z0c:z1c, y0c:y1c, x0c:x1c] elif format == 'zarr': zarr_url = url + "/" + scale.key zarr_parse = urlparse(zarr_url) zarr_path = os.path.join(zarr_parse.netloc, unquote(zarr_parse.path)) storage = zarr.NestedDirectoryStore(zarr_path) dataset = zarr.Array(storage) chunk = dataset[z0c:z1c, y0c:y1c, x0c:x1c] else: raise NotImplementedError("Can't read %s yet" % format) else: response = urlopen(chunk_url) data = response.read() chunk = np.frombuffer(data, info.data_type).reshape( (z1c - z0c, y1c - y0c, x1c - x0c)) if z0c < z0: chunk = chunk[z0 - z0c:] z0c = z0 if z1c > z1: chunk = chunk[:z1 - z0c] z1c = z1 if y0c < y0: chunk = chunk[:, y0 - y0c:] y0c = y0 if y1c > y1: chunk = chunk[:, :y1 - y0c] y1c = y1 if x0c < x0: chunk = chunk[:, :, x0 - x0c:] x0c = x0 if x1c > x1: chunk = chunk[:, :, :x1 - x0c] x1c = x1 result[z0c - z0:z0c - z0 + chunk.shape[0], y0c - y0:y0c - y0 + chunk.shape[1], x0c - x0:x0c - x0 + chunk.shape[2]] = chunk return result
# -*- coding: utf-8 -*- import scrapy import re from mySpider.items import sudaMainItem import pymysql import copy from urllib.request import urlparse from urllib.parse import urljoin url = "http://eng.suda.edu.cn/suda_news/sdyw/202002/0c620fb0-aad7-4168-a3a4-a7c07442df98.html" # 域名 domain = urlparse(url).netloc # 协议 scheme = urlparse(url).scheme + '://' print(scheme + domain) class SudaurlsSpider(scrapy.Spider): name = 'sudaurls' # allowed_domains = ['www.suda.edu.cn', 'aff.suda.edu.cn', 'eng.suda.edu.cn', 'file.suda.edu.cn', # 'library.suda.edu.cn', 'mail.suda.edu.cn', 'csteaching.suda.edu.cn'] start_urls = ['http://www.suda.edu.cn'] basic_url = 'http://www.suda.edu.cn' table_count = 0 url_pool = set() def parse(self, response): # self.count = self.count+1 #print('这是第', self.count, '个页面') print('当前爬取页面' + response.request.url.strip('*/')) print('当前集合大小', len(self.url_pool)) titles = response.xpath('//a/@href').extract()
def build_url(self, local_path, **kwargs): # Make the path relative. local_path = local_path.strip('/') # We complain when we see non-normalized paths, as it is a good # indicator that unsanitized data may be getting through. # Mutating the scheme syntax to match is a little gross, but it works # for today. norm_path = os.path.normpath(local_path) if local_path.replace( '://', ':/') != norm_path or norm_path.startswith('../'): raise ValueError('path is not normalized') external = kwargs.pop('external', None) or kwargs.pop( '_external', None) scheme = kwargs.pop('scheme', None) if scheme and not external: raise ValueError('cannot specify scheme without external=True') if kwargs.get('_anchor'): raise ValueError('images have no _anchor') if kwargs.get('_method'): raise ValueError('images have no _method') # Remote URLs are encoded into the query. parsed = urlparse(local_path) if parsed.scheme or parsed.netloc: if parsed.scheme not in ALLOWED_SCHEMES: raise ValueError('scheme %r is not allowed' % parsed.scheme) kwargs['url'] = local_path local_path = '_' # Must be something. # Local ones are not. else: abs_path = self.find_img(local_path) if abs_path: kwargs['version'] = encode_int(int(os.path.getmtime(abs_path))) # Prep the cache flag, which defaults to True. cache = kwargs.pop('cache', True) if not cache: kwargs['cache'] = '' # Prep the enlarge flag, which defaults to False. enlarge = kwargs.pop('enlarge', False) if enlarge: kwargs['enlarge'] = '1' # Prep the transform, which is a set of delimited strings. transform = kwargs.get('transform') if transform: if isinstance(transform, basestring): transform = re.split(r'[,;:_ ]', transform) # We replace delimiters with underscores, and percent with p, since # these won't need escaping. kwargs['transform'] = '_'.join( str(x).replace('%', 'p') for x in transform) # Sign the query. public_kwargs = ((LONG_TO_SHORT.get(k, k), v) for k, v in kwargs.items() if v is not None and not k.startswith('_')) query = urlencode(sorted(public_kwargs), True) signer = Signer(current_app.secret_key) sig = signer.get_signature('%s?%s' % (local_path, query)).decode() url = '%s/%s?%s&s=%s' % ( current_app.config['IMAGES_URL'], urlquote(local_path), query, sig, ) if external: url = '%s://%s%s/%s' % (scheme or request.scheme, request.host, request.script_root, url.lstrip('/')) return url
def _filter(elem): parsed = urlparse(elem['href']) return bool(parsed.netloc) and bool( parsed.scheme) and "rust-lang.org" not in parsed.netloc
def test_content(self): self.assertGreater(len(self.json_response), 0) parsed_image_url = url_request.urlparse(self.json_response[0]) self.assertEqual(parsed_image_url.scheme, 'http') self.assertEqual(parsed_image_url.netloc, '127.0.0.1:8000') self.assertRegex(parsed_image_url.path, r'^/\d+$')
def parse(self) -> urlparse: """Returns `urllib.request.urlparse` result for given URL""" return urlparse(self.url)
def __init__(self, path=None, url=None, perform_init=True): self.stream = None file_scheme = "file:" self.using_temp_file = False if url is not None: url = str(url) if url.lower().startswith(file_scheme): url = url2pathname(url[len(file_scheme):]) path = url self.path = path if path is None: if url.lower().startswith("omero:"): while True: # # We keep trying to contact the OMERO server via the # login dialog until the user gives up or we connect. # try: self.rdr = get_omero_reader() self.path = url if perform_init: self.init_reader() return except jutil.JavaException as e: je = e.throwable if jutil.is_instance_of( je, "loci/formats/FormatException"): je = jutil.call(je, "getCause", "()Ljava/lang/Throwable;") if jutil.is_instance_of( je, "Glacier2/PermissionDeniedException"): omero_logout() omero_login() else: logger.warn(e.message) for line in traceback.format_exc().split("\n"): logger.warn(line) if jutil.is_instance_of( je, "java/io/FileNotFoundException"): raise IOError( errno.ENOENT, "The file, \"%s\", does not exist." % path, path) e2 = IOError( errno.EINVAL, "Could not load the file as an image (see log for details)", path.encode('utf-8')) raise e2 else: # # Other URLS, copy them to a tempfile location # ext = url[url.rfind("."):] src = urlopen(url) dest_fd, self.path = tempfile.mkstemp(suffix=ext) try: dest = os.fdopen(dest_fd, 'wb') shutil.copyfileobj(src, dest) except: src.close() dest.close() os.remove(self.path) self.using_temp_file = True src.close() dest.close() urlpath = urlparse(url)[2] filename = unquote(urlpath.split("/")[-1]) else: if sys.platform.startswith("win"): self.path = self.path.replace("/", os.path.sep) filename = os.path.split(path)[1] if not os.path.isfile(self.path): raise IOError(errno.ENOENT, "The file, \"%s\", does not exist." % path, path) self.stream = jutil.make_instance( 'loci/common/RandomAccessInputStream', '(Ljava/lang/String;)V', self.path) self.rdr = None class_list = get_class_list() find_rdr_script = """ var classes = class_list.getClasses(); var rdr = null; var lc_filename = java.lang.String(filename.toLowerCase()); for (pass=0; pass < 3; pass++) { for (class_idx in classes) { var maybe_rdr = classes[class_idx].newInstance(); if (pass == 0) { if (maybe_rdr.isThisType(filename, false)) { rdr = maybe_rdr; break; } continue; } else if (pass == 1) { var suffixes = maybe_rdr.getSuffixes(); var suffix_found = false; for (suffix_idx in suffixes) { var suffix = java.lang.String(suffixes[suffix_idx]); suffix = suffix.toLowerCase(); if (lc_filename.endsWith(suffix)) { suffix_found = true; break; } } if (! suffix_found) continue; } if (maybe_rdr.isThisType(stream)) { rdr = maybe_rdr; break; } } if (rdr) break; } rdr; """ IFormatReader = make_iformat_reader_class() jrdr = jutil.run_script( find_rdr_script, dict(class_list=class_list, filename=filename, stream=self.stream)) if jrdr is None: raise ValueError("Could not find a Bio-Formats reader for %s", self.path) self.rdr = IFormatReader() self.rdr.o = jrdr if perform_init: self.init_reader()
def download_file(url, data_dir, resume=True, overwrite=False, verbose=0): """ Load requested file if needed or requested. Parameters ---------- url: str the url of the file to be downloaded. data_dir: str path of the data directory. resume: bool (optional, default True) if True, try to resume partially downloaded files overwrite: bool (optional, default False) if True and file already exists, delete it. verbose: int (optional, default 0) control the verbosity level. Returns ------- download_fname: str absolute path to the downloaded file. Note: If, for any reason, the download procedure fails, all downloaded files are removed. """ # Create the download directory if necessary if not os.path.exists(data_dir): os.makedirs(data_dir) # Determine filename using URL parse = urlparse(url) fname = os.path.basename(parse.path) # Generate the download file name download_fname = os.path.join(data_dir, fname) # Generate a temporary file for the download temp_fname = os.path.join(data_dir, fname + ".part") # If the file is already created remove it if the overwrite option is set # or return the file if os.path.exists(download_fname): if overwrite: os.remove(download_fname) else: return download_fname # If the temporary file is already created remove it if the overwrite # option is set if os.path.exists(temp_fname): if overwrite: os.remove(temp_fname) # Start a timer to evaluate the download time t0 = time.time() # Test if the dataset has been released try: urlopen(url) except: raise ValueError( "The '{0}' dataset has not been released yet.".format(url)) # Start downloading dataset local_file = None bytes_so_far = 0 try: # Prepare the download if verbose > 0: print("Downloading data from {0}...".format(url)) # Case 1: continue the downloading from an existing temporary file if resume and os.path.exists(temp_fname): url_opener = ResumeURLOpener() # Download has been interrupted, we try to resume it. local_file_size = os.path.getsize(temp_fname) # If the file exists, then only download the remainder url_opener.addheader("Range", "bytes={0}-".format(local_file_size)) try: data = url_opener.open(url) except HTTPError: # There is a problem that may be due to resuming # Restart the downloading from scratch return download_file(url, data_dir, resume=False, overwrite=False) local_file = open(temp_fname, "ab") bytes_so_far = local_file_size # Case 2: just download the file else: data = urlopen(url) local_file = open(temp_fname, "wb") # Get the total file size try: total_size = data.info().get_all("Content-Length")[0].strip() total_size = int(total_size) + bytes_so_far except Exception as e: if verbose > 0: print("Total size could not be determined.") total_size = "?" # Download data chunk_size = 8192 while True: # Read chunk chunk = data.read(chunk_size) # Stoping criterion if not chunk: break # Write to local file bytes_so_far += len(chunk) local_file.write(chunk) # Write report status and print a progress bar if isinstance(total_size, int): ratio = float(bytes_so_far) / float(total_size) else: ratio = 0 progress_bar(ratio, title=os.path.basename(url)) print() # Temporary file must be closed prior to the move if not local_file.closed: local_file.close() shutil.move(temp_fname, download_fname) # Get process duration and print it dt = time.time() - t0 exit_message = ("Download was done in {0} minutes, {1: .2f} " "seconds").format(int(numpy.floor(dt / 60)), dt % 60) if verbose > 0: print(exit_message) except HTTPError as e: raise Exception("{0}\nError while downloading file '{1}'. " "Dataset download aborted.".format(e, fname)) finally: # Temporary file must be closed if local_file is not None: if not local_file.closed: local_file.close() return download_fname
def __init__(self, path=None, url=None, perform_init=True): self.stream = None file_scheme = "file:" self.using_temp_file = False if url is not None: url = str(url) if url.lower().startswith(file_scheme): url = url2pathname(url[len(file_scheme):]) path = url self.path = path if path is None: if url.lower().startswith("omero:"): while True: # # We keep trying to contact the OMERO server via the # login dialog until the user gives up or we connect. # try: self.rdr = get_omero_reader() self.path = url if perform_init: self.init_reader() return except jutil.JavaException as e: je = e.throwable if jutil.is_instance_of( je, "loci/formats/FormatException"): je = jutil.call(je, "getCause", "()Ljava/lang/Throwable;") if jutil.is_instance_of( je, "Glacier2/PermissionDeniedException"): omero_logout() omero_login() else: logger.warn(e.message) for line in traceback.format_exc().split("\n"): logger.warn(line) if jutil.is_instance_of( je, "java/io/FileNotFoundException"): raise IOError( errno.ENOENT, "The file, \"%s\", does not exist." % path, path) e2 = IOError( errno.EINVAL, "Could not load the file as an image (see log for details)", path.encode('utf-8')) raise e2 else: # # Other URLS, copy them to a tempfile location # ext = url[url.rfind("."):] src = urlopen(url) dest_fd, self.path = tempfile.mkstemp(suffix=ext) try: dest = os.fdopen(dest_fd, 'wb') shutil.copyfileobj(src, dest) except: src.close() dest.close() os.remove(self.path) self.using_temp_file = True src.close() dest.close() urlpath = urlparse(url)[2] filename = unquote(urlpath.split("/")[-1]) else: if sys.platform.startswith("win"): self.path = self.path.replace("/", os.path.sep) filename = os.path.split(path)[1] if not os.path.isfile(self.path): raise IOError( errno.ENOENT, "The file, \"%s\", does not exist." % path, path) self.stream = jutil.make_instance('loci/common/RandomAccessInputStream', '(Ljava/lang/String;)V', self.path) self.rdr = None class_list = get_class_list() find_rdr_script = """ var classes = class_list.getClasses(); var rdr = null; var lc_filename = java.lang.String(filename.toLowerCase()); for (pass=0; pass < 3; pass++) { for (class_idx in classes) { var maybe_rdr = classes[class_idx].newInstance(); if (pass == 0) { if (maybe_rdr.isThisType(filename, false)) { rdr = maybe_rdr; break; } continue; } else if (pass == 1) { var suffixes = maybe_rdr.getSuffixes(); var suffix_found = false; for (suffix_idx in suffixes) { var suffix = java.lang.String(suffixes[suffix_idx]); suffix = suffix.toLowerCase(); if (lc_filename.endsWith(suffix)) { suffix_found = true; break; } } if (! suffix_found) continue; } if (maybe_rdr.isThisType(stream)) { rdr = maybe_rdr; break; } } if (rdr) break; } rdr; """ IFormatReader = make_iformat_reader_class() jrdr = jutil.run_script(find_rdr_script, dict(class_list = class_list, filename = filename, stream = self.stream)) if jrdr is None: raise ValueError("Could not find a Bio-Formats reader for %s", self.path) self.rdr = IFormatReader() self.rdr.o = jrdr if perform_init: self.init_reader()
def test_path(self): url = '//example.com:8042/over/there' a = fetch.urlparse(url).path b = request.urlparse(url).path self.assertEqual(a, b)
def main(argv): ssl._create_default_https_context = ssl._create_unverified_context help_message = 'A Link Scrapper in Python \n\n Usage: python scrapper.py [option] [argument] \n\n -u, --url = url to crawl \n -c, --crawl [on/off] = turn on or off crawl, default=on \n -f, --file [filepath] = a file path to parse, crawling deactivated in this option \n -l --lfiles = list of files to parse (each line of the file must be a different file) \n -w --lwebsite = list of websites to check (each line of the file must be a different website), crawling deactivated in this option. If localhost, the crawling is deactivated in this option\n -S --stdin [option] for accept stdin input. Available options are: "f" to pipe the content of an html file, "p" for a list of files, with "w" a list of websites, example: "cat listofwebsites.txt | python scrapper.py -S w"' badargument_message_url = "The only option to be use with -u, --url is --crawl, -c" badargument_message_lwebsite = "The only option -l, --lwebsite is provide a list of websites, shouldnt be used with other parameter" badargument_message_stdin = "Stdin cannot be used with this options" try: opts, args = getopt.getopt(argv, "h:u:c:f:w:S:l:", ['help', 'url=', 'crawl=', 'file=', 'lfiles=', 'stdin=', 'lwebsite=']) except getopt.GetoptError: printandexit(message=help_message) port: int = 3000 crawl: bool = 0 lwebsite: bool = 0 lfiles: bool = 0 urlselected: bool = 0 fselect: bool = 0 stdin: bool = 0 given_url: str = "http://localhost" for opt, arg in opts: if opt == '-h': # help message printandexit(message=help_message) elif opt in ("-c", "--crawl"): # activate/deactivate crawling if arg == "on": crawl = 1 if arg == "off": crawl = 0 elif opt in ("-f", "--file"): # File path to parse fselect = 1 crawl = 0 # There is no crawling here, since there is no domain file_path = arg fname = ntpath.basename(arg) try: geturls(url=file_path, domain_name="", crawl=crawl, is_file=1) except IOError: print("Please choose a valid file path") sys.exit() elif opt in ("-u", "--url"): # url to crawl, decide whether it's a normal website or localhost urlselected = 1 given_url = arg domain_name = urlparse(given_url).netloc elif opt in ("-S", "--Stdin"): stdin = 1 option = "stdin_file" if arg == "f": option = "stdin_file" if arg == "w": option = "lsites" if arg == "p": option = "path" elif opt in ("-w", "--lwebsite"): input_file = arg lwebsite = 1 elif opt in ("-l", "--lfiles"): input_file = arg lfiles = 1 else: print("Parameter not recognized: %s !\n" % opt) print(help_message) #Options validations if (urlselected == 1 and fselect == 1) and (urlselected == 1 and lwebsite == 1) and (urlselected == 1 and stdin == 1): printandexit(message=badargument_message_url) if (lwebsite == 1 and stdin == 1) and (lwebsite == 1 and fselect == 1): printandexit(message=badargument_message_lwebsite) if (lfiles == 1 and stdin == 1) and (stdin == 1 and fselect == 1): printandexit(message=badargument_message_stdin) #Process the selected options if lwebsite == 1: process_lwebsites(input_file=input_file, crawl=crawl) if lfiles == 1: crawl = 0 process_lfiles(input_file=input_file, crawl=crawl, is_file=1) if fselect == 1: crawl = 0 if stdin == 1: process_stdin(stdin=sys.stdin, option=option, crawl=crawl) if urlselected == 1: if "localhost" not in given_url: geturls(url=given_url, domain_name=domain_name, crawl=crawl, is_file=0) else: # For localhost, it needs to pass the port. Eg: time python3 scrapper.py -u http://localhost:3000 crawl = 0 geturls(url=given_url, domain_name=given_url, crawl=crawl, is_file=0)
def retrieve_url(url, filename, *, logger=None, uncompress=False, transmit_compressed=True, update=False, check_certificates=True, name=None, timeout=60): """Return requested URL in filename :param url: the URL to retrive :param filename: where to save the contents of the URL :param name: string to use to identify the data in status messages :param logger: logger instance to use for status and warning messages :param uncompress: if true, then uncompress the content :param update: if true, then existing file is okay if newer than web version :param check_certificates: if true :returns: None if an existing file, otherwise the content type :raises urllib.request.URLError or EOFError: if unsuccessful If 'update' and the filename already exists, fetch the HTTP headers for the URL and check the last modified date to see if there is a newer version or not. If there isn't a newer version, return the filename. If there is a newer version, or if the filename does not exist, save the URL in the filename, and set the file's modified date to the HTTP last modified date, and return the filename. """ import os import time from urllib.request import Request, urlopen, urlparse, URLError from chimerax import app_dirs from .errors import UserError if name is None: name = os.path.basename(filename) hostname = urlparse(url).hostname if _timeout_cache: if hostname in _timeout_cache: cur_time = time.time() prev_time = _timeout_cache[hostname] if prev_time + TIMEOUT_CACHE_VALID < cur_time: del _timeout_cache[hostname] else: raise UserError(f'{hostname} failed to respond') headers = {"User-Agent": html_user_agent(app_dirs)} request = Request(url, unverifiable=True, headers=headers) last_modified = None if update and os.path.exists(filename): if logger: logger.status('check for newer version of %s' % name, secondary=True) info = os.stat(filename) request.method = 'HEAD' try: with urlopen(request, timeout=timeout) as response: d = response.headers['Last-modified'] last_modified = _convert_to_timestamp(d) if last_modified is None and logger: logger.warning('Invalid date "%s" for %s' % (d, request.full_url)) if last_modified is None or last_modified <= info.st_mtime: return except URLError: pass request.method = 'GET' try: request.headers['Accept-encoding'] = 'gzip, identity' if transmit_compressed else 'identity' if check_certificates: ssl_context = None else: import ssl ssl_context = ssl.create_default_context() ssl_context.check_hostname = False ssl_context.verify_mode = ssl.CERT_NONE with urlopen(request, timeout=timeout, context=ssl_context) as response: compressed = uncompress ct = response.headers['Content-Type'] if not compressed: ce = response.headers['Content-Encoding'] if ce: compressed = ce.casefold() in ('gzip', 'x-gzip') if ct: compressed = compressed or ct.casefold() in ( 'application/gzip', 'application/x-gzip') ct = 'application/octet-stream' if logger: logger.info('Fetching%s %s from %s' % ( " compressed" if compressed else "", name, request.get_full_url())) d = response.headers['Last-modified'] last_modified = _convert_to_timestamp(d) content_length = response.headers['Content-Length'] if content_length is not None: content_length = int(content_length) with open(filename, 'wb') as f: if compressed: read_and_uncompress(response, f, name, content_length, logger) else: read_and_report_progress(response, f, name, content_length, logger) if last_modified is not None: os.utime(filename, (last_modified, last_modified)) if logger: logger.status('%s fetched' % name, secondary=True, blank_after=5) return ct except Exception as err: if os.path.exists(filename): os.remove(filename) if logger: logger.status('Error fetching %s' % name, secondary=True, blank_after=15) if isinstance(err, URLError) and isinstance(err.reason, TimeoutError): _timeout_cache[hostname] = time.time() raise UserError(f'{hostname} failed to respond') raise
def test_netloc_case(self): url = 'foo://EXAMPLE.com' a = fetch.urlparse(url).netloc b = request.urlparse(url).netloc self.assertEqual(a, b)
def handle_request(self, path): # Verify the signature. query = dict(request.args.items()) old_sig = query.pop('s', None) if not old_sig: abort(404) signer = Signer(current_app.secret_key) new_sig = signer.get_signature( '%s?%s' % (path, urlencode(sorted(query.items()), True))).decode() if not constant_time_compare(old_sig.encode(), new_sig.encode()): abort(404) # Expand kwargs. query = dict((SHORT_TO_LONG.get(k, k), v) for k, v in query.items()) remote_url = query.get('url') if remote_url: # This is redundant for newly built URLs, but not for those which # have already been generated and cached. parsed = urlparse(remote_url) if parsed.scheme not in ALLOWED_SCHEMES: abort(404) # Download the remote file. makedirs(current_app.config['IMAGES_CACHE']) path = os.path.join( current_app.config['IMAGES_CACHE'], hashlib.md5(remote_url).hexdigest() + os.path.splitext(parsed.path)[1]) if not os.path.exists(path): log.info('downloading %s' % remote_url) tmp_path = path + '.tmp-' + str(os.getpid()) try: remote_file = urlopen(remote_url).read() except HTTPError as e: # abort with remote error code (403 or 404 most times) # log.debug('HTTP Error: %r' % e) abort(e.code) else: fh = open(tmp_path, 'wb') fh.write(remote_file) fh.close() call(['mv', tmp_path, path]) else: path = self.find_img(path) if not path: abort(404) # Not found. raw_mtime = os.path.getmtime(path) mtime = datetime.datetime.utcfromtimestamp(raw_mtime).replace( microsecond=0) # log.debug('last_modified: %r' % mtime) # log.debug('if_modified_since: %r' % request.if_modified_since) if request.if_modified_since and request.if_modified_since >= mtime: return '', 304 mode = query.get('mode') transform = query.get('transform') transform = re.split(r'[;,_/ ]', transform) if transform else None background = query.get('background') width = query.get('width') width = int(width) if width else None height = query.get('height') height = int(height) if height else None quality = query.get('quality') quality = int(quality) if quality else 75 format = (query.get('format', '') or os.path.splitext(path)[1][1:] or 'jpeg').lower() format = {'jpg': 'jpeg'}.get(format, format) has_version = 'version' in query use_cache = query.get('cache', True) enlarge = query.get('enlarge', False) sharpen = query.get('sharpen') sharpen = re.split(r'[;,_/ ]', sharpen) if sharpen else None if use_cache: # The parts in this initial list were parameters cached in version 1. # In order to avoid regenerating all images when a new feature is # added, we append (feature_name, value) tuples to the end. cache_key_parts = [ path, mode, width, height, quality, format, background ] if transform: cache_key_parts.append(('transform', transform)) if sharpen: cache_key_parts.append(('sharpen', sharpen)) if enlarge: cache_key_parts.append(('enlarge', enlarge)) cache_key = hashlib.md5(repr( tuple(cache_key_parts)).encode()).hexdigest() cache_dir = os.path.join(current_app.config['IMAGES_CACHE'], cache_key[:2]) cache_path = os.path.join(cache_dir, cache_key + '.' + format) cache_mtime = os.path.getmtime(cache_path) if os.path.exists( cache_path) else None mimetype = 'image/%s' % format cache_timeout = 31536000 if has_version else current_app.config[ 'IMAGES_MAX_AGE'] if not use_cache or not cache_mtime or cache_mtime < raw_mtime: log.info('resizing %r for %s' % (path, query)) image = Image.open(path) image = self.resize( image, background=background, enlarge=enlarge, height=height, mode=mode, transform=transform, width=width, ) image = self.post_process( image, sharpen=sharpen, ) if not use_cache: fh = StringIO() image.save(fh, format, quality=quality) return fh.getvalue(), 200, [ ('Content-Type', mimetype), ('Cache-Control', str(cache_timeout)), ] makedirs(cache_dir) cache_file = open(cache_path, 'wb') image.save(cache_file, format, quality=quality) cache_file.close() return send_file(cache_path, mimetype=mimetype, cache_timeout=cache_timeout)
def filterinput(input): x = urlparse(input) return x.hostname
def netloc(self): return '{0.scheme}://{0.netloc}'.format(urlparse(self.orig_url))
def getcms(keyword): # , language, resPeople result = keyWordsCollection.find_one({"originKey": keyword}) language = result["language"] resPeople = result["resPeople"] part = result["part"] station = result["station"] word = words.get(language) if not word: logging.info("没有适配语言:{}".format(language)) # 改变关键词获取状态 updateStatusKeyWord(keyword, part) return url = "http://api.serpprovider.com/5bfdf4cd7d33d1d77b9875d1/google/en-us/{}/{}".format( word, keyword) logging.info("请求数据,关键字:{},url:{}".format(keyword, url)) html = sendRequest(url) # 请求 try: datas = json.loads(html) except Exception as e: return reslist = jsonpath.jsonpath(datas, "$..res") if reslist: reslist = reslist[0] else: logging.error("google搜索后没有数据:{}".format(url)) # 改变关键词获取状态 updateStatusKeyWord(keyword, part) return if not reslist: # 改变关键词获取状态 updateStatusKeyWord(keyword, part) logging.error("google搜索后没有数据:{}".format(url)) return for data in reslist: # 协议 scheme = urlparse(data['url']).scheme # 域名 domain = urlparse(data['url']).netloc if not scheme or not domain: continue link = scheme + '://' + domain # 拼接链接 # 判断是否在缓存中 if part == "GB": domainList = domainListGB else: domainList = domainListCL if domain in domainList: logging.warn("该域名已经获取,存在缓存中,domain:{}".format(domain)) continue # 判断是否在数据库中 result = googleUrlCollection.find_one({"domain": domain}) if result: logging.warn("该域名已经获取,存在数据库中中,domain:{}".format(domain)) if result["part"] != part: webresultList = list(webResourcescollection.find({"url": link})) for result in webresultList: if part == "clothes": result["_id"] = result["_id"].replace( "_GB_", "_clothes_") else: result["_id"] = result["_id"].replace( "_clothes_", "_GB_") try: result["resPeople"] = resPeople result["part"] = part result["station"] = station mongoResult = webResourcescollection.find_one( {"_id": result["_id"]}) if not mongoResult: webResourcescollection.insert(result) logging.info("加入成功:{},_id:{}".format( part, result["_id"])) except Exception as e: logging.error(e) continue # 查询是否在GB中 title = data['title'] # 获取标题 describition = data['desc'] # 获取描述 domainList.append(domain) sourceUrl = data["url"] insertItem(domain, link, sourceUrl, scheme, keyword, language, resPeople, title, describition, word, part, station) # 改变关键词获取状态 updateStatusKeyWord(keyword, part)
def main(argv): global BASE_DIR, SRC_DIR, ARCHIVE_DIR, DEBUG_OUTPUT, FALLBACK_URL, USE_TAR, USE_UNZIP global TOOL_COMMAND_PYTHON, TOOL_COMMAND_GIT, TOOL_COMMAND_HG, TOOL_COMMAND_SVN, TOOL_COMMAND_PATCH, TOOL_COMMAND_TAR, TOOL_COMMAND_UNZIP try: opts, args = getopt.getopt( argv, "ln:N:cCb:h", ["list", "name=", "name-file=", "clean", "clean-all", "base-dir", "bootstrap-file=", "local-bootstrap-file=", "use-tar", "use-unzip", "repo-snapshots", "fallback-url=", "force-fallback", "debug-output", "help"]) except getopt.GetoptError: printOptions() return 0 opt_names = [] name_files = [] opt_clean = False opt_clean_archives = False list_libraries = False default_bootstrap_filename = "bootstrap.json" bootstrap_filename = os.path.abspath(os.path.join(BASE_DIR, default_bootstrap_filename)) local_bootstrap_filename = "" create_repo_snapshots = False force_fallback = False base_dir_path = "" for opt, arg in opts: if opt in ("-h", "--help"): printOptions() return 0 if opt in ("-l", "--list"): list_libraries = True if opt in ("-n", "--name"): opt_names.append(arg) if opt in ("-N", "--name-file"): name_files.append(os.path.abspath(arg)) if opt in ("-c", "--clean"): opt_clean = True if opt in ("-C", "--clean-all"): opt_clean = True opt_clean_archives = True if opt in ("-b", "--base-dir"): base_dir_path = os.path.abspath(arg) BASE_DIR = base_dir_path SRC_DIR = os.path.join(BASE_DIR, SRC_DIR_BASE) ARCHIVE_DIR = os.path.join(BASE_DIR, ARCHIVE_DIR_BASE) bootstrap_filename = os.path.join(BASE_DIR, default_bootstrap_filename) log("Using " + arg + " as base directory") if opt in ("--bootstrap-file",): bootstrap_filename = os.path.abspath(arg) log("Using main bootstrap file " + bootstrap_filename) if opt in ("--local-bootstrap-file",): local_bootstrap_filename = os.path.abspath(arg) log("Using local bootstrap file " + local_bootstrap_filename) if opt in ("--use-tar",): USE_TAR = True if opt in ("--use-unzip",): USE_UNZIP = True if opt in ("--repo-snapshots",): create_repo_snapshots = True log("Will create repository snapshots") if opt in ("--fallback-url",): FALLBACK_URL = arg if opt in ("--force-fallback",): force_fallback = True log("Using fallback URL to fetch all libraries") if opt in ("--debug-output",): DEBUG_OUTPUT = True if platform.system() is not "Windows": # Unfortunately some IDEs do not have a proper PATH environment variable set, # so we search manually for the required tools in some obvious locations. paths_to_search = os.environ["PATH"].split(":") + ["/usr/local/bin", "/opt/local/bin", "/usr/bin"] TOOL_COMMAND_PYTHON = findToolCommand(TOOL_COMMAND_PYTHON, paths_to_search, required = True) TOOL_COMMAND_GIT = findToolCommand(TOOL_COMMAND_GIT, paths_to_search, required = True) TOOL_COMMAND_HG = findToolCommand(TOOL_COMMAND_HG, paths_to_search, required = True) TOOL_COMMAND_SVN = findToolCommand(TOOL_COMMAND_SVN, paths_to_search, required = True) TOOL_COMMAND_PATCH = findToolCommand(TOOL_COMMAND_PATCH, paths_to_search, required = True) TOOL_COMMAND_TAR = findToolCommand(TOOL_COMMAND_TAR, paths_to_search, required = USE_TAR) TOOL_COMMAND_UNZIP = findToolCommand(TOOL_COMMAND_UNZIP, paths_to_search, required = USE_UNZIP) if base_dir_path: os.chdir(base_dir_path) if name_files: for name_file in name_files: try: with open(name_file) as f: opt_names_local = [l for l in (line.strip() for line in f) if l] opt_names_local = [l for l in opt_names_local if l[0] is not '#'] opt_names += opt_names_local dlog("Name file contains: " + ", ".join(opt_names_local)) except: log("ERROR: cannot parse name file " + name_file) return -1 if force_fallback and not FALLBACK_URL: log("Error: cannot force usage of the fallback location without specifying a fallback URL") return -1; state_filename = os.path.join(os.path.dirname(os.path.splitext(bootstrap_filename)[0]), \ "." + os.path.basename(os.path.splitext(bootstrap_filename)[0])) \ + os.path.splitext(bootstrap_filename)[1] dlog("bootstrap_filename = " + bootstrap_filename) dlog("state_filename = " + state_filename) # read canonical libraries data data = readJSONData(bootstrap_filename) if data is None: return -1; # some sanity checking for library in data: if library.get('name', None) is None: log("ERROR: Invalid schema: library object does not have a 'name'") return -1 # read local libraries data, if available local_data = None if local_bootstrap_filename: local_data = readJSONData(local_bootstrap_filename) if local_data is None: return -1; # some sanity checking for local_library in local_data: if local_library.get('name', None) is None: log("ERROR: Invalid schema: local library object does not have a 'name'") return -1 # merge canonical and local library data, if applicable; local libraries take precedence if local_data is not None: for local_library in local_data: local_name = local_library.get('name', None) found_canonical_library = False for n, library in enumerate(data): name = library.get('name', None) if local_name == name: data[n] = local_library # overwrite library found_canonical_library = True if not found_canonical_library: data.append(local_library) if list_libraries: listLibraries(data) return 0 sdata = [] if os.path.exists(state_filename): sdata = readJSONData(state_filename) # create source directory if not os.path.isdir(SRC_DIR): log("Creating directory " + SRC_DIR) os.mkdir(SRC_DIR) # create archive files directory if not os.path.isdir(ARCHIVE_DIR): log("Creating directory " + ARCHIVE_DIR) os.mkdir(ARCHIVE_DIR) failed_libraries = [] for library in data: name = library.get('name', None) source = library.get('source', None) post = library.get('postprocess', None) if (opt_names) and (not name in opt_names): continue lib_dir = os.path.join(SRC_DIR, name) dlog("********** LIBRARY " + name + " **********") dlog("lib_dir = " + lib_dir + ")") # compare against cached state cached_state_ok = False if not opt_clean: for slibrary in sdata: sname = slibrary.get('name', None) if sname is not None and sname == name and slibrary == library and os.path.exists(lib_dir): cached_state_ok = True break if cached_state_ok: log("Cached state for " + name + " equals expected state; skipping library") continue else: # remove cached state for library sdata[:] = [s for s in sdata if not (lambda s, name : s.get('name', None) is not None and s['name'] == name)(s, name)] # create library directory, if necessary if opt_clean: log("Cleaning directory for " + name) if os.path.exists(lib_dir): shutil.rmtree(lib_dir) if not os.path.exists(lib_dir): os.mkdir(lib_dir) try: # download source if source is not None: if 'type' not in source: log("ERROR: Invalid schema for " + name + ": 'source' object must have a 'type'") return -1 if 'url' not in source: log("ERROR: Invalid schema for " + name + ": 'source' object must have a 'url'") return -1 src_type = source['type'] src_url = source['url'] if src_type == "sourcefile": sha1 = source.get('sha1', None) user_agent = source.get('user-agent', None) try: if force_fallback: raise RuntimeError downloadFile(src_url, ARCHIVE_DIR, name, sha1, force_download = opt_clean_archives, user_agent = user_agent) filename_rel = os.path.basename(src_url) shutil.copyfile( os.path.join(ARCHIVE_DIR, filename_rel), os.path.join(lib_dir, filename_rel) ) except: if FALLBACK_URL: if not force_fallback: log("WARNING: Downloading of file " + src_url + " failed; trying fallback") p = urlparse(src_url) filename_rel = os.path.split(p.path)[1] # get original filename p = urlparse(FALLBACK_URL) fallback_src_url = urlunparse([p[0], p[1], p[2] + "/" + ARCHIVE_DIR_BASE + "/" + filename_rel, p[3], p[4], p[5]]) downloadFile(fallback_src_url, ARCHIVE_DIR, name, sha1, force_download = True) shutil.copyfile( os.path.join(ARCHIVE_DIR, filename_rel), os.path.join(lib_dir, filename_rel) ) else: shutil.rmtree(lib_dir) raise elif src_type == "archive": sha1 = source.get('sha1', None) user_agent = source.get('user-agent', None) try: if force_fallback: raise RuntimeError downloadAndExtractFile(src_url, ARCHIVE_DIR, name, sha1, force_download = opt_clean_archives, user_agent = user_agent) except: if FALLBACK_URL: if not force_fallback: log("WARNING: Downloading of file " + src_url + " failed; trying fallback") p = urlparse(src_url) filename_rel = os.path.split(p.path)[1] # get original filename p = urlparse(FALLBACK_URL) fallback_src_url = urlunparse([p[0], p[1], p[2] + "/" + ARCHIVE_DIR_BASE + "/" + filename_rel, p[3], p[4], p[5]]) downloadAndExtractFile(fallback_src_url, ARCHIVE_DIR, name, sha1, force_download = True) else: raise else: revision = source.get('revision', None) archive_name = name + ".tar.gz" # for reading or writing of snapshot archives if revision is not None: archive_name = name + "_" + revision + ".tar.gz" try: if force_fallback: raise RuntimeError cloneRepository(src_type, src_url, name, revision) if create_repo_snapshots: log("Creating snapshot of library repository " + name) repo_dir = os.path.join(SRC_DIR, name) archive_filename = os.path.join(SNAPSHOT_DIR, archive_name) dlog("Snapshot will be saved as " + archive_filename) createArchiveFromDirectory(repo_dir, archive_filename, revision is None) except: if FALLBACK_URL: if not force_fallback: log("WARNING: Cloning of repository " + src_url + " failed; trying fallback") # copy archived snapshot from fallback location p = urlparse(FALLBACK_URL) fallback_src_url = urlunparse([p[0], p[1], p[2] + "/" + SNAPSHOT_DIR_BASE + "/" + archive_name, p[3], p[4], p[5]]) dlog("Looking for snapshot " + fallback_src_url + " of library repository " + name) # create snapshots files directory downloadAndExtractFile(fallback_src_url, SNAPSHOT_DIR, name, force_download = True) # reset repository state to particular revision (only using local operations inside the function) cloneRepository(src_type, src_url, name, revision, True) else: raise else: # set up clean directory for potential patch application shutil.rmtree(lib_dir) os.mkdir(lib_dir) # post-processing if post is not None: if 'type' not in post: log("ERROR: Invalid schema for " + name + ": 'postprocess' object must have a 'type'") return -1 if 'file' not in post: log("ERROR: Invalid schema for " + name + ": 'postprocess' object must have a 'file'") return -1 post_type = post['type'] post_file = post['file'] if post_type == "patch": applyPatchFile(post_file, name, post.get('pnum', DEFAULT_PNUM)) elif post_type == "script": runPythonScript(post_file) else: log("ERROR: Unknown post-processing type '" + post_type + "' for " + name) return -1 # add to cached state sdata.append(library) # write out cached state writeJSONData(sdata, state_filename) except: log("ERROR: Failure to bootstrap library " + name + " (reason: " + str(sys.exc_info()[0]) + ")") traceback.print_exc() failed_libraries.append(name) if failed_libraries: log("***************************************") log("FAILURE to bootstrap the following libraries:") log(', '.join(failed_libraries)) log("***************************************") return -1 log("Finished") return 0
def is_valid(url): #Checks whether `url` is a valid URL. parsed = urlparse(url) return bool(parsed.netloc) and bool(parsed.scheme)
def valid_url(url): parsed_url = urlparse(url) return bool(parsed_url.scheme)
def test_netloc(self): url = 'foo://example.com' a = fetch.urlparse(url).netloc b = request.urlparse(url).netloc self.assertEqual(a, b)
def is_valid(url): ''' Check whether 'url' is a valid URL. ''' if 'None' in url: return False parsed = urlparse(url) return bool(parsed.netloc) and bool(parsed.scheme)
def modify_urls(self, url_list): return [ self._base_url + urlparse(url).query.split("=")[1].rstrip() + "/" for url in url_list ]
def sanitize_url(url): p = urlparse(url) url = urlunparse([p[0], p[1], quote(p[2]), p[3], p[4], p[5]]) # quote special characters in the path return url
if __name__ == '__main__': import re from urllib.request import urlopen from urllib.request import urlparse # Скачивание страницы, декодирование из байтов в строку text = urlopen(input()).read().decode() urls = set() # Поиск ссылок в тексте for url in re.findall(r'''<a.+href=["'](.+?)['"].*>''', text, flags=re.MULTILINE): # Разбор строки url на компоненты result = urlparse(url) result = result.path if not result.netloc else result.netloc # Избавляемся от порта if ':' in result: result = result.split(':')[0] # Избавляемся от относительных ссылок if result.startswith('../'): continue urls.add(result) for url in sorted(urls): print(url)
def getFilename(url): a = urlparse(url) b = os.path.basename(a.path) print(b) return b
def get_links(page_url): host = urlparse(page_url)[1] page = download_page(page_url) links = extract_links(page) return [link for link in links if urlparse(link)[1] == host]
# import libraries from __future__ import print_function import urllib.request as urllib_request from bs4 import BeautifulSoup from builtins import input from txt2pdf import txt2pdf book = "" #testing deafault #quote_page = 'http://fullbooks.net/a-court-of-mist-and-fury/page-1-1076467.html' #asks for the url of the book quote_page = input("Insert the URL on the fisrt page, example:'https://novel22.net/a-court-of-thorns-and-roses/page-1-1076370.html': \n") #parses the url to get information out url = urllib_request.urlparse(quote_page) data = url.path.split("/") #get the book name out of data name_of_book = data[1] #gets the important numbers of the url info = (data[2].replace(".html","")).split("-") hash_number = int(info[-1]) -1 page_1 = 1 total_pages = int(input("Insert the total of pages:")) print("Downloading {}".format((name_of_book.replace("-"," ")).title()))
parser.add_argument("-m", "--max-urls", help="Number of max URLs to crawl, default is 30.", default=30, type=int) args = parser.parse_args() url = args.url max_urls = args.max_urls crawl(url, max_urls) extract = tldextract.extract(url) #print the output: TLD, Domain, Hostname, Path, Links print('TLD: ' + extract.suffix) if extract.subdomain == '': print('Domain: ' + urlparse(url).netloc) else: print('Domain: ' + extract.domain + '.' + extract.suffix) print('Hostname: ' + urlparse(url).netloc) print('Path: ' + urlparse(url).path) print('LINKS:') print('\t' + "Same hostname: ") for link in same_host: print('\t\t' + link) print('\n') print('\t' + "Same domain: ") for link in same_domain: print('\t\t' + link) print('\n') print('\t' + "Different domain: ") for link in different_domain:
def is_valid_url(self, url): # pylint: disable=no-self-use parsed = urlparse(url) return bool(parsed.netloc) and bool(parsed.scheme)
def test_netloc_no_scheme(self): url = '//example.com' a = fetch.urlparse(url).netloc b = request.urlparse(url).netloc self.assertEqual(a, b)
def is_valid(url): parsed = urlparse(url) return bool(parsed.netloc) and bool(parsed.scheme)
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0' } colorama.init() urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) print(pyfiglet.figlet_format("Makdi", font='whimsy')) print( "The program is configured to send the discovered links to the proxy, posting minimal output to stdout \n\n" ) get_domain = str(input("Input a URL with http/https prepended: ")) domain = urlparse(get_domain).netloc #extract domain # variables that store final and buffer values ilink = [] elink = [] in_links = set() #stores internal links ex_links = set() #stores externanl links through passive discovery def valid_url(): # url validation try: global ip initial = requests.get(get_domain, headers=headers, verify=False) print("Connected. Proceeding further \n\n") crawl(get_domain)
def getQueryContent(alamatURL, strQuery): parsed = urlparse(alamatURL) QueryContent = str(urllib.parse.parse_qs(parsed.query)[strQuery][0]) #QueryContent = str(urlparse.parse_qs(parsed.query)[strQuery][0]) return QueryContent
def process(in_file, out_file): http = urllib3.PoolManager() url = request.pathname2url(in_file) mimetype = mimetypes.guess_type(url)[0] basename = ntpath.basename(in_file) with open(in_file, mode='rb') as fp: file_data = fp.read() r = http.request('POST', UPLOAD_URL, fields={'files': (basename, file_data, mimetype)}) json_txt = r.data.decode('utf-8') json_obj = json.loads(json_txt) width = json_obj['dim']['cols'] height = json_obj['dim']['rows'] file_id = json_obj['id'] im = Image.open(in_file) im = im.convert('RGB') draw = ImageDraw.Draw(im) width_ratio = im.width / width height_ratio = im.height / height for i in range(json_obj['balloonCount']): ballon = json_obj[str(i)] ballon_url = request.urlparse(ballon['originalURL']) fname = ntpath.basename(ballon_url.path) r = http.request('POST', TRANSLATE_URL, fields={ 'fname': fname, 'id': file_id, 'lang': 'ja' }) json_translated = json.loads(r.data.decode('utf-8')) translatedText = json_translated['translatedText'] if not translatedText: continue boundingRect = ballon['boundingRect'] x0 = boundingRect['x'] y0 = boundingRect['y'] x1 = x0 + boundingRect['width'] y1 = y0 + boundingRect['height'] textRectCount = ballon['textRectCount'] for rect in range(textRectCount): textRect = ballon['textRect'][str(rect)] x0 = math.floor(width_ratio * textRect['x'] + .5) y0 = math.floor(height_ratio * textRect['y'] + .5) x1 = x0 + math.floor(width_ratio * textRect['width'] + .5) y1 = y0 + math.floor(height_ratio * textRect['height'] + .5) draw.rectangle((x0, y0, x1, y1), fill=(255, 255, 255)) currentTextRect = 0 textRect = ballon['textRect'][str(currentTextRect)] target_x = math.floor(textRect['x'] * width_ratio + .5) target_y = math.floor(textRect['y'] * height_ratio + .5) target_width = math.floor(textRect['width'] * width_ratio + .5) target_height = math.floor(textRect['height'] * height_ratio + .5) start_x = target_x + target_width start_y = target_y linemaxsize = 0 for ch in translatedText: ch_w, ch_h = draw.textsize(ch, spacing=0, font=FONT) if linemaxsize < ch_w: linemaxsize = ch_w if start_y + ch_h > target_y + target_height: start_x -= linemaxsize + LINE_SPACING start_y = target_y if start_x - ch_w < target_x: currentTextRect += 1 if currentTextRect >= textRectCount: break textRect = ballon['textRect'][str(currentTextRect)] target_x = math.floor(textRect['x'] * width_ratio + .5) target_y = math.floor(textRect['y'] * height_ratio + .5) target_width = math.floor(textRect['width'] * width_ratio + .5) target_height = math.floor(textRect['height'] * height_ratio + .5) start_x = target_x + target_width start_y = target_y linemaxsize = ch_w ch_x = start_x - ch_w ch_y = start_y start_y += ch_h + SPACING draw.text((ch_x, ch_y), ch, font=FONT, fill=(0, 0, 0)) im.save(out_file)