def find_pingback_urls(self, urls): """Find the pingback urls of each urls""" pingback_urls = {} for url in urls: try: page = urlopen(url) headers = page.info() if 'text/' not in headers.get('Content-Type', '').lower(): continue server_url = headers.get('X-Pingback') if not server_url: server_url = self.find_pingback_href(page.read()) if server_url: server_url_splitted = urlsplit(server_url) if not server_url_splitted.netloc: url_splitted = urlsplit(url) server_url = '%s://%s%s' % (url_splitted.scheme, url_splitted.netloc, server_url) pingback_urls[url] = server_url except IOError: pass return pingback_urls
def __init__( self, url=DEFAULT_URI, name=None, ssl_required=False, verbose=False, pedantic=False, socket_keepalive=False ): self._connect_timeout = None self._socket_keepalive = socket_keepalive self._socket = None self._socket_file = None self._subscriptions = {} self._next_sid = 1 self._server = None self._server_index = 0 if isinstance(url, (list, tuple)): urls = [urlparse.urlsplit(x) for x in url] else: urls = [urlparse.urlsplit(url)] self._options = { 'url': urls, 'name': name, 'ssl_required': ssl_required, 'verbose': verbose, 'pedantic': pedantic }
def __init__(self, registry, url="", auth=None, verify=False, api_timeout=None): # Registry ip:port self.registry = urlsplit(registry).netloc # Service url, ip:port self.url = url # Authentication (user, password) or None. Used by request to do # basicauth self.auth = auth # Timeout for HTTP request self.api_timeout = api_timeout # Desired scope is the scope needed for the next operation on the # registry self.desired_scope = "" # Scope of the token we have self.scope = "" # Token used to authenticate self.token = "" # Boolean to enfore https checks. Used by request self.verify = verify # If we have no url then token are not required. get_new_token will not # be called if url: split = urlsplit(url) # user in url will take precedence over giver username if split.username and split.password: self.auth = (split.username, split.password) self.token_required = True else: self.token_required = False
def extract_password_row(self, row): res = '' hostname_split = urlparse.urlsplit(row[0]) website = urlparse.urlunsplit((hostname_split.scheme, hostname_split.netloc, "", "", "")).strip('\n') username = '' password = '' form_url = '' user_field = '' pass_field = '' form_url_split = urlparse.urlsplit(row[1]) form_url = urlparse.urlunsplit((form_url_split.scheme, form_url_split.netloc, "", "", "")).strip('\n') #print('\nusername = '******' password RAW = ', row[5]) password = self.decode_password(row[5]) try: username = row[3] try: password = self.decode_password(row[5]) self.num_passwords += 1 pass except: print('ERROR - password = '******'non password entry (blacklists - ignoring)') res = self.format_list_csv([website, username, form_url, user_field, pass_field, password]) return res
def main(GET): parser = argparse.ArgumentParser(description='Scrape a simple site.') parser.add_argument('url', help='the URL at which to begin') start_url = parser.parse_args().url starting_netloc = urlsplit(start_url).netloc url_filter = (lambda url: urlsplit(url).netloc == starting_netloc) scrape((GET, start_url), url_filter)
def zoom_article(self, ticket_id, article_id): art_descr = self.__db.article_description(article_id) if art_descr[4] & ART_TEXT: return eval(self.__db.article_message(article_id)) self.echo("Zoom article:", ticket_id, article_id) url_beg = urlsplit(self.runtime.get("site"))[:3] params = ( ("Action", "AgentTicketZoom"), ("Subaction", "ArticleUpdate"), ("TicketID", ticket_id), ("ArticleID", article_id), ("OTRSAgentInterface", self.runtime["OTRSAgentInterface"])) url = urlunsplit(url_beg + (urlencode(params), "")) pg = TicketsPage(self.core) page = pg.load(url) if page is None: return mail_header = page.get("mail_header", []) if "mail_src" in page: url = urlunsplit(url_beg[:2] + urlsplit(page["mail_src"])[2:]) self.echo("Get message:", url) pg = MessagePage(self.core) try: mail_text = pg.load(url) except LoginError: mail_text = pg.login() else: mail_text = page["message_text"] if mail_header: mail_text.insert(0, ("\n",)) for i in reversed(mail_header): mail_text.insert(0, ("%s\t%s\n" % i,)) shrink_tupled_text(mail_text) self.__db.article_message(article_id, repr(mail_text)) return mail_text
def get_fetcher(url=None, *, item=dict()): RTMP_PROTOCOLS = {'rtmp', 'rtmpt', 'rtmpe', 'rtmpte'} url = item.get("url", url) if urlsplit(url).scheme in RTMP_PROTOCOLS: return RtmpFetcher(url, live=True) auth = comm.get_auth() protocol = urlsplit(auth['server']).scheme if protocol in RTMP_PROTOCOLS: (url, ext) = url.rsplit('.', 1) # strip the extension (.flv or .mp4) url = auth['playpath_prefix'] + url if ext == 'mp4': url = 'mp4:' + url rtmp_url = auth['rtmp_url'] token = auth.get('token') if token: # Cannot use urljoin() because # the RTMP scheme would have to be added to its whitelist rtmp_url += '?auth=' + token return RtmpFetcher(rtmp_url, playpath=url) else: return HdsFetcher(url, auth)
def https_open(self, request): """ Send an HTTP request, which can be either GET or POST, depending on req.has_data() Args: request - instance of urllib2.Request """ full_url = request.get_full_url() url_parts = parse.urlsplit(full_url) robo = None if url_parts.netloc in self.robots: robo = self.robots[url_parts.netloc] else: # Getting request url, for checking robots.txt host = parse.urlsplit(full_url)[1] rurl = parse.urlunparse(("http", host, "/robots.txt", "", "")) robo = reppy.cache.RobotsCache() robo.fetch(rurl, self.agent_name) self.robots[url_parts.netloc] = robo # Is url allow for crawler in robots.txt if robo.allowed(full_url, self.agent_name): # Return result of request return request.HTTPHandler.https_open(self, request) else: raise RuntimeError('Forbidden by robots.txt')
def _url(self, hashed_name_func, name, force=False, hashed_files=None): """ Return the non-hashed URL in DEBUG mode. """ if settings.DEBUG and not force: hashed_name, fragment = name, '' else: clean_name, fragment = urldefrag(name) if urlsplit(clean_name).path.endswith('/'): # don't hash paths hashed_name = name else: args = (clean_name,) if hashed_files is not None: args += (hashed_files,) hashed_name = hashed_name_func(*args) final_url = super().url(hashed_name) # Special casing for a @font-face hack, like url(myfont.eot?#iefix") # http://www.fontspring.com/blog/the-new-bulletproof-font-face-syntax query_fragment = '?#' in name # [sic!] if fragment or query_fragment: urlparts = list(urlsplit(final_url)) if fragment and not urlparts[4]: urlparts[4] = fragment if query_fragment and not urlparts[3]: urlparts[2] += '?' final_url = urlunsplit(urlparts) return unquote(final_url)
def test_flow(self): url = self.sp.make_auth_req() status, headers, _ = self.getPage(url) assert status == '303 See Other' url = self.get_redirect_location(headers) req = parse_qs(urlsplit(url).query) assert 'SAMLRequest' in req assert 'RelayState' in req action, body = self.idp.handle_auth_req(req['SAMLRequest'][0], req['RelayState'][0], BINDING_HTTP_REDIRECT, 'test1') status, headers, body = self.getPage(action, method='POST', body=urlencode(body)) assert status == '302 Found' url = self.get_redirect_location(headers) req = parse_qs(urlsplit(url).query) assert 'SAMLResponse' in req assert 'RelayState' in req resp = self.sp.parse_authn_request_response(req['SAMLResponse'][0], BINDING_HTTP_REDIRECT) identity = resp.ava assert identity["displayName"][0] == "Test1" assert identity["sn"][0] == "test1@valueA" assert identity['o'][0] == "Small university"
def forwards(apps, schema_editor): MenuItem = apps.get_model('external_services', 'MenuItem') items = (MenuItem.objects.all() .exclude(service=None) .exclude(menu_url=None) .exclude(menu_url='')) errors = [] for item in items: uri1 = urlsplit(item.menu_url) uri2 = urlsplit(item.service.url) if uri1.netloc and uri1.netloc != uri2.netloc: errors.append(item) if errors: print() msg = ['Database is in inconsistent state.'] for item in errors: msg.append(" MenuItem(pk=%s): %s <> %s" % (item.pk, item.menu_url, item.service.url)) msg.append("For above menuitems, domain in MenuItem.menu_url doesn't match domain in MenuItem.service.url.") msg.append("Database is in inconsistent state. Manual fixing is required.") raise RuntimeError('\n'.join(msg)) for item in items: uri = urlsplit(item.menu_url) url = uri._replace(scheme='', netloc='').geturl() item.menu_url = url item.save(update_fields=['menu_url'])
def url(self, name, force=False): """ Returns the real URL in DEBUG mode. """ if settings.DEBUG and not force: hashed_name, fragment = name, '' else: clean_name, fragment = urldefrag(name) if urlsplit(clean_name).path.endswith('/'): # don't hash paths hashed_name = name else: cache_key = self.cache_key(name) hashed_name = self.cache.get(cache_key) if hashed_name is None: hashed_name = self.hashed_name(clean_name).replace('\\', '/') # set the cache if there was a miss # (e.g. if cache server goes down) self.cache.set(cache_key, hashed_name) final_url = super(CachedFilesMixin, self).url(hashed_name) # Special casing for a @font-face hack, like url(myfont.eot?#iefix") # http://www.fontspring.com/blog/the-new-bulletproof-font-face-syntax query_fragment = '?#' in name # [sic!] if fragment or query_fragment: urlparts = list(urlsplit(final_url)) if fragment and not urlparts[4]: urlparts[4] = fragment if query_fragment and not urlparts[3]: urlparts[2] += '?' final_url = urlunsplit(urlparts) return unquote(final_url)
def clean_url(value): """ Taken from Django' URLField, this helps to normalize URLs. Raises a ValueError if an invalid url is passed. Example: >>> clean_url("www.google.com") "http://www.google.com" >>> clean_url("_.com") Traceback (most recent call last): File "<stdin>", line 1, in <module> ValueError: Enter a valid URL. """ if value: value = value.strip() value = value.encode('ascii', 'ignore').decode("utf-8") url_fields = list(urlsplit((value))) if not url_fields[0]: # If no URL scheme given, assume http:// url_fields[0] = 'http' if not url_fields[1]: # Assume that if no domain is provided, that the path segment # contains the domain. url_fields[1] = url_fields[2] url_fields[2] = '' # Rebuild the url_fields list, since the domain segment may now # contain the path too. url_fields = list(urlsplit((urlunsplit(url_fields)))) if not url_fields[2]: # the path portion may need to be added before query params url_fields[2] = '/' value = urlunsplit(url_fields) return value
def assertRedirects(self, response, expected_url, status_code=302, target_status_code=200, host=None): """Asserts that a response redirected to a specific URL, and that the redirect URL can be loaded. Note that assertRedirects won't work for external links since it uses TestClient to do a request. """ self.assertEqual(response.status_code, status_code, ("Response didn't redirect as expected: Response code was %d" " (expected %d)" % (response.status_code, status_code))) url = response['Location'] scheme, netloc, path, query, fragment = urlsplit(url) e_scheme, e_netloc, e_path, e_query, e_fragment = urlsplit(expected_url) if not (e_scheme or e_netloc): expected_url = urlunsplit(('http', host or 'testserver', e_path, e_query, e_fragment)) self.assertEqual(url, expected_url, "Response redirected to '%s', expected '%s'" % (url, expected_url)) # Get the redirection page, using the same client that was used # to obtain the original response. redirect_response = response.client.get(path, QueryDict(query)) self.assertEqual(redirect_response.status_code, target_status_code, ("Couldn't retrieve redirection page '%s': response code was %d" " (expected %d)") % (path, redirect_response.status_code, target_status_code))
def sitelinks(self, html_page, url): """Finds all links in the provided html page""" bs = BeautifulSoup(html_page) links = set() urlpart = urlsplit(url) try: for anchor in bs.find_all('a'): linkpart = list(urlsplit(anchor['href'])) linkpart[4] = '' #remove the fragment if linkpart[0] == '': linkpart[0] = urlpart.scheme if linkpart[1] == '': linkpart[1] = urlpart.netloc if linkpart[0] == urlpart.scheme and linkpart[1] == urlpart.netloc: if linkpart[2].startswith('/'): links.add(urlunsplit(linkpart)) elif linkpart[2] != '': #relative URL. links.add(urljoin(url, linkpart[2])) except KeyError: pass return links
def hashed_name(self, name, content=None, filename=None): # `filename` is the name of file to hash if `content` isn't given. # `name` is the base name to construct the new hashed filename from. parsed_name = urlsplit(unquote(name)) clean_name = parsed_name.path.strip() filename = (filename and urlsplit(unquote(filename)).path.strip()) or clean_name opened = content is None if opened: if not self.exists(filename): raise ValueError("The file '%s' could not be found with %r." % (filename, self)) try: content = self.open(filename) except IOError: # Handle directory paths and fragments return name try: file_hash = self.file_hash(clean_name, content) finally: if opened: content.close() path, filename = os.path.split(clean_name) root, ext = os.path.splitext(filename) if file_hash is not None: file_hash = ".%s" % file_hash hashed_name = os.path.join(path, "%s%s%s" % (root, file_hash, ext)) unparsed_name = list(parsed_name) unparsed_name[2] = hashed_name # Special casing for a @font-face hack, like url(myfont.eot?#iefix") # http://www.fontspring.com/blog/the-new-bulletproof-font-face-syntax if '?#' in name and not unparsed_name[3]: unparsed_name[2] += '?' return urlunsplit(unparsed_name)
def __form_data(text, formid, params, soup=None, form_url=None): if type(params) is not dict: raise TypeError('Params must be a dict') if soup is None: soup = BeautifulSoup(text, 'html.parser') form = soup.find('form', attrs={'id': formid}) action = form.attrs.get('action') if not urlsplit(action).netloc: if form_url is None or not urlsplit(form_url).netloc: raise ValueError('kwarg form_url must be specified if form ' 'action lacks a host') action = urljoin(form_url, action) inputs = form.find_all('input') + form.find_all('textarea') for i in inputs: try: name = i.attrs['name'] type_ = i.attrs['type'] value = params.get(name) if type_ == 'submit': continue elif type_ == 'hidden': value = i.attrs['value'] if value is None else value elif value is None: raise ValueError('kwarg params dictionary is missing a ' 'value for a non-hidden field') except KeyError: pass else: params[name] = value return Session.FormInfo(params=params, post_url=action)
def oauth(self, req, credentials = None, params = {}): #NOTE: While flickr supports HTTPS in its oauth endpoints, flickr #thinks that the HTTPS endpoints are being accessed via HTTP, and thus #constructs the signature base string accordingly, which #will hence not match the signature base string generated by #pyoauth1client. We solve this by replacing HTTPS with HTTP #when generating the signature base string, and then revert the change #after the base string is generated. This way the signature #base string will match the one generated by flickr even though #we are accessing the endpoints via HTTPS for ADDED SECURITY!!!111one x = urlsplit(req.url) if x.scheme == "https": #Remove the HTTPS Scheme https = True x = x._replace(scheme = "http") req = req._replace(url = urlunsplit(x)) else: https = False y = super().oauth(req, credentials, params) if https: #Add back the HTTPS scheme x = urlsplit(y.url) x = x._replace(scheme = "https") y = y._replace(url = urlunsplit(x)) return y
def run(self): while True: # grabs url from queue level, u = self.input_q.get() main = '{0.scheme}://{0.netloc}/'.format(urlsplit(u)) # fetching urls if level < MAX_URL_LEVEL: html = _get_content(u) if not isinstance(html, list): soup = bs(html) for link in soup.find_all('a'): href = link.get('href') if not href or len(href) < 2: continue # Check if URL is relative elif not urlsplit(href)[0] and not urlsplit(href)[1]: self.output_q.put((level+1, _url_discard(urljoin(u, href)))) elif href.startswith(main): self.output_q.put((level+1, _url_discard(href))) else: # Place for possible error logs (: pass # signals to queue job is done self.input_q.task_done()
def _main(): base_url = sys.argv[1] soup = bs4.BeautifulSoup(urlopen(base_url), from_encoding="windows-1252") index_urls = [urljoin(base_url, h3("a")[0]["href"]) for h3 in soup("h3")] for index_url in index_urls: try: resp = urlopen(index_url) except HTTPError as err: print(err, err.url, file=sys.stderr) print("Skipping..", file=sys.stderr) continue index_soup = bs4.BeautifulSoup(resp, from_encoding="iso-8859-1") index_path = urlsplit(index_url).path index_filepath = os.path.normpath("." + index_path) try: os.makedirs(os.path.dirname(index_filepath)) except OSError as e: if e.errno != errno.EEXIST: raise e for issue_url in iter_issue_urls(index_soup): issue_url = urljoin(index_url, issue_url) try: resp = urlopen(issue_url) except HTTPError as err: print(err, err.url, file=sys.stderr) print("Skipping..", file=sys.stderr) continue issue_soup = bs4.BeautifulSoup(resp, from_encoding="windows-1252") issue_path = urlsplit(issue_url).path issue_filepath = os.path.normpath("." + issue_path) with open(issue_filepath, "w") as f: print(klupu.clean_soup(issue_soup), file=f) with open(index_filepath, "w") as f: print(klupu.clean_soup(index_soup), file=f)
def main(GET): global mail,error,error_list parser = argparse.ArgumentParser(description='Scrape a simple site.') parser.add_argument('url', help='the URL at which to begin') start_url = parser.parse_args().url starting_netloc = urlsplit(start_url).netloc url_filter = (lambda url: urlsplit(url).netloc == starting_netloc) scrape((GET, start_url), url_filter) print ("\n\nresult--------------------------------\nerror:%d" %(error)) count = 1; for url in error_list: print(url) print("\n") for url in mail: print("[%d]url:%s" %(count,url)) data = mail[url][0] if data: tmp = [] for val in data: if not val in tmp: print (val) tmp.append(val) else: print("None") print ("") count+=1
def parse_url(link): """Say Website Title information in channel""" baseurl = '{uri.scheme}://{uri.netloc}'.format(uri=urlsplit(link)) path = urlsplit(link).path query = '?{uri.query}'.format(uri=urlsplit(link)) try: headers = {'Accept-Encoding': 'utf-8', 'User-Agent': 'Mozilla/5.0'} response = get(baseurl + path + query, headers=headers) except: return if response.headers["Content-Type"] and "text/html" in response.headers["Content-Type"]: try: URL = BeautifulSoup(response.text, "html.parser") except: return if not URL.title: return if URL.title.string is None: return if len(URL.title.string) > 250: title=URL.title.string[0:250] + '…' else: title=URL.title.string return title.replace('\n', ' ').strip() + " (" + urlsplit(link).netloc + ")" else: return
def find_pingback_urls(self, urls): """ Find the pingback URL for each URLs. """ pingback_urls = {} for url in urls: try: page = urlopen(url) headers = page.info() server_url = headers.get('X-Pingback') if not server_url: content_type = headers.get('Content-Type', '').split( ';')[0].strip().lower() if content_type in ['text/html', 'application/xhtml+xml']: server_url = self.find_pingback_href( page.read(5 * 1024)) if server_url: server_url_splitted = urlsplit(server_url) if not server_url_splitted.netloc: url_splitted = urlsplit(url) server_url = '%s://%s%s' % (url_splitted.scheme, url_splitted.netloc, server_url) pingback_urls[url] = server_url except IOError: pass return pingback_urls
def sendall(self, data, *args, **kw): self._sent_data.append(data) hostnames = [getattr(i.info, 'hostname', None) for i in HTTPretty._entries.keys()] self.fd.seek(0) try: requestline, _ = data.split(b'\r\n', 1) method, path, version = parse_requestline(requestline) is_parsing_headers = True except ValueError: is_parsing_headers = False if not is_parsing_headers: if len(self._sent_data) > 1: headers, body = map(utf8, self._sent_data[-2:]) method, path, version = parse_requestline(headers) split_url = urlsplit(path) info = URIInfo(hostname=self._host, port=self._port, path=split_url.path, query=split_url.query) # If we are sending more data to a dynamic response entry, # we need to call the method again. if self._entry and self._entry.dynamic_response: self._entry.body(info, method, body, headers) try: return HTTPretty.historify_request(headers, body, False) except Exception as e: logging.error(traceback.format_exc(e)) return self._true_sendall(data, *args, **kw) # path might come with s = urlsplit(path) POTENTIAL_HTTP_PORTS.append(int(s.port or 80)) headers, body = map(utf8, data.split(b'\r\n\r\n', 1)) request = HTTPretty.historify_request(headers, body) info = URIInfo(hostname=self._host, port=self._port, path=s.path, query=s.query, last_request=request) entries = [] for matcher, value in HTTPretty._entries.items(): if matcher.matches(info): entries = value break if not entries: self._true_sendall(data) return self._entry = matcher.get_next_entry(method) self._request = (info, body, headers)
def is_external_url(self, url, site_url): """ Check if the URL is an external URL. """ url_splitted = urlsplit(url) if not url_splitted.netloc: return False return url_splitted.netloc != urlsplit(site_url).netloc
def urlFileName(url:str)->str: from os import path r = path.basename(parse.urlsplit(url).path) if r: return r r = path.basename(parse.urlsplit(url).query) assert r return r
def assertRedirects(self, response, expected_url, status_code=302, target_status_code=200, host=None, msg_prefix=''): """Asserts that a response redirected to a specific URL, and that the redirect URL can be loaded. Note that assertRedirects won't work for external links since it uses TestClient to do a request. """ if msg_prefix: msg_prefix += ": " if hasattr(response, 'redirect_chain'): # The request was a followed redirect self.assertTrue(len(response.redirect_chain) > 0, msg_prefix + "Response didn't redirect as expected: Response" " code was %d (expected %d)" % (response.status_code, status_code)) self.assertEqual(response.redirect_chain[0][1], status_code, msg_prefix + "Initial response didn't redirect as expected:" " Response code was %d (expected %d)" % (response.redirect_chain[0][1], status_code)) url, status_code = response.redirect_chain[-1] self.assertEqual(response.status_code, target_status_code, msg_prefix + "Response didn't redirect as expected: Final" " Response code was %d (expected %d)" % (response.status_code, target_status_code)) else: # Not a followed redirect self.assertEqual(response.status_code, status_code, msg_prefix + "Response didn't redirect as expected: Response" " code was %d (expected %d)" % (response.status_code, status_code)) url = response.url scheme, netloc, path, query, fragment = urlsplit(url) redirect_response = response.client.get(path, QueryDict(query)) # Get the redirection page, using the same client that was used # to obtain the original response. self.assertEqual(redirect_response.status_code, target_status_code, msg_prefix + "Couldn't retrieve redirection page '%s':" " response code was %d (expected %d)" % (path, redirect_response.status_code, target_status_code)) e_scheme, e_netloc, e_path, e_query, e_fragment = urlsplit( expected_url) if not (e_scheme or e_netloc): expected_url = urlunsplit(('http', host or 'testserver', e_path, e_query, e_fragment)) self.assertEqual(url, expected_url, msg_prefix + "Response redirected to '%s', expected '%s'" % (url, expected_url))
def parse_apisrv_url(scheme, apisrv): if apisrv.startswith('http://') or apisrv.startswith('https://'): return urlsplit(apisrv)[0:2] elif scheme != None: # the split/join is needed to get a proper url (e.g. without a trailing slash) return urlsplit(urljoin(scheme, apisrv))[0:2] else: msg = 'invalid apiurl \'%s\' (specify the protocol (http:// or https://))' % apisrv raise URLError(msg)
def link_clean(link_row): link=link_row['links'] try: urlsplit(link) except ValueError: clean_link='BROKEN' else: clean_link=link.decode("utf-8") return clean_link
def main(GET): parser = argparse.ArgumentParser(description='Scrape a simple site.') parser.add_argument('url', help='the URL at which to begin') parser.add_argument("-n", "--number", type=int, help="the number of reachable website", default=15) numEXECUTE = parser.parse_args().number #print (numEXECUTE) start_url = parser.parse_args().url starting_netloc = urlsplit(start_url).netloc url_filter = (lambda url: urlsplit(url).netloc == starting_netloc) scrape((GET, start_url), url_filter, numEXECUTE)
def set_base_url(self): self.base_url = urlsplit(self.start_url)._replace(path="", query="").geturl()
def to_ip(url): value = parse.urlsplit(url) if value.netloc != "": return FQDN().to_ip(value.netloc) return None
def _request_favicon(self): scheme, netloc, *_ = urlsplit(self.reply_url) favicon_response = urllib.request.urlopen( f"{scheme}://{netloc}/favicon.ico") assert favicon_response.read() == b"Favicon is not provided."
def handle(self): if self.args.languages and not self.args.locale_dir: self.subparser.error( '--locale-dir is required if --languages is set.') if self.args.versions_dir: versions_directory = Path(self.args.versions_dir) data = {} languages = {'en'} localedir = self.args.locale_dir headers = ['Title', 'Description', 'Extension'] if localedir: available_translations = [ n for n in os.listdir(localedir) if os.path.isdir(os.path.join(localedir, n)) ] if self.args.languages: for language in self.args.languages.split(','): if language in available_translations: languages.add(language) else: self.subparser.error( f'translations to {language} are not available') else: languages.update(available_translations) for version in self.versions(): public_download_url = version.download_url if self.args.versions_dir: version.download_url = (versions_directory / version.id / version.version).as_uri() # Add the extension's data. data.setdefault( version.id, { 'id': version.id, 'category': version.category, 'core': version.core, 'name': {}, 'description': {}, 'latest_version': None, 'versions': {}, }) # Add the version's metadata. version_data = { 'id': version.id, 'date': version.date, 'version': version.version, 'base_url': version.base_url, 'download_url': public_download_url, 'publisher': { 'name': version.repository_user, 'url': version.repository_user_page, }, 'metadata': version.metadata, 'schemas': {}, 'codelists': {}, 'readme': {}, } parsed = urlsplit(version_data['publisher']['url']) if parsed.netloc == 'github.com' and 'OCDS_GITHUB_ACCESS_TOKEN' in os.environ: api_url = f"https://api.github.com/users/{version_data['publisher']['name']}" headers = { 'Authorization': f"token {os.getenv('OCDS_GITHUB_ACCESS_TOKEN')}" } version_data['publisher']['name'] = session.get( api_url, headers=headers).json()['name'] for language in sorted(languages): # Update the version's metadata and add the version's schema. translator = _translator(version, 'schema', localedir, language) translation = translate_extension_metadata_data( version.metadata, translator, lang=language) for key in TRANSLATABLE_EXTENSION_METADATA_KEYWORDS: version_data['metadata'][key][language] = translation[key][ language] for name in ('record-package-schema.json', 'release-package-schema.json', 'release-schema.json'): version_data['schemas'].setdefault(name, {}) if name in version.schemas: translation = translate_schema_data( version.schemas[name], translator) version_data['schemas'][name][language] = translation # Add the version's codelists. if version.codelists: translator = _translator(version, 'codelists', localedir, language) for name in sorted(version.codelists): version_data['codelists'].setdefault(name, {}) codelist = version.codelists[name] version_data['codelists'][name][language] = {} translation = [ translator.gettext(fieldname) for fieldname in codelist.fieldnames ] version_data['codelists'][name][language][ 'fieldnames'] = translation translation = translate_codelist_data( codelist, translator, headers) version_data['codelists'][name][language][ 'rows'] = translation # Add the version's readme and documentation. translator = _translator(version, 'docs', localedir, language) translation = translate_markdown_data( 'README.md', version.remote('README.md'), translator) version_data['readme'][language] = translation data[version.id]['versions'][version.version] = version_data for _id in data: # Determine the latest version. See ocdsextensionregistry.util.get_latest_version(). versions = data[_id]['versions'] if len(versions) == 1: latest_version = list(versions)[0] elif 'master' in versions: latest_version = 'master' elif default_minor_version in versions: latest_version = default_minor_version else: dated = [kv for kv in versions.items() if kv[1]['date']] if dated: latest_version = max(dated, key=lambda kv: kv[1]['date'])[0] else: raise CommandError( f"Couldn't determine latest version of {_id}") # Apply the latest version. data[_id]['latest_version'] = latest_version for field in ('name', 'description'): data[_id][field] = data[_id]['versions'][latest_version][ 'metadata'][field] json_dump(data, sys.stdout)
def get_ftp_date(url): import ftputil pr = parse.urlsplit(url) with ftputil.FTPHost(pr.netloc, 'anonymous', '') as host: return host.path.getmtime(pr.path)
def scrape(): browser = init_browser() mars_facts_data = {} nasa = "https://mars.nasa.gov/news/" browser.visit(nasa) time.sleep(2) html = browser.html soup = bs(html,"html.parser") #scrapping latest news about mars from nasa news_title = soup.find("div",class_="content_title").text news_paragraph = soup.find("div", class_="article_teaser_body").text mars_facts_data['news_title'] = news_title mars_facts_data['news_paragraph'] = news_paragraph #Mars Featured Image nasa_image = "https://www.jpl.nasa.gov/spaceimages/?search=&category=featured#submit" browser.visit(nasa_image) time.sleep(2) from urllib.parse import urlsplit base_url = "{0.scheme}://{0.netloc}/".format(urlsplit(nasa_image)) xpath = "//*[@id=\"page\"]/section[3]/div/ul/li[1]/a/div/div[2]/img" #Use splinter to click on the mars featured image #to bring the full resolution image results = browser.find_by_xpath(xpath) img = results[0] img.click() time.sleep(2) #get image url using BeautifulSoup html_image = browser.html soup = bs(html_image, "html.parser") img_url = soup.find("img", class_="fancybox-image")["src"] full_img_url = base_url + img_url mars_facts_data["featured_image"] = full_img_url # #### Mars Weather #get mars weather's latest tweet from the website url_weather = "https://twitter.com/marswxreport?lang=en" browser.visit(url_weather) html_weather = browser.html soup = bs(html_weather, "html.parser") mars_weather = soup.find("p", class_="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text").text mars_facts_data["mars_weather"] = mars_weather # #### Mars Facts url_facts = "https://space-facts.com/mars/" time.sleep(2) table = pd.read_html(url_facts) table[0] df_mars_facts = table[0] df_mars_facts.columns = ["Parameter", "Values"] clean_table = df_mars_facts.set_index(["Parameter"]) mars_html_table = clean_table.to_html() mars_html_table = mars_html_table.replace("\n", "") mars_facts_data["mars_facts_table"] = mars_html_table # #### Mars Hemisperes url_hemisphere = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser.visit(url_hemisphere) #Getting the base url hemisphere_base_url = "{0.scheme}://{0.netloc}/".format(urlsplit(url_hemisphere)) hemisphere_img_urls = [] hemisphere_img_urls #1 Hemisphere hemisphere_img_urls = [] results = browser.find_by_xpath( "//*[@id='product-section']/div[2]/div[1]/a/img").click() time.sleep(2) cerberus_open_click = browser.find_by_xpath( "//*[@id='wide-image-toggle']").click() time.sleep(1) cerberus_image = browser.html soup = bs(cerberus_image, "html.parser") cerberus_url = soup.find("img", class_="wide-image")["src"] cerberus_img_url = hemisphere_base_url + cerberus_url #print(cerberus_img_url) cerberus_title = soup.find("h2",class_="title").text #print(cerberus_title) back_button = browser.find_by_xpath("//*[@id='splashy']/div[1]/div[1]/div[3]/section/a").click() cerberus = {"image title":cerberus_title, "image url": cerberus_img_url} hemisphere_img_urls.append(cerberus) #2 Hemisphere results1 = browser.find_by_xpath( "//*[@id='product-section']/div[2]/div[2]/a/img").click() time.sleep(2) schiaparelli_open_click = browser.find_by_xpath( "//*[@id='wide-image-toggle']").click() time.sleep(1) schiaparelli_image = browser.html soup = bs(schiaparelli_image, "html.parser") schiaparelli_url = soup.find("img", class_="wide-image")["src"] schiaparelli_img_url = hemisphere_base_url + schiaparelli_url #print(schiaparelli_img_url) schiaparelli_title = soup.find("h2",class_="title").text #print(schiaparelli_title) back_button = browser.find_by_xpath("//*[@id='splashy']/div[1]/div[1]/div[3]/section/a").click() schiaparelli = {"image title":schiaparelli_title, "image url": schiaparelli_img_url} hemisphere_img_urls.append(schiaparelli) #3 Hemisphere results1 = browser.find_by_xpath( "//*[@id='product-section']/div[2]/div[3]/a/img").click() time.sleep(2) syrtis_major_open_click = browser.find_by_xpath( "//*[@id='wide-image-toggle']").click() time.sleep(1) syrtis_major_image = browser.html soup = bs(syrtis_major_image, "html.parser") syrtis_major_url = soup.find("img", class_="wide-image")["src"] syrtis_major_img_url = hemisphere_base_url + syrtis_major_url #print(syrtis_major_img_url) syrtis_major_title = soup.find("h2",class_="title").text #print(syrtis_major_title) back_button = browser.find_by_xpath("//*[@id='splashy']/div[1]/div[1]/div[3]/section/a").click() syrtis_major = {"image title":syrtis_major_title, "image url": syrtis_major_img_url} hemisphere_img_urls.append(syrtis_major) #4 Hemisphere results1 = browser.find_by_xpath( "//*[@id='product-section']/div[2]/div[4]/a/img").click() time.sleep(2) valles_marineris_open_click = browser.find_by_xpath( "//*[@id='wide-image-toggle']").click() time.sleep(1) valles_marineris_image = browser.html soup = bs(valles_marineris_image, "html.parser") valles_marineris_url = soup.find("img", class_="wide-image")["src"] valles_marineris_img_url = hemisphere_base_url + syrtis_major_url #print(valles_marineris_img_url) valles_marineris_title = soup.find("h2",class_="title").text #print(valles_marineris_title) back_button = browser.find_by_xpath("//*[@id='splashy']/div[1]/div[1]/div[3]/section/a").click() valles_marineris = {"image title":valles_marineris_title, "image url": valles_marineris_img_url} hemisphere_img_urls.append(valles_marineris) mars_facts_data["hemisphere_img_url"] = hemisphere_img_urls return mars_facts_data
def url_to_repo_name(url): """@todo""" path = urlsplit(url).path path = path[1:] (base, ext) = os.path.splitext(path) return base
def _get_base_path(self) -> str: return cast(str, urlsplit(self.location).path)
def base_path(self) -> str: if self.base_url: return urlsplit(self.base_url).path return self._get_base_path()
def UnpackUserTarball(): tarballs = [] userFiles = [] if len(sys.argv) > 1: tarballs = sys.argv[1].split(',') if len(sys.argv) > 2: userFiles = sys.argv[2].split(',') jobDir = os.environ['WMAGENTJOBDIR'] for tarball in tarballs: splitResult = urlsplit(tarball) tarFile = os.path.join(jobDir, os.path.basename(tarball)) # Is it a URL or a file that exists in the jobDir? if splitResult[0] in ['xrootd', 'root']: logging.info("Fetching tarball %s through xrootd", tarball) try: subprocess.check_call( ['xrdcp', '-d', '1', '-f', tarball, 'TEMP_TARBALL.tgz']) subprocess.check_call(['tar', 'xf', 'TEMP_TARBALL.tgz']) except subprocess.CalledProcessError: logging.error("Couldn't retrieve/extract file from xrootd") raise finally: if os.path.exists('TEMP_TARBALL.tgz'): os.unlink('TEMP_TARBALL.tgz') elif splitResult[0] in ['http', 'https'] and splitResult[1]: retriever = getRetriever(splitResult[0]) with tempfile.NamedTemporaryFile() as tempFile: if setHttpProxy(tarball): try: logging.info( 'Fetching URL tarball %s through proxy server', tarball) fileName, headers = retriever(tarball, tempFile.name) except (RuntimeError, IOError): del os.environ['http_proxy'] logging.warning( 'Fetching URL tarball %s after proxy server failure', tarball) fileName, headers = retriever(tarball, tempFile.name) else: logging.info( 'Fetching URL tarball %s without proxy server', tarball) fileName, headers = retriever(tarball, tempFile.name) try: subprocess.check_call(['tar', 'xf', fileName]) except subprocess.CalledProcessError: raise RuntimeError('Error extracting %s' % tarball) elif os.path.isfile(tarFile): logging.info("Untarring %s", tarFile) subprocess.check_call(['tar', 'xf', tarFile]) else: raise IOError('%s does not exist' % tarFile) for userFile in userFiles: if userFile: logging.info("Moving '%s' to execution directory.", userFile) shutil.move(userFile, '..') return 0
def _enrich_layer_metadata(self, geonode_layer): workspace, layername = geonode_layer.name.split( ":") if ":" in geonode_layer.name else (None, geonode_layer.name) url = urlsplit(self.url) base_url = '%s://%s/' % (url.scheme, url.netloc) response = requests.get('%sapi/layers/?name=%s' % (base_url, layername), {}, timeout=10, verify=False) content = response.content status = response.status_code content_type = response.headers['Content-Type'] if status == 200 and 'application/json' == content_type: try: if isinstance(content, bytes): content = content.decode('UTF-8') _json_obj = json.loads(content) if _json_obj['meta']['total_count'] == 1: _layer = _json_obj['objects'][0] if _layer: r_fields = {} # Update plain fields for field in GeoNodeServiceHandler.LAYER_FIELDS: if field in _layer and _layer[field]: r_fields[field] = _layer[field] if r_fields: Layer.objects.filter(id=geonode_layer.id).update( **r_fields) geonode_layer.refresh_from_db() # Update Thumbnail if "thumbnail_url" in _layer and _layer[ "thumbnail_url"]: thumbnail_remote_url = _layer["thumbnail_url"] _url = urlsplit(thumbnail_remote_url) if not _url.scheme: thumbnail_remote_url = "{}{}".format( geonode_layer.remote_service.service_url, _url.path) resp, image = http_client.request( thumbnail_remote_url) if 'ServiceException' in str(image) or \ resp.status_code < 200 or resp.status_code > 299: msg = 'Unable to obtain thumbnail: %s' % image logger.debug(msg) # Replace error message with None. image = None if image is not None: thumbnail_name = 'layer-%s-thumb.png' % geonode_layer.uuid geonode_layer.save_thumbnail(thumbnail_name, image=image) else: self._create_layer_thumbnail(geonode_layer) else: self._create_layer_thumbnail(geonode_layer) # Add Keywords if "keywords" in _layer and _layer["keywords"]: keywords = _layer["keywords"] if keywords: geonode_layer.keywords.clear() geonode_layer.keywords.add(*keywords) # Add Regions if "regions" in _layer and _layer["regions"]: (regions_resolved, regions_unresolved) = resolve_regions( _layer["regions"]) if regions_resolved: geonode_layer.regions.clear() geonode_layer.regions.add(*regions_resolved) # Add Topic Category if "category__gn_description" in _layer and _layer[ "category__gn_description"]: try: categories = TopicCategory.objects.filter( Q(gn_description__iexact=_layer[ "category__gn_description"])) if categories: geonode_layer.category = categories[0] except BaseException: traceback.print_exc() except BaseException: traceback.print_exc() finally: geonode_layer.save()
def crawlLinks(links): articlesContent = pd.DataFrame() for link in tqdm(list(links)): try: rq = requests.get(link) domain = "{0.netloc}".format(urlsplit(link)) category = re.search(domain + '/([^/]+)', link).group(1) if rq.status_code == 200: page = bs4.BeautifulSoup(rq.text, features="html.parser") if page.find({'class': 'article-post'}): body = page.select('.article-post')[0] headline = body.select('h1')[0].text if len(body.select('h1')) else '' subtitle = None #metadata location = body.select('.location')[0].text if len(body.select('.location')) else '' articleDate = body.select('.fa-calendar')[0].text if len(body.select('.fa-calendar')) else '' views = body.select('.fa-eye')[0].text if len(body.select('.fa-eye')) else '' comments = body.select('.fa-comments-o')[0].text if len(body.select('.fa-comments-o')) else '' comments = comments.split(" ")[0] if comments != '' else '' tags = ' - '.join([tag['a'].text for tag in body.select('.tags').select('li')]) else: headline = page.select('.post-title')[0].text if len(page.select('.post-title')) else '' subtitle = page.select('.post-subtitle')[0].text if len(page.select('.post-subtitle')) else '' #metadata simpleShare = page.select('.simple-share')[0] if len(page.select('.simple-share')) > 0 else '' li = simpleShare.find_all('li') location = li[0].text if len(li) > 0 else '' articleDate = li[1].text if len(li) > 1 else '' views = li[2].text if len(li) > 2 else '' views = views.split(" ")[0] if views != '' else '' comments = li[3].text if len(li) > 3 else '' comments = comments.split(" ")[0] if comments != '' else '' tags = ' - '.join([tag.a.text for tag in page.select('.tags-widget')[0].select('li')[1:]]) if len(page.select('.tags-widget')) > 0 else '' # 30 Дек. 2019, 16:13 if articleDate != '': month_name = re.search('([а-яА-Я]+)', articleDate) if month_name is not None: month_name = month_name.group(1) articleDate = articleDate.replace(month_name, replace_month_with_digit(month_name)) articleDate = pd.to_datetime(articleDate, format='%d %m %Y, %H:%M') article_text = clean_text(page.select('.post-content')[0].select('div')[2].text) if len(page.select('.post-content')) > 0 else '' articlesContent = articlesContent.append({'link': link, 'title': clean_text(headline), 'subtitle': clean_text(subtitle), 'location': clean_text(location), 'comments': clean_text(comments), 'date': articleDate, 'views': clean_text(views), 'category': category, 'tags': clean_text(tags), 'article_text': article_text}, ignore_index=True) except: continue return articlesContent
def determine_file_path(asset_url, site_directory): folder = site_directory + aass.determine_storage_location(asset_url) filename = urlsplit(asset_url).path file_path = folder + '\\' + basename(filename) create_directory(folder) return file_path
def to_domain_name(url): value = parse.urlsplit(url) if value.netloc != "" and not IPAddress.is_valid(value.netloc): return value.netloc return None
def prepare_url(self, product): search = quote(product).replace('%20', '-') url = urljoin(self.base_url, search) return urlsplit(url)._replace(query="").geturl()
def _get_conn(self): '''Obtain connection to server and authentication token''' log.debug('started') if 'no-ssl' in self.options: ssl_context = None else: ssl_context = self.ssl_context headers = CaseInsensitiveDict() headers['X-Auth-User'] = self.login headers['X-Auth-Key'] = self.password with HTTPConnection(self.hostname, self.port, proxy=self.proxy, ssl_context=ssl_context) as conn: conn.timeout = int(self.options.get('tcp-timeout', 20)) for auth_path in ('/v1.0', '/auth/v1.0'): log.debug('GET %s', auth_path) conn.send_request('GET', auth_path, headers=headers) resp = conn.read_response() if resp.status in (404, 412): log.debug('auth to %s failed, trying next path', auth_path) conn.discard() continue elif resp.status == 401: raise AuthorizationError(resp.reason) elif resp.status > 299 or resp.status < 200: raise HTTPError(resp.status, resp.reason, resp.headers) # Pylint can't infer SplitResult Types #pylint: disable=E1103 self.auth_token = resp.headers['X-Auth-Token'] o = urlsplit(resp.headers['X-Storage-Url']) self.auth_prefix = urllib.parse.unquote(o.path) if o.scheme == 'https': ssl_context = self.ssl_context elif o.scheme == 'http': ssl_context = None else: # fall through to scheme used for authentication pass # mock server can only handle one connection at a time # so we explicitly disconnect this connection before # opening the feature detection connection # (mock server handles both - storage and authentication) conn.disconnect() self._detect_features(o.hostname, o.port, ssl_context) conn = HTTPConnection(o.hostname, o.port, proxy=self.proxy, ssl_context=ssl_context) conn.timeout = int(self.options.get('tcp-timeout', 20)) return conn raise RuntimeError('No valid authentication path found')
# Created on 2018-08-14 15:53 """ urlparse 和 urlsplit 使分割url 区别在于urlsplit有params, params不常用 """ from urllib import parse url = "https://www.baidu.com/p?wd=123&s=abc#a" # 举栗子 url2 = "https://www.baidu.com/p;AAA?wd=123&s=abc#a" result_0 = parse.urlparse(url) result_1 = parse.urlsplit(url) print("--- urlparse ---", parse.urlparse(url2), end="\n\n") print("--- urlparse ---", result_0, end="\n\n") print("--- urlsplit ---", result_1, end="\n\n") print("类型:", type(result_1), end="\n\n") print("--------------- urlparse ---------------") print("scheme: ", result_0.scheme) print("netloc: ", result_0.netloc) print("path: ", result_0.path) print("params: ", result_0.params) print("query: ", result_0.query) print("fragment:", result_0.fragment)
if __name__ == "__main__": urlsplit_fuzzer = GrammarCoverageFuzzer(webbrowser_grammar, start_symbol="<urlsplit>") for i in range(5): print(urlsplit_fuzzer.fuzz()) from urllib.parse import urlsplit if __package__ is None or __package__ == "": from Timer import Timer else: from .Timer import Timer if __name__ == "__main__": with Timer() as urlsplit_timer: urlsplit('http://www.fuzzingbook.org/', 'http', True) urlsplit_timer.elapsed_time() if __name__ == "__main__": with Timer() as webbrowser_timer: webbrowser("http://www.fuzzingbook.org") webbrowser_timer.elapsed_time() if __name__ == "__main__": webbrowser_timer.elapsed_time() / urlsplit_timer.elapsed_time() # ## Synopsis if __name__ == "__main__": print('\n## Synopsis')
def url_join(base, *args): """ Helper function to join an arbitrary number of url segments together. """ # Python2 urlsplit can't handle bytearray (TypeError: unhashable type) if isinstance(base, bytearray): base = bytes(base) if isinstance(base, bytes): needbytes = True else: needbytes = False try: scheme, netloc, path, query, fragment = urlsplit(base) except UnicodeDecodeError: # PY3 urlsplit uses implicit (ASCII) encoding to decode bytes, but we # use latin1 since re-encoding after urlsplit exactly reverses decode # for any ASCII superset (needed for posixpath.join to work, # since EBCDIC codes / as \x61 [a]). This "trick" allows use of ASCII # supersets for bytes URL in the original base. base = base.decode('latin1') scheme, netloc, path, query, fragment = (x.encode('latin1') for x in urlsplit(base)) if not len(path): if needbytes: path = b('/') else: path = u('/') newargs = [] try: for x in args: if needbytes: # Although they don't need conversion, bytes args must be ASCII # as we cannot know they use the same encoding as base URL. if isinstance(x, bytes) or isinstance(x, bytearray): newargs.append(x.decode('ascii').encode('ascii')) else: if not isinstance(x, text_type): x = '%s' % x newargs.append(x.encode('ascii')) else: if isinstance(x, bytes) or isinstance(x, bytearray): newargs.append(x.decode('ascii')) else: if not isinstance(x, text_type): x = '%s' % x newargs.append(x) path = posixpath.join(path, *newargs) if PY3 and needbytes: # PY3 urlunsplit uses implicit (ASCII) encoding to decode bytes, # but we use latin1 since re-encoding after urlunsplit exactly # reverses decode for any ASCII superset (needed for posixpath.join # to work, since EBCDIC codes / as \x61 [a]). This "trick" allows # use of ASCII supersets for bytes URLs (but not args, for safety). return urlunsplit([x.decode('latin1') for x in [scheme, netloc, path, query, fragment]] ).encode('latin1') except UnicodeError: raise TypeError("Can't mix non-ASCII bytes and strings in URL paths.") return urlunsplit([scheme, netloc, path, query, fragment])
def run(self, pool_size): """ this function manages the parallel processing of the url list using the python Pool class the function first reads the list of urls out of the page_lists directory, cleans it for known issues (eg common binary files), and issues with idna encoding (tricky!) then the page list is mapped to the process_url function and executed in parallell pool_size is defined in the run_webxray.py file, see details there """ # the list of url MUST be in the page_lists directory! try: url_list = open( os.path.dirname(os.path.abspath(__file__)) + '/../page_lists/' + self.pages_file_name, 'r') except: print( 'File "%s" does not exist, file must be in ./page_lists directory. Exiting.' % self.pages_file_name) exit() # set up sql connection used to determine if items are already in the db if self.db_engine == 'mysql': from webxray.MySQLDriver import MySQLDriver sql_driver = MySQLDriver(self.db_name) elif self.db_engine == 'postgres': from webxray.PostgreSQLDriver import PostgreSQLDriver sql_driver = PostgreSQLDriver(self.db_name) elif self.db_engine == 'sqlite': from webxray.SQLiteDriver import SQLiteDriver sql_driver = SQLiteDriver(self.db_name) # this list gets mapped to the Pool, very important! urls_to_process = set() # simple counter used solely for updates to CLI count = 0 print('\t------------------------') print('\t Building List of Pages ') print('\t------------------------') for url in url_list: # skip lines that are comments if "#" in url[0]: continue count += 1 # only do lines starting with https?:// if not (re.match('^https?://.+', url)): print("\t\t%s | %-50s Not a valid address, Skipping." % (count, url[:50])) continue # non-ascii domains will crash phantomjs, so we need to convert them to # idna/ascii/utf-8 # this requires splitting apart the url, converting the domain to idna, # and pasting it all back together split_url = urlsplit(url.strip()) idna_fixed_netloc = split_url.netloc.encode('idna').decode('utf-8') url = urlunsplit( (split_url.scheme, idna_fixed_netloc, split_url.path, split_url.query, split_url.fragment)) # if it is a m$ office or other doc, skip if re.match('.+(pdf|ppt|pptx|doc|docx|txt|rtf|xls|xlsx)$', url): print("\t\t%s | %-50s Not an HTML document, Skipping." % (count, url[:50])) continue # skip if in db already unless we are doing a timeseries if self.allow_timeseries == False: if sql_driver.page_exists(url): print("\t\t%s | %-50s Exists in DB, Skipping." % (count, url[:50])) continue # only add if not in list already if url not in urls_to_process: print("\t\t%s | %-50s Adding." % (count, url[:50])) urls_to_process.add(url) else: print("\t\t%s | %-50s Already queued, Skipping." % (count, url[:50])) # close the db connection sql_driver.close() print('\t----------------------------------') print('\t%s addresses will now be webXray\'d' % len(urls_to_process)) print('\t\tBrowser(s) are %s' % self.browser_types) print('\t\tBrowser wait time is %s seconds' % self.browser_wait) print('\t\t...you can go take a walk. ;-)') print('\t----------------------------------') # for macOS (darwin) we must specify start method as 'forkserver' # this is essentially voodoo to ward off evil spirits which # appear when large pool sizes are used on macOS # get_start_method must be set to 'allow_none', otherwise upon # checking the method it gets set (!) - and if we then get/set again # we get an error if sys.platform == 'darwin' and multiprocessing.get_start_method( allow_none=True) != 'forkserver': multiprocessing.set_start_method('forkserver') myPool = multiprocessing.Pool(pool_size) myPool.map(self.process_url, urls_to_process) # FYI self.print_runtime()
def url(self, url): if urlparse.urlsplit(url).netloc is None: return self.url(url) body = {"url": url} return self.send_session_command("POST", "url", body)
print(result) result = urlparse('http://www.baidu.com/index.html#comment', allow_fragments=False) print(result) result = urlparse('http://www.baidu.com/index.html#comment', allow_fragments=False) print(result.scheme, result[0], result.netloc, result[1], sep='\n') # url unparse data = ['http', 'www.baidu.com', 'index.html', 'user', 'id=6', 'comment'] print(urlunparse(data)) # url split result = urlsplit('www.baidu.com/index.html;user?id=5#comment') print(result) # url unsplit data = ['http', 'www.baidu.com', 'index.html', 'id=6', 'comment'] print(urlunsplit(data)) print(urljoin('http://www.baidu.com', 'FAQ.html')) print(urljoin('http://www.baidu.com', 'https://cuiqingcai.com/FAQ.html')) print( urljoin('http://www.baidu.com/about.html', 'https://cuiqingcai.com/FAQ.html')) print( urljoin('http://www.baidu.com/about.html', 'https://cuiqingcai.com/FAQ.html?question=2')) print(
# set of already crawled urls for email processed_urls = set() # a set of fetched emails emails = set() # process urls one by one from unprocessed_url queue until queue is empty while len(unprocessed_urls): # move next url from the queue to the set of processed urls url = unprocessed_urls.popleft() processed_urls.add(url) # extract base url to resolve relative links parts = urlsplit(url) base_url = "{0.scheme}://{0.netloc}".format(parts) path = url[:url.rfind("/") + 1] if "/" in parts.path else url # get url's content print("Crawling URL %s" % url) try: response = requests.get(url) except (requests.exceptions.MissingSchema, requests.exceptions.ConnectionError): # ignore pages with errors and continue with next url continue # extract all email addresses and add them into the resulting set # You may edit the regular expression as per your requirement new_emails = set(
body=db_patch_body, instance=INSTANCE_NAME, database=DB_NAME, task_id='sql_db_patch_task', ) sql_db_patch_task2 = CloudSQLPatchInstanceDatabaseOperator( body=db_patch_body, instance=INSTANCE_NAME, database=DB_NAME, task_id='sql_db_patch_task2') # [END howto_operator_cloudsql_db_patch] # ############################################## # # ### EXPORTING SQL FROM INSTANCE 1 ############ # # ############################################## # export_url_split = urlsplit(EXPORT_URI) # For export to work we need to add the Cloud SQL instance's Service Account # write access to the destination GCS bucket. # [START howto_operator_cloudsql_export_gcs_permissions] sql_gcp_add_bucket_permission_task = GCSBucketCreateAclEntryOperator( entity="user-{{ task_instance.xcom_pull(" "'sql_instance_create_task', key='service_account_email') " "}}", role="WRITER", bucket=export_url_split[1], # netloc (bucket) task_id='sql_gcp_add_bucket_permission_task', ) # [END howto_operator_cloudsql_export_gcs_permissions] # [START howto_operator_cloudsql_export]
from urllib import parse # url = input("输入完整url") from utils import get_lower_case_name url = input("请输入完整地址:") model = input("请输入模型名称:") qs = parse.parse_qs(parse.urlsplit(url).query).keys() # qs = parse.parse_qs(parse.urlsplit(url).query).keys() remove_list = ["pageIndex", "pageSize"] remain_qs = [one for one in qs if one not in remove_list] row_list = [] for one in remain_qs: # if get_lower_case_name(one) != one: one = f""" {one} = filters.CharFilter(field_name="{get_lower_case_name(one)}", lookup_expr='icontains')\n""" row_list.append(one) txt = f""" class {model}Filter(filters.FilterSet): {"".join(row_list)} class Meta: model = {model} fields = {remain_qs} filter_class = {model}Filter """ print(txt)
def domain(self) -> str: url = urlsplit(self.url) return '{}://{}'.format(url.scheme, url.netloc)
def main(apiurl, opts, argv): repo = argv[0] arch = argv[1] build_descr = argv[2] xp = [] build_root = None cache_dir = None build_uid = '' vm_memory = config['build-memory'] vm_type = config['build-type'] vm_telnet = None build_descr = os.path.abspath(build_descr) build_type = os.path.splitext(build_descr)[1][1:] if os.path.basename(build_descr) == 'PKGBUILD': build_type = 'arch' if os.path.basename(build_descr) == 'build.collax': build_type = 'collax' if os.path.basename(build_descr) == 'appimage.yml': build_type = 'appimage' if os.path.basename(build_descr) == 'snapcraft.yaml': build_type = 'snapcraft' if build_type not in ['spec', 'dsc', 'kiwi', 'arch', 'collax', 'livebuild', 'snapcraft', 'appimage']: raise oscerr.WrongArgs( 'Unknown build type: \'%s\'. Build description should end in .spec, .dsc, .kiwi, or .livebuild. Or being named PKGBUILD, build.collax, appimage.yml or snapcraft.yaml' \ % build_type) if not os.path.isfile(build_descr): raise oscerr.WrongArgs('Error: build description file named \'%s\' does not exist.' % build_descr) buildargs = [] if not opts.userootforbuild: buildargs.append('--norootforbuild') if opts.clean: buildargs.append('--clean') if opts.noinit: buildargs.append('--noinit') if opts.nochecks: buildargs.append('--no-checks') if not opts.no_changelog: buildargs.append('--changelog') if opts.root: build_root = opts.root if opts.target: buildargs.append('--target=%s' % opts.target) if opts.threads: buildargs.append('--threads=%s' % opts.threads) if opts.jobs: buildargs.append('--jobs=%s' % opts.jobs) elif config['build-jobs'] > 1: buildargs.append('--jobs=%s' % config['build-jobs']) if opts.icecream or config['icecream'] != '0': if opts.icecream: num = opts.icecream else: num = config['icecream'] if int(num) > 0: buildargs.append('--icecream=%s' % num) xp.append('icecream') xp.append('gcc-c++') if opts.ccache: buildargs.append('--ccache') xp.append('ccache') if opts.linksources: buildargs.append('--linksources') if opts.baselibs: buildargs.append('--baselibs') if opts.debuginfo: buildargs.append('--debug') if opts._with: for o in opts._with: buildargs.append('--with=%s' % o) if opts.without: for o in opts.without: buildargs.append('--without=%s' % o) if opts.define: for o in opts.define: buildargs.append('--define=%s' % o) if config['build-uid']: build_uid = config['build-uid'] if opts.build_uid: build_uid = opts.build_uid if build_uid: buildidre = re.compile('^[0-9]{1,5}:[0-9]{1,5}$') if build_uid == 'caller': buildargs.append('--uid=%s:%s' % (os.getuid(), os.getgid())) elif buildidre.match(build_uid): buildargs.append('--uid=%s' % build_uid) else: print('Error: build-uid arg must be 2 colon separated numerics: "uid:gid" or "caller"', file=sys.stderr) return 1 if opts.vm_memory: vm_memory = opts.vm_memory if opts.vm_type: vm_type = opts.vm_type if opts.vm_telnet: vm_telnet = opts.vm_telnet if opts.alternative_project: prj = opts.alternative_project pac = '_repository' else: prj = store_read_project(os.curdir) if opts.local_package: pac = '_repository' else: pac = store_read_package(os.curdir) if opts.shell: buildargs.append("--shell") orig_build_root = config['build-root'] # make it possible to override configuration of the rc file for var in ['OSC_PACKAGECACHEDIR', 'OSC_SU_WRAPPER', 'OSC_BUILD_ROOT']: val = os.getenv(var) if val: if var.startswith('OSC_'): var = var[4:] var = var.lower().replace('_', '-') if var in config: print('Overriding config value for %s=\'%s\' with \'%s\'' % (var, config[var], val)) config[var] = val pacname = pac if pacname == '_repository': if not opts.local_package: try: pacname = store_read_package(os.curdir) except oscerr.NoWorkingCopy: opts.local_package = True if opts.local_package: pacname = os.path.splitext(os.path.basename(build_descr))[0] apihost = urlsplit(apiurl)[1] if not build_root: build_root = config['build-root'] if build_root == orig_build_root: # ENV var was not set build_root = config['api_host_options'][apiurl].get('build-root', build_root) try: build_root = build_root % {'repo': repo, 'arch': arch, 'project': prj, 'package': pacname, 'apihost': apihost} except: pass cache_dir = config['packagecachedir'] % {'apihost': apihost} extra_pkgs = [] if not opts.extra_pkgs: extra_pkgs = config['extra-pkgs'] elif opts.extra_pkgs != ['']: extra_pkgs = opts.extra_pkgs if xp: extra_pkgs += xp prefer_pkgs = {} build_descr_data = open(build_descr).read() # XXX: dirty hack but there's no api to provide custom defines if opts.without: s = '' for i in opts.without: s += "%%define _without_%s 1\n" % i build_descr_data = s + build_descr_data if opts._with: s = '' for i in opts._with: s += "%%define _with_%s 1\n" % i build_descr_data = s + build_descr_data if opts.define: s = '' for i in opts.define: s += "%%define %s\n" % i build_descr_data = s + build_descr_data cpiodata = None servicefile = os.path.join(os.path.dirname(build_descr), "_service") if not os.path.isfile(servicefile): servicefile = os.path.join(os.path.dirname(build_descr), "_service") if not os.path.isfile(servicefile): servicefile = None else: print('Using local _service file') buildenvfile = os.path.join(os.path.dirname(build_descr), "_buildenv." + repo + "." + arch) if not os.path.isfile(buildenvfile): buildenvfile = os.path.join(os.path.dirname(build_descr), "_buildenv") if not os.path.isfile(buildenvfile): buildenvfile = None else: print('Using local buildenv file: %s' % os.path.basename(buildenvfile)) if buildenvfile or servicefile: from .util import cpio if not cpiodata: cpiodata = cpio.CpioWrite() if opts.prefer_pkgs: print('Scanning the following dirs for local packages: %s' % ', '.join(opts.prefer_pkgs)) from .util import cpio if not cpiodata: cpiodata = cpio.CpioWrite() prefer_pkgs = get_prefer_pkgs(opts.prefer_pkgs, arch, build_type, cpiodata) if cpiodata: cpiodata.add(os.path.basename(build_descr), build_descr_data) # buildenv must come last for compatibility reasons... if buildenvfile: cpiodata.add("buildenv", open(buildenvfile).read()) if servicefile: cpiodata.add("_service", open(servicefile).read()) build_descr_data = cpiodata.get() # special handling for overlay and rsync-src/dest specialcmdopts = [] if opts.rsyncsrc or opts.rsyncdest : if not opts.rsyncsrc or not opts.rsyncdest: raise oscerr.WrongOptions('When using --rsync-{src,dest} both parameters have to be specified.') myrsyncsrc = os.path.abspath(os.path.expanduser(os.path.expandvars(opts.rsyncsrc))) if not os.path.isdir(myrsyncsrc): raise oscerr.WrongOptions('--rsync-src %s is no valid directory!' % opts.rsyncsrc) # can't check destination - its in the target chroot ;) - but we can check for sanity myrsyncdest = os.path.expandvars(opts.rsyncdest) if not os.path.isabs(myrsyncdest): raise oscerr.WrongOptions('--rsync-dest %s is no absolute path (starting with \'/\')!' % opts.rsyncdest) specialcmdopts = ['--rsync-src='+myrsyncsrc, '--rsync-dest='+myrsyncdest] if opts.overlay: myoverlay = os.path.abspath(os.path.expanduser(os.path.expandvars(opts.overlay))) if not os.path.isdir(myoverlay): raise oscerr.WrongOptions('--overlay %s is no valid directory!' % opts.overlay) specialcmdopts += ['--overlay='+myoverlay] bi_file = None bc_file = None bi_filename = '_buildinfo-%s-%s.xml' % (repo, arch) bc_filename = '_buildconfig-%s-%s' % (repo, arch) if is_package_dir('.') and os.access(osc.core.store, os.W_OK): bi_filename = os.path.join(os.getcwd(), osc.core.store, bi_filename) bc_filename = os.path.join(os.getcwd(), osc.core.store, bc_filename) elif not os.access('.', os.W_OK): bi_file = NamedTemporaryFile(prefix=bi_filename) bi_filename = bi_file.name bc_file = NamedTemporaryFile(prefix=bc_filename) bc_filename = bc_file.name else: bi_filename = os.path.abspath(bi_filename) bc_filename = os.path.abspath(bc_filename) try: if opts.noinit: if not os.path.isfile(bi_filename): raise oscerr.WrongOptions('--noinit is not possible, no local buildinfo file') print('Use local \'%s\' file as buildinfo' % bi_filename) if not os.path.isfile(bc_filename): raise oscerr.WrongOptions('--noinit is not possible, no local buildconfig file') print('Use local \'%s\' file as buildconfig' % bc_filename) elif opts.offline: if not os.path.isfile(bi_filename): raise oscerr.WrongOptions('--offline is not possible, no local buildinfo file') print('Use local \'%s\' file as buildinfo' % bi_filename) if not os.path.isfile(bc_filename): raise oscerr.WrongOptions('--offline is not possible, no local buildconfig file') else: print('Getting buildinfo from server and store to %s' % bi_filename) bi_text = ''.join(get_buildinfo(apiurl, prj, pac, repo, arch, specfile=build_descr_data, addlist=extra_pkgs)) if not bi_file: bi_file = open(bi_filename, 'w') # maybe we should check for errors before saving the file bi_file.write(bi_text) bi_file.flush() print('Getting buildconfig from server and store to %s' % bc_filename) bc = get_buildconfig(apiurl, prj, repo) if not bc_file: bc_file = open(bc_filename, 'w') bc_file.write(bc) bc_file.flush() except HTTPError as e: if e.code == 404: # check what caused the 404 if meta_exists(metatype='prj', path_args=(quote_plus(prj), ), template_args=None, create_new=False, apiurl=apiurl): pkg_meta_e = None try: # take care, not to run into double trouble. pkg_meta_e = meta_exists(metatype='pkg', path_args=(quote_plus(prj), quote_plus(pac)), template_args=None, create_new=False, apiurl=apiurl) except: pass if pkg_meta_e: print('ERROR: Either wrong repo/arch as parameter or a parse error of .spec/.dsc/.kiwi file due to syntax error', file=sys.stderr) else: print('The package \'%s\' does not exist - please ' \ 'rerun with \'--local-package\'' % pac, file=sys.stderr) else: print('The project \'%s\' does not exist - please ' \ 'rerun with \'--alternative-project <alternative_project>\'' % prj, file=sys.stderr) sys.exit(1) else: raise bi = Buildinfo(bi_filename, apiurl, build_type, list(prefer_pkgs.keys())) if bi.debuginfo and not (opts.disable_debuginfo or '--debug' in buildargs): buildargs.append('--debug') if opts.release: bi.release = opts.release if bi.release: buildargs.append('--release=%s' % bi.release) if opts.build_opt: buildargs += opts.build_opt # real arch of this machine # vs. # arch we are supposed to build for if bi.hostarch != None: if hostarch != bi.hostarch and not bi.hostarch in can_also_build.get(hostarch, []): print('Error: hostarch \'%s\' is required.' % (bi.hostarch), file=sys.stderr) return 1 elif hostarch != bi.buildarch: if not bi.buildarch in can_also_build.get(hostarch, []): # OBSOLETE: qemu_can_build should not be needed anymore since OBS 2.3 if vm_type != "emulator" and not bi.buildarch in qemu_can_build: print('Error: hostarch \'%s\' cannot build \'%s\'.' % (hostarch, bi.buildarch), file=sys.stderr) return 1 print('WARNING: It is guessed to build on hostarch \'%s\' for \'%s\' via QEMU.' % (hostarch, bi.buildarch), file=sys.stderr) rpmlist_prefers = [] if prefer_pkgs: print('Evaluating preferred packages') for name, path in prefer_pkgs.items(): if bi.has_dep(name): # We remove a preferred package from the buildinfo, so that the # fetcher doesn't take care about them. # Instead, we put it in a list which is appended to the rpmlist later. # At the same time, this will make sure that these packages are # not verified. bi.remove_dep(name) rpmlist_prefers.append((name, path)) print(' - %s (%s)' % (name, path)) print('Updating cache of required packages') urllist = [] if not opts.download_api_only: # transform 'url1, url2, url3' form into a list if 'urllist' in config: if isinstance(config['urllist'], str): re_clist = re.compile('[, ]+') urllist = [ i.strip() for i in re_clist.split(config['urllist'].strip()) ] else: urllist = config['urllist'] # OBS 1.5 and before has no downloadurl defined in buildinfo if bi.downloadurl: urllist.append(bi.downloadurl + '/%(extproject)s/%(extrepository)s/%(arch)s/%(filename)s') if opts.disable_cpio_bulk_download: urllist.append( '%(apiurl)s/build/%(project)s/%(repository)s/%(repoarch)s/%(repopackage)s/%(repofilename)s' ) fetcher = Fetcher(cache_dir, urllist = urllist, api_host_options = config['api_host_options'], offline = opts.noinit or opts.offline, http_debug = config['http_debug'], enable_cpio = not opts.disable_cpio_bulk_download, cookiejar=cookiejar) if not opts.trust_all_projects: # implicitly trust the project we are building for check_trusted_projects(apiurl, [ i for i in bi.projects.keys() if not i == prj ]) imagefile = '' imagesource = '' imagebins = [] if (not config['no_preinstallimage'] and not opts.nopreinstallimage and bi.preinstallimage and not opts.noinit and not opts.offline and (opts.clean or (not os.path.exists(build_root + "/installed-pkg") and not os.path.exists(build_root + "/.build/init_buildsystem.data")))): (imagefile, imagesource, imagebins) = get_preinstall_image(apiurl, arch, cache_dir, bi.preinstallimage) if imagefile: # remove binaries from build deps which are included in preinstall image for i in bi.deps: if i.name in imagebins: bi.remove_dep(i.name) # now update the package cache fetcher.run(bi) old_pkg_dir = None if opts.oldpackages: old_pkg_dir = opts.oldpackages if not old_pkg_dir.startswith('/') and not opts.offline: data = [ prj, pacname, repo, arch] if old_pkg_dir == '_link': p = osc.core.findpacs(os.curdir)[0] if not p.islink(): raise oscerr.WrongOptions('package is not a link') data[0] = p.linkinfo.project data[1] = p.linkinfo.package repos = osc.core.get_repositories_of_project(apiurl, data[0]) # hack for links to e.g. Factory if not data[2] in repos and 'standard' in repos: data[2] = 'standard' elif old_pkg_dir != '' and old_pkg_dir != '_self': a = old_pkg_dir.split('/') for i in range(0, len(a)): data[i] = a[i] destdir = os.path.join(cache_dir, data[0], data[2], data[3]) old_pkg_dir = None try: print("Downloading previous build from %s ..." % '/'.join(data)) binaries = get_binarylist(apiurl, data[0], data[2], data[3], package=data[1], verbose=True) except Exception as e: print("Error: failed to get binaries: %s" % str(e)) binaries = [] if binaries: class mytmpdir: """ temporary directory that removes itself""" def __init__(self, *args, **kwargs): self.name = mkdtemp(*args, **kwargs) _rmtree = staticmethod(shutil.rmtree) def cleanup(self): self._rmtree(self.name) def __del__(self): self.cleanup() def __exit__(self): self.cleanup() def __str__(self): return self.name old_pkg_dir = mytmpdir(prefix='.build.oldpackages', dir=os.path.abspath(os.curdir)) if not os.path.exists(destdir): os.makedirs(destdir) for i in binaries: fname = os.path.join(destdir, i.name) os.symlink(fname, os.path.join(str(old_pkg_dir), i.name)) if os.path.exists(fname): st = os.stat(fname) if st.st_mtime == i.mtime and st.st_size == i.size: continue get_binary_file(apiurl, data[0], data[2], data[3], i.name, package = data[1], target_filename = fname, target_mtime = i.mtime, progress_meter = True) if old_pkg_dir != None: buildargs.append('--oldpackages=%s' % old_pkg_dir) # Make packages from buildinfo available as repos for kiwi if build_type == 'kiwi': if os.path.exists('repos'): shutil.rmtree('repos') os.mkdir('repos') for i in bi.deps: if not i.extproject: # remove bi.deps.remove(i) continue # project pdir = str(i.extproject).replace(':/', ':') # repo rdir = str(i.extrepository).replace(':/', ':') # arch adir = i.repoarch # project/repo prdir = "repos/"+pdir+"/"+rdir # project/repo/arch pradir = prdir+"/"+adir # source fullfilename sffn = i.fullfilename filename = sffn.split("/")[-1] # target fullfilename tffn = pradir+"/"+filename if not os.path.exists(os.path.join(pradir)): os.makedirs(os.path.join(pradir)) if not os.path.exists(tffn): print("Using package: "+sffn) if opts.linksources: os.link(sffn, tffn) else: os.symlink(sffn, tffn) if prefer_pkgs: for name, path in prefer_pkgs.items(): if name == filename: print("Using prefered package: " + path + "/" + filename) os.unlink(tffn) if opts.linksources: os.link(path + "/" + filename, tffn) else: os.symlink(path + "/" + filename, tffn) # Is a obsrepositories tag used? try: tree = ET.parse(build_descr) except: print('could not parse the kiwi file:', file=sys.stderr) print(open(build_descr).read(), file=sys.stderr) sys.exit(1) root = tree.getroot() # product for xml in root.findall('instsource'): if xml.find('instrepo').find('source').get('path') == 'obsrepositories:/': print("obsrepositories:/ for product builds is not yet supported in osc!") sys.exit(1) # appliance expand_obsrepos=None for xml in root.findall('repository'): if xml.find('source').get('path') == 'obsrepositories:/': expand_obsrepos=True if expand_obsrepos: buildargs.append('--kiwi-parameter') buildargs.append('--ignore-repos') for xml in root.findall('repository'): if xml.find('source').get('path') == 'obsrepositories:/': for path in bi.pathes: if not os.path.isdir("repos/"+path): continue buildargs.append('--kiwi-parameter') buildargs.append('--add-repo') buildargs.append('--kiwi-parameter') buildargs.append("dir://./repos/"+path) buildargs.append('--kiwi-parameter') buildargs.append('--add-repotype') buildargs.append('--kiwi-parameter') buildargs.append('rpm-md') if xml.get('priority'): buildargs.append('--kiwi-parameter') buildargs.append('--add-repoprio='+xml.get('priority')) else: m = re.match(r"obs://[^/]+/([^/]+)/(\S+)", xml.find('source').get('path')) if not m: # short path without obs instance name m = re.match(r"obs://([^/]+)/(.+)", xml.find('source').get('path')) project=m.group(1).replace(":", ":/") repo=m.group(2) buildargs.append('--kiwi-parameter') buildargs.append('--add-repo') buildargs.append('--kiwi-parameter') buildargs.append("dir://./repos/"+project+"/"+repo) buildargs.append('--kiwi-parameter') buildargs.append('--add-repotype') buildargs.append('--kiwi-parameter') buildargs.append('rpm-md') if xml.get('priority'): buildargs.append('--kiwi-parameter') buildargs.append('--add-repopriority='+xml.get('priority')) if vm_type == "xen" or vm_type == "kvm" or vm_type == "lxc": print('Skipping verification of package signatures due to secure VM build') elif bi.pacsuffix == 'rpm': if opts.no_verify: print('Skipping verification of package signatures') else: print('Verifying integrity of cached packages') verify_pacs(bi) elif bi.pacsuffix == 'deb': if opts.no_verify or opts.noinit: print('Skipping verification of package signatures') else: print('WARNING: deb packages get not verified, they can compromise your system !') else: print('WARNING: unknown packages get not verified, they can compromise your system !') for i in bi.deps: if i.hdrmd5: from .util import packagequery hdrmd5 = packagequery.PackageQuery.queryhdrmd5(i.fullfilename) if not hdrmd5: print("Error: cannot get hdrmd5 for %s" % i.fullfilename) sys.exit(1) if hdrmd5 != i.hdrmd5: print("Error: hdrmd5 mismatch for %s: %s != %s" % (i.fullfilename, hdrmd5, i.hdrmd5)) sys.exit(1) print('Writing build configuration') if build_type == 'kiwi': rpmlist = [ '%s %s\n' % (i.name, i.fullfilename) for i in bi.deps if not i.noinstall ] else: rpmlist = [ '%s %s\n' % (i.name, i.fullfilename) for i in bi.deps ] for i in imagebins: rpmlist.append('%s preinstallimage\n' % i) rpmlist += [ '%s %s\n' % (i[0], i[1]) for i in rpmlist_prefers ] if imagefile: rpmlist.append('preinstallimage: %s\n' % imagefile) if imagesource: rpmlist.append('preinstallimagesource: %s\n' % imagesource) rpmlist.append('preinstall: ' + ' '.join(bi.preinstall_list) + '\n') rpmlist.append('vminstall: ' + ' '.join(bi.vminstall_list) + '\n') rpmlist.append('runscripts: ' + ' '.join(bi.runscripts_list) + '\n') if build_type != 'kiwi' and bi.noinstall_list: rpmlist.append('noinstall: ' + ' '.join(bi.noinstall_list) + '\n') if build_type != 'kiwi' and bi.installonly_list: rpmlist.append('installonly: ' + ' '.join(bi.installonly_list) + '\n') rpmlist_file = NamedTemporaryFile(prefix='rpmlist.') rpmlist_filename = rpmlist_file.name rpmlist_file.writelines(rpmlist) rpmlist_file.flush() subst = { 'repo': repo, 'arch': arch, 'project' : prj, 'package' : pacname } vm_options = [] # XXX check if build-device present my_build_device = '' if config['build-device']: my_build_device = config['build-device'] % subst else: # obs worker uses /root here but that collides with the # /root directory if the build root was used without vm # before my_build_device = build_root + '/img' need_root = True if vm_type: if config['build-swap']: my_build_swap = config['build-swap'] % subst else: my_build_swap = build_root + '/swap' vm_options = [ '--vm-type=%s' % vm_type ] if vm_telnet: vm_options += [ '--vm-telnet=' + vm_telnet ] if vm_memory: vm_options += [ '--memory=' + vm_memory ] if vm_type != 'lxc': vm_options += [ '--vm-disk=' + my_build_device ] vm_options += [ '--vm-swap=' + my_build_swap ] vm_options += [ '--logfile=%s/.build.log' % build_root ] if vm_type == 'kvm': if os.access(build_root, os.W_OK) and os.access('/dev/kvm', os.W_OK): # so let's hope there's also an fstab entry need_root = False if config['build-kernel']: vm_options += [ '--vm-kernel=' + config['build-kernel'] ] if config['build-initrd']: vm_options += [ '--vm-initrd=' + config['build-initrd'] ] build_root += '/.mount' if config['build-vmdisk-rootsize']: vm_options += [ '--vmdisk-rootsize=' + config['build-vmdisk-rootsize'] ] if config['build-vmdisk-swapsize']: vm_options += [ '--vmdisk-swapsize=' + config['build-vmdisk-swapsize'] ] if config['build-vmdisk-filesystem']: vm_options += [ '--vmdisk-filesystem=' + config['build-vmdisk-filesystem'] ] if config['build-vm-user']: vm_options += [ '--vm-user='******'build-vm-user'] ] if opts.preload: print("Preload done for selected repo/arch.") sys.exit(0) print('Running build') cmd = [ config['build-cmd'], '--root='+build_root, '--rpmlist='+rpmlist_filename, '--dist='+bc_filename, '--arch='+bi.buildarch ] cmd += specialcmdopts + vm_options + buildargs cmd += [ build_descr ] if need_root: sucmd = config['su-wrapper'].split() if sucmd[0] == 'su': if sucmd[-1] == '-c': sucmd.pop() cmd = sucmd + ['-s', cmd[0], 'root', '--' ] + cmd[1:] else: cmd = sucmd + cmd # change personality, if needed if hostarch != bi.buildarch and bi.buildarch in change_personality: cmd = [ change_personality[bi.buildarch] ] + cmd try: rc = run_external(cmd[0], *cmd[1:]) if rc: print() print('The buildroot was:', build_root) sys.exit(rc) except KeyboardInterrupt as i: print("keyboard interrupt, killing build ...") cmd.append('--kill') run_external(cmd[0], *cmd[1:]) raise i pacdir = os.path.join(build_root, '.build.packages') if os.path.islink(pacdir): pacdir = os.readlink(pacdir) pacdir = os.path.join(build_root, pacdir) if os.path.exists(pacdir): (s_built, b_built) = get_built_files(pacdir, bi.buildtype) print() if s_built: print(s_built) print() print(b_built) if opts.keep_pkgs: for i in b_built.splitlines() + s_built.splitlines(): shutil.copy2(i, os.path.join(opts.keep_pkgs, os.path.basename(i))) if bi_file: bi_file.close() if bc_file: bc_file.close() rpmlist_file.close()
def _set_network_proxy(): if conf.proxy: debug_msg = "setting the HTTP/SOCKS proxy for all network requests" logger.debug(debug_msg) try: _ = urlsplit(conf.proxy) except Exception as ex: err_msg = "invalid proxy address '{0}' ('{1}')".format( conf.proxy, str(ex)) raise PocsuiteSyntaxException(err_msg) hostname_port = _.netloc.split(":") scheme = _.scheme.upper() hostname = hostname_port[0] port = None username = None password = None if len(hostname_port) == 2: try: port = int(hostname_port[1]) except Exception: pass if not all((scheme, hasattr(PROXY_TYPE, scheme), hostname, port)): err_msg = "proxy value must be in format '({0})://address:port'".format( "|".join(_[0].lower() for _ in get_public_type_members(PROXY_TYPE))) raise PocsuiteSyntaxException(err_msg) if conf.proxy_cred: _ = re.search(r"\A(.*?):(.*?)\Z", conf.proxy_cred) if not _: err_msg = "proxy authentication credentials " err_msg += "value must be in format username:password" raise PocsuiteSyntaxException(err_msg) else: username = _.group(1) password = _.group(2) if scheme in (PROXY_TYPE.SOCKS4, PROXY_TYPE.SOCKS5, PROXY_TYPE.SOCKS5H): socks.set_default_proxy( socks.PROXY_TYPE_SOCKS4 if scheme == PROXY_TYPE.SOCKS4 else socks.PROXY_TYPE_SOCKS5, hostname, port, username=username, password=password, rdns=True if scheme == PROXY_TYPE.SOCKS5H else False, ) socket.socket = socks.socksocket conf.proxies = { "http": conf.proxy, "https": conf.proxy, } else: if conf.proxy_cred: proxy_string = "{0}@".format(conf.proxy_cred) else: proxy_string = "" proxy_string = "{0}{1}:{2}".format(proxy_string, hostname, port) conf.proxies = {"http": proxy_string, "https": proxy_string}
def make_abs_url(url): pr = parse.urlsplit(url) return parse.urlunsplit(parse.SplitResult(pr.scheme, pr.netloc, path.abspath(pr.path), '',''))
from urllib import parse url = 'https://www.baidu.com/s?wd=python' result = parse.urlparse(url) print(result) result = parse.urlsplit(url) print(result)