def __call__(self, value): if not value or '@' not in value: raise ValidationError(self.message, code=self.code, params={'value': value}) user_part, domain_part = value.rsplit('@', 1) if not self.user_regex.match(user_part): raise ValidationError(self.message, code=self.code, params={'value': value}) if (domain_part not in self.domain_allowlist and not self.validate_domain_part(domain_part)): # Try for possible IDN domain-part try: domain_part = punycode(domain_part) except UnicodeError: pass else: if self.validate_domain_part(domain_part): return raise ValidationError(self.message, code=self.code, params={'value': value})
def smart_urlquote(url): """Quote a URL if it isn't already quoted.""" def unquote_quote(segment): segment = unquote(segment) # Tilde is part of RFC3986 Unreserved Characters # https://tools.ietf.org/html/rfc3986#section-2.3 # See also https://bugs.python.org/issue16285 return quote(segment, safe=RFC3986_SUBDELIMS + RFC3986_GENDELIMS + '~') # Handle IDN before quoting. try: scheme, netloc, path, query, fragment = urlsplit(url) except ValueError: # invalid IPv6 URL (normally square brackets in hostname part). return unquote_quote(url) try: netloc = punycode(netloc) # IDN -> ACE except UnicodeError: # invalid domain part return unquote_quote(url) if query: # Separately unquoting key/value, so as to not mix querystring separators # included in query values. See #22267. query_parts = [(unquote(q[0]), unquote(q[1])) for q in parse_qsl(query, keep_blank_values=True)] # urlencode will take care of quoting query = urlencode(query_parts) path = unquote_quote(path) fragment = unquote_quote(fragment) return urlunsplit((scheme, netloc, path, query, fragment))
def sanitize_address(addr, encoding): """ Format a pair of (name, address) or an email address string. """ address = None if not isinstance(addr, tuple): addr = force_str(addr) try: token, rest = parser.get_mailbox(addr) except (HeaderParseError, ValueError, IndexError): raise ValueError('Invalid address "%s"' % addr) else: if rest: # The entire email address must be parsed. raise ValueError( 'Invalid address; only %s could be parsed from "%s"' % (token, addr)) nm = token.display_name or "" localpart = token.local_part domain = token.domain or "" else: nm, address = addr localpart, domain = address.rsplit("@", 1) nm = Header(nm, encoding).encode() # Avoid UTF-8 encode, if it's possible. try: localpart.encode("ascii") except UnicodeEncodeError: localpart = Header(localpart, encoding).encode() domain = punycode(domain) parsed_address = Address(nm, username=localpart, domain=domain) return str(parsed_address)
def __call__(self, value): if not isinstance(value, str): raise ValidationError(self.message, code=self.code, params={'value': value}) if self.unsafe_chars.intersection(value): raise ValidationError(self.message, code=self.code, params={'value': value}) # Check if the scheme is valid. scheme = value.split('://')[0].lower() if scheme not in self.schemes: raise ValidationError(self.message, code=self.code, params={'value': value}) # Then check full URL try: splitted_url = urlsplit(value) except ValueError: raise ValidationError(self.message, code=self.code, params={'value': value}) try: super().__call__(value) except ValidationError as e: # Trivial case failed. Try for possible IDN domain if value: scheme, netloc, path, query, fragment = splitted_url try: netloc = punycode(netloc) # IDN -> ACE except UnicodeError: # invalid domain part raise e url = urlunsplit((scheme, netloc, path, query, fragment)) super().__call__(url) else: raise else: # Now verify IPv6 in the netloc part host_match = re.search(r'^\[(.+)\](?::\d{1,5})?$', splitted_url.netloc) if host_match: potential_ip = host_match[1] try: validate_ipv6_address(potential_ip) except ValidationError: raise ValidationError(self.message, code=self.code, params={'value': value}) # The maximum length of a full host name is 253 characters per RFC 1034 # section 3.1. It's defined to be 255 bytes or less, but this includes # one byte for the length of the name and one byte for the trailing dot # that's used to indicate absolute names in DNS. if splitted_url.hostname is None or len(splitted_url.hostname) > 253: raise ValidationError(self.message, code=self.code, params={'value': value})
def handle_word( self, word, *, safe_input, trim_url_limit=None, nofollow=False, autoescape=False, ): if "." in word or "@" in word or ":" in word: # lead: Punctuation trimmed from the beginning of the word. # middle: State of the word. # trail: Punctuation trimmed from the end of the word. lead, middle, trail = self.trim_punctuation(word) # Make URL we want to point to. url = None nofollow_attr = ' rel="nofollow"' if nofollow else "" if self.simple_url_re.match(middle): url = smart_urlquote(html.unescape(middle)) elif self.simple_url_2_re.match(middle): url = smart_urlquote("http://%s" % html.unescape(middle)) elif ":" not in middle and self.is_email_simple(middle): local, domain = middle.rsplit("@", 1) try: domain = punycode(domain) except UnicodeError: return word url = self.mailto_template.format(local=local, domain=domain) nofollow_attr = "" # Make link. if url: trimmed = self.trim_url(middle, limit=trim_url_limit) if autoescape and not safe_input: lead, trail = escape(lead), escape(trail) trimmed = escape(trimmed) middle = self.url_template.format( href=escape(url), attrs=nofollow_attr, url=trimmed, ) return mark_safe(f"{lead}{middle}{trail}") else: if safe_input: return mark_safe(word) elif autoescape: return escape(word) elif safe_input: return mark_safe(word) elif autoescape: return escape(word) return word
def __call__(self, value): # Check first if the scheme is valid split = value.split('://') if (len(split) > 1): # in words, there was a split scheme = split[0].lower() if scheme not in self.schemes: raise ValidationError(self.message, code=self.code) # Then check full URL try: super().__call__(value) except ValidationError as e: # Trivial case failed. Try for possible IDN domain if value: try: scheme, netloc, path, query, fragment = urlsplit(value) except ValueError: # for example, "Invalid IPv6 URL" raise ValidationError(self.message, code=self.code) try: netloc = punycode(netloc) # IDN -> ACE except UnicodeError: # invalid domain part raise e url = urlunsplit((scheme, netloc, path, query, fragment)) super().__call__(url) else: raise else: # Now verify IPv6 in the netloc part host_match = re.search(r'^\[(.+)\](?::\d{2,5})?$', urlsplit(value).netloc) if host_match: potential_ip = host_match.groups()[0] try: validate_ipv6_address(potential_ip) except ValidationError: raise ValidationError(self.message, code=self.code) # The maximum length of a full host name is 253 characters per RFC 1034 # section 3.1. It's defined to be 255 bytes or less, but this includes # one byte for the length of the name and one byte for the trailing dot # that's used to indicate absolute names in DNS. if len(urlsplit(value).netloc) > 253: raise ValidationError(self.message, code=self.code)
def sanitize_address(addr, encoding): """ Format a pair of (name, address) or an email address string. """ address = None if not isinstance(addr, tuple): addr = force_str(addr) try: token, rest = parser.get_mailbox(addr) except (HeaderParseError, ValueError, IndexError): raise ValueError('Invalid address "%s"' % addr) else: if rest: # The entire email address must be parsed. raise ValueError( 'Invalid address; only %s could be parsed from "%s"' % (token, addr) ) nm = token.display_name or '' localpart = token.local_part domain = token.domain or '' else: nm, address = addr localpart, domain = address.rsplit('@', 1) address_parts = nm + localpart + domain if '\n' in address_parts or '\r' in address_parts: raise ValueError('Invalid address; address parts cannot contain newlines.') # Avoid UTF-8 encode, if it's possible. try: nm.encode('ascii') nm = Header(nm).encode() except UnicodeEncodeError: nm = Header(nm, encoding).encode() try: localpart.encode('ascii') except UnicodeEncodeError: localpart = Header(localpart, encoding).encode() domain = punycode(domain) parsed_address = Address(username=localpart, domain=domain) return formataddr((nm, parsed_address.addr_spec))
def handle_word(self, word): if '.' in word or '@' in word or ':' in word: # lead: Punctuation trimmed from the beginning of the word. # middle: State of the word. # trail: Punctuation trimmed from the end of the word. lead, middle, trail = self.trim_punctuation(word) # Make URL we want to point to. url = None nofollow_attr = ' rel="nofollow"' if self.nofollow else '' if self.simple_url_re.match(middle): url = smart_urlquote(html.unescape(middle)) elif self.simple_url_2_re.match(middle): url = smart_urlquote('http://%s' % html.unescape(middle)) elif ':' not in middle and self.is_email_simple(middle): local, domain = middle.rsplit('@', 1) try: domain = punycode(domain) except UnicodeError: return word url = self.mailto_template.format(local=local, domain=domain) nofollow_attr = '' # Make link. if url: trimmed = self.trim_url(middle) if self.autoescape and not self.safe_input: lead, trail = escape(lead), escape(trail) trimmed = escape(trimmed) middle = self.url_template.format( href=escape(url), attrs=nofollow_attr, url=trimmed, ) return mark_safe(f'{lead}{middle}{trail}') else: if self.safe_input: return mark_safe(word) elif self.autoescape: return escape(word) elif self.safe_input: return mark_safe(word) elif self.autoescape: return escape(word) return word
def __call__(self, value): if not value or "@" not in value: raise ValidationError(self.message, code=self.code) user_part, domain_part = value.rsplit("@", 1) if not self.user_regex.match(user_part): raise ValidationError(self.message, code=self.code) if domain_part not in self.domain_whitelist and not self.validate_domain_part( domain_part): # Try for possible IDN domain-part try: domain_part = punycode(domain_part) except UnicodeError: pass else: if self.validate_domain_part(domain_part): return raise ValidationError(self.message, code=self.code)
def urlize(text, trim_url_limit=None, nofollow=False, autoescape=False): """ Convert any URLs in text into clickable links. Works on http://, https://, www. links, and also on links ending in one of the original seven gTLDs (.com, .edu, .gov, .int, .mil, .net, and .org). Links can have trailing punctuation (periods, commas, close-parens) and leading punctuation (opening parens) and it'll still do the right thing. If trim_url_limit is not None, truncate the URLs in the link text longer than this limit to trim_url_limit - 1 characters and append an ellipsis. If nofollow is True, give the links a rel="nofollow" attribute. If autoescape is True, autoescape the link text and URLs. """ safe_input = isinstance(text, SafeData) def trim_url(x, limit=trim_url_limit): if limit is None or len(x) <= limit: return x return '%s…' % x[:max(0, limit - 1)] def trim_punctuation(lead, middle, trail): """ Trim trailing and wrapping punctuation from `middle`. Return the items of the new state. """ # Continue trimming until middle remains unchanged. trimmed_something = True while trimmed_something: trimmed_something = False # Trim wrapping punctuation. for opening, closing in WRAPPING_PUNCTUATION: if middle.startswith(opening): middle = middle[len(opening):] lead += opening trimmed_something = True # Keep parentheses at the end only if they're balanced. if (middle.endswith(closing) and middle.count(closing) == middle.count(opening) + 1): middle = middle[:-len(closing)] trail = closing + trail trimmed_something = True # Trim trailing punctuation (after trimming wrapping punctuation, # as encoded entities contain ';'). Unescape entities to avoid # breaking them by removing ';'. middle_unescaped = html.unescape(middle) stripped = middle_unescaped.rstrip(TRAILING_PUNCTUATION_CHARS) if middle_unescaped != stripped: trail = middle[len(stripped):] + trail middle = middle[:len(stripped) - len(middle_unescaped)] trimmed_something = True return lead, middle, trail def is_email_simple(value): """Return True if value looks like an email address.""" # An @ must be in the middle of the value. if '@' not in value or value.startswith('@') or value.endswith('@'): return False try: p1, p2 = value.split('@') except ValueError: # value contains more than one @. return False # Dot must be in p2 (e.g. example.com) if '.' not in p2 or p2.startswith('.'): return False return True words = word_split_re.split(str(text)) for i, word in enumerate(words): if '.' in word or '@' in word or ':' in word: # lead: Current punctuation trimmed from the beginning of the word. # middle: Current state of the word. # trail: Current punctuation trimmed from the end of the word. lead, middle, trail = '', word, '' # Deal with punctuation. lead, middle, trail = trim_punctuation(lead, middle, trail) # Make URL we want to point to. url = None nofollow_attr = ' rel="nofollow"' if nofollow else '' if simple_url_re.match(middle): url = smart_urlquote(html.unescape(middle)) elif simple_url_2_re.match(middle): url = smart_urlquote('http://%s' % html.unescape(middle)) elif ':' not in middle and is_email_simple(middle): local, domain = middle.rsplit('@', 1) try: domain = punycode(domain) except UnicodeError: continue url = 'mailto:%s@%s' % (local, domain) nofollow_attr = '' # Make link. if url: trimmed = trim_url(middle) if autoescape and not safe_input: lead, trail = escape(lead), escape(trail) trimmed = escape(trimmed) middle = '<a href="%s"%s>%s</a>' % (escape(url), nofollow_attr, trimmed) words[i] = mark_safe('%s%s%s' % (lead, middle, trail)) else: if safe_input: words[i] = mark_safe(word) elif autoescape: words[i] = escape(word) elif safe_input: words[i] = mark_safe(word) elif autoescape: words[i] = escape(word) return ''.join(words)
def get_fqdn(self): if not hasattr(self, '_fqdn'): self._fqdn = punycode(socket.getfqdn()) return self._fqdn
def handle(self, *args, **kwargs): # read sites from site querySites = Site.objects.all().order_by('name') print("Found {0} sites".format(len(querySites))) logfile = 'siteinfo.log' logging.basicConfig(filename=logfile, level=logging.INFO) logging.info('Starting logfile') # loop through sites and scan for querysite in querySites: print("Url: {0}".format(querysite.url)) url_encoded = punycode(querysite.url) print("Encoded url: {0}".format(url_encoded)) # enter whatis_query result into database scan_result = whatis_query(url_encoded) # print(scan_result) print("Count {0}, type: {1}".format(len(scan_result), type(scan_result))) scan_json = json.loads(scan_result) # print(scan_json) for key, value in scan_json.items(): print("Checking site: {0}".format(key)) # print("Content:") # print(value) print("Type: {0}".format(type(value))) for app in value: # check if value already exists, then update existing insertdata = app.copy() if not insertdata['ver']: insertdata['ver'] = "N/A" print(insertdata) try: obj, created = Framework.objects.update_or_create( app=app['app'], site=querysite, defaults=insertdata) if created: print('created new databaseentry') elif obj: print('updated existing databasentry') else: print("Couldn't update at all") except BaseException as e: print("Couldn't insert: {0}. \n Cause: {1}".format( insertdata, e)) # check provider print("Checking provider for {0}".format(url_encoded)) hostprovider = GetHostProvider(address=url_encoded) print("Provider: {0}, Ip: {1}, source: {2}".format( hostprovider.provider, hostprovider.ip, hostprovider.source)) # add provider to database provider_insertdata = { 'provider': hostprovider.provider, 'ip': hostprovider.ip, 'source': hostprovider.source } try: provider_obj, provider_created = Provider.objects.update_or_create( site=querysite, defaults=provider_insertdata) if provider_created: print('created new providerdatabase for {0}'.format( url_encoded)) else: print('Updated existing providerentry') except BaseException as e: print("Couldn't insert {0}. \n Cause: {1}".format( provider_insertdata, e)) # check geodata geo_insertdata = ping_geo(url_encoded) # add geodata to database print(geo_insertdata) try: geo_obj, geo_created = GeoInfo.objects.update_or_create( site=querysite, defaults=geo_insertdata) if provider_created: print('created new providerdatabase for {0}'.format( url_encoded)) else: print('Updated existing providerentry') except BaseException as e: print("Couldn't insert {0}. \n Cause: {1}".format( geo_insertdata, e))