def get_domain(self, address): try: dom = re.search(r'(//|^)([a-z0-9.-]*[a-z]\.[a-z][a-z-]*?(?:[/:].*|$))', address).group(2) if not self.in_whitelist(dom): if get_tld(url_normalize(dom, **self.url_kwargs), fail_silently=True): return url_normalize(dom, **self.url_kwargs) return None except AttributeError: return None except UnicodeError: # url_normalize's error, happens when something weird matches regex self.logger.info("Caught UnicodeError on %r.", address) return None
def delete(self): '''Removes a feed.''' feedurl = self.request.body query = Feed.gql("WHERE link = :1", url_normalize(feedurl)) feed = query.fetch(1) db.delete(feed) self.response.out.write("Deleted\n")
def save(kv, db): url = request.forms['url'] if not url: return {'err' : '请输入URL'} url = url_normalize(url) if not url: return {'err' : '请输入有效的 URL'} surl = urlsplit(url) if surl.netloc.endswith(BLACKLIST): return {'err' : '不支持的域名'} code = hashto62(url) key = base62_encode(code) sql = """ INSERT INTO `taobb_urls` (`id`, `key`, `url`, `gmt_create`, `gmt_modified`) VALUES (:id, :key, :url, now(), now()) ON DUPLICATE KEY UPDATE `gmt_modified` = now() """ sp = SQLParams('named', 'format') sql, params = sp.format(sql, { 'id' : code, 'key' : key, 'url' : url, }) if db.execute(sql, params) and kv.set(key, url): return {'key':key , 'err' : None} else: return {'err': '内部错误'}
def save(db): url = request.forms['url'] if not url: return {'err': '请输入URL'} url = url_normalize(url) if not url: return {'err': '请输入有效的 URL'} surl = urlsplit(url) hostname = surl.hostname if hostname.endswith(BLACKLIST): return {'err': '不支持的域名'} if len(WHITELIST) > 0 and not ('.' + hostname).endswith(WHITELIST): return {'err': '仅支持阿里巴巴旗下网站域名'} code = hashto62(url) try: key = insert(db, code, url) return {'key': key, 'err': None} except: pass return {'err': '内部错误'}
def test_url_normalize_changes(): """Assert url_normalize do not change URI if not required. http://www.intertwingly.net/wiki/pie/PaceCanonicalIds """ for value in NO_CHANGES_EXPECTED: assert url_normalize(value) == value
def _validate(self): """Validate the request.""" if self.client_id is None: raise InvalidRequest('No client_id') try: self.client = Client.objects.get(key=self.client_id) except Client.DoesNotExist: raise InvalidClient("client_id %s doesn't exist" % self.client_id) # Redirect URI if self.redirect_uri is None: if self.client.redirect_uri is None: raise MissingRedirectURI("No redirect_uri" "provided or registered.") elif self.client.redirect_uri is not None: if url_normalize(self.redirect_uri) != url_normalize(self.client.redirect_uri): self.redirect_uri = self.client.redirect_uri raise InvalidRequest("Registered redirect_uri doesn't " "match provided redirect_uri.") self.redirect_uri = self.redirect_uri or self.client.redirect_uri # Check response type if self.response_type is None: raise InvalidRequest('response_type is a required parameter.') if self.response_type not in ["code", "token"]: raise InvalidRequest("No such response type %s" % self.response_type) # Response type if self.authorized_response_type & RESPONSE_TYPES[self.response_type] == 0: raise UnauthorizedClient("Response type %s not allowed." % self.response_type) if not absolute_http_url_re.match(self.redirect_uri): raise InvalidRequest('Absolute URI required for redirect_uri') # Scope if self.authorized_scope is not None and self.scope is None: self.scope = self.authorized_scope if self.scope is not None: if self.client.all_scopes_allowable: self.access_ranges = AccessRange.objects.filter(key__in=self.scope) else: self.access_ranges = self.client.allowable_scopes.filter(key__in=self.scope) access_ranges = set(self.access_ranges.values_list('key', flat=True)) difference = access_ranges.symmetric_difference(self.scope) if len(difference) != 0: raise InvalidScope("Following access ranges do not " "exist: %s" % ', '.join(difference)) if self.authorized_scope is not None: new_scope = self.scope - self.authorized_scope if len(new_scope) > 0: raise InvalidScope("Invalid scope: %s" % ','.join(new_scope))
def test_url_normalize_with_http_scheme(): """Assert we could use http scheme as default.""" url = "//www.foo.com/" expected = "http://www.foo.com/" actual = url_normalize(url, default_scheme='http') assert actual == expected
def normalize(url): """Normalizes URL. **Args:** * *url:* URL string. *Returns str*""" return url_normalize(url)
def getOrCreateLink(url): url = url_normalize(url) l = Link.objects.filter(url=url) if not l: l = Link(url=url) l.save() else: l = l[0] return l
def normalize_url_lossy(url: str) -> Optional[str]: """Do some simple transformations on a URL to make it match other equivalent URLs as well as possible. Normalization is "lossy" (makes the whole URL lowercase, removes subdomain parts "m.", "data.", "news.", ... in some cases). WARNING: You MUST set media.normalized_url = null for all possibly impacted media if you edit this function. If in doubt, set normalized_url = null for all media. See mediawords.tm.media.lookup_medium for more details. """ url = decode_object_from_bytes_if_needed(url) if url is None: return None if len(url) == 0: return None url = fix_common_url_mistakes(url) url = url.lower() # make archive.is links look like the destination link url = re.sub(r'^https://archive.is/[a-z0-9]/[a-z0-9]+/(.*)', r'\1', url, flags=re.I) if not url.startswith('http'): url = 'http://' + url # r2.ly redirects through the hostname, ala http://543.r2.ly if 'r2.ly' not in url: url = re.sub( r'^(https?://)(m|beta|media|data|image|www?|cdn|topic|article|news|archive|blog|video|search|preview|' r'login|shop|sports?|act|donate|press|web|photos?|\d+?).?\.(.*\.)', r"\1\3", url, re.I) # collapse the vast array of http://pronkraymond83483.podomatic.com/ urls into http://pronkpops.podomatic.com/ url = re.sub(r'http://.*pron.*\.podomatic\.com', 'http://pronkpops.podomatic.com', url) # get rid of anchor text url = re.sub(r'#.*', '', url) # get rid of multiple slashes in a row url = re.sub(r'(//.*/)/+', r"\1", url) url = re.sub(r'^https:', 'http:', url) # canonical_url might raise an encoding error if url is not invalid; just skip the canonical url step in the case # noinspection PyBroadException try: url = url_normalize.url_normalize(url) except Exception as ex: log.warning("Unable to get canonical URL for URL %s: %s" % (url, str(ex),)) # add trailing slash if re.search(r'https?://[^/]*$', url): url += '/' return url
def compare(url1, url2): url1 = url1.strip() url2 = url2.strip() if len(url1) == 0: if len(url2) == 0: return 0 else: return -1 elif len(url2) == 0: return 1 url1 = url_normalize.url_normalize(url1) url2 = url_normalize.url_normalize(url2) if url1 < url2: return -1 elif url1 == url2: return 0 else: return 1
def __init__(self, url=None, key=None): obj = {} self._r = red if url: # create new self._url = url_normalize(url) self._r.incr('last_url_id') # inc global counter self._key = key = self._find_key(self._url) obj[key] = self._url obj['%s:created_at' % key] = datetime.now().strftime(self.time_f) self._r.mset(obj) if key: # load exist self._key = key
def is_http_url(url: str) -> bool: """Returns true if URL is in the "http" ("https") scheme.""" url = decode_object_from_bytes_if_needed(url) if url is None: log.debug("URL is None") return False if len(url) == 0: log.debug("URL is empty") return False log.debug("Testing if URL '%s' is HTTP(s) URL" % url) if not re.search(__URL_REGEX, url): log.debug("URL '%s' does not match URL's regexp" % url) return False try: uri = furl(url) # Try stringifying URL back from the furl() object to try out all of its accessors str(uri) # Some URLs become invalid when normalized (which is what "requests" will do), e.g.: # # http://michigan-state-football-sexual-assault-charges-arrest-players-names -- valid # http://michigan-state-football-sexual-assault-charges-arrest-players-names/ -- invalid (decoding error) # # ...so try the same with normalized URL normalized_url = url_normalize.url_normalize(url) normalized_uri = furl(normalized_url) str(normalized_uri) except Exception as ex: log.debug("Cannot parse URL: %s" % str(ex)) return False if not uri.scheme: log.debug("Scheme is undefined for URL %s" % url) return False if not uri.scheme.lower() in ['http', 'https']: log.debug("Scheme is not HTTP(s) for URL %s" % url) return False if not uri.host: log.debug("Host is undefined for URL %s" % url) return False return True
def cert_chain_url_valid(cert_url): """ Ensure that the provided URL for the certificate chain is valid, by checking that: * it's HTTPS * the host is s3.amazonaws.com * the port, if specified, is 443 * the path starts with '/echo.api/' """ normalized = url_normalize(cert_url) parsed = urlparse(normalized) url_checks = { 'scheme': parsed.scheme == 'https', 'hostname': parsed.hostname == 's3.amazonaws.com', 'port': parsed.port in (443, None), 'path': parsed.path.startswith('/echo.api/'), } all_checks_pass = all(url_checks.values()) return all_checks_pass
def _prepareURL(self, apiQueryURI): """ If the URI (actually just a partial URL, usually the path part) doesn't begin with the base URL for the API, concatenate the two into a new URL and return it. :param apiQueryURI: URI (actually, just a partial URL, usually the path part) for an API entry point. :type apiQueryURI: str :return: URL for the API query, ready for use :rtype: str """ assert isinstance(apiQueryURI, str) assert not util.stringContainsAllCharacters(apiQueryURI, '{}'), \ 'apiQueryURI contains unformatted arguments: "%s"' % apiQueryURI if apiQueryURI.startswith(self.apiBaseURL): return apiQueryURI return url_normalize(self.apiBaseURL + '/' + apiQueryURI)
def fetch(self, method, endpoint, params): api_endpoint = url_normalize(self.api_base + endpoint) if method.lower() in ['get', 'delete']: content = self.oauth.request( method, api_endpoint, params = params, headers={'User-Agent':'Semantics3 Python Lib/0.2'}, timeout=self.timeout ) else: content = self.oauth.request( method, api_endpoint, data = json.dumps(params), headers={'User-Agent':'Semantics3 Python Lib/0.2', 'Content-Type':'application/json'}, timeout=self.timeout ) return content
def main(): ''' Takes in a list of URL's in a file and outputs the validity of the URL, the canonicalized URL, the uniqueness of the URL and the canonicalized URL ''' (parser, opts, args) = controller() if not opts.filename: parser.print_help() sys.exit(1) filename = opts.filename try: f = open(filename, 'r') raw_url_list = reader.read_file(f) except IOError as e: handle_io_exception(filename, e) unique_raw_urls = set() unique_canonicalized_urls = set() is_raw_valid = False is_raw_unique = False is_canonical_unique = False canonicalized_url = "" for raw_url in raw_url_list: print("Source: " + raw_url) is_raw_valid = url_validator.is_valid(raw_url) print("Valid: " + str(is_raw_valid)) canonicalized_url = url_normalize.url_normalize(raw_url) print("Canonical: " + canonicalized_url) is_raw_unique = raw_url not in unique_raw_urls if is_raw_unique: unique_raw_urls.add(raw_url) print("Source unique: " + str(is_raw_unique)) is_canonical_unique = canonicalized_url not in unique_canonicalized_urls if is_canonical_unique: unique_canonicalized_urls.add(canonicalized_url) print("Canonicalized URL unique: " + str(is_canonical_unique))
def add(self, url, load_bad_url=False, always=False): url = url_normalize(url) if always: state = self._URL_TASK else: state = self.get_index_state(url) if state == self._URL_TASK: ## crawling return if state == self._URL_DONE: return if not load_bad_url and state == self._URL_BAD: return host = urlparse.urlparse(url).netloc if not host: return if host in self._pool: self._pool[host].add(url) else: self._pool[host] = set([url]) self.url_count += 1 self._urlindex.Put(url, self._URL_TASK)
def fetch(self, url=False, armor=False): if not url: return False normalizedurl = url_normalize.url_normalize(url) if self.debug: print (normalizedurl) urlhash = self._hash(v=normalizedurl) if self.debug: print (urlhash) self._hashdir(urlhash) if not self._archived(v=urlhash): fetcher = requests.get(normalizedurl) if fetcher.status_code == 200: meta = fetcher.headers meta['url_archiver:url'] = normalizedurl meta['url_archiver:urlhash'] = urlhash meta['url_archiver:version'] = __version__ self._store(raw=fetcher.text, u=normalizedurl, meta=meta) return self._get(v=urlhash, armor=armor) else: return self._get(v=urlhash, armor=armor)
def canonical_url(url: str) -> str: """Make URL canonical (lowercase scheme and host, remove default port, etc.)""" # FIXME maybe merge with normalize_url() as both do pretty much the same thing url = decode_object_from_bytes_if_needed(url) if url is None: raise McCanonicalURLException("URL is None.") if len(url) == 0: raise McCanonicalURLException("URL is empty.") url = fix_common_url_mistakes(url) if not is_http_url(url): raise McCanonicalURLException("URL is not HTTP(s): %s" % url) try: can_url = url_normalize.url_normalize(url) except Exception as ex: raise McCanonicalURLException("Failed to create canonical URL from URL %s: %s" % (url, str(ex),)) return can_url
def get_sites(queries, args): status_code = 200 url = args.get("url", None) if url: normalized_url = url_normalize(url) result = queries.get_site_by_url(url=normalized_url) if result: response_body = { "message": f"Returning site with url '{normalized_url}'", "result": result, } else: response_body = { "message": f"No site having url '{normalized_url}'", "result": {}, } status_code = 404 else: result = list(queries.get_all_sites()) response_body = {"message": "Returning all sites", "result": result} return {"body": response_body, "status_code": status_code}
def handle_image(self, image, pageLink): image_link = image.get('src') hostname = pageLink.split('/')[2:3][0] try: if not "https://" or not "http://" in image_link: image_link = url_normalize(pageLink + "/" + image_link) path = urllib.request.urlopen(image_link) if '@@images' in path.url: filename = path.url.split('/')[-4] else: filename = path.url.split('/')[-1] if filename is 'thumb' or filename is 'preview' or filename is 'mini': print('url not getting translated: ' + path.url) image['alt'] = filename while os.path.isfile(filename): (root, ext) = os.path.splitext(filename) filename = root + "(1)" + ext urllib.request.urlretrieve(image_link, filename=filename) elif hostname in image_link: path = urllib.request.urlopen(image_link) if '@@images' in path.url: filename = path.url.split('/')[-3] else: filename = path.url.split('/')[-1] if filename is 'thumb' or filename is 'preview' or filename is 'mini': print('url not getting translated: ' + path.url) image['alt'] = filename while os.path.isfile(filename): (root, ext) = os.path.splitext(filename) filename = root + "(1)" + ext urllib.request.urlretrieve(image_link, filename=filename) except Exception as e: print(e) print("image link not working: " + str(pageLink) + ": " + str(image_link)) self.errors.write(str(pageLink) + ": " + str(image_link) + "\n")
def _extract_links(soup, page): """Extract links from a webpage and normalize those links. Returns a list of (link,dns) tuple.""" extracted_links = re.findall('"((http)s?://.*?)"', page) extracted_links = [url for url, _ in extracted_links] links = [] for i in range(len(extracted_links)): # Normalize the url link by converting it to canonical form. # For more info, refer to https://pypi.python.org/pypi/urlnorm # MOSTAFA HERE IS an exception # It gives me an exception sometimes try: extracted_links[i] = url_normalize(extracted_links[i]) except: i -= 1 continue extracted_links[i] = extracted_links[i].replace( "%3A", ":") # Restore the ":" character back. if Fetcher._check_ext_html(extracted_links[i]): links.append((extracted_links[i], Fetcher.extract_dns(extracted_links[i]))) return links
def parse_url(url): # normalizing the url converts all domain characters # into lower case and ensures non-alphanumeric characters # are properly formatted from url_normalize import url_normalize try: url_parsed = urlparse(url_normalize(url)) except: url_parsed = urlparse(url) # remove trailing slash from url if present #path=url_parsed.path #if len(path)>0 and path[-1]=='/': #path=path[:-1] # this check is necessary for when url='' hostname = url_parsed.hostname if hostname is None: hostname = '' # store port numbers as -1 if its the default port for a scheme # we must wrap this in a try/except block in case post numbers are out of range try: port = url_parsed.port except ValueError: port = None if port is None: port = -1 return { 'scheme': url_parsed.scheme, 'hostname': hostname, 'port': port, 'path': url_parsed.path, 'params': url_parsed.params, 'query': url_parsed.query, 'fragment': url_parsed.fragment, 'other': '', }
async def post(self): # Extract long URL url_long = self.get_body_argument('url_long') if not url_long: # Empty url_long. Send back default HTML. if self.service.verbosity > 1: self.verbose_message(200) self.finish(self.generate_html()) return # Long URL received. Canonicalize it. try: url_long = url_normalize.url_normalize(url_long) except UnicodeError: await self.send_error_emulate(400, f'Failed to interpret URL "{url_long}"') return # Get short URL and send back response url_short = await self.service.lookup(url_long, tablename='long2short') # Assemble full version of short URL url_short = f'{self.service.get_host_url(self.request)}/{url_short}' if self.service.verbosity > 1: self.verbose_message(200) self.finish(self.generate_html(url_long, url_short))
def get_google_search_urls(query): ''' input:\n query - User-Defined query, should be a string returns:\n all_urls - the URLS on the Google Search Result, a list \n urls_info - Related info to the URLs, a list ''' url = url_normalize(BASE_URL+query) all_urls, urls_info, par_urls_info = get_parent_child_info( url, starting_regex='/url?q=', google=True, depth=0, top_ten=False) for sw in STOP_WORDS: front, back = 0, len(all_urls) - 1 while front < back: if sw in all_urls[front]: del all_urls[front] del urls_info[front] back -= 1 else: front += 1 # New Code for url, url_info in zip(all_urls, urls_info): heap_list = [float('-inf'), next(COUNTER), url_info[0], url_info[1], par_urls_info[2], []] VISITED_URLS_PRIOR_DICT[url] = heap_list return all_urls, urls_info
def set_camunda_configuration(self, configuration: dict): if 'host' not in configuration.keys(): raise ValueError( f"Incomplete configuration. Configuration must include at least the Camunda host url:\t{configuration}" ) # weird things happen when dictionary is not copied and keyword is called repeatedly. Somehow robot or python remember the configuration from the previous call camunda_config = configuration.copy() host = configuration['host'] camunda_config['host'] = url_normalize(f'{host}/engine-rest') if 'api_key' in configuration.keys(): api_key = configuration['api_key'] camunda_config['api_key'] = {'default': api_key} if 'api_key_prefix' in configuration.keys(): api_key_prefix = configuration['api_key_prefix'] camunda_config['api_key_prefix'] = {'default': api_key_prefix} logger.debug( f"New configuration for Camunda client:\t{camunda_config}") self._shared_resources.client_configuration = Configuration( **camunda_config)
def create(request): request_json = json.loads(request.body.decode('utf-8')) print(request_json) if 'url' in request_json and len(request_json['url'].strip()) > 0: val = URLValidator() try: long_url = url_normalize(request_json['url'].strip()) val(long_url) existing_link = Link.objects.filter(url=long_url) if existing_link.count() == 0: link = Link(url=long_url, date_created=timezone.now()) link.save() else: link = existing_link[0] return JsonResponse({ 'short_url': request.build_absolute_uri( reverse('shortener:goto', args=[link.short_url()])) }) except ValidationError: return JsonResponse({'error': 'URL is not valid.'}, status=400) else: return JsonResponse({'error': 'No URL given.'}, status=400)
def parse(xml): '''Parses a blob of XML, e.g., from a fetcher or from PuSH.''' parsed = feedparser.parse(xml) selflink = parsed.feed.link # Best if we can't find a selflink for l in parsed.feed.links: if l.rel == "self": selflink = l.href query = Feed.gql("WHERE link = :1", url_normalize(selflink)) feed = query.fetch(1)[0] for entry in parsed['entries']: item = Item.get_or_insert(Item.makekeyname(entry.link), title = entry.title, link = entry.link, retrieved = datetime.now(), content = entry.content[0].value, #summary = entry.summary, version = 1, created = datetime(*(entry.published_parsed[:6])), feed = feed, private = feed.private, )
def add(self, url, load_bad_url=False, always=False): url = url_normalize(url) if always: state = self._URL_TASK else: state = self.get_index_state(url) if state == self._URL_TASK: ## crawling return if state == self._URL_DONE: return if not load_bad_url and state == self._URL_BAD: return host = urlparse.urlparse(url).netloc if not host: return if host in self._pool: if url not in self._pool[host]: self._pool[host].add(url) self.url_count += 1 else: self._pool[host] = set([url]) self.url_count += 1 self._urlindex.Put(url, self._URL_TASK)
def __init__(self, xml): url = xml.find("loc") lastmod = xml.find("lastmod") title = xml.find("news:title") description = xml.find("news:description") keywords = xml.find("news:keywords") publication_date = xml.find("news:publication_date") if not title: title = xml.find("video:title") if not description: description = xml.find("video:description") self.url = format_text(url_normalize(url.text.strip().lower())) self.html = "" self.tree = None parsed = urlparse(self.url) self.site = parsed.netloc self.path = parsed.path try: pardir = "/".join(re.sub(r"(/)$", "", self.path).split("/")[:-2]) except: pardir = "/" self.base_url = f"{parsed.scheme}://{parsed.netloc}{pardir}" self.lastmod = parse_timestamp(format_text( lastmod.text)) if lastmod else None self.headline = format_text(title.text.strip()) if title else "" self.keywords = ([format_text(kw) for kw in keywords.text.split(",")] if keywords else []) self.publication_date = (format_text(publication_date.text) if publication_date else "") self.description = format_text(description.text) if description else "" self.xml = format_text(xml.__repr__()) self.metadata = {"schemata": [], "errors": []} self.has_metadata = False self.seen = self.url in seen # seen.add(self.url) self.articlebody = "" self.visited = False
def load_sitemap_urls(fp="lib/newspapers.tsv"): fp = os.path.abspath(fp) news = load_csv(fp) loaded = [] for row in list(news): resolved_urls = [] for k, v in list(row.items()): if not v: continue elif k.startswith("sitemap_url_template"): resolved = datetime.datetime.now().strftime(v) resolved_urls.append(resolved) elif k.startswith("sitemap_url"): resolved_urls.append(v) print("rrresolved_urls") print(resolved_urls) print(row) url = url_normalize(row["url"]).strip().lower() parsed = urlparse(url) row["url"] = url row["site"] = parsed.netloc row["sitemap_urls"] = resolved_urls loaded.append(row) return loaded
def getLinks(self, url): url = url_normalize(url) self.links = [] # Remember the base URL which will be important when creating # absolute URLs self.baseUrl = url # Use the urlopen function from the standard Python 3 library response = urlopen(url) head = response.info() isUrl = head.gettype().startswith('text/html') # print(isUrl) # Make sure that we are looking at HTML and not other things that # are floating around on the internet (such as # JavaScript files, CSS, or .PDFs for example) if isUrl: htmlBytes = response.read() # Note that feed() handles Strings well, but not bytes # (A change from Python 2.x to Python 3.x) htmlString = htmlBytes.decode("utf-8") self.feed(htmlString) return htmlString, self.links else: return "",[]
def get_canonical_form(url, parent_url): try: can_url = url_normalize(url) can_url = str(can_url).replace(":80", "").replace(":443", "").replace("https", "http") if can_url.__contains__("#"): can_url = can_url.split("#")[0] split_array = re.split("^\.\./", can_url) if len(split_array) > 1: # will execute if '../' is present at the url start extension = split_array[1] can_url = urljoin(parent_url, extension) if can_url.startswith("/"): can_url = urljoin(parent_url, can_url) url_array = urlparse(can_url) # ParseResult(scheme='http', netloc='www.browse-tutorials.net', path='/tutorial/get-self-base-url-appengine-urlparse', # params='', query='', fragment='') base = url_array.scheme + "://" + url_array.netloc can_url = can_url[:len(base)].lower() + can_url[len(base):] except Exception: return "" return can_url
async def async_step_ssdp( self, discovery_info: ssdp.SsdpServiceInfo) -> FlowResult: """Handle SSDP initiated config flow.""" await self.async_set_unique_id(discovery_info.upnp[ssdp.ATTR_UPNP_UDN]) self._abort_if_unique_id_configured() # Attempt to distinguish from other non-LTE Huawei router devices, at least # some ones we are interested in have "Mobile Wi-Fi" friendlyName. if ("mobile" not in discovery_info.upnp.get(ssdp.ATTR_UPNP_FRIENDLY_NAME, "").lower()): return self.async_abort(reason="not_huawei_lte") if TYPE_CHECKING: assert discovery_info.ssdp_location url = url_normalize( discovery_info.upnp.get( ssdp.ATTR_UPNP_PRESENTATION_URL, f"http://{urlparse(discovery_info.ssdp_location).hostname}/", )) if serial_number := discovery_info.upnp.get(ssdp.ATTR_UPNP_SERIAL): await self.async_set_unique_id(serial_number) self._abort_if_unique_id_configured()
async def async_step_ssdp(self, discovery_info): """Handle SSDP initiated config flow.""" # Attempt to distinguish from other non-LTE Huawei router devices, at least # some ones we are interested in have "Mobile Wi-Fi" friendlyName. if "mobile" not in discovery_info.get(ssdp.ATTR_UPNP_FRIENDLY_NAME, "").lower(): return self.async_abort(reason="not_huawei_lte") # https://github.com/PyCQA/pylint/issues/3167 url = self.context[CONF_URL] = url_normalize( # pylint: disable=no-member discovery_info.get( ssdp.ATTR_UPNP_PRESENTATION_URL, f"http://{urlparse(discovery_info[ssdp.ATTR_SSDP_LOCATION]).hostname}/", )) if any(url == flow["context"].get(CONF_URL) for flow in self._async_in_progress()): return self.async_abort(reason="already_in_progress") user_input = {CONF_URL: url} if self._already_configured(user_input): return self.async_abort(reason="already_configured") return await self._async_show_user_form(user_input)
async def async_setup(hass: HomeAssistant, config: ConfigType) -> bool: """Set up Huawei LTE component.""" # dicttoxml (used by huawei-lte-api) has uselessly verbose INFO level. # https://github.com/quandyfactory/dicttoxml/issues/60 logging.getLogger("dicttoxml").setLevel(logging.WARNING) # Arrange our YAML config to dict with normalized URLs as keys domain_config: dict[str, dict[str, Any]] = {} if DOMAIN not in hass.data: hass.data[DOMAIN] = HuaweiLteData(hass_config=config, config=domain_config) for router_config in config.get(DOMAIN, []): domain_config[url_normalize( router_config.pop(CONF_URL))] = router_config def service_handler(service: ServiceCall) -> None: """Apply a service.""" routers = hass.data[DOMAIN].routers if url := service.data.get(CONF_URL): router = routers.get(url) elif not routers: _LOGGER.error("%s: no routers configured", service.service) return
async def async_step_user(self, user_input: dict[str, Any] | None = None ) -> FlowResult: """Handle user initiated config flow.""" if user_input is None: return await self._async_show_user_form() errors = {} # Normalize URL user_input[CONF_URL] = url_normalize(user_input[CONF_URL], default_scheme="http") if "://" not in user_input[CONF_URL]: errors[CONF_URL] = "invalid_url" return await self._async_show_user_form(user_input=user_input, errors=errors) conn: AuthorizedConnection def logout() -> None: try: conn.user.logout() except Exception: # pylint: disable=broad-except _LOGGER.debug("Could not logout", exc_info=True) def try_connect(user_input: dict[str, Any]) -> AuthorizedConnection: """Try connecting with given credentials.""" username = user_input.get(CONF_USERNAME) or "" password = user_input.get(CONF_PASSWORD) or "" conn = AuthorizedConnection( user_input[CONF_URL], username=username, password=password, timeout=CONNECTION_TIMEOUT, ) return conn def get_device_info() -> tuple[GetResponseType, GetResponseType]: """Get router info.""" client = Client(conn) try: device_info = client.device.information() except Exception: # pylint: disable=broad-except _LOGGER.debug("Could not get device.information", exc_info=True) try: device_info = client.device.basic_information() except Exception: # pylint: disable=broad-except _LOGGER.debug("Could not get device.basic_information", exc_info=True) device_info = {} try: wlan_settings = client.wlan.multi_basic_settings() except Exception: # pylint: disable=broad-except _LOGGER.debug("Could not get wlan.multi_basic_settings", exc_info=True) wlan_settings = {} return device_info, wlan_settings try: conn = await self.hass.async_add_executor_job( try_connect, user_input) except LoginErrorUsernameWrongException: errors[CONF_USERNAME] = "incorrect_username" except LoginErrorPasswordWrongException: errors[CONF_PASSWORD] = "incorrect_password" except LoginErrorUsernamePasswordWrongException: errors[CONF_USERNAME] = "invalid_auth" except LoginErrorUsernamePasswordOverrunException: errors["base"] = "login_attempts_exceeded" except ResponseErrorException: _LOGGER.warning("Response error", exc_info=True) errors["base"] = "response_error" except Timeout: _LOGGER.warning("Connection timeout", exc_info=True) errors[CONF_URL] = "connection_timeout" except Exception: # pylint: disable=broad-except _LOGGER.warning("Unknown error connecting to device", exc_info=True) errors[CONF_URL] = "unknown" if errors: await self.hass.async_add_executor_job(logout) return await self._async_show_user_form(user_input=user_input, errors=errors) info, wlan_settings = await self.hass.async_add_executor_job( get_device_info) await self.hass.async_add_executor_job(logout) if not self.unique_id: if serial_number := info.get("SerialNumber"): await self.async_set_unique_id(serial_number) self._abort_if_unique_id_configured() else: await self._async_handle_discovery_without_unique_id()
def normalize_url_domain(url): url = url_normalize(url) return str(url) if str(url).endswith('/') else str(url) + '/'
def t(self, in_url, ex_url): self.assertEqual(url_normalize(in_url), ex_url)
def __init__(self, url): self.url = url self.cleaned_url = url_normalize(url)
def normalize_url(self, url): return url_normalize.url_normalize(url)
def normalize_url(url): return url_normalize(url)
def __init__(self, dkubeURL, token): configuration.host = url_normalize('{}/dkube/v2/controller'.format(dkubeURL)) configuration.api_key['Authorization'] = token configuration.verify_ssl = False
def check_domain(self, item): """check subdomains and check if url is a store (by keywords)""" counter = 0 leader_phone = '' leader_phone_from_team = '' all_pages_phone = '' main_page_phone = '' phone = '' leader_email = '' leader_email_from_team = '' all_pages_email = '' main_page_email = '' email = '' leader_phone_without_sitemap = '' leader_email_without_sitemap = '' try: domain = item['Internet-Adresse'] subdomains = [] subdomains_list = list() domain_is_shop = False domain = str(domain) if domain != 'nan': # take a domain target = self.clear_url(domain) # make a request to an external service req = requests.get( "https://crt.sh/?q=%.{d}&output=json".format(d=target), headers=self.headers) if req.status_code != 200: print("[X] Information not available!") else: for (key, value) in enumerate(req.json()): subdomains.append(value['name_value']) subdomains = sorted(set(subdomains)) # select the required subdomains for subdomain in subdomains: if 'shop' in subdomain or 'store' in subdomain: domain_is_shop = True if '\n' in subdomain: s = subdomain.split(sep='\n') for v in s: if 'shop' in v or 'store' in v: subdomains_list.append( url_normalize(v)) print(f'subdomain_m: {v}') else: subdomains_list.append( url_normalize(subdomain)) print(f'subdomain_o: {subdomain}') is_shop, main_page_phone, main_page_email, phone, email = self.is_shop_and_main_page( domain, domain_is_shop) # noqa leader_phone_without_sitemap = phone leader_email_without_sitemap = email if is_shop is True: domain_is_shop = True if domain_is_shop is True: # check the quantity of goods common_list = [link for link in subdomains_list] subdomains_list = self.normalize_urls_list(common_list) common_list.append(domain) common_list = self.normalize_urls_list(common_list) sitemap_tree = self.get_sitemap_tree(common_list) if sitemap_tree: leader_phone, leader_email = self.get_leader_phone_and_email_from_sitemap( sitemap_tree) leader_phone_from_team, leader_email_from_team = \ self.get_leader_phone_and_email_from_sitemap_section_team(sitemap_tree) counter, all_pages_phone, all_pages_email = \ self.check_phones_emails_on_every_page_and_count_the_quantity_of_goods(sitemap_tree) else: pass phone, leader_phone_without_sitemap, leader_phone, leader_phone_from_team, main_page_phone,\ all_pages_phone = self.phone( leader_phone_without_sitemap=leader_phone_without_sitemap, leader_phone=leader_phone, leader_phone_from_team=leader_phone_from_team, main_page_phone=main_page_phone, all_pages_phone=all_pages_phone ) email, leader_email_without_sitemap, leader_email, leader_email_from_team, main_page_email,\ all_pages_email = self.email( leader_email_without_sitemap=leader_email_without_sitemap, leader_email=leader_email, leader_email_from_team=leader_email_from_team, main_page_email=main_page_email, all_pages_email=all_pages_email ) self.write_to_file( item, is_shop=domain_is_shop, number_of_goods=counter, shop_domain=subdomains_list, phone=phone, main_page_phone=main_page_phone, leader_phone_without_sitemap=leader_phone_without_sitemap, all_pages_phone=all_pages_phone, leader_phone=leader_phone, leader_phone_from_team=leader_phone_from_team, email=email, leader_email_without_sitemap=leader_email_without_sitemap, leader_email=leader_email, leader_email_from_team=leader_email_from_team, main_page_email=main_page_email, all_pages_email=all_pages_email) self.open_db() self.cur.execute( """INSERT INTO Domains_and_subdomains ( DUNS, Handelsregister_Nummer, UID, Internet_Adresse, subdomains, Rechtsform, Filiale_Indikator, Mitarbeiter, Mitarbeiter_Gruppe, is_shop, number_of_goods, phone, phone_main_page, leader_phone_without_sitemap, phones_all_pages, leader_phone_sitemap, leader_phone_from_team_sitemap, email, email_main_page, leader_email_without_sitemap, emails_all_pages, leader_email_sitemap, leader_email_from_team_sitemap ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)""", ( # noqa item['DUNS'], item['Handelsregister-Nummer'], item['UID'], item['Internet-Adresse'], str(subdomains_list), item['Rechtsform'], item['Filiale Indikator'], item['Mitarbeiter'], item['Mitarbeiter Gruppe'], domain_is_shop, counter, str(phone), str(main_page_phone), str(leader_phone_without_sitemap), str(all_pages_phone), str(leader_phone), str(leader_phone_from_team), str(email), str(main_page_email), str(leader_email_without_sitemap), str(all_pages_email), str(leader_email), str(leader_email_from_team))) self.connection.commit() self.close_db() else: self.write_to_file( item, is_shop=False, number_of_goods=0, shop_domain='', phone=phone, main_page_phone=main_page_phone, leader_phone_without_sitemap=leader_phone_without_sitemap, all_pages_phone=all_pages_phone, leader_phone=leader_phone, leader_phone_from_team=leader_phone_from_team, email=email, leader_email_without_sitemap=leader_email_without_sitemap, leader_email=leader_email, leader_email_from_team=leader_email_from_team, main_page_email=main_page_email, all_pages_email=all_pages_email) self.open_db() self.cur.execute( """INSERT INTO Domains_and_subdomains ( DUNS, Handelsregister_Nummer, UID, Internet_Adresse, subdomains, Rechtsform, Filiale_Indikator, Mitarbeiter, Mitarbeiter_Gruppe, is_shop, number_of_goods, phone, phone_main_page, leader_phone_without_sitemap, phones_all_pages, leader_phone_sitemap, leader_phone_from_team_sitemap, email, email_main_page, leader_email_without_sitemap, emails_all_pages, leader_email_sitemap, leader_email_from_team_sitemap ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)""", ( # noqa item['DUNS'], item['Handelsregister-Nummer'], item['UID'], item['Internet-Adresse'], '', item['Rechtsform'], item['Filiale Indikator'], item['Mitarbeiter'], item['Mitarbeiter Gruppe'], False, 0, '', '', '', '', '', '', '', '', '', '', '', '')) self.connection.commit() self.close_db() except Exception as e: print(f'check_domain: {e}') self.write_to_file( item, is_shop=False, number_of_goods=0, shop_domain='', phone=phone, main_page_phone=main_page_phone, leader_phone_without_sitemap=leader_phone_without_sitemap, all_pages_phone=all_pages_phone, leader_phone=leader_phone, leader_phone_from_team=leader_phone_from_team, email=email, leader_email_without_sitemap=leader_email_without_sitemap, leader_email=leader_email, leader_email_from_team=leader_email_from_team, main_page_email=main_page_email, all_pages_email=all_pages_email) self.open_db() self.cur.execute( """INSERT INTO Domains_and_subdomains ( DUNS, Handelsregister_Nummer, UID, Internet_Adresse, subdomains, Rechtsform, Filiale_Indikator, Mitarbeiter, Mitarbeiter_Gruppe, is_shop, number_of_goods, phone, phone_main_page, leader_phone_without_sitemap, phones_all_pages, leader_phone_sitemap, leader_phone_from_team_sitemap, email, email_main_page, leader_email_without_sitemap, emails_all_pages, leader_email_sitemap, leader_email_from_team_sitemap ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)""", ( # noqa item['DUNS'], item['Handelsregister-Nummer'], item['UID'], item['Internet-Adresse'], '', item['Rechtsform'], item['Filiale Indikator'], item['Mitarbeiter'], item['Mitarbeiter Gruppe'], False, 0, '', '', '', '', '', '', '', '', '', '', '', '')) self.connection.commit() self.close_db()
async def async_step_user(self, user_input=None): """Handle user initiated config flow.""" if user_input is None: return await self._async_show_user_form() errors = {} # Normalize URL user_input[CONF_URL] = url_normalize(user_input[CONF_URL], default_scheme="http") if "://" not in user_input[CONF_URL]: errors[CONF_URL] = "invalid_url" return await self._async_show_user_form(user_input=user_input, errors=errors) if self._already_configured(user_input): return self.async_abort(reason="already_configured") conn = None def logout(): if hasattr(conn, "user"): try: conn.user.logout() except Exception: # pylint: disable=broad-except _LOGGER.debug("Could not logout", exc_info=True) def try_connect(username: Optional[str], password: Optional[str]) -> Connection: """Try connecting with given credentials.""" if username or password: conn = AuthorizedConnection( user_input[CONF_URL], username=username, password=password, timeout=CONNECTION_TIMEOUT, ) else: try: conn = AuthorizedConnection( user_input[CONF_URL], username="", password="", timeout=CONNECTION_TIMEOUT, ) user_input[CONF_USERNAME] = "" user_input[CONF_PASSWORD] = "" except ResponseErrorException: _LOGGER.debug( "Could not login with empty credentials, proceeding unauthenticated", exc_info=True, ) conn = Connection(user_input[CONF_URL], timeout=CONNECTION_TIMEOUT) del user_input[CONF_USERNAME] del user_input[CONF_PASSWORD] return conn def get_router_title(conn: Connection) -> str: """Get title for router.""" title = None client = Client(conn) try: info = client.device.basic_information() except Exception: # pylint: disable=broad-except _LOGGER.debug("Could not get device.basic_information", exc_info=True) else: title = info.get("devicename") if not title: try: info = client.device.information() except Exception: # pylint: disable=broad-except _LOGGER.debug("Could not get device.information", exc_info=True) else: title = info.get("DeviceName") return title or DEFAULT_DEVICE_NAME username = user_input.get(CONF_USERNAME) password = user_input.get(CONF_PASSWORD) try: conn = await self.hass.async_add_executor_job( try_connect, username, password) except LoginErrorUsernameWrongException: errors[CONF_USERNAME] = "incorrect_username" except LoginErrorPasswordWrongException: errors[CONF_PASSWORD] = "incorrect_password" except LoginErrorUsernamePasswordWrongException: errors[CONF_USERNAME] = "incorrect_username_or_password" except LoginErrorUsernamePasswordOverrunException: errors["base"] = "login_attempts_exceeded" except ResponseErrorException: _LOGGER.warning("Response error", exc_info=True) errors["base"] = "response_error" except Timeout: _LOGGER.warning("Connection timeout", exc_info=True) errors[CONF_URL] = "connection_timeout" except Exception: # pylint: disable=broad-except _LOGGER.warning("Unknown error connecting to device", exc_info=True) errors[CONF_URL] = "unknown_connection_error" if errors: await self.hass.async_add_executor_job(logout) return await self._async_show_user_form(user_input=user_input, errors=errors) title = await self.hass.async_add_executor_job(get_router_title, conn) await self.hass.async_add_executor_job(logout) return self.async_create_entry(title=title, data=user_input)
def normalize_url(url: Optional[str]) -> Optional[str]: if url is None: return url return url_normalize.url_normalize(url)
def __init__(self, url): self.url = url self.normalized = url_normalize(url)
def test_url_normalize_results(): """Assert url_normalize return expected results.""" for value, expected in EXPECTED_RESULTS.items(): assert expected == url_normalize(value), value
def validate(self, value): value = db.LinkProperty().validate(value) value = url_normalize(value) return value
def canonical(url): norm = url_normalize(url) return norm.split('#', 1)[0]
def getNormalized(self): return url_normalize(self.url)
from betamax import Betamax from betamax_serializers.pretty_json import PrettyJSONSerializer from click.testing import CliRunner from ddi.cli import initiate_session from ddi.ipv4 import * import base64 import os import pytest import url_normalize ddi_host = os.environ.get('DDI_HOST', 'ddi-test-host.example.com') ddi_password = os.environ.get('DDI_PASSWORD', 'test_password') ddi_server = os.environ.get('DDI_SERVER', 'https://ddi.example.com') ddi_site_name = os.environ.get('DDI_SITE_NAME', 'EXAMPLE') ddi_url = url_normalize.url_normalize(ddi_server) ddi_username = os.environ.get('DDI_USERNAME', 'test_user') domain_name = os.environ.get('DOMAINNAME', 'example.com') errant_ddi_host = 'bad-host.example.com' # Test variables: errant_ipv4_address = '1.1.1.1' errant_subnet = '1.1.1.0' ipv4_address = '172.23.23.4' subnet = '172.23.23.0' # Makes the output more readable Betamax.register_serializer(PrettyJSONSerializer) config = Betamax.configure() config.cassette_library_dir = 'tests/cassettes'
numberOfPages= int(raw_input('n: ')) print "Start time:" + str(time.asctime(time.localtime(time.time()))) start = time.time() userinput = urllib.urlencode ({'q':userinput}) response = urllib.urlopen ( 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&rsz=8&' + userinput ).read() #extract 1st 8 results from google search engine json = myjson.loads ( response ) results = json [ 'responseData' ][ 'results' ] RepeatedLinkCheckdict=dict() # dictionary to check whether a link is repeated for crawling. dictValue = 1 from collections import deque initqueue = deque() # This is the queue that will contain valid urls to be crawled, in Breadth First Search order. for result in results: url = result['url'] normalUrl=url_normalize.url_normalize(url) if (normalUrl in RepeatedLinkCheckdict): pass # If the current url is already crawled discard else: RepeatedLinkCheckdict[normalUrl]=dictValue dictValue=dictValue+1 initqueue.append(url) PageDataList=[] # This list will store the content of the pages successfully crawled LinksParsedInOrder=[] # This list will store the successfully crawled links in the order they are crawled and the related statistical information lengthOfQueue=len(initqueue) outputData='' UrlCode=0 i=-1 TotalSizeOfPagesDownloaded=0 while i<numberOfPages:
def normalize(self): url_normalize(self.url.geturl())
async def async_setup(hass: HomeAssistantType, config) -> bool: """Set up Huawei LTE component.""" # Arrange our YAML config to dict with normalized URLs as keys domain_config = {} if DOMAIN not in hass.data: hass.data[DOMAIN] = HuaweiLteData(hass_config=config, config=domain_config) for router_config in config.get(DOMAIN, []): domain_config[url_normalize( router_config.pop(CONF_URL))] = router_config def service_handler(service) -> None: """Apply a service.""" url = service.data.get(CONF_URL) routers = hass.data[DOMAIN].routers if url: router = routers.get(url) elif not routers: _LOGGER.error("%s: no routers configured", service.service) return elif len(routers) == 1: router = next(iter(routers.values())) else: _LOGGER.error( "%s: more than one router configured, must specify one of URLs %s", service.service, sorted(routers), ) return if not router: _LOGGER.error("%s: router %s unavailable", service.service, url) return if service.service == SERVICE_CLEAR_TRAFFIC_STATISTICS: if router.suspended: _LOGGER.debug("%s: ignored, integration suspended", service.service) return result = router.client.monitoring.set_clear_traffic() _LOGGER.debug("%s: %s", service.service, result) elif service.service == SERVICE_REBOOT: if router.suspended: _LOGGER.debug("%s: ignored, integration suspended", service.service) return result = router.client.device.reboot() _LOGGER.debug("%s: %s", service.service, result) elif service.service == SERVICE_RESUME_INTEGRATION: # Login will be handled automatically on demand router.suspended = False _LOGGER.debug("%s: %s", service.service, "done") elif service.service == SERVICE_SUSPEND_INTEGRATION: router.logout() router.suspended = True _LOGGER.debug("%s: %s", service.service, "done") else: _LOGGER.error("%s: unsupported service", service.service) for service in ADMIN_SERVICES: hass.helpers.service.async_register_admin_service( DOMAIN, service, service_handler, schema=SERVICE_SCHEMA, ) for url, router_config in domain_config.items(): hass.async_create_task( hass.config_entries.flow.async_init( DOMAIN, context={"source": SOURCE_IMPORT}, data={ CONF_URL: url, CONF_USERNAME: router_config.get(CONF_USERNAME), CONF_PASSWORD: router_config.get(CONF_PASSWORD), }, )) return True
def makekeyname(url): return hashlib.sha224(url_normalize(url)).hexdigest()