Exemplo n.º 1
0
 def get_domain(self, address):
     try:
         dom = re.search(r'(//|^)([a-z0-9.-]*[a-z]\.[a-z][a-z-]*?(?:[/:].*|$))', address).group(2)
         if not self.in_whitelist(dom):
             if get_tld(url_normalize(dom, **self.url_kwargs), fail_silently=True):
                 return url_normalize(dom, **self.url_kwargs)
         return None
     except AttributeError:
         return None
     except UnicodeError:  # url_normalize's error, happens when something weird matches regex
         self.logger.info("Caught UnicodeError on %r.", address)
         return None
Exemplo n.º 2
0
 def delete(self):
     '''Removes a feed.'''
     feedurl = self.request.body
     query = Feed.gql("WHERE link = :1", url_normalize(feedurl))
     feed = query.fetch(1)
     db.delete(feed)
     self.response.out.write("Deleted\n")
Exemplo n.º 3
0
def save(kv, db):

    url = request.forms['url']
    if not url:
        return {'err' : '请输入URL'}

    url = url_normalize(url)
    if not url:
        return {'err' : '请输入有效的 URL'}

    surl = urlsplit(url)
    if surl.netloc.endswith(BLACKLIST):
        return {'err' : '不支持的域名'}

            
    code = hashto62(url)
    key = base62_encode(code)

    sql = """
    INSERT INTO `taobb_urls` (`id`, `key`, `url`, `gmt_create`, `gmt_modified`) VALUES (:id, :key, :url, now(), now())
    ON DUPLICATE KEY UPDATE `gmt_modified` = now()
    """

    sp = SQLParams('named', 'format')
    sql, params = sp.format(sql, { 
        'id' : code,
        'key' : key,
        'url' : url,
    })

    if db.execute(sql, params) and kv.set(key, url):
        return {'key':key , 'err' : None}
    else:
        return {'err': '内部错误'}
Exemplo n.º 4
0
def save(db):
    url = request.forms['url']
    if not url:
        return {'err': '请输入URL'}

    url = url_normalize(url)
    if not url:
        return {'err': '请输入有效的 URL'}

    surl = urlsplit(url)
    hostname = surl.hostname
    if hostname.endswith(BLACKLIST):
        return {'err': '不支持的域名'}

    if len(WHITELIST) > 0 and not ('.' + hostname).endswith(WHITELIST):
        return {'err': '仅支持阿里巴巴旗下网站域名'}

    code = hashto62(url)

    try:
        key = insert(db, code, url)
        return {'key': key, 'err': None}
    except:
        pass

    return {'err': '内部错误'}
Exemplo n.º 5
0
def test_url_normalize_changes():
    """Assert url_normalize do not change URI if not required.

    http://www.intertwingly.net/wiki/pie/PaceCanonicalIds
    """
    for value in NO_CHANGES_EXPECTED:
        assert url_normalize(value) == value
Exemplo n.º 6
0
 def _validate(self):
     """Validate the request."""
     if self.client_id is None:
         raise InvalidRequest('No client_id')
     try:
         self.client = Client.objects.get(key=self.client_id)
     except Client.DoesNotExist:
         raise InvalidClient("client_id %s doesn't exist" % self.client_id)
     # Redirect URI
     if self.redirect_uri is None:
         if self.client.redirect_uri is None:
             raise MissingRedirectURI("No redirect_uri"
                 "provided or registered.")
     elif self.client.redirect_uri is not None:
         if url_normalize(self.redirect_uri) != url_normalize(self.client.redirect_uri):
             self.redirect_uri = self.client.redirect_uri
             raise InvalidRequest("Registered redirect_uri doesn't "
                 "match provided redirect_uri.")
     self.redirect_uri = self.redirect_uri or self.client.redirect_uri
     # Check response type
     if self.response_type is None:
         raise InvalidRequest('response_type is a required parameter.')
     if self.response_type not in ["code", "token"]:
         raise InvalidRequest("No such response type %s" % self.response_type)
     # Response type
     if self.authorized_response_type & RESPONSE_TYPES[self.response_type] == 0:
         raise UnauthorizedClient("Response type %s not allowed." %
             self.response_type)
     if not absolute_http_url_re.match(self.redirect_uri):
         raise InvalidRequest('Absolute URI required for redirect_uri')
     # Scope
     if self.authorized_scope is not None and self.scope is None:
         self.scope = self.authorized_scope
     if self.scope is not None:
         if self.client.all_scopes_allowable:
             self.access_ranges = AccessRange.objects.filter(key__in=self.scope)
         else:
             self.access_ranges = self.client.allowable_scopes.filter(key__in=self.scope)
         access_ranges = set(self.access_ranges.values_list('key', flat=True))
         difference = access_ranges.symmetric_difference(self.scope)
         if len(difference) != 0:
             raise InvalidScope("Following access ranges do not "
                 "exist: %s" % ', '.join(difference))
         if self.authorized_scope is not None:
             new_scope = self.scope - self.authorized_scope
             if len(new_scope) > 0:
                 raise InvalidScope("Invalid scope: %s" % ','.join(new_scope))
Exemplo n.º 7
0
def test_url_normalize_with_http_scheme():
    """Assert we could use http scheme as default."""
    url = "//www.foo.com/"
    expected = "http://www.foo.com/"

    actual = url_normalize(url, default_scheme='http')

    assert actual == expected
Exemplo n.º 8
0
def normalize(url):
    """Normalizes URL.

    **Args:**

    * *url:* URL string.

    *Returns str*"""
    return url_normalize(url)
Exemplo n.º 9
0
def getOrCreateLink(url):
    url = url_normalize(url)
    l = Link.objects.filter(url=url)
    if not l:
        l = Link(url=url)
        l.save()
    else:
        l = l[0]
    return l
Exemplo n.º 10
0
def normalize_url_lossy(url: str) -> Optional[str]:
    """Do some simple transformations on a URL to make it match other equivalent URLs as well as possible.

    Normalization is "lossy" (makes the whole URL lowercase, removes subdomain parts "m.", "data.", "news.", ...
    in some cases).

    WARNING: You MUST set media.normalized_url = null for all possibly impacted media if you edit this
    function.  If in doubt, set normalized_url = null for all media.  See mediawords.tm.media.lookup_medium for
    more details.
    """
    url = decode_object_from_bytes_if_needed(url)

    if url is None:
        return None
    if len(url) == 0:
        return None

    url = fix_common_url_mistakes(url)

    url = url.lower()

    # make archive.is links look like the destination link
    url = re.sub(r'^https://archive.is/[a-z0-9]/[a-z0-9]+/(.*)', r'\1', url, flags=re.I)
    if not url.startswith('http'):
        url = 'http://' + url

    # r2.ly redirects through the hostname, ala http://543.r2.ly
    if 'r2.ly' not in url:
        url = re.sub(
            r'^(https?://)(m|beta|media|data|image|www?|cdn|topic|article|news|archive|blog|video|search|preview|'
            r'login|shop|sports?|act|donate|press|web|photos?|\d+?).?\.(.*\.)',
            r"\1\3", url, re.I)

    # collapse the vast array of http://pronkraymond83483.podomatic.com/ urls into http://pronkpops.podomatic.com/
    url = re.sub(r'http://.*pron.*\.podomatic\.com', 'http://pronkpops.podomatic.com', url)

    # get rid of anchor text
    url = re.sub(r'#.*', '', url)

    # get rid of multiple slashes in a row
    url = re.sub(r'(//.*/)/+', r"\1", url)

    url = re.sub(r'^https:', 'http:', url)

    # canonical_url might raise an encoding error if url is not invalid; just skip the canonical url step in the case
    # noinspection PyBroadException
    try:
        url = url_normalize.url_normalize(url)
    except Exception as ex:
        log.warning("Unable to get canonical URL for URL %s: %s" % (url, str(ex),))

    # add trailing slash
    if re.search(r'https?://[^/]*$', url):
        url += '/'

    return url
Exemplo n.º 11
0
def compare(url1, url2):
  url1 = url1.strip()
  url2 = url2.strip()
  
  if len(url1) == 0:
    if len(url2) == 0:
      return 0
    else:
      return -1
  elif len(url2) == 0:
    return 1
  
  url1 = url_normalize.url_normalize(url1)
  url2 = url_normalize.url_normalize(url2)
  if url1 < url2:
    return -1
  elif url1 == url2:
    return 0
  else:
    return 1
Exemplo n.º 12
0
    def __init__(self, url=None, key=None):
        obj = {}
        self._r = red
        if url:
            # create new
            self._url = url_normalize(url)

            self._r.incr('last_url_id') # inc global counter
            self._key = key = self._find_key(self._url)

            obj[key] = self._url
            obj['%s:created_at' % key] = datetime.now().strftime(self.time_f)
            self._r.mset(obj)
        if key:
            # load exist
            self._key = key
Exemplo n.º 13
0
def is_http_url(url: str) -> bool:
    """Returns true if URL is in the "http" ("https") scheme."""
    url = decode_object_from_bytes_if_needed(url)
    if url is None:
        log.debug("URL is None")
        return False
    if len(url) == 0:
        log.debug("URL is empty")
        return False

    log.debug("Testing if URL '%s' is HTTP(s) URL" % url)

    if not re.search(__URL_REGEX, url):
        log.debug("URL '%s' does not match URL's regexp" % url)
        return False

    try:
        uri = furl(url)

        # Try stringifying URL back from the furl() object to try out all of its accessors
        str(uri)

        # Some URLs become invalid when normalized (which is what "requests" will do), e.g.:
        #
        #     http://michigan-state-football-sexual-assault-charges-arrest-players-names -- valid
        #     http://michigan-state-football-sexual-assault-charges-arrest-players-names/ -- invalid (decoding error)
        #
        # ...so try the same with normalized URL
        normalized_url = url_normalize.url_normalize(url)
        normalized_uri = furl(normalized_url)
        str(normalized_uri)

    except Exception as ex:
        log.debug("Cannot parse URL: %s" % str(ex))
        return False

    if not uri.scheme:
        log.debug("Scheme is undefined for URL %s" % url)
        return False
    if not uri.scheme.lower() in ['http', 'https']:
        log.debug("Scheme is not HTTP(s) for URL %s" % url)
        return False
    if not uri.host:
        log.debug("Host is undefined for URL %s" % url)
        return False

    return True
Exemplo n.º 14
0
def cert_chain_url_valid(cert_url):
    """
    Ensure that the provided URL for the certificate chain is valid, by checking that:
    * it's HTTPS
    * the host is s3.amazonaws.com
    * the port, if specified, is 443
    * the path starts with '/echo.api/'
    """
    normalized = url_normalize(cert_url)
    parsed = urlparse(normalized)
    url_checks = {
        'scheme': parsed.scheme == 'https',
        'hostname': parsed.hostname == 's3.amazonaws.com',
        'port': parsed.port in (443, None),
        'path': parsed.path.startswith('/echo.api/'),
    }
    all_checks_pass = all(url_checks.values())
    return all_checks_pass
Exemplo n.º 15
0
    def _prepareURL(self, apiQueryURI):
        """
        If the URI (actually just a partial URL, usually the path part) doesn't begin with
        the base URL for the API, concatenate the two into a new URL and return it.

        :param apiQueryURI: URI (actually, just a partial URL, usually the path part) for an API entry point.
        :type apiQueryURI: str
        :return: URL for the API query, ready for use
        :rtype: str
        """
        assert isinstance(apiQueryURI, str)
        assert not util.stringContainsAllCharacters(apiQueryURI, '{}'), \
            'apiQueryURI contains unformatted arguments: "%s"' % apiQueryURI

        if apiQueryURI.startswith(self.apiBaseURL):
            return apiQueryURI

        return url_normalize(self.apiBaseURL + '/' + apiQueryURI)
Exemplo n.º 16
0
 def fetch(self, method, endpoint, params):
     api_endpoint = url_normalize(self.api_base + endpoint)
     if method.lower() in ['get', 'delete']:
         content = self.oauth.request(
                     method,
                     api_endpoint,
                     params = params,
                     headers={'User-Agent':'Semantics3 Python Lib/0.2'},
                     timeout=self.timeout
                   )
     else:
         content = self.oauth.request(
                     method,
                     api_endpoint,
                     data = json.dumps(params),
                     headers={'User-Agent':'Semantics3 Python Lib/0.2', 'Content-Type':'application/json'},
                     timeout=self.timeout
                   )
     return content
Exemplo n.º 17
0
def main():
  '''
  Takes in a list of URL's in a file and outputs the
  validity of the URL, the canonicalized URL, the uniqueness
  of the URL and the canonicalized URL
  '''
  (parser, opts, args) = controller()
  if not opts.filename:
    parser.print_help()
    sys.exit(1)

  filename = opts.filename

  try:
    f = open(filename, 'r')
    raw_url_list = reader.read_file(f)
  except IOError as e:
    handle_io_exception(filename, e)
  
  unique_raw_urls = set()
  unique_canonicalized_urls = set()
  is_raw_valid = False
  is_raw_unique = False
  is_canonical_unique = False
  canonicalized_url = ""
  
  for raw_url in raw_url_list:
    print("Source: " + raw_url)
    is_raw_valid = url_validator.is_valid(raw_url)
    print("Valid: " + str(is_raw_valid))
    canonicalized_url = url_normalize.url_normalize(raw_url)
    print("Canonical: " + canonicalized_url)
    
    is_raw_unique = raw_url not in unique_raw_urls
    if is_raw_unique:
      unique_raw_urls.add(raw_url)
    print("Source unique: " + str(is_raw_unique))
    
    is_canonical_unique = canonicalized_url not in unique_canonicalized_urls
    if is_canonical_unique:
      unique_canonicalized_urls.add(canonicalized_url)
    print("Canonicalized URL unique: " + str(is_canonical_unique))
Exemplo n.º 18
0
 def add(self, url, load_bad_url=False, always=False):
     url = url_normalize(url)
     if always:
         state = self._URL_TASK
     else:
         state = self.get_index_state(url)
         if state == self._URL_TASK:
             ## crawling
             return
         if state == self._URL_DONE:
             return
         if not load_bad_url and state == self._URL_BAD:
             return
     host = urlparse.urlparse(url).netloc
     if not host: return
     if host in self._pool:
         self._pool[host].add(url)
     else:
         self._pool[host] = set([url])
     self.url_count += 1
     self._urlindex.Put(url, self._URL_TASK)
Exemplo n.º 19
0
 def fetch(self, url=False, armor=False):
     if not url:
         return False
     normalizedurl = url_normalize.url_normalize(url)
     if self.debug:
         print (normalizedurl)
     urlhash = self._hash(v=normalizedurl)
     if self.debug:
         print (urlhash)
     self._hashdir(urlhash)
     if not self._archived(v=urlhash):
         fetcher = requests.get(normalizedurl)
         if fetcher.status_code == 200:
             meta = fetcher.headers
             meta['url_archiver:url'] = normalizedurl
             meta['url_archiver:urlhash'] = urlhash
             meta['url_archiver:version'] = __version__
             self._store(raw=fetcher.text, u=normalizedurl, meta=meta)
             return self._get(v=urlhash, armor=armor)
     else:
             return self._get(v=urlhash, armor=armor)
Exemplo n.º 20
0
def canonical_url(url: str) -> str:
    """Make URL canonical (lowercase scheme and host, remove default port, etc.)"""
    # FIXME maybe merge with normalize_url() as both do pretty much the same thing

    url = decode_object_from_bytes_if_needed(url)

    if url is None:
        raise McCanonicalURLException("URL is None.")
    if len(url) == 0:
        raise McCanonicalURLException("URL is empty.")

    url = fix_common_url_mistakes(url)

    if not is_http_url(url):
        raise McCanonicalURLException("URL is not HTTP(s): %s" % url)

    try:
        can_url = url_normalize.url_normalize(url)
    except Exception as ex:
        raise McCanonicalURLException("Failed to create canonical URL from URL %s: %s" % (url, str(ex),))

    return can_url
Exemplo n.º 21
0
def get_sites(queries, args):
    status_code = 200
    url = args.get("url", None)
    if url:
        normalized_url = url_normalize(url)
        result = queries.get_site_by_url(url=normalized_url)
        if result:
            response_body = {
                "message": f"Returning site with url '{normalized_url}'",
                "result": result,
            }
        else:
            response_body = {
                "message": f"No site having url '{normalized_url}'",
                "result": {},
            }
            status_code = 404

    else:
        result = list(queries.get_all_sites())
        response_body = {"message": "Returning all sites", "result": result}
    return {"body": response_body, "status_code": status_code}
Exemplo n.º 22
0
    def handle_image(self, image, pageLink):
        image_link = image.get('src')
        hostname = pageLink.split('/')[2:3][0]

        try:
            if not "https://" or not "http://" in image_link:
                image_link = url_normalize(pageLink + "/" + image_link)
                path = urllib.request.urlopen(image_link)
                if '@@images' in path.url:
                    filename = path.url.split('/')[-4]
                else:
                    filename = path.url.split('/')[-1]
                if filename is 'thumb' or filename is 'preview' or filename is 'mini':
                    print('url not getting translated: ' + path.url)
                image['alt'] = filename
                while os.path.isfile(filename):
                    (root, ext) = os.path.splitext(filename)
                    filename = root + "(1)" + ext
                urllib.request.urlretrieve(image_link, filename=filename)

            elif hostname in image_link:
                path = urllib.request.urlopen(image_link)
                if '@@images' in path.url:
                    filename = path.url.split('/')[-3]
                else:
                    filename = path.url.split('/')[-1]
                if filename is 'thumb' or filename is 'preview' or filename is 'mini':
                    print('url not getting translated: ' + path.url)
                image['alt'] = filename
                while os.path.isfile(filename):
                    (root, ext) = os.path.splitext(filename)
                    filename = root + "(1)" + ext
                urllib.request.urlretrieve(image_link, filename=filename)

        except Exception as e:
            print(e)
            print("image link not working: " + str(pageLink) + ": " +
                  str(image_link))
            self.errors.write(str(pageLink) + ": " + str(image_link) + "\n")
Exemplo n.º 23
0
    def _extract_links(soup, page):
        """Extract links from a webpage and normalize those links. Returns a list of (link,dns) tuple."""
        extracted_links = re.findall('"((http)s?://.*?)"', page)
        extracted_links = [url for url, _ in extracted_links]

        links = []
        for i in range(len(extracted_links)):
            # Normalize the url link by converting it to canonical form.
            # For more info, refer to https://pypi.python.org/pypi/urlnorm
            # MOSTAFA HERE IS an exception
            # It gives me an exception sometimes
            try:
                extracted_links[i] = url_normalize(extracted_links[i])
            except:
                i -= 1
                continue
            extracted_links[i] = extracted_links[i].replace(
                "%3A", ":")  # Restore the ":" character back.
            if Fetcher._check_ext_html(extracted_links[i]):
                links.append((extracted_links[i],
                              Fetcher.extract_dns(extracted_links[i])))
        return links
Exemplo n.º 24
0
def parse_url(url):
    # normalizing the url converts all domain characters
    # into lower case and ensures non-alphanumeric characters
    # are properly formatted
    from url_normalize import url_normalize
    try:
        url_parsed = urlparse(url_normalize(url))
    except:
        url_parsed = urlparse(url)

    # remove trailing slash from url if present
    #path=url_parsed.path
    #if len(path)>0 and path[-1]=='/':
    #path=path[:-1]

    # this check is necessary for when url=''
    hostname = url_parsed.hostname
    if hostname is None:
        hostname = ''

    # store port numbers as -1 if its the default port for a scheme
    # we must wrap this in a try/except block in case post numbers are out of range
    try:
        port = url_parsed.port
    except ValueError:
        port = None
    if port is None:
        port = -1

    return {
        'scheme': url_parsed.scheme,
        'hostname': hostname,
        'port': port,
        'path': url_parsed.path,
        'params': url_parsed.params,
        'query': url_parsed.query,
        'fragment': url_parsed.fragment,
        'other': '',
    }
Exemplo n.º 25
0
def canonical_url(url: str) -> str:
    """Make URL canonical (lowercase scheme and host, remove default port, etc.)"""
    # FIXME maybe merge with normalize_url() as both do pretty much the same thing

    url = decode_object_from_bytes_if_needed(url)

    if url is None:
        raise McCanonicalURLException("URL is None.")
    if len(url) == 0:
        raise McCanonicalURLException("URL is empty.")

    url = fix_common_url_mistakes(url)

    if not is_http_url(url):
        raise McCanonicalURLException("URL is not HTTP(s): %s" % url)

    try:
        can_url = url_normalize.url_normalize(url)
    except Exception as ex:
        raise McCanonicalURLException("Failed to create canonical URL from URL %s: %s" % (url, str(ex),))

    return can_url
Exemplo n.º 26
0
 async def post(self):
     # Extract long URL
     url_long = self.get_body_argument('url_long')
     if not url_long:
         # Empty url_long. Send back default HTML.
         if self.service.verbosity > 1:
             self.verbose_message(200)
         self.finish(self.generate_html())
         return
     # Long URL received. Canonicalize it.
     try:
         url_long = url_normalize.url_normalize(url_long)
     except UnicodeError:
         await self.send_error_emulate(400, f'Failed to interpret URL "{url_long}"')
         return
     # Get short URL and send back response
     url_short = await self.service.lookup(url_long, tablename='long2short')
     # Assemble full version of short URL
     url_short = f'{self.service.get_host_url(self.request)}/{url_short}'
     if self.service.verbosity > 1:
         self.verbose_message(200)
     self.finish(self.generate_html(url_long, url_short))
Exemplo n.º 27
0
def get_google_search_urls(query):
    '''
    input:\n
    query - User-Defined query, should be a string

    returns:\n
    all_urls - the URLS on the Google Search Result, a list \n
    urls_info - Related info to the URLs, a list
    '''

    url = url_normalize(BASE_URL+query)

    all_urls, urls_info, par_urls_info = get_parent_child_info(
        url, starting_regex='/url?q=', google=True, depth=0, top_ten=False)

    for sw in STOP_WORDS:

        front, back = 0, len(all_urls) - 1

        while front < back:

            if sw in all_urls[front]:
                del all_urls[front]
                del urls_info[front]
                back -= 1

            else:
                front += 1

    # New Code
    for url, url_info in zip(all_urls, urls_info):

        heap_list = [float('-inf'), next(COUNTER),
                     url_info[0], url_info[1], par_urls_info[2], []]

        VISITED_URLS_PRIOR_DICT[url] = heap_list

    return all_urls, urls_info
    def set_camunda_configuration(self, configuration: dict):
        if 'host' not in configuration.keys():
            raise ValueError(
                f"Incomplete configuration. Configuration must include at least the Camunda host url:\t{configuration}"
            )

        # weird things happen when dictionary is not copied and keyword is called repeatedly. Somehow robot or python remember the configuration from the previous call
        camunda_config = configuration.copy()

        host = configuration['host']
        camunda_config['host'] = url_normalize(f'{host}/engine-rest')

        if 'api_key' in configuration.keys():
            api_key = configuration['api_key']
            camunda_config['api_key'] = {'default': api_key}
            if 'api_key_prefix' in configuration.keys():
                api_key_prefix = configuration['api_key_prefix']
                camunda_config['api_key_prefix'] = {'default': api_key_prefix}

        logger.debug(
            f"New configuration for Camunda client:\t{camunda_config}")
        self._shared_resources.client_configuration = Configuration(
            **camunda_config)
Exemplo n.º 29
0
def create(request):
    request_json = json.loads(request.body.decode('utf-8'))
    print(request_json)
    if 'url' in request_json and len(request_json['url'].strip()) > 0:
        val = URLValidator()
        try:
            long_url = url_normalize(request_json['url'].strip())
            val(long_url)
            existing_link = Link.objects.filter(url=long_url)
            if existing_link.count() == 0:
                link = Link(url=long_url, date_created=timezone.now())
                link.save()
            else:
                link = existing_link[0]
            return JsonResponse({
                'short_url':
                request.build_absolute_uri(
                    reverse('shortener:goto', args=[link.short_url()]))
            })
        except ValidationError:
            return JsonResponse({'error': 'URL is not valid.'}, status=400)
    else:
        return JsonResponse({'error': 'No URL given.'}, status=400)
Exemplo n.º 30
0
    def parse(xml):
        '''Parses a blob of XML, e.g., from a fetcher or from PuSH.'''
        parsed = feedparser.parse(xml)
        selflink = parsed.feed.link # Best if we can't find a selflink
        for l in parsed.feed.links:
            if l.rel == "self":
                selflink = l.href

        query = Feed.gql("WHERE link = :1", url_normalize(selflink))
        feed = query.fetch(1)[0]
        
        for entry in parsed['entries']:
            item = Item.get_or_insert(Item.makekeyname(entry.link),
            title = entry.title,
            link = entry.link,
            retrieved = datetime.now(),
            content = entry.content[0].value,
            #summary = entry.summary,
            version = 1,
            created = datetime(*(entry.published_parsed[:6])),
            feed = feed,
            private = feed.private,
            )
Exemplo n.º 31
0
 def add(self, url, load_bad_url=False, always=False):
     url = url_normalize(url)
     if always:
         state = self._URL_TASK
     else:
         state = self.get_index_state(url)
         if state == self._URL_TASK:
             ## crawling
             return
         if state == self._URL_DONE:
             return
         if not load_bad_url and state == self._URL_BAD:
             return
     host = urlparse.urlparse(url).netloc
     if not host: return
     if host in self._pool:
         if url not in self._pool[host]:
             self._pool[host].add(url)
             self.url_count += 1
     else:
         self._pool[host] = set([url])
         self.url_count += 1
     self._urlindex.Put(url, self._URL_TASK)
Exemplo n.º 32
0
 def __init__(self, xml):
     url = xml.find("loc")
     lastmod = xml.find("lastmod")
     title = xml.find("news:title")
     description = xml.find("news:description")
     keywords = xml.find("news:keywords")
     publication_date = xml.find("news:publication_date")
     if not title:
         title = xml.find("video:title")
     if not description:
         description = xml.find("video:description")
     self.url = format_text(url_normalize(url.text.strip().lower()))
     self.html = ""
     self.tree = None
     parsed = urlparse(self.url)
     self.site = parsed.netloc
     self.path = parsed.path
     try:
         pardir = "/".join(re.sub(r"(/)$", "", self.path).split("/")[:-2])
     except:
         pardir = "/"
     self.base_url = f"{parsed.scheme}://{parsed.netloc}{pardir}"
     self.lastmod = parse_timestamp(format_text(
         lastmod.text)) if lastmod else None
     self.headline = format_text(title.text.strip()) if title else ""
     self.keywords = ([format_text(kw) for kw in keywords.text.split(",")]
                      if keywords else [])
     self.publication_date = (format_text(publication_date.text)
                              if publication_date else "")
     self.description = format_text(description.text) if description else ""
     self.xml = format_text(xml.__repr__())
     self.metadata = {"schemata": [], "errors": []}
     self.has_metadata = False
     self.seen = self.url in seen
     # seen.add(self.url)
     self.articlebody = ""
     self.visited = False
Exemplo n.º 33
0
def load_sitemap_urls(fp="lib/newspapers.tsv"):
    fp = os.path.abspath(fp)
    news = load_csv(fp)
    loaded = []
    for row in list(news):
        resolved_urls = []
        for k, v in list(row.items()):
            if not v:
                continue
            elif k.startswith("sitemap_url_template"):
                resolved = datetime.datetime.now().strftime(v)
                resolved_urls.append(resolved)
            elif k.startswith("sitemap_url"):
                resolved_urls.append(v)
        print("rrresolved_urls")
        print(resolved_urls)
        print(row)
        url = url_normalize(row["url"]).strip().lower()
        parsed = urlparse(url)
        row["url"] = url
        row["site"] = parsed.netloc
        row["sitemap_urls"] = resolved_urls
        loaded.append(row)
    return loaded
Exemplo n.º 34
0
 def getLinks(self, url):
     url = url_normalize(url)
     
     self.links = []
     # Remember the base URL which will be important when creating
     # absolute URLs
     self.baseUrl = url
     # Use the urlopen function from the standard Python 3 library
     response = urlopen(url)
     head = response.info()
     isUrl = head.gettype().startswith('text/html')
     # print(isUrl)
     # Make sure that we are looking at HTML and not other things that
     # are floating around on the internet (such as
     # JavaScript files, CSS, or .PDFs for example)
     if isUrl:
         htmlBytes = response.read()
         # Note that feed() handles Strings well, but not bytes
         # (A change from Python 2.x to Python 3.x)
         htmlString = htmlBytes.decode("utf-8")
         self.feed(htmlString)
         return htmlString, self.links
     else:
         return "",[]
def get_canonical_form(url, parent_url):
    try:
        can_url = url_normalize(url)

        can_url = str(can_url).replace(":80", "").replace(":443", "").replace("https", "http")
        if can_url.__contains__("#"):
            can_url = can_url.split("#")[0]

        split_array = re.split("^\.\./", can_url)
        if len(split_array) > 1:  # will execute if '../' is present at the url start
            extension = split_array[1]
            can_url = urljoin(parent_url, extension)

        if can_url.startswith("/"):
            can_url = urljoin(parent_url, can_url)

        url_array = urlparse(can_url)
        # ParseResult(scheme='http', netloc='www.browse-tutorials.net', path='/tutorial/get-self-base-url-appengine-urlparse',
        #             params='', query='', fragment='')
        base = url_array.scheme + "://" + url_array.netloc
        can_url = can_url[:len(base)].lower() + can_url[len(base):]
    except Exception:
        return ""
    return can_url
Exemplo n.º 36
0
    async def async_step_ssdp(
            self, discovery_info: ssdp.SsdpServiceInfo) -> FlowResult:
        """Handle SSDP initiated config flow."""
        await self.async_set_unique_id(discovery_info.upnp[ssdp.ATTR_UPNP_UDN])
        self._abort_if_unique_id_configured()

        # Attempt to distinguish from other non-LTE Huawei router devices, at least
        # some ones we are interested in have "Mobile Wi-Fi" friendlyName.
        if ("mobile"
                not in discovery_info.upnp.get(ssdp.ATTR_UPNP_FRIENDLY_NAME,
                                               "").lower()):
            return self.async_abort(reason="not_huawei_lte")

        if TYPE_CHECKING:
            assert discovery_info.ssdp_location
        url = url_normalize(
            discovery_info.upnp.get(
                ssdp.ATTR_UPNP_PRESENTATION_URL,
                f"http://{urlparse(discovery_info.ssdp_location).hostname}/",
            ))

        if serial_number := discovery_info.upnp.get(ssdp.ATTR_UPNP_SERIAL):
            await self.async_set_unique_id(serial_number)
            self._abort_if_unique_id_configured()
Exemplo n.º 37
0
    async def async_step_ssdp(self, discovery_info):
        """Handle SSDP initiated config flow."""
        # Attempt to distinguish from other non-LTE Huawei router devices, at least
        # some ones we are interested in have "Mobile Wi-Fi" friendlyName.
        if "mobile" not in discovery_info.get(ssdp.ATTR_UPNP_FRIENDLY_NAME,
                                              "").lower():
            return self.async_abort(reason="not_huawei_lte")

        # https://github.com/PyCQA/pylint/issues/3167
        url = self.context[CONF_URL] = url_normalize(  # pylint: disable=no-member
            discovery_info.get(
                ssdp.ATTR_UPNP_PRESENTATION_URL,
                f"http://{urlparse(discovery_info[ssdp.ATTR_SSDP_LOCATION]).hostname}/",
            ))

        if any(url == flow["context"].get(CONF_URL)
               for flow in self._async_in_progress()):
            return self.async_abort(reason="already_in_progress")

        user_input = {CONF_URL: url}
        if self._already_configured(user_input):
            return self.async_abort(reason="already_configured")

        return await self._async_show_user_form(user_input)
Exemplo n.º 38
0
async def async_setup(hass: HomeAssistant, config: ConfigType) -> bool:
    """Set up Huawei LTE component."""

    # dicttoxml (used by huawei-lte-api) has uselessly verbose INFO level.
    # https://github.com/quandyfactory/dicttoxml/issues/60
    logging.getLogger("dicttoxml").setLevel(logging.WARNING)

    # Arrange our YAML config to dict with normalized URLs as keys
    domain_config: dict[str, dict[str, Any]] = {}
    if DOMAIN not in hass.data:
        hass.data[DOMAIN] = HuaweiLteData(hass_config=config,
                                          config=domain_config)
    for router_config in config.get(DOMAIN, []):
        domain_config[url_normalize(
            router_config.pop(CONF_URL))] = router_config

    def service_handler(service: ServiceCall) -> None:
        """Apply a service."""
        routers = hass.data[DOMAIN].routers
        if url := service.data.get(CONF_URL):
            router = routers.get(url)
        elif not routers:
            _LOGGER.error("%s: no routers configured", service.service)
            return
Exemplo n.º 39
0
    async def async_step_user(self,
                              user_input: dict[str, Any] | None = None
                              ) -> FlowResult:
        """Handle user initiated config flow."""
        if user_input is None:
            return await self._async_show_user_form()

        errors = {}

        # Normalize URL
        user_input[CONF_URL] = url_normalize(user_input[CONF_URL],
                                             default_scheme="http")
        if "://" not in user_input[CONF_URL]:
            errors[CONF_URL] = "invalid_url"
            return await self._async_show_user_form(user_input=user_input,
                                                    errors=errors)

        conn: AuthorizedConnection

        def logout() -> None:
            try:
                conn.user.logout()
            except Exception:  # pylint: disable=broad-except
                _LOGGER.debug("Could not logout", exc_info=True)

        def try_connect(user_input: dict[str, Any]) -> AuthorizedConnection:
            """Try connecting with given credentials."""
            username = user_input.get(CONF_USERNAME) or ""
            password = user_input.get(CONF_PASSWORD) or ""
            conn = AuthorizedConnection(
                user_input[CONF_URL],
                username=username,
                password=password,
                timeout=CONNECTION_TIMEOUT,
            )
            return conn

        def get_device_info() -> tuple[GetResponseType, GetResponseType]:
            """Get router info."""
            client = Client(conn)
            try:
                device_info = client.device.information()
            except Exception:  # pylint: disable=broad-except
                _LOGGER.debug("Could not get device.information",
                              exc_info=True)
                try:
                    device_info = client.device.basic_information()
                except Exception:  # pylint: disable=broad-except
                    _LOGGER.debug("Could not get device.basic_information",
                                  exc_info=True)
                    device_info = {}
            try:
                wlan_settings = client.wlan.multi_basic_settings()
            except Exception:  # pylint: disable=broad-except
                _LOGGER.debug("Could not get wlan.multi_basic_settings",
                              exc_info=True)
                wlan_settings = {}
            return device_info, wlan_settings

        try:
            conn = await self.hass.async_add_executor_job(
                try_connect, user_input)
        except LoginErrorUsernameWrongException:
            errors[CONF_USERNAME] = "incorrect_username"
        except LoginErrorPasswordWrongException:
            errors[CONF_PASSWORD] = "incorrect_password"
        except LoginErrorUsernamePasswordWrongException:
            errors[CONF_USERNAME] = "invalid_auth"
        except LoginErrorUsernamePasswordOverrunException:
            errors["base"] = "login_attempts_exceeded"
        except ResponseErrorException:
            _LOGGER.warning("Response error", exc_info=True)
            errors["base"] = "response_error"
        except Timeout:
            _LOGGER.warning("Connection timeout", exc_info=True)
            errors[CONF_URL] = "connection_timeout"
        except Exception:  # pylint: disable=broad-except
            _LOGGER.warning("Unknown error connecting to device",
                            exc_info=True)
            errors[CONF_URL] = "unknown"
        if errors:
            await self.hass.async_add_executor_job(logout)
            return await self._async_show_user_form(user_input=user_input,
                                                    errors=errors)

        info, wlan_settings = await self.hass.async_add_executor_job(
            get_device_info)
        await self.hass.async_add_executor_job(logout)

        if not self.unique_id:
            if serial_number := info.get("SerialNumber"):
                await self.async_set_unique_id(serial_number)
                self._abort_if_unique_id_configured()
            else:
                await self._async_handle_discovery_without_unique_id()
Exemplo n.º 40
0
 def normalize_url_domain(url):
     url = url_normalize(url)
     return str(url) if str(url).endswith('/') else str(url) + '/'
Exemplo n.º 41
0
 def t(self, in_url, ex_url):
     self.assertEqual(url_normalize(in_url), ex_url)
Exemplo n.º 42
0
 def __init__(self, url):
   self.url = url
   self.cleaned_url = url_normalize(url)
Exemplo n.º 43
0
 def normalize_url(self, url):
     return url_normalize.url_normalize(url)
def normalize_url(url):
    return url_normalize(url)
Exemplo n.º 45
0
 def __init__(self, dkubeURL, token):
     configuration.host = url_normalize('{}/dkube/v2/controller'.format(dkubeURL))
     configuration.api_key['Authorization'] = token
     configuration.verify_ssl = False
Exemplo n.º 46
0
    def check_domain(self, item):
        """check subdomains and check if url is a store (by keywords)"""
        counter = 0
        leader_phone = ''
        leader_phone_from_team = ''
        all_pages_phone = ''
        main_page_phone = ''
        phone = ''
        leader_email = ''
        leader_email_from_team = ''
        all_pages_email = ''
        main_page_email = ''
        email = ''
        leader_phone_without_sitemap = ''
        leader_email_without_sitemap = ''

        try:
            domain = item['Internet-Adresse']
            subdomains = []
            subdomains_list = list()
            domain_is_shop = False
            domain = str(domain)
            if domain != 'nan':

                # take a domain
                target = self.clear_url(domain)

                # make a request to an external service
                req = requests.get(
                    "https://crt.sh/?q=%.{d}&output=json".format(d=target),
                    headers=self.headers)

                if req.status_code != 200:
                    print("[X] Information not available!")

                else:
                    for (key, value) in enumerate(req.json()):
                        subdomains.append(value['name_value'])

                    subdomains = sorted(set(subdomains))

                    # select the required subdomains
                    for subdomain in subdomains:
                        if 'shop' in subdomain or 'store' in subdomain:
                            domain_is_shop = True
                            if '\n' in subdomain:
                                s = subdomain.split(sep='\n')
                                for v in s:
                                    if 'shop' in v or 'store' in v:
                                        subdomains_list.append(
                                            url_normalize(v))
                                        print(f'subdomain_m: {v}')
                            else:
                                subdomains_list.append(
                                    url_normalize(subdomain))
                                print(f'subdomain_o: {subdomain}')

                is_shop, main_page_phone, main_page_email, phone, email = self.is_shop_and_main_page(
                    domain, domain_is_shop)  # noqa
                leader_phone_without_sitemap = phone
                leader_email_without_sitemap = email

                if is_shop is True:
                    domain_is_shop = True

                if domain_is_shop is True:
                    # check the quantity of goods
                    common_list = [link for link in subdomains_list]
                    subdomains_list = self.normalize_urls_list(common_list)
                    common_list.append(domain)
                    common_list = self.normalize_urls_list(common_list)
                    sitemap_tree = self.get_sitemap_tree(common_list)
                    if sitemap_tree:
                        leader_phone, leader_email = self.get_leader_phone_and_email_from_sitemap(
                            sitemap_tree)
                        leader_phone_from_team, leader_email_from_team = \
                            self.get_leader_phone_and_email_from_sitemap_section_team(sitemap_tree)
                        counter, all_pages_phone, all_pages_email = \
                            self.check_phones_emails_on_every_page_and_count_the_quantity_of_goods(sitemap_tree)
                    else:
                        pass

                phone, leader_phone_without_sitemap, leader_phone, leader_phone_from_team, main_page_phone,\
                    all_pages_phone = self.phone(
                        leader_phone_without_sitemap=leader_phone_without_sitemap, leader_phone=leader_phone,
                        leader_phone_from_team=leader_phone_from_team,
                        main_page_phone=main_page_phone, all_pages_phone=all_pages_phone
                    )

                email, leader_email_without_sitemap, leader_email, leader_email_from_team, main_page_email,\
                    all_pages_email = self.email(
                        leader_email_without_sitemap=leader_email_without_sitemap, leader_email=leader_email,
                        leader_email_from_team=leader_email_from_team,
                        main_page_email=main_page_email, all_pages_email=all_pages_email
                    )

                self.write_to_file(
                    item,
                    is_shop=domain_is_shop,
                    number_of_goods=counter,
                    shop_domain=subdomains_list,
                    phone=phone,
                    main_page_phone=main_page_phone,
                    leader_phone_without_sitemap=leader_phone_without_sitemap,
                    all_pages_phone=all_pages_phone,
                    leader_phone=leader_phone,
                    leader_phone_from_team=leader_phone_from_team,
                    email=email,
                    leader_email_without_sitemap=leader_email_without_sitemap,
                    leader_email=leader_email,
                    leader_email_from_team=leader_email_from_team,
                    main_page_email=main_page_email,
                    all_pages_email=all_pages_email)

                self.open_db()
                self.cur.execute(
                    """INSERT INTO Domains_and_subdomains (
                     DUNS, Handelsregister_Nummer, UID, Internet_Adresse, subdomains, Rechtsform, Filiale_Indikator,
                     Mitarbeiter, Mitarbeiter_Gruppe, is_shop, number_of_goods, phone, phone_main_page,
                     leader_phone_without_sitemap, phones_all_pages, leader_phone_sitemap, 
                     leader_phone_from_team_sitemap, email, email_main_page, leader_email_without_sitemap,  
                     emails_all_pages, leader_email_sitemap, leader_email_from_team_sitemap
                     )
                     VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)""",
                    (  # noqa
                        item['DUNS'], item['Handelsregister-Nummer'],
                        item['UID'], item['Internet-Adresse'],
                        str(subdomains_list), item['Rechtsform'],
                        item['Filiale Indikator'], item['Mitarbeiter'],
                        item['Mitarbeiter Gruppe'], domain_is_shop, counter,
                        str(phone), str(main_page_phone),
                        str(leader_phone_without_sitemap),
                        str(all_pages_phone), str(leader_phone),
                        str(leader_phone_from_team), str(email),
                        str(main_page_email),
                        str(leader_email_without_sitemap),
                        str(all_pages_email), str(leader_email),
                        str(leader_email_from_team)))
                self.connection.commit()
                self.close_db()

            else:
                self.write_to_file(
                    item,
                    is_shop=False,
                    number_of_goods=0,
                    shop_domain='',
                    phone=phone,
                    main_page_phone=main_page_phone,
                    leader_phone_without_sitemap=leader_phone_without_sitemap,
                    all_pages_phone=all_pages_phone,
                    leader_phone=leader_phone,
                    leader_phone_from_team=leader_phone_from_team,
                    email=email,
                    leader_email_without_sitemap=leader_email_without_sitemap,
                    leader_email=leader_email,
                    leader_email_from_team=leader_email_from_team,
                    main_page_email=main_page_email,
                    all_pages_email=all_pages_email)
                self.open_db()
                self.cur.execute(
                    """INSERT INTO Domains_and_subdomains (
                     DUNS, Handelsregister_Nummer, UID, Internet_Adresse, subdomains, Rechtsform, Filiale_Indikator,
                     Mitarbeiter, Mitarbeiter_Gruppe, is_shop, number_of_goods, phone, phone_main_page,
                     leader_phone_without_sitemap, phones_all_pages, leader_phone_sitemap, 
                     leader_phone_from_team_sitemap, email, email_main_page, leader_email_without_sitemap,  
                     emails_all_pages, leader_email_sitemap, leader_email_from_team_sitemap
                     )
                     VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)""",
                    (  # noqa
                        item['DUNS'], item['Handelsregister-Nummer'],
                        item['UID'], item['Internet-Adresse'], '',
                        item['Rechtsform'], item['Filiale Indikator'],
                        item['Mitarbeiter'], item['Mitarbeiter Gruppe'], False,
                        0, '', '', '', '', '', '', '', '', '', '', '', ''))
                self.connection.commit()
                self.close_db()

        except Exception as e:
            print(f'check_domain: {e}')
            self.write_to_file(
                item,
                is_shop=False,
                number_of_goods=0,
                shop_domain='',
                phone=phone,
                main_page_phone=main_page_phone,
                leader_phone_without_sitemap=leader_phone_without_sitemap,
                all_pages_phone=all_pages_phone,
                leader_phone=leader_phone,
                leader_phone_from_team=leader_phone_from_team,
                email=email,
                leader_email_without_sitemap=leader_email_without_sitemap,
                leader_email=leader_email,
                leader_email_from_team=leader_email_from_team,
                main_page_email=main_page_email,
                all_pages_email=all_pages_email)
            self.open_db()
            self.cur.execute(
                """INSERT INTO Domains_and_subdomains (
                     DUNS, Handelsregister_Nummer, UID, Internet_Adresse, subdomains, Rechtsform, Filiale_Indikator,
                     Mitarbeiter, Mitarbeiter_Gruppe, is_shop, number_of_goods, phone, phone_main_page,
                     leader_phone_without_sitemap, phones_all_pages, leader_phone_sitemap, 
                     leader_phone_from_team_sitemap, email, email_main_page, leader_email_without_sitemap,  
                     emails_all_pages, leader_email_sitemap, leader_email_from_team_sitemap
                     )
                     VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)""",
                (  # noqa
                    item['DUNS'], item['Handelsregister-Nummer'], item['UID'],
                    item['Internet-Adresse'], '', item['Rechtsform'],
                    item['Filiale Indikator'], item['Mitarbeiter'],
                    item['Mitarbeiter Gruppe'], False, 0, '', '', '', '', '',
                    '', '', '', '', '', '', ''))
            self.connection.commit()
            self.close_db()
Exemplo n.º 47
0
    async def async_step_user(self, user_input=None):
        """Handle user initiated config flow."""
        if user_input is None:
            return await self._async_show_user_form()

        errors = {}

        # Normalize URL
        user_input[CONF_URL] = url_normalize(user_input[CONF_URL],
                                             default_scheme="http")
        if "://" not in user_input[CONF_URL]:
            errors[CONF_URL] = "invalid_url"
            return await self._async_show_user_form(user_input=user_input,
                                                    errors=errors)

        if self._already_configured(user_input):
            return self.async_abort(reason="already_configured")

        conn = None

        def logout():
            if hasattr(conn, "user"):
                try:
                    conn.user.logout()
                except Exception:  # pylint: disable=broad-except
                    _LOGGER.debug("Could not logout", exc_info=True)

        def try_connect(username: Optional[str],
                        password: Optional[str]) -> Connection:
            """Try connecting with given credentials."""
            if username or password:
                conn = AuthorizedConnection(
                    user_input[CONF_URL],
                    username=username,
                    password=password,
                    timeout=CONNECTION_TIMEOUT,
                )
            else:
                try:
                    conn = AuthorizedConnection(
                        user_input[CONF_URL],
                        username="",
                        password="",
                        timeout=CONNECTION_TIMEOUT,
                    )
                    user_input[CONF_USERNAME] = ""
                    user_input[CONF_PASSWORD] = ""
                except ResponseErrorException:
                    _LOGGER.debug(
                        "Could not login with empty credentials, proceeding unauthenticated",
                        exc_info=True,
                    )
                    conn = Connection(user_input[CONF_URL],
                                      timeout=CONNECTION_TIMEOUT)
                    del user_input[CONF_USERNAME]
                    del user_input[CONF_PASSWORD]
            return conn

        def get_router_title(conn: Connection) -> str:
            """Get title for router."""
            title = None
            client = Client(conn)
            try:
                info = client.device.basic_information()
            except Exception:  # pylint: disable=broad-except
                _LOGGER.debug("Could not get device.basic_information",
                              exc_info=True)
            else:
                title = info.get("devicename")
            if not title:
                try:
                    info = client.device.information()
                except Exception:  # pylint: disable=broad-except
                    _LOGGER.debug("Could not get device.information",
                                  exc_info=True)
                else:
                    title = info.get("DeviceName")
            return title or DEFAULT_DEVICE_NAME

        username = user_input.get(CONF_USERNAME)
        password = user_input.get(CONF_PASSWORD)
        try:
            conn = await self.hass.async_add_executor_job(
                try_connect, username, password)
        except LoginErrorUsernameWrongException:
            errors[CONF_USERNAME] = "incorrect_username"
        except LoginErrorPasswordWrongException:
            errors[CONF_PASSWORD] = "incorrect_password"
        except LoginErrorUsernamePasswordWrongException:
            errors[CONF_USERNAME] = "incorrect_username_or_password"
        except LoginErrorUsernamePasswordOverrunException:
            errors["base"] = "login_attempts_exceeded"
        except ResponseErrorException:
            _LOGGER.warning("Response error", exc_info=True)
            errors["base"] = "response_error"
        except Timeout:
            _LOGGER.warning("Connection timeout", exc_info=True)
            errors[CONF_URL] = "connection_timeout"
        except Exception:  # pylint: disable=broad-except
            _LOGGER.warning("Unknown error connecting to device",
                            exc_info=True)
            errors[CONF_URL] = "unknown_connection_error"
        if errors:
            await self.hass.async_add_executor_job(logout)
            return await self._async_show_user_form(user_input=user_input,
                                                    errors=errors)

        title = await self.hass.async_add_executor_job(get_router_title, conn)
        await self.hass.async_add_executor_job(logout)

        return self.async_create_entry(title=title, data=user_input)
Exemplo n.º 48
0
def normalize_url(url: Optional[str]) -> Optional[str]:
    if url is None:
        return url

    return url_normalize.url_normalize(url)
Exemplo n.º 49
0
 def __init__(self, url):
   self.url = url
   self.normalized = url_normalize(url)
Exemplo n.º 50
0
def test_url_normalize_results():
    """Assert url_normalize return expected results."""
    for value, expected in EXPECTED_RESULTS.items():
        assert expected == url_normalize(value), value
Exemplo n.º 51
0
 def validate(self, value):
     value = db.LinkProperty().validate(value)
     value = url_normalize(value)
     return value
Exemplo n.º 52
0
def canonical(url):
  norm = url_normalize(url)
  return norm.split('#', 1)[0]
Exemplo n.º 53
0
Arquivo: Url.py Projeto: tobyk100/Url
 def getNormalized(self):
   return url_normalize(self.url)
Exemplo n.º 54
0
from betamax import Betamax
from betamax_serializers.pretty_json import PrettyJSONSerializer
from click.testing import CliRunner
from ddi.cli import initiate_session
from ddi.ipv4 import *

import base64
import os
import pytest
import url_normalize

ddi_host = os.environ.get('DDI_HOST', 'ddi-test-host.example.com')
ddi_password = os.environ.get('DDI_PASSWORD', 'test_password')
ddi_server = os.environ.get('DDI_SERVER', 'https://ddi.example.com')
ddi_site_name = os.environ.get('DDI_SITE_NAME', 'EXAMPLE')
ddi_url = url_normalize.url_normalize(ddi_server)
ddi_username = os.environ.get('DDI_USERNAME', 'test_user')
domain_name = os.environ.get('DOMAINNAME', 'example.com')
errant_ddi_host = 'bad-host.example.com'

# Test variables:
errant_ipv4_address = '1.1.1.1'
errant_subnet = '1.1.1.0'
ipv4_address = '172.23.23.4'
subnet = '172.23.23.0'

# Makes the output more readable
Betamax.register_serializer(PrettyJSONSerializer)

config = Betamax.configure()
config.cassette_library_dir = 'tests/cassettes'
Exemplo n.º 55
0
numberOfPages= int(raw_input('n: '))
print "Start time:" + str(time.asctime(time.localtime(time.time())))
start = time.time()
userinput = urllib.urlencode ({'q':userinput})
response = urllib.urlopen ( 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&rsz=8&' + userinput ).read() #extract 1st 8 results from google search engine
json = myjson.loads ( response )
results = json [ 'responseData' ][ 'results' ]
RepeatedLinkCheckdict=dict()  # dictionary to check whether a link is repeated for crawling.
dictValue = 1

from collections import deque
initqueue = deque()           # This is the queue that will contain valid urls to be crawled, in Breadth First Search order.

for result in results:
    url = result['url']   
    normalUrl=url_normalize.url_normalize(url)
    if (normalUrl in RepeatedLinkCheckdict): pass   # If the current url is already crawled discard
    else:
      RepeatedLinkCheckdict[normalUrl]=dictValue
      dictValue=dictValue+1
      initqueue.append(url)
   

PageDataList=[]            # This list will store the content of the pages successfully crawled
LinksParsedInOrder=[]      # This list will store the successfully crawled links in the order they are crawled and the related statistical information
lengthOfQueue=len(initqueue)
outputData=''
UrlCode=0
i=-1
TotalSizeOfPagesDownloaded=0
while i<numberOfPages:
Exemplo n.º 56
0
 def normalize(self):
     url_normalize(self.url.geturl())
Exemplo n.º 57
0
async def async_setup(hass: HomeAssistantType, config) -> bool:
    """Set up Huawei LTE component."""

    # Arrange our YAML config to dict with normalized URLs as keys
    domain_config = {}
    if DOMAIN not in hass.data:
        hass.data[DOMAIN] = HuaweiLteData(hass_config=config,
                                          config=domain_config)
    for router_config in config.get(DOMAIN, []):
        domain_config[url_normalize(
            router_config.pop(CONF_URL))] = router_config

    def service_handler(service) -> None:
        """Apply a service."""
        url = service.data.get(CONF_URL)
        routers = hass.data[DOMAIN].routers
        if url:
            router = routers.get(url)
        elif not routers:
            _LOGGER.error("%s: no routers configured", service.service)
            return
        elif len(routers) == 1:
            router = next(iter(routers.values()))
        else:
            _LOGGER.error(
                "%s: more than one router configured, must specify one of URLs %s",
                service.service,
                sorted(routers),
            )
            return
        if not router:
            _LOGGER.error("%s: router %s unavailable", service.service, url)
            return

        if service.service == SERVICE_CLEAR_TRAFFIC_STATISTICS:
            if router.suspended:
                _LOGGER.debug("%s: ignored, integration suspended",
                              service.service)
                return
            result = router.client.monitoring.set_clear_traffic()
            _LOGGER.debug("%s: %s", service.service, result)
        elif service.service == SERVICE_REBOOT:
            if router.suspended:
                _LOGGER.debug("%s: ignored, integration suspended",
                              service.service)
                return
            result = router.client.device.reboot()
            _LOGGER.debug("%s: %s", service.service, result)
        elif service.service == SERVICE_RESUME_INTEGRATION:
            # Login will be handled automatically on demand
            router.suspended = False
            _LOGGER.debug("%s: %s", service.service, "done")
        elif service.service == SERVICE_SUSPEND_INTEGRATION:
            router.logout()
            router.suspended = True
            _LOGGER.debug("%s: %s", service.service, "done")
        else:
            _LOGGER.error("%s: unsupported service", service.service)

    for service in ADMIN_SERVICES:
        hass.helpers.service.async_register_admin_service(
            DOMAIN,
            service,
            service_handler,
            schema=SERVICE_SCHEMA,
        )

    for url, router_config in domain_config.items():
        hass.async_create_task(
            hass.config_entries.flow.async_init(
                DOMAIN,
                context={"source": SOURCE_IMPORT},
                data={
                    CONF_URL: url,
                    CONF_USERNAME: router_config.get(CONF_USERNAME),
                    CONF_PASSWORD: router_config.get(CONF_PASSWORD),
                },
            ))

    return True
Exemplo n.º 58
0
 def makekeyname(url):
     return hashlib.sha224(url_normalize(url)).hexdigest()