Exemplo n.º 1
0
def reduce(url):
    if not requester.validate_url(url):
        raise InvalidUrlError('Cannot reduce invalid url: ' + str(url))
    s = re.search(regex['end'], url)
    if s:
        url = s.group(1)
    return url
Exemplo n.º 2
0
def remove_schema(url):
    if not requester.validate_url(url):
        raise InvalidUrlError('Cannot remove schema of invalid url: ' +
                              str(url))
    parsed = urlparse(url)
    if parsed.query:
        return parsed.netloc + parsed.path + '?' + parsed.query
    return parsed.netloc + parsed.path
Exemplo n.º 3
0
def partial(url, host):
    if url is None or not isinstance(url, str):
        raise InvalidInputError('Cannot fix partiality of invalid url: ' +
                                str(url))
    if not requester.validate_url(host):
        raise InvalidHostError('Cannot fix partiality with invalid host: ' +
                               str(url))
    return urljoin(host, url)
Exemplo n.º 4
0
    def add_to_database_by_ip_address(self,
                                      ip_address,
                                      netloc,
                                      host,
                                      working_link,
                                      title=None,
                                      playable_url=None):
        if ip_address is not None and not re.search(regex['ip'], ip_address):
            raise InvalidInputError(
                'Cannot add to database with invalid IP address: %s' %
                ip_address)
        if not requester.validate_url(netloc):
            raise InvalidUrlError(
                'Cannot add to database with an invalid url: %s' % netloc)
        if not requester.validate_url(host):
            raise InvalidUrlError(
                'Cannot add to database with an invalid host: %s' % host)
        doc = self.document_from_ip_address(ip_address)
        if not doc:
            if working_link:
                stream_status = "Working"
            else:
                stream_status = "Broken"
            data = {
                'ip_address': ip_address,
                'network_locations': [netloc],
                'titles': [],
                'linked_by': [host],
                'stream_status': stream_status,
                'playable_urls': []
            }
            if playable_url:
                data["playable_urls"].append(playable_url)
            self._collection.insert(data)

        else:
            self.add_to_hosts(ip_address, host)
            if netloc not in doc["network_locations"]:
                self._collection.update(
                    {'ip_address': ip_address},
                    {'$push': {
                        'network_locations': netloc
                    }})
            else:
                self.update_stream_status(ip_address, working_link)
        self.add_to_titles(ip_address, title)
Exemplo n.º 5
0
 def update_running(self, host, running):
     if not requester.validate_url(host):
         raise InvalidUrlError(
             'Cannot update running entry with an invalid host: %s' % host)
     if self.entry_from_host(host) is None:
         raise EntryNotInDatabaseError(
             'The host %s is not in the hosts database' % host)
     self._db.hosts.update({'host': host}, {'$set': {'running': running}})
Exemplo n.º 6
0
def deport(url):
    if not requester.validate_url(url):
        raise InvalidUrlError('Cannot remove port of invalid url: ' + str(url))
    search = re.search(regex['port'], url)
    if search:
        if search.group(3):
            return search.group(1) + search.group(3)
        return search.group(1)
    return url
Exemplo n.º 7
0
def remove_identifier(url):
    if not requester.validate_url(url):
        raise InvalidUrlError(
            'Cannot remove the identifier of an invalid url: %s' % url)
    search = re.search(regex['identity'], url)
    if search:
        return search.group(1) + search.group(
            3)  #search group 2 contains the identifier
    return url
Exemplo n.º 8
0
def expand(url):
    if not requester.validate_url(url):
        raise InvalidUrlError('Cannot fix shortness of invalid url: ' +
                              str(url))
    if re.search(regex['short'], url):
        request = requester.make_request(url)
        if request is not None:
            #print('Expanded', url, 'to', make_request.url)
            url = request.url
    return url
Exemplo n.º 9
0
def remove_top(url):
    if not requester.validate_url(url):
        raise InvalidUrlError('Cannot remove top of invalid url: ' + str(url))
    url = deport(remove_identifier(url))
    netloc = urlparse(url).netloc
    if re.search(regex['ip'], netloc):
        return netloc, 'ip'
    s = re.search(regex['top'], netloc)
    domain = s.group(1)
    top = s.group(2)
    return domain, top
Exemplo n.º 10
0
 def add_to_hosts(self, host, running=False):
     if not requester.validate_url(host):
         raise InvalidUrlError(
             'Cannot add a url to streams with an invalid host: %s' % host)
     entry = self.entry_from_host(host)
     if entry:
         raise EntryInDatabaseError(
             'The following host is already in the database: %s' % host)
     data = {
         'host': um.prepare_netloc(host),
         'running': running
     }  # data for the entry
     self._db.hosts.insert(data)
Exemplo n.º 11
0
 def entry_from_host(self, host):
     if not requester.validate_url(host):
         raise InvalidUrlError(
             'Cannot retrieve entry with an invalid host: %s' % host)
     cursors = self._db.hosts.find({'host': um.prepare_netloc(host)})
     if cursors.count(
     ) > 1:  # count counts the number of cursor, if there are multiple cursors an error is raised
         raise MultipleEntriesInDatabaseError(
             'There are multiple entries in the hosts database with the same host: %s'
             % host)
     if cursors.count() == 1:
         return cursors[0]  # returns the cursor corresponding to the url
     return None  # returns None if no cursor is found
Exemplo n.º 12
0
def test_validate_url():
    url1 = 'https://www.reddit.com'
    url2 = 'www.reddit.com'
    url3 = None
    url4 = 'dog'
    url5 = 6
    url6 = []
    url7 = 'https://www.reddit.com'
    url8 = 'http://reddit.com/r/all'
    url9 = 'http://leaderpro.pt:25461/live/Andreia/Andreia/15182.ts'
    url10 = 'http://streamer1.streamhost.org:1935/salive/GMIalfah/chunklist.m3u8?'
    url11 = 'https://192.12.15.0/play/.ts'
    urls = [url1, url2, url3, url4, url5, url6, url7, url8, url9, url10, url11]
    valid_urls = []
    for url in urls:
        if requester.validate_url(url):
            valid_urls.append(url)

    assert valid_urls == [url1, url7, url8, url9, url10]
Exemplo n.º 13
0
def prepare_netloc(url):
    if not requester.validate_url(url):
        raise InvalidUrlError('Cannot prepare_netloc invalid url: ' + str(url))
    url = reduce(deport(remove_identifier(url)))
    parsed = urlparse(url)
    return parsed.scheme + '://' + parsed.netloc
Exemplo n.º 14
0
 def add_to_streams(self, url, host, ext_title=None):
     if not requester.validate_url(url):
         raise InvalidUrlError('Cannot add an invalid url to streams: %s' %
                               url)
     if not requester.validate_url(host):
         raise InvalidUrlError(
             'Cannot add a url to streams with an invalid host: %s' % host)
     netloc = um.prepare_netloc(url)
     if netloc not in self.broken_stream_links and netloc not in self.working_stream_links:  # Note that network locations are only
         # added to broken_stream_links or working_stream_links if their working link status is known. Also note that visitor classes
         # are unique to crawler classes, therefore each visitor class will only deal with one host. Therefore if a stream is
         # added to the database from a given visitor class with a known working link status, then it is no longer necessary
         # to evaluate that stream, doing so would lead to redundant requests to the database
         if netloc not in self.ip_addresses:  #if there isn't an IP address assigned to the network location
             try:
                 ip_addresses = socket.gethostbyname_ex(
                     um.remove_schema(netloc)
                 )[2]  #fetches all IP addresses from the network location
                 self.ip_addresses[netloc] = ip_addresses
             except socket.gaierror:  #if this error is raised then the network location is down
                 ip_addresses = None
         else:
             ip_addresses = self.ip_addresses[netloc]
         if ip_addresses:
             stream_statuses = {}
             for ip_address in ip_addresses:
                 playable_url = False
                 if (ip_address, netloc) not in self.connection_attempts:
                     self.connection_attempts[(ip_address, netloc)] = 1
                 if self.connection_attempts[(ip_address,
                                              netloc)] in self.fibs:
                     if url not in stream_statuses:
                         try:
                             stream_status = requester.evaluate_stream(url)
                         except StreamTimedOutError:
                             stream_statuses[url] = working_link = False
                         else:
                             if stream_status:
                                 stream_statuses[url] = working_link = True
                                 r = requester.make_request(url)
                                 if r and r.ok:
                                     playable_url = True
                             elif self.connection_attempts[(
                                     ip_address, netloc)] == self.fibs[-1]:
                                 stream_statuses[url] = working_link = False
                             else:
                                 stream_statuses[url] = working_link = None
                     else:
                         working_link = stream_statuses[url]
                     if not playable_url:
                         self.add_to_database_by_ip_address(
                             ip_address, netloc, host, working_link,
                             ext_title)
                     else:
                         self.add_to_database_by_ip_address(
                             ip_address, netloc, host, working_link,
                             ext_title, url)
                     if working_link:
                         self.working_stream_links.add(netloc)
                     elif working_link is False:
                         self.broken_stream_links.add(netloc)
                 self.connection_attempts[(ip_address, netloc)] += 1
     elif netloc in self.working_stream_links:
         ip_addresses = self.ip_addresses[netloc]
         for ip_address in ip_addresses:
             self.add_to_titles(ip_address, ext_title)