def error_check_prepare_netloc(): url1 = None url2 = 23 url3 = '//www.reddit.com' url4 = 'www.google.com' urls = [url1, url2, url3, url4] for url in urls: with pytest.raises(InvalidUrlError): um.prepare_netloc(url)
def internal(url, host): if not validate_url(url): raise InvalidUrlError( 'Cannot define internal status of invalid url: ' + str(url)) if not validate_url(host): raise InvalidUrlError( 'Cannot define internal status with invalid base: ' + str(host)) url_netloc = um.prepare_netloc(url) host_netloc = um.prepare_netloc(host) return url_netloc == host_netloc
def add_hosts_to_database(): for host in hosts: host = um.prepare_netloc( host) # prepare the network location of the host if host_list.entry_from_host( host) is None: # if there is not an entry at the host host_list.add_to_hosts(host) # add it to the database
def test_document_from_ip_address(): streamer = Streamer('test_document_from_ip_address') host = 'http://list-iptv.com' url1 = 'http://62.210.245.19:8000/live/testapp/testapp/2.ts' url2 = 'http://clientportalpro.com:2500/live/VE5DWv4Ait/7KHLqRRZ9E/2160.ts' url3 = 'http://ndasat.pro:8000/live/exch/exch/1227.ts' for url in [url1, url2, url3]: netloc = um.prepare_netloc(url) ip_addresses = socket.gethostbyname_ex(um.remove_schema(netloc))[2] for ip_address in ip_addresses: data = { 'ip_address': ip_address, 'network_locations': [ SON([('network_location', netloc), ('linked_by', [host]), ('working_link', True)]) ] } streamer.collection().insert(data) doc = streamer.document_from_ip_address(ip_address) assert doc['ip_address'] == ip_address assert doc['network_locations'] == [ SON([('network_location', netloc), ('linked_by', [host]), ('working_link', True)]) ] streamer.delete()
def add_to_hosts(self, host, running=False): if not requester.validate_url(host): raise InvalidUrlError( 'Cannot add a url to streams with an invalid host: %s' % host) entry = self.entry_from_host(host) if entry: raise EntryInDatabaseError( 'The following host is already in the database: %s' % host) data = { 'host': um.prepare_netloc(host), 'running': running } # data for the entry self._db.hosts.insert(data)
def entry_from_host(self, host): if not requester.validate_url(host): raise InvalidUrlError( 'Cannot retrieve entry with an invalid host: %s' % host) cursors = self._db.hosts.find({'host': um.prepare_netloc(host)}) if cursors.count( ) > 1: # count counts the number of cursor, if there are multiple cursors an error is raised raise MultipleEntriesInDatabaseError( 'There are multiple entries in the hosts database with the same host: %s' % host) if cursors.count() == 1: return cursors[0] # returns the cursor corresponding to the url return None # returns None if no cursor is found
def test_add_to_database_by_ip_address(): streamer = Streamer('test_add_to_database_by_ip_address') host = 'http://list-iptv.com' url1 = 'http://62.210.245.19:8000/live/testapp/testapp/2.ts' url2 = 'http://clientportalpro.com:2500/live/VE5DWv4Ait/7KHLqRRZ9E/2160.ts' url3 = 'http://ndasat.pro:8000/live/exch/exch/1227.ts' for url in [url1, url2, url3]: netloc = um.prepare_netloc(url) ip_addresses = socket.gethostbyname_ex(um.remove_schema(netloc))[2] for ip_address in ip_addresses: streamer.add_to_database_by_ip_address(ip_address, netloc, host, None) doc = streamer.document_from_ip_address(ip_address) assert doc['ip_address'] == ip_address for entry in doc['network_locations']: assert entry == netloc streamer.delete()
def test_prepare_netloc(): url1 = um.prepare_netloc( 'http://s4.bossna-caffe.com:80/hls/1200.m3u8?channelId=1200&deviceMac=00:1A:79:3A:2D:B9&uid=35640' ) url2 = um.prepare_netloc( 'http://www.s4.bossna-caffe.com:80/hls/1200.m3u8?channelId=1200&deviceMac=00:1A:79:3A:2D:B9&uid=35640/' ) url3 = um.prepare_netloc('http://192.68.132.1:800') url4 = um.prepare_netloc('http://192.68.132.1:400') url5 = um.prepare_netloc( 'http://www.soledge7.dogannet.tv/S1/HLS_LIVE/tv2/1000/prog_index.m3u8') url6 = um.prepare_netloc( 'http://soledge7.dogannet.tv/S1/HLS_LIVE/tv2/1000/prog_index.m3u8') url7 = um.prepare_netloc('http://reddit.com') assert url1 == url2 == 'http://s4.bossna-caffe.com' assert url3 == url4 == 'http://192.68.132.1' assert url5 == url6 == 'http://soledge7.dogannet.tv' assert url7 == 'http://reddit.com'
def add_to_streams(self, url, host, ext_title=None): if not requester.validate_url(url): raise InvalidUrlError('Cannot add an invalid url to streams: %s' % url) if not requester.validate_url(host): raise InvalidUrlError( 'Cannot add a url to streams with an invalid host: %s' % host) netloc = um.prepare_netloc(url) if netloc not in self.broken_stream_links and netloc not in self.working_stream_links: # Note that network locations are only # added to broken_stream_links or working_stream_links if their working link status is known. Also note that visitor classes # are unique to crawler classes, therefore each visitor class will only deal with one host. Therefore if a stream is # added to the database from a given visitor class with a known working link status, then it is no longer necessary # to evaluate that stream, doing so would lead to redundant requests to the database if netloc not in self.ip_addresses: #if there isn't an IP address assigned to the network location try: ip_addresses = socket.gethostbyname_ex( um.remove_schema(netloc) )[2] #fetches all IP addresses from the network location self.ip_addresses[netloc] = ip_addresses except socket.gaierror: #if this error is raised then the network location is down ip_addresses = None else: ip_addresses = self.ip_addresses[netloc] if ip_addresses: stream_statuses = {} for ip_address in ip_addresses: playable_url = False if (ip_address, netloc) not in self.connection_attempts: self.connection_attempts[(ip_address, netloc)] = 1 if self.connection_attempts[(ip_address, netloc)] in self.fibs: if url not in stream_statuses: try: stream_status = requester.evaluate_stream(url) except StreamTimedOutError: stream_statuses[url] = working_link = False else: if stream_status: stream_statuses[url] = working_link = True r = requester.make_request(url) if r and r.ok: playable_url = True elif self.connection_attempts[( ip_address, netloc)] == self.fibs[-1]: stream_statuses[url] = working_link = False else: stream_statuses[url] = working_link = None else: working_link = stream_statuses[url] if not playable_url: self.add_to_database_by_ip_address( ip_address, netloc, host, working_link, ext_title) else: self.add_to_database_by_ip_address( ip_address, netloc, host, working_link, ext_title, url) if working_link: self.working_stream_links.add(netloc) elif working_link is False: self.broken_stream_links.add(netloc) self.connection_attempts[(ip_address, netloc)] += 1 elif netloc in self.working_stream_links: ip_addresses = self.ip_addresses[netloc] for ip_address in ip_addresses: self.add_to_titles(ip_address, ext_title)