def parse_proxyList(self): curr_proxy_list = [] try: # Parse all proxy pages -> format: /list/{num}.htm # Get the pageRange from the 'pagination' table page_set = self.get_pagination_set() logger.debug("Pages: {}".format(page_set)) # One JS unpacker per provider (not per page) self.js_unpacker = self.init_js_unpacker() for page in page_set: response = requests.get("{0}{1}".format(self.get_url(), page), timeout=self.timeout) if not response.ok: # Could not parse ANY page - Let user know if not curr_proxy_list: logger.warning("Proxy Provider url failed: {}".format( self.get_url())) # Return proxies parsed so far return curr_proxy_list content = response.content soup = BeautifulSoup(content, "html.parser", from_encoding="iso-8859-1") table = soup.find("div", attrs={"id": "proxylist"}) # The first tr contains the field names. headings = [ th.get_text() for th in table.find("tr").find_all("th") ] # skip last 'Select All' row for row in table.find_all("tr")[1:-1]: td_row = row.find("td") portKey = td_row.find('span', attrs={ 'class': True }).get('class')[0] port = self.js_unpacker.get_port(portKey) proxy_obj = self.create_proxy_object(row, port) # Make sure it is a Valid Proxy Address if proxy_obj is not None and UrlParser.valid_ip( proxy_obj.ip) and UrlParser.valid_port(port): curr_proxy_list.append(proxy_obj) else: logger.debug("Proxy Invalid: {}".format( proxy_obj.to_str())) except AttributeError as e: logger.error( "Provider {0} failed with Attribute error: {1}".format( self.id, e)) except KeyError as e: logger.error("Provider {0} failed with Key error: {1}".format( self.id, e)) except Exception as e: logger.error("Provider {0} failed with Unknown error: {1}".format( self.id, e)) finally: return curr_proxy_list
class TestBaseProxyParsers(unittest.TestCase): def setUp(self): self.normal_parser = UrlParser("proxy-test", "http://proxy-test.com", bandwidth_KBs=50) self.no_bdwidthParser = UrlParser("slow-proxy", "http://slow-proxy.com") def test_normal_parser(self): self.assertEqual(self.normal_parser.get_url(), "http://proxy-test.com", "incorrect parser URL") self.assertEqual(self.normal_parser.get_min_bandwidth(), 50, "incorrect parser bandwidth") def test_no_bandwidth_parser(self): self.assertEqual(self.no_bdwidthParser.get_url(), "http://slow-proxy.com", "incorrect parser URL") self.assertEqual(self.no_bdwidthParser.get_min_bandwidth(), 150, "incorrect parser bandwidth")
def parse_proxyList(self): curr_proxy_list = [] try: response = requests.get(self.get_URl(), timeout=self.timeout) if not response.ok: logger.warn("Proxy Provider url failed: {}".format( self.get_URl())) return [] content = response.content soup = BeautifulSoup(content, "html.parser") table = soup.find("table", attrs={"class": "proxy_list"}) # The first tr contains the field names. headings = [ th.get_text() for th in table.find("tr").find_all("th") ] datasets = [] for row in table.find_all("tr")[1:]: dataset = zip(headings, (td.get_text() for td in row.find_all("td"))) datasets.append(dataset) for dataset in datasets: # Check Field[0] for tags and field[1] for values! address = "" proxy_straggler = False for field in dataset: # Discard slow proxies! Speed is in KB/s if field[0] == 'Speed': if float(field[1]) < self.get_min_bandwidth(): proxy_straggler = True if field[0] == 'IP': # Make sure it is a Valid IP if not UrlParser.valid_ip(field[1]): logger.debug("IP with Invalid format: {}".format( field[1])) break else: address += field[1] + ':' elif field[0] == 'Port': address += field[1] # Avoid Straggler proxies and make sure it is a Valid Proxy Address if not proxy_straggler and UrlParser.valid_ip_port(address): proxy = "http://" + address curr_proxy_list.append(proxy.__str__()) # print "{0:<10}: {1}".format(field[0], field[1]) # print "ALL: ", curr_proxy_list except: pass return curr_proxy_list
def parse_proxyList(self, use_top15k=False): curr_proxy_list = [] try: response = requests.get(self.get_URl() + "/" + self.top_proxy_path, timeout=self.timeout) if not response.ok: logger.warn("Proxy Provider url failed: {}".format( self.get_URl())) return [] content = response.content soup = BeautifulSoup(content, "html.parser") table = soup.find("div", attrs={ "class": "paragraph", 'style': "text-align:left;" }).find('font', attrs={'color': '#33a27f'}) # Parse Top Proxy List page for row in [ x for x in table.contents if getattr(x, 'name', None) != 'br' ]: # Make sure it is a Valid Proxy Address if UrlParser.valid_ip_port(row): proxy = "http://" + row curr_proxy_list.append(proxy.__str__()) else: logger.debug("Address with Invalid format: {}".format(row)) # Usually these proxies are stale if use_top15k: # Parse 15k Nodes Text file (named *-all-*.txt) content = requests.get(self.get_URl() + "/" + self.txt_proxy_path).content soup = BeautifulSoup(content, "html.parser") table = soup.find("div", attrs={"class": "wsite-multicol-table-wrap"}) for link in table.findAll('a'): current_link = link.get('href') if current_link is not None and "all" in current_link: self.txt_proxy_path = current_link more_content = requests.get(self.get_URl() + self.txt_proxy_path).text for proxy_address in more_content.split(): if UrlParser.valid_ip_port(proxy_address): curr_proxy_list.append(proxy_address) except: pass return curr_proxy_list
def parse_proxyList(self): curr_proxy_list = [] response = requests.get(self.get_URl(), timeout=self.timeout) if not response.ok: logger.warn("Proxy Provider url failed: {}".format(self.get_URl())) return [] content = response.content soup = BeautifulSoup(content, "html.parser") table = soup.find("table", attrs={"class": "display fpltable"}) if table is None: return curr_proxy_list # The first tr contains the field names. headings = [th.get_text() for th in table.find("tr").find_all("th")] datasets = [] for row in table.find_all("tr")[1:]: dataset = zip(headings, (td.get_text() for td in row.find_all("td"))) if dataset: datasets.append(dataset) for dataset in datasets: # Check Field[0] for tags and field[1] for values! address = "" for field in dataset: if field[0] == 'IP Address': # Make sure it is a Valid IP if not UrlParser.valid_ip(field[1]): logger.debug("IP with Invalid format: {}".format( field[1])) break else: address += field[1] + ':' elif field[0] == 'Port': address += field[1] # Make sure it is a Valid Proxy Address if UrlParser.valid_ip_port(address): proxy = "http://" + address curr_proxy_list.append(proxy.__str__()) else: logger.debug("Address with Invalid format: {}".format(address)) # print "{0:<10}: {1}".format(field[0], field[1]) # print "ALL: ", curr_proxy_list return curr_proxy_list
def create_proxy_object(self, dataset): ip = "" port = None anonymity = AnonymityLevel.UNKNOWN country = None # Check Field[0] for tags and field[1] for values! for field in dataset: # Discard slow proxies! Speed is in KB/s if field[0] == 'Speed': if float(field[1]) < self.get_min_bandwidth(): logger.debug("Proxy with low bandwidth: {}".format(float(field[1]))) return None if field[0] == 'IP': ip = field[1].strip() # String strip() # Make sure it is a Valid IP if not UrlParser.valid_ip(ip): logger.debug("IP with Invalid format: {}".format(ip)) return None elif field[0] == 'Port': port = field[1].strip() # String strip() elif field[0] == 'Anon': anonymity = AnonymityLevel.get(field[1].strip()) # String strip() elif field[0] == 'Country': country = field[1].strip() # String strip() return ProxyObject(source=self.id, ip=ip, port=port, anonymity_level=anonymity, country=country)
def create_proxy_object(self, dataset): # Check Field[0] for tags and field[1] for values! ip = "" port = None anonymity = AnonymityLevel.UNKNOWN country = None for field in dataset: if field[0] == 'IP Address': # Make sure it is a Valid IP ip = field[1].strip() # String strip() # Make sure it is a Valid IP if not UrlParser.valid_ip(ip): logger.debug("IP with Invalid format: {}".format(ip)) return None elif field[0] == 'Port': port = field[1].strip() # String strip() elif field[0] == 'Anonymity': anonymity = AnonymityLevel.get( field[1].strip()) # String strip() elif field[0] == 'Country': country = field[1].strip() # String strip() return ProxyObject(source=self.id, ip=ip, port=port, anonymity_level=anonymity, country=country)
def parse_proxyList(self): curr_proxy_list = [] try: # Parse all proxy pages -> format: /list/{num}.htm # Get the pageRange from the 'pagination' table page_set = self.get_pagination_set() logger.debug("Pages: {}".format(page_set)) for page in page_set: response = requests.get("{0}{1}".format(self.get_url(), page), timeout=self.timeout) if not response.ok: # Could not parse ANY page - Let user know if not curr_proxy_list: logger.warn("Proxy Provider url failed: {}".format( self.get_url())) # Return proxies parsed so far return curr_proxy_list content = response.content soup = BeautifulSoup(content, "html.parser") # css provides the port number so we reverse it # for href in soup.findAll('link'): # if '/styles/' in href.get('href'): # style = "http://www.samair.ru" + href.get('href') # break # css = requests.get(style).content.split('\n') # css.pop() # ports = {} # for l in css: # p = l.split(' ') # key = p[0].split(':')[0][1:] # value = p[1].split('\"')[1] # ports[key] = value table = soup.find("div", attrs={"id": "proxylist"}) # The first tr contains the field names. headings = [ th.get_text() for th in table.find("tr").find_all("th") ] for row in table.find_all("tr")[1:]: td_row = row.find("td") # curr_proxy_list.append('http://' + row.text + ports[row['class'][0]]) proxy_obj = self.create_proxy_object(row) # Make sure it is a Valid Proxy Address if proxy_obj is not None and UrlParser.valid_ip_port( td_row.text): curr_proxy_list.append(proxy_obj) else: logger.debug("Proxy Invalid: {}".format(td_row.text)) except AttributeError as e: logger.error( "Provider {0} failed with Attribute error: {1}".format( self.id, e)) except KeyError as e: logger.error("Provider {0} failed with Key error: {1}".format( self.id, e)) except Exception as e: logger.error("Provider {0} failed with Unknown error: {1}".format( self.id, e)) finally: return curr_proxy_list
def create_proxy_object(self, address, country, anonymity): # Make sure it is a Valid IP ip = address.strip().split(":")[0] if not UrlParser.valid_ip(ip): logger.debug("IP with Invalid format: {}".format(ip)) return None port = address.strip().split(":")[1] country = country.strip() anonymity = AnonymityLevel.get(anonymity.strip()) return ProxyObject(source=self.id, ip=ip, port=port, anonymity_level=anonymity, country=country)
def create_proxy_object(self, row): for td_row in row.findAll("td"): if td_row.attrs['data-label'] == 'IP:port ': text = td_row.text.strip() ip = text.split(":")[0] # Make sure it is a Valid IP if not UrlParser.valid_ip(ip): logger.debug("IP with Invalid format: {}".format(ip)) return None port = text.split(":")[1] elif td_row.attrs['data-label'] == 'Anonymity Type: ': anonymity = AnonymityLevel.get(td_row.text.strip()) elif td_row.attrs['data-label'] == 'Country: ': country = td_row.text.strip() return ProxyObject(source=self.id, ip=ip, port=port, anonymity_level=anonymity, country=country)
def parse_proxyList(self): curr_proxy_list = [] # Parse all proxy pages -> format: /list/{num}.htm # TODO: get the pageRange from the 'pagination' table for page in range(1, 21): response = requests.get("{0}{num:02d}.htm".format(self.get_URl(), num=page), timeout=self.timeout) if not response.ok: # Could not parse ANY page - Let user know if not curr_proxy_list: logger.warn("Proxy Provider url failed: {}".format( self.get_URl())) # Return proxies parsed so far return curr_proxy_list content = response.content soup = BeautifulSoup(content, "html.parser") # css provides the port number so we reverse it # for href in soup.findAll('link'): # if '/styles/' in href.get('href'): # style = "http://www.samair.ru" + href.get('href') # break # css = requests.get(style).content.split('\n') # css.pop() # ports = {} # for l in css: # p = l.split(' ') # key = p[0].split(':')[0][1:] # value = p[1].split('\"')[1] # ports[key] = value table = soup.find("div", attrs={"id": "proxylist"}) # The first tr contains the field names. headings = [ th.get_text() for th in table.find("tr").find_all("th") ] for row in table.find_all("tr")[1:]: td_row = row.find("td") # curr_proxy_list.append('http://' + row.text + ports[row['class'][0]]) # Make sure it is a Valid Proxy Address if UrlParser.valid_ip_port(td_row.text): curr_proxy_list.append('http://' + td_row.text) else: logger.debug("Address with Invalid format: {}".format( td_row.text)) return curr_proxy_list
def parse_proxyList(self): curr_proxy_list = [] try: response = requests.get(self.get_url(), timeout=self.timeout) if not response.ok: logger.warn("Proxy Provider url failed: {}".format( self.get_url())) return [] content = response.content soup = BeautifulSoup(content, "html.parser") table = soup.find("table", attrs={"id": "proxylisttable"}) # The first tr contains the field names. headings = [ th.get_text() for th in table.find("tr").find_all("th") ] datasets = [] for row in table.find_all("tr")[1:]: dataset = zip(headings, (td.get_text() for td in row.find_all("td"))) if dataset: datasets.append(dataset) for dataset in datasets: proxy_obj = self.create_proxy_object(dataset) # Make sure it is a Valid Proxy Address if proxy_obj is not None and UrlParser.valid_ip_port( proxy_obj.get_address()): curr_proxy_list.append(proxy_obj) else: logger.debug("Proxy Invalid: {}".format(dataset)) except AttributeError as e: logger.error( "Provider {0} failed with Attribute error: {1}".format( self.id, e)) except KeyError as e: logger.error("Provider {0} failed with Key error: {1}".format( self.id, e)) except Exception as e: logger.error("Provider {0} failed with Unknown error: {1}".format( self.id, e)) finally: return curr_proxy_list
def parse_proxyList(self): curr_proxy_list = [] try: response = requests.get(self.get_URl(), timeout=self.timeout) if not response.ok: logger.warn("Proxy Provider url failed: {}".format( self.get_URl())) return [] content = response.content soup = BeautifulSoup(content, "html.parser") # css provides the port number so we reverse it # for href in soup.findAll('link'): # if '/styles/' in href.get('href'): # style = "http://www.samair.ru" + href.get('href') # break # css = requests.get(style).content.split('\n') # css.pop() # ports = {} # for l in css: # p = l.split(' ') # key = p[0].split(':')[0][1:] # value = p[1].split('\"')[1] # ports[key] = value table = soup.find("table", attrs={"id": "proxylist"}) # The first tr contains the field names. headings = [ th.get_text() for th in table.find("tr").find_all("th") ] for row in table.find_all("tr")[1:]: td_row = row.find("td") # curr_proxy_list.append('http://' + row.text + ports[row['class'][0]]) # Make sure it is a Valid Proxy Address if UrlParser.valid_ip_port(td_row.text): curr_proxy_list.append('http://' + td_row.text) else: logger.debug("Address with Invalid format: {}".format( td_row.text)) except: pass return curr_proxy_list
def __init__(self, web_url, bandwithdh=None, timeout=None): UrlParser.__init__(self, web_url, bandwithdh, timeout)
def __init__(self, web_url, timeout=None): UrlParser.__init__(self, web_url, timeout)
def __init__(self, id, web_url, timeout=None): self.top_proxy_path = "proxy-list.html" self.txt_proxy_path = "txt-lists.html" UrlParser.__init__(self, id=id, web_url=web_url, timeout=timeout)
def parse_proxyList(self, use_top15k=False): curr_proxy_list = [] try: response = requests.get(self.get_url() + "/" + self.top_proxy_path, timeout=self.timeout) if not response.ok: logger.warn("Proxy Provider url failed: {}".format( self.get_url())) return [] content = response.content soup = BeautifulSoup(content, "html.parser") all_divs = soup.findAll("div", attrs={ "class": "paragraph", 'style': "text-align:left;" }) # address_table = soup.find("div", attrs={"class": "paragraph", 'style': "text-align:left;"}) # .find('font', attrs={'color': '#33a27f'}) # Parse Top Proxy List page address_list = [] country_list = [] anonymity_list = [] for div in all_divs: address_div = div.find('font', attrs={'color': '#33a27f'}) if address_div is not None: for row in [ x for x in address_div.contents if getattr(x, 'name', None) != 'br' ]: address_list.append(str(row)) curr_div = div.findAll('font', attrs={'size': '2'}) if curr_div[0] is not None: row_data = [] # font -> strong -> font title = curr_div[0].contents[0].contents[0].contents[0] for row in [ x for x in curr_div[-1].contents if getattr(x, 'name', None) != 'br' ]: row_data.append(str(row)) if 'Country' in str(title): country_list.extend(row_data) if 'Status' in str(title): anonymity_list.extend(row_data) for address, country, anonymity in zip(address_list, country_list, anonymity_list): # Make sure it is a Valid Proxy Address proxy_obj = self.create_proxy_object(address, country, anonymity) if proxy_obj is not None and UrlParser.valid_ip_port( proxy_obj.get_address()): curr_proxy_list.append(proxy_obj) else: logger.debug("Proxy Invalid: {}".format(row)) # Usually these proxies are stale if use_top15k: # Parse 15k Nodes Text file (named *-all-*.txt) content = requests.get(self.get_url() + "/" + self.txt_proxy_path).content soup = BeautifulSoup(content, "html.parser") table = soup.find("div", attrs={"class": "wsite-multicol-table-wrap"}) for link in table.findAll('a'): current_link = link.get('href') if current_link is not None and "all" in current_link: self.txt_proxy_path = current_link more_content = requests.get(self.get_url() + self.txt_proxy_path).text for proxy_address in more_content.split(): if UrlParser.valid_ip_port(proxy_address): proxy_obj = self.create_proxy_object(row) curr_proxy_list.append(proxy_obj) except AttributeError as e: logger.error( "Provider {0} failed with Attribute error: {1}".format( self.id, e)) except KeyError as e: logger.error("Provider {0} failed with Key error: {1}".format( self.id, e)) except Exception as e: logger.error("Provider {0} failed with Unknown error: {1}".format( self.id, e)) finally: return curr_proxy_list
def setUp(self): self.normal_parser = UrlParser("proxy-test", "http://proxy-test.com", bandwidth_KBs=50) self.no_bdwidthParser = UrlParser("slow-proxy", "http://slow-proxy.com")
def __init__(self, id, web_url, timeout=None): self.base_url = web_url web_url += "/list/" # Ports decoded by the JS unpacker self.js_unpacker = None UrlParser.__init__(self, id=id, web_url=web_url, timeout=timeout)
def __init__(self, id, web_url, bandwidth=None, timeout=None): UrlParser.__init__(self, id=id, web_url=web_url, bandwidth_KBs=bandwidth, timeout=timeout)
def __init__(self, web_url, timeout=None): web_url += "/list/" UrlParser.__init__(self, web_url, timeout)
def __init__(self, id, web_url, timeout=None): UrlParser.__init__(self, id=id, web_url=web_url, timeout=timeout)