def parse_proxyList(self): curr_proxy_list = [] try: # Parse all proxy pages -> format: /list/{num}.htm # Get the pageRange from the 'pagination' table page_set = self.get_pagination_set() logger.debug("Pages: {}".format(page_set)) for page in page_set: response = requests.get("{0}{1}".format(self.get_url(), page), timeout=self.timeout) if not response.ok: # Could not parse ANY page - Let user know if not curr_proxy_list: logger.warning("Proxy Provider url failed: {}".format( self.get_url())) # Return proxies parsed so far return curr_proxy_list content = response.content soup = BeautifulSoup(content, "html.parser") # css provides the port number so we reverse it # for href in soup.findAll('link'): # if '/styles/' in href.get('href'): # style = "http://www.samair.ru" + href.get('href') # break # css = requests.get(style).content.split('\n') # css.pop() # ports = {} # for l in css: # p = l.split(' ') # key = p[0].split(':')[0][1:] # value = p[1].split('\"')[1] # ports[key] = value table = soup.find("div", attrs={"id": "proxylist"}) # The first tr contains the field names. headings = [ th.get_text() for th in table.find("tr").find_all("th") ] for row in table.find_all("tr")[1:]: td_row = row.find("td") # curr_proxy_list.append('http://' + row.text + ports[row['class'][0]]) proxy_obj = self.create_proxy_object(row) # Make sure it is a Valid Proxy Address if proxy_obj is not None and UrlParser.valid_ip_port( td_row.text): curr_proxy_list.append(proxy_obj) else: logger.debug("Proxy Invalid: {}".format(td_row.text)) except AttributeError as e: logger.error( "Provider {0} failed with Attribute error: {1}".format( self.id, e)) except KeyError as e: logger.error("Provider {0} failed with Key error: {1}".format( self.id, e)) except Exception as e: logger.error("Provider {0} failed with Unknown error: {1}".format( self.id, e)) finally: return curr_proxy_list
def create_proxy_object(self, address, country, anonymity): # Make sure it is a Valid IP ip = address.strip().split(":")[0] if not UrlParser.valid_ip(ip): logger.debug("IP with Invalid format: {}".format(ip)) return None port = address.strip().split(":")[1] country = country.strip() anonymity = AnonymityLevel.get(anonymity.strip()) return ProxyObject(source=self.id, ip=ip, port=port, anonymity_level=anonymity, country=country)
def create_proxy_object(self, row): for td_row in row.findAll("td"): if td_row.attrs['data-label'] == 'IP:port ': text = td_row.text.strip() ip = text.split(":")[0] # Make sure it is a Valid IP if not UrlParser.valid_ip(ip): logger.debug("IP with Invalid format: {}".format(ip)) return None port = text.split(":")[1] elif td_row.attrs['data-label'] == 'Anonymity Type: ': anonymity = AnonymityLevel.get(td_row.text.strip()) elif td_row.attrs['data-label'] == 'Country: ': country = td_row.text.strip() return ProxyObject(source=self.id, ip=ip, port=port, anonymity_level=anonymity, country=country)
def create_proxy_object(self, dataset): # Check Field[0] for tags and field[1] for values! ip = "" port = None anonymity = AnonymityLevel.UNKNOWN country = None for field in dataset: if field[0] == 'IP Address': # Make sure it is a Valid IP ip = field[1].strip() # String strip() # Make sure it is a Valid IP if not UrlParser.valid_ip(ip): logger.debug("IP with Invalid format: {}".format(ip)) return None elif field[0] == 'Port': port = field[1].strip() # String strip() elif field[0] == 'Anonymity': anonymity = AnonymityLevel.get(field[1].strip()) # String strip() elif field[0] == 'Country': country = field[1].strip() # String strip() return ProxyObject(source=self.id, ip=ip, port=port, anonymity_level=anonymity, country=country)
def parse_proxyList(self): curr_proxy_list = [] try: response = requests.get(self.get_url(), timeout=self.timeout) if not response.ok: logger.warning("Proxy Provider url failed: {}".format( self.get_url())) return [] content = response.text.split('\n') for i, line in enumerate(content): if i > 3: if not line: break proxy_obj = self.create_proxy_object(line.split()) # Avoid Straggler proxies and make sure it is a Valid Proxy Address if proxy_obj is not None and UrlParser.valid_ip_port( proxy_obj.get_address()): curr_proxy_list.append(proxy_obj) else: # print("Proxy Invalid: {}".format(line)) logger.debug("Proxy Invalid: {}".format(line)) except AttributeError as e: logger.error( "Provider {0} failed with Attribute error: {1}".format( self.id, e)) # print("Provider {0} failed with Attribute error: {1}".format(self.id, e)) except KeyError as e: logger.error("Provider {0} failed with Key error: {1}".format( self.id, e)) # print("Provider {0} failed with Key error: {1}".format(self.id, e)) except Exception as e: logger.error("Provider {0} failed with Unknown error: {1}".format( self.id, e)) # print("Provider {0} failed with Unknown error: {1}".format(self.id, e)) finally: return curr_proxy_list
def parse_proxyList(self): curr_proxy_list = [] try: response = requests.get(self.get_url(), timeout=self.timeout) if not response.ok: logger.warning("Proxy Provider url failed: {}".format(self.get_url())) return [] content = response.content soup = BeautifulSoup(content, "html.parser") table = soup.find("table", attrs={"id": "proxylisttable"}) # The first tr contains the field names. headings = [th.get_text() for th in table.find("tr").find_all("th")] datasets = [] for row in table.find_all("tr")[1:]: dataset = zip(headings, (td.get_text() for td in row.find_all("td"))) if dataset: datasets.append(dataset) for dataset in datasets: proxy_obj = self.create_proxy_object(dataset) # Make sure it is a Valid Proxy Address if proxy_obj is not None and UrlParser.valid_ip_port(proxy_obj.get_address()): curr_proxy_list.append(proxy_obj) else: logger.debug("Proxy Invalid: {}".format(dataset)) except AttributeError as e: logger.error("Provider {0} failed with Attribute error: {1}".format(self.id, e)) except KeyError as e: logger.error("Provider {0} failed with Key error: {1}".format(self.id, e)) except Exception as e: logger.error("Provider {0} failed with Unknown error: {1}".format(self.id, e)) finally: return curr_proxy_list
def __init__(self, id, web_url, timeout=None): UrlParser.__init__(self, id=id, web_url=web_url, timeout=timeout)
def __init__(self, id, web_url, timeout=None): web_url += "/list/" UrlParser.__init__(self, id=id, web_url=web_url, timeout=timeout)
def parse_proxyList(self, use_top15k=False): curr_proxy_list = [] try: response = requests.get(self.get_url() + "/" + self.top_proxy_path, timeout=self.timeout) if not response.ok: logger.warning("Proxy Provider url failed: {}".format(self.get_url())) return [] content = response.content soup = BeautifulSoup(content, "html.parser") all_divs = soup.findAll("div", attrs={"class": "paragraph", 'style': "text-align:left;"}) # address_table = soup.find("div", attrs={"class": "paragraph", 'style': "text-align:left;"}) # .find('font', attrs={'color': '#33a27f'}) # Parse Top Proxy List page address_list = [] country_list = [] anonymity_list = [] for div in all_divs: address_div = div.find('font', attrs={'color': '#33a27f'}) if address_div is not None: for row in [x for x in address_div.contents if getattr(x, 'name', None) != 'br']: address_list.append(str(row)) curr_div = div.findAll('font', attrs={'size': '2'}) if curr_div[0] is not None: row_data = [] # font -> strong -> font title = curr_div[0].contents[0].contents[0].contents[0] for row in [x for x in curr_div[-1].contents if getattr(x, 'name', None) != 'br']: row_data.append(str(row)) if 'Country' in str(title): country_list.extend(row_data) if 'Status' in str(title): anonymity_list.extend(row_data) for address, country, anonymity in zip(address_list, country_list, anonymity_list): # Make sure it is a Valid Proxy Address proxy_obj = self.create_proxy_object(address, country, anonymity) if proxy_obj is not None and UrlParser.valid_ip_port(proxy_obj.get_address()): curr_proxy_list.append(proxy_obj) else: logger.debug("Proxy Invalid: {}".format(row)) # Usually these proxies are stale if use_top15k: # Parse 15k Nodes Text file (named *-all-*.txt) content = requests.get(self.get_url() + "/" + self.txt_proxy_path).content soup = BeautifulSoup(content, "html.parser") table = soup.find("div", attrs={"class": "wsite-multicol-table-wrap"}) for link in table.findAll('a'): current_link = link.get('href') if current_link is not None and "all" in current_link: self.txt_proxy_path = current_link more_content = requests.get(self.get_url() + self.txt_proxy_path).text for proxy_address in more_content.split(): if UrlParser.valid_ip_port(proxy_address): proxy_obj = self.create_proxy_object(row) curr_proxy_list.append(proxy_obj) except AttributeError as e: logger.error("Provider {0} failed with Attribute error: {1}".format(self.id, e)) except KeyError as e: logger.error("Provider {0} failed with Key error: {1}".format(self.id, e)) except Exception as e: logger.error("Provider {0} failed with Unknown error: {1}".format(self.id, e)) finally: return curr_proxy_list
def __init__(self, id, web_url, timeout=None): self.top_proxy_path = "proxy-list.html" self.txt_proxy_path = "txt-lists.html" UrlParser.__init__(self, id=id, web_url=web_url, timeout=timeout)
def __init__(self, id, web_url, bandwidth=None, timeout=None): UrlParser.__init__(self, id=id, web_url=web_url, bandwidth_KBs=bandwidth, timeout=timeout)