Python UrlParser 예제들, muted_http_request_randomizer.requests.parsers.UrlParser.UrlParser Python 예제들

예제 #1

0

파일 보기

    def parse_proxyList(self):
        curr_proxy_list = []
        try:
            # Parse all proxy pages -> format: /list/{num}.htm
            # Get the pageRange from the 'pagination' table
            page_set = self.get_pagination_set()
            logger.debug("Pages: {}".format(page_set))
            for page in page_set:
                response = requests.get("{0}{1}".format(self.get_url(), page),
                                        timeout=self.timeout)
                if not response.ok:
                    # Could not parse ANY page - Let user know
                    if not curr_proxy_list:
                        logger.warning("Proxy Provider url failed: {}".format(
                            self.get_url()))
                    # Return proxies parsed so far
                    return curr_proxy_list
                content = response.content
                soup = BeautifulSoup(content, "html.parser")
                # css provides the port number so we reverse it
                # for href in soup.findAll('link'):
                #     if '/styles/' in href.get('href'):
                #         style = "http://www.samair.ru" + href.get('href')
                #         break
                # css = requests.get(style).content.split('\n')
                # css.pop()
                # ports = {}
                # for l in css:
                #     p = l.split(' ')
                #     key = p[0].split(':')[0][1:]
                #     value = p[1].split('\"')[1]
                #     ports[key] = value

                table = soup.find("div", attrs={"id": "proxylist"})
                # The first tr contains the field names.
                headings = [
                    th.get_text() for th in table.find("tr").find_all("th")
                ]
                for row in table.find_all("tr")[1:]:
                    td_row = row.find("td")
                    # curr_proxy_list.append('http://' + row.text + ports[row['class'][0]])
                    proxy_obj = self.create_proxy_object(row)
                    # Make sure it is a Valid Proxy Address
                    if proxy_obj is not None and UrlParser.valid_ip_port(
                            td_row.text):
                        curr_proxy_list.append(proxy_obj)
                    else:
                        logger.debug("Proxy Invalid: {}".format(td_row.text))
        except AttributeError as e:
            logger.error(
                "Provider {0} failed with Attribute error: {1}".format(
                    self.id, e))
        except KeyError as e:
            logger.error("Provider {0} failed with Key error: {1}".format(
                self.id, e))
        except Exception as e:
            logger.error("Provider {0} failed with Unknown error: {1}".format(
                self.id, e))
        finally:
            return curr_proxy_list

예제 #2

0

파일 보기

파일: RebroWeeblyParser.py 프로젝트: wugui2020/HTTP_Request_Randomizer

    def create_proxy_object(self, address, country, anonymity):
        # Make sure it is a Valid IP
        ip = address.strip().split(":")[0]
        if not UrlParser.valid_ip(ip):
            logger.debug("IP with Invalid format: {}".format(ip))
            return None
        port = address.strip().split(":")[1]
        country = country.strip()
        anonymity = AnonymityLevel.get(anonymity.strip())

        return ProxyObject(source=self.id, ip=ip, port=port, anonymity_level=anonymity, country=country)

예제 #3

0

파일 보기

 def create_proxy_object(self, row):
     for td_row in row.findAll("td"):
         if td_row.attrs['data-label'] == 'IP:port ':
             text = td_row.text.strip()
             ip = text.split(":")[0]
             # Make sure it is a Valid IP
             if not UrlParser.valid_ip(ip):
                 logger.debug("IP with Invalid format: {}".format(ip))
                 return None
             port = text.split(":")[1]
         elif td_row.attrs['data-label'] == 'Anonymity Type: ':
             anonymity = AnonymityLevel.get(td_row.text.strip())
         elif td_row.attrs['data-label'] == 'Country: ':
             country = td_row.text.strip()
     return ProxyObject(source=self.id,
                        ip=ip,
                        port=port,
                        anonymity_level=anonymity,
                        country=country)

예제 #4

0

파일 보기

 def create_proxy_object(self, dataset):
     # Check Field[0] for tags and field[1] for values!
     ip = ""
     port = None
     anonymity = AnonymityLevel.UNKNOWN
     country = None
     for field in dataset:
         if field[0] == 'IP Address':
             # Make sure it is a Valid IP
             ip = field[1].strip()  # String strip()
             # Make sure it is a Valid IP
             if not UrlParser.valid_ip(ip):
                 logger.debug("IP with Invalid format: {}".format(ip))
                 return None
         elif field[0] == 'Port':
             port = field[1].strip()  # String strip()
         elif field[0] == 'Anonymity':
             anonymity = AnonymityLevel.get(field[1].strip())  # String strip()
         elif field[0] == 'Country':
             country = field[1].strip()  # String strip()
     return ProxyObject(source=self.id, ip=ip, port=port, anonymity_level=anonymity, country=country)

예제 #5

0

파일 보기

파일: ProxyForEuParser.py 프로젝트: wugui2020/HTTP_Request_Randomizer

    def parse_proxyList(self):
        curr_proxy_list = []
        try:
            response = requests.get(self.get_url(), timeout=self.timeout)

            if not response.ok:
                logger.warning("Proxy Provider url failed: {}".format(
                    self.get_url()))
                return []

            content = response.text.split('\n')
            for i, line in enumerate(content):
                if i > 3:
                    if not line:
                        break
                    proxy_obj = self.create_proxy_object(line.split())
                    # Avoid Straggler proxies and make sure it is a Valid Proxy Address
                    if proxy_obj is not None and UrlParser.valid_ip_port(
                            proxy_obj.get_address()):
                        curr_proxy_list.append(proxy_obj)
                    else:
                        # print("Proxy Invalid: {}".format(line))
                        logger.debug("Proxy Invalid: {}".format(line))
        except AttributeError as e:
            logger.error(
                "Provider {0} failed with Attribute error: {1}".format(
                    self.id, e))
            # print("Provider {0} failed with Attribute error: {1}".format(self.id, e))
        except KeyError as e:
            logger.error("Provider {0} failed with Key error: {1}".format(
                self.id, e))
            # print("Provider {0} failed with Key error: {1}".format(self.id, e))
        except Exception as e:
            logger.error("Provider {0} failed with Unknown error: {1}".format(
                self.id, e))
            # print("Provider {0} failed with Unknown error: {1}".format(self.id, e))
        finally:
            return curr_proxy_list

예제 #6

0

파일 보기

    def parse_proxyList(self):
        curr_proxy_list = []
        try:
            response = requests.get(self.get_url(), timeout=self.timeout)
            if not response.ok:
                logger.warning("Proxy Provider url failed: {}".format(self.get_url()))
                return []

            content = response.content
            soup = BeautifulSoup(content, "html.parser")
            table = soup.find("table", attrs={"id": "proxylisttable"})

            # The first tr contains the field names.
            headings = [th.get_text() for th in table.find("tr").find_all("th")]

            datasets = []
            for row in table.find_all("tr")[1:]:
                dataset = zip(headings, (td.get_text() for td in row.find_all("td")))
                if dataset:
                    datasets.append(dataset)

            for dataset in datasets:
                proxy_obj = self.create_proxy_object(dataset)
                # Make sure it is a Valid Proxy Address
                if proxy_obj is not None and UrlParser.valid_ip_port(proxy_obj.get_address()):
                    curr_proxy_list.append(proxy_obj)
                else:
                    logger.debug("Proxy Invalid: {}".format(dataset))
        except AttributeError as e:
            logger.error("Provider {0} failed with Attribute error: {1}".format(self.id, e))
        except KeyError as e:
            logger.error("Provider {0} failed with Key error: {1}".format(self.id, e))
        except Exception as e:
            logger.error("Provider {0} failed with Unknown error: {1}".format(self.id, e))
        finally:
            return curr_proxy_list

예제 #7

0

파일 보기

 def __init__(self, id, web_url, timeout=None):
     UrlParser.__init__(self, id=id, web_url=web_url, timeout=timeout)

예제 #8

0

파일 보기

 def __init__(self, id, web_url, timeout=None):
     web_url += "/list/"
     UrlParser.__init__(self, id=id, web_url=web_url, timeout=timeout)

예제 #9

0

파일 보기

파일: RebroWeeblyParser.py 프로젝트: wugui2020/HTTP_Request_Randomizer

    def parse_proxyList(self, use_top15k=False):
        curr_proxy_list = []
        try:
            response = requests.get(self.get_url() + "/" + self.top_proxy_path, timeout=self.timeout)

            if not response.ok:
                logger.warning("Proxy Provider url failed: {}".format(self.get_url()))
                return []

            content = response.content
            soup = BeautifulSoup(content, "html.parser")
            all_divs = soup.findAll("div", attrs={"class": "paragraph", 'style': "text-align:left;"})
            # address_table = soup.find("div", attrs={"class": "paragraph", 'style': "text-align:left;"})
            # .find('font', attrs={'color': '#33a27f'})
            # Parse Top Proxy List page
            address_list = []
            country_list = []
            anonymity_list = []
            for div in all_divs:
                address_div = div.find('font', attrs={'color': '#33a27f'})
                if address_div is not None:
                    for row in [x for x in address_div.contents if getattr(x, 'name', None) != 'br']:
                        address_list.append(str(row))
                curr_div = div.findAll('font', attrs={'size': '2'})
                if curr_div[0] is not None:
                    row_data = []
                    # font -> strong -> font
                    title = curr_div[0].contents[0].contents[0].contents[0]
                    for row in [x for x in curr_div[-1].contents if getattr(x, 'name', None) != 'br']:
                        row_data.append(str(row))
                    if 'Country' in str(title):
                        country_list.extend(row_data)
                    if 'Status' in str(title):
                        anonymity_list.extend(row_data)
            for address, country, anonymity in zip(address_list, country_list, anonymity_list):
                # Make sure it is a Valid Proxy Address
                proxy_obj = self.create_proxy_object(address, country, anonymity)
                if proxy_obj is not None and UrlParser.valid_ip_port(proxy_obj.get_address()):
                    curr_proxy_list.append(proxy_obj)
                else:
                    logger.debug("Proxy Invalid: {}".format(row))
            # Usually these proxies are stale
            if use_top15k:
                # Parse 15k Nodes Text file (named *-all-*.txt)
                content = requests.get(self.get_url() + "/" + self.txt_proxy_path).content
                soup = BeautifulSoup(content, "html.parser")
                table = soup.find("div", attrs={"class": "wsite-multicol-table-wrap"})
                for link in table.findAll('a'):
                    current_link = link.get('href')
                    if current_link is not None and "all" in current_link:
                        self.txt_proxy_path = current_link
                more_content = requests.get(self.get_url() + self.txt_proxy_path).text
                for proxy_address in more_content.split():
                    if UrlParser.valid_ip_port(proxy_address):
                        proxy_obj = self.create_proxy_object(row)
                        curr_proxy_list.append(proxy_obj)
        except AttributeError as e:
            logger.error("Provider {0} failed with Attribute error: {1}".format(self.id, e))
        except KeyError as e:
            logger.error("Provider {0} failed with Key error: {1}".format(self.id, e))
        except Exception as e:
            logger.error("Provider {0} failed with Unknown error: {1}".format(self.id, e))
        finally:
            return curr_proxy_list

예제 #10

0

파일 보기

파일: RebroWeeblyParser.py 프로젝트: wugui2020/HTTP_Request_Randomizer

 def __init__(self, id, web_url, timeout=None):
     self.top_proxy_path = "proxy-list.html"
     self.txt_proxy_path = "txt-lists.html"
     UrlParser.__init__(self, id=id, web_url=web_url, timeout=timeout)

예제 #11

0

파일 보기

파일: ProxyForEuParser.py 프로젝트: wugui2020/HTTP_Request_Randomizer

 def __init__(self, id, web_url, bandwidth=None, timeout=None):
     UrlParser.__init__(self,
                        id=id,
                        web_url=web_url,
                        bandwidth_KBs=bandwidth,
                        timeout=timeout)