def extract(self): for setting_name, setting in self.__setting.items(): for base_url, selector, pattern in itertools.zip_longest(setting["base_url"], setting["selector"], setting["pattern"]): if base_url: if pattern is None: pattern = setting["pattern"][len(setting["pattern"]) - 1] if selector is None: selector = setting["selector"][len(setting["selector"]) - 1] time.sleep(self.interval) self.__web.set_target(base_url) try: html = self.__web.start_request() dom = htmldom.HtmlDom().createDom(html) table = dom.find(selector) if table.len == 0: raise SettingReader.ProxySettingError("invalid selector!", selector) td = threading.Thread(target=self._find_proxy, args=(table, pattern)) td.start() self.__tasks.append(td) # self._find_proxy(table, pattern) except Exception as err: logger.log(logger.BASIC, str(err)) for task in self.__tasks: task.join()
def __extract_baseurl_mode2(self, base_url): """ :return extracted base urls list. """ urls = [] for item_url, selector, pattern, attr in itertools.zip_longest(base_url["base_url"], base_url["selector"], base_url["pattern"], base_url["container_attr"]): if item_url: if pattern is None: pattern = base_url["pattern"][len(base_url["pattern"])-1] if selector is None: selector = base_url["selector"][len(base_url["selector"])-1] if attr is None: attr = base_url["container_attr"][len(base_url["container_attr"])-1] time.sleep(self.interval) self.__web.set_target(item_url) try: html = self.__web.start_request() dom = htmldom.HtmlDom().createDom(html) table = dom.find(selector) x = table.html() for i in base_url["sequence"]: if attr: urls.append(absolute_url(item_url, table[int(i)].attr(attr))) else: url_sm = re.search(pattern, table[int(i)].html()) urls.append(absolute_url(item_url, url_sm.group(1))) except Exception as err: logger.log(logger.BASIC, str(err)) return urls
def check(self): tmp = [] try: for item in self.candidates: timeout = self.get_delay_time(item["ip"]) if timeout: item["responsetime"] = float(timeout) tmp.append(item) logger.log(logger.DETAIL, "got a candidate ip:{0} delay:{1}s".format(item["ip"], item["responsetime"])) except Exception as err: logger.log(logger.BASIC, err) self.candidates = tmp
def get_candidate(self, host_index): wc = webclient.WebClient() test_conf = self._test_conf[host_index] host_url, key = self._host[host_index] wc.set_target(host_url, timeout=self.timeout) for item in self.key_list: response = wc.start_request(test_conf[0], test_conf[1], item) if response: result = self.parse_response(response, host_index) if result != {}: if result not in self.candidates: self.candidates.append(result) logger.log(logger.DETAIL, "got an alpha ip:{0}".format(result["ip"]))
def hunt(self, host_index): """ host_index must be self.super_ping or self.chinaz. :param host_index: :return:a list with checked ips. """ wc = webclient.WebClient() host_url, key = self._host[host_index] wc.set_target(host_url, timeout=self.timeout) result = wc.start_request(key[1], data=key[0].format(self.target)) try: dom = htmldom.HtmlDom().createDom(result) for item in dom.find("script"): if self.found_key_data(item.text(), host_index): self._parser[host_index](item.text()) except Exception as err: logger.log(logger.BASIC, err) self.get_candidate(host_index) self.check() return self.candidates
def start_request(self, method="GET",data=None): """ :param method: :param data: data to send to server. :return: str """ raw_data = None try: self.__client.setup(post=data) answer = self.__client.request() if answer.code != 200: logger.warning( "failed to get response from server! error: {0}\n".format(http.client.responses[answer.code])) raw_data = answer.unicode_body() except Exception as err: logger.log(logger.DETAIL, str(err) + " when request to {0}\n".format( self.__target.netloc + self.__target.path)) # when error happened, it often # means there is still a connection # thus just ignore it. return raw_data
def _find_proxy(self, ip_table, pattern): """ :param ip_table: list of proxy with junk data. :param pattern: to search ip and port. :return: """ valid_pattern = False for ip_item in ip_table: try: ip_item = ip_item.html().replace("\n", "").replace("\r", "").replace("\s", "") while re.search(pattern, ip_item): valid_pattern = True ip_port_sm = re.search(pattern, ip_item) if is_validate(*ip_port_sm.groups(), target=self.target, check=self.check): if ip_port_sm.groups() not in self.__proxies: self.__proxies.append(ip_port_sm.groups()) print(self.format.format(*ip_port_sm.groups()), end="", file=self.out, flush=True) ip_item = ip_item[ip_port_sm.end(2):] except Exception as err: logger.log(logger.BASIC, str(err)) if valid_pattern is False: raise SettingReader.ProxySettingError("invalid pattern!", pattern)
def start_request(self, method="GET", url="", data=""): """ :param method: :param url: :param data: data to send to server. :return: str """ url = self.__target.path + url raw_data = None try: self.__client.connect() if method == "GET": data = url + data data = str.replace(data, r'//', r'/') self.__client.putrequest(method, data) data = "" else: self.__client.putrequest(method, url) data = urllib.parse.quote(data, '=&') self.__client.putheader('Content-Length', len(data)) self.__put_def_header() self.__put_client_header() self.__client.endheaders() self.__client.send(data.encode()) anwser = self.__client.getresponse() if anwser.status != 200: logger.warning( "failed to get response from server! error: {0}".format(http.client.responses[anwser.status])) raw_data = str(anwser.read().decode()) except Exception as err: logger.log(logger.DETAIL, str(err) + " when request to {0}".format( self.__target.netloc + self.__target.path)) # when error happened, it often # means there is still a connection # thus just ignore it. self.__client.close() return raw_data