def parse(self, soup): result = [] for s in soup.find("table").find_all("tr")[1:]: try: d = s.find_all("td") ip = { "ip": d[2].string, "port": d[3].string, "info": "", "type": 0, } info = d[4].find("a") if info: ip["info"] = info.string if d[5].string == "透明": ip["type"] = 1 elif d[5].string == "匿名": ip["type"] = 2 elif d[5].string == "高匿": ip["type"] = 3 result.append(ip) except Exception as e: logger.error('XiCiDaiLi parse error: %s', e) return result
def parse(self, soup): result = [] for s in soup.find("table").find_all("tr")[1:]: try: d = s.find_all("td") ip = { "ip": d[2].string, "port": d[3].string, "info": "", "type": 0, } info = d[4].find("a") if info: ip["info"] = info.string if d[5].string == "透明": ip["type"] = 1 elif d[5].string == "匿名": ip["type"] = 2 elif d[5].string == "高匿": ip["type"] = 3 result.append(ip) except Exception as e: logger.error('XiCiDaiLi parse error: %s', e) return result
def parse(self, soup): result = [] s = soup.find("div", class_="entry-content").find_all("p") for d in s[1].stripped_strings: try: rst = d.split(u"\xa0\xa0", 2) if len(rst) != 3: continue ip = { "ip": rst[0].split(":")[0], "port": rst[0].split(":")[1], "info": rst[2], } if rst[1] == "透明": ip["type"] = 1 elif rst[1] == "匿名": ip["type"] = 2 elif rst[1] == "高匿名": ip["type"] = 3 else: ip["type"] = 0 result.append(ip) except Exception, e: logger.error('KuaiDaiLi parse error: %s', e)
def parse(self, soup): result = [] s = soup.find("div", class_="entry-content").find_all("p") for d in s[1].stripped_strings: try: rst = d.split(u"\xa0\xa0", 2) if len(rst) != 3: continue ip = { "ip": rst[0].split(":")[0], "port": rst[0].split(":")[1], "info": rst[2], } if rst[1] == "透明": ip["type"] = 1 elif rst[1] == "匿名": ip["type"] = 2 elif rst[1] == "高匿名": ip["type"] = 3 else: ip["type"] = 0 result.append(ip) except Exception, e: logger.error('KuaiDaiLi parse error: %s', e)
def run(self, proxyips): result = {} proxy_set = self.classify(proxyips) for proxy_type in self.proxy_type: proxy_list = list(proxy_set.get(proxy_type, set())) logger.info('sniffer start, proxy_type: %s, proxy_ip: %s', proxy_type, len(proxy_list)) result[proxy_type] = self.validator.run_in_multiprocess(proxy_list) logger.info('sniffer finish, proxy_type: %s, avail_ip: %s', proxy_type, len(result[proxy_type])) if SNIFFER['OUTPUT']: try: self.save2file(result) except Exception as e: logger.error("Write file fail, error: %s", e) if SNIFFER['BACKEND'] != '': try: self.redis = redis.StrictRedis(*SNIFFER['BACKEND'].split(':')) self.redis.ping() except Exception as e: logger.error("Backend redis error: %s", e) return self.reflesh_redis() self.save2redis(result)
def parse(self, soup): result = [] for s in soup.find_all("table"): for t in s.find_all("tr")[2:]: try: ip = self._parse(t.find_all("td")) result.append(ip) except Exception as e: logger.error('CNProxy parse error: %s', e) return result
def parse(self, soup): result = [] for s in soup.find_all("table"): for t in s.find_all("tr")[2:]: try: ip = self._parse(t.find_all("td")) result.append(ip) except Exception as e: logger.error('CNProxy parse error: %s', e) return result
def crawl(self): base = "http://blog.kuaidaili.com/" proxyip = [] r = requests.get(base) if r.status_code == requests.codes.ok: soup = BeautifulSoup(r.text, "html5lib") for s in soup.find_all("article")[:2]: proxyip.extend(self.get(s.find("a")["href"])) else: logger.error("KuaiDaiLi crawl root fail, HTTP Response Code: %s", r.status_code) return proxyip
def crawl(self): base = "http://blog.kuaidaili.com/" proxyip = [] r = requests.get(base) if r.status_code == requests.codes.ok: soup = BeautifulSoup(r.text, "html5lib") for s in soup.find_all("article")[:2]: proxyip.extend(self.get(s.find("a")["href"])) else: logger.error("KuaiDaiLi crawl root fail, HTTP Response Code: %s", r.status_code) return proxyip
def parse(self, soup): result = [] soup = soup.find("div", id="boxright").find_all("li") keys = ["ip", "port", "type", "info"] for s in soup[1:]: try: ip = {} for idx, val in enumerate(s.stripped_strings): ip[keys[idx]] = val ip['type'] = 1 result.append(ip) except Exception as e: logger.error('CZ88 parse error: %s', e) return result
def parse(self, soup): result = [] soup = soup.find("div", id="boxright").find_all("li") keys = ["ip", "port", "type", "info"] for s in soup[1:]: try: ip = {} for idx, val in enumerate(s.stripped_strings): ip[keys[idx]] = val ip['type'] = 1 result.append(ip) except Exception as e: logger.error('CZ88 parse error: %s', e) return result
def parse(self, soup): result = [] for d in soup.find('body').contents: try: d = str(d).strip() if d != '' and d[0].isdigit(): ip = { "ip": d.split(':')[0], "port": d.split(':')[1], "info": "", "type": 0, } result.append(ip) except Exception as e: logger.error('IP66API parse error: %s', e) return result
def parse(self, soup): result = [] for d in soup.find('body').contents: try: d = str(d).strip() if d != '' and d[0].isdigit(): ip = { "ip": d.split(':')[0], "port": d.split(':')[1], "info": "", "type": 0, } result.append(ip) except Exception as e: logger.error('IP66API parse error: %s', e) return result
def get(self, url, encoding=None, headers=None): logger.info('crawl: %s', url) try: r = requests.get(url, headers=headers) if headers else requests.get(url) if encoding: r.encoding = encoding if r.status_code == requests.codes.ok: soup = BeautifulSoup(r.text, "html5lib") return self.parse(soup) else: raise Exception("HTTP Response Code: %s" % r.status_code) except Exception as e: logger.error('Crawl error: %s', e) return []
def parse(self, soup): result = [] s = soup.find("table").find_all("tr") for d in s: try: w = d.find_all("td") ip = { "ip": w[0].string, "port": w[1].string, "info": w[3].string, "type": 0, } if w[2].string == "透明": ip['type'] = 1 elif w[2].string == "高匿": ip['type'] = 3 result.append(ip) except Exception as e: logger.error('IP002 parse error: %s', e) return result
def parse(self, soup): result = [] s = soup.find("table").find_all("tr") for d in s: try: w = d.find_all("td") ip = { "ip": w[0].string, "port": w[1].string, "info": w[3].string, "type": 0, } if w[2].string == "透明": ip['type'] = 1 elif w[2].string == "高匿": ip['type'] = 3 result.append(ip) except Exception as e: logger.error('IP002 parse error: %s', e) return result
def parse(self, soup): result = [] for s in soup.find("table").find_all("tr")[1:]: try: d = s.find_all("td") ip = { "ip": d[0].string, "port": d[1].string, "info": d[5].string, "type": 0, } if d[2].string == "透明": ip["type"] = 1 elif d[2].string == "匿名": ip["type"] = 2 elif d[2].string == "高匿名": ip["type"] = 3 result.append(ip) except Exception as e: logger.error('KuaiDaiLi2 parse error: %s', e) return result
def parse(self, soup): result = [] for s in soup.find("table").find_all("tr")[1:]: try: d = s.find_all("td") ip = { "ip": d[0].string, "port": d[1].string, "info": d[5].string, "type": 0, } if d[2].string == "透明": ip["type"] = 1 elif d[2].string == "匿名": ip["type"] = 2 elif d[2].string == "高匿名": ip["type"] = 3 result.append(ip) except Exception as e: logger.error('KuaiDaiLi2 parse error: %s', e) return result