def fetch_xici(self, num): """抓取http://www.xicidaili.com/,质量10%""" page = 1 proxyes = [] while len(proxyes) <= num and page <= 2: url = "http://www.xicidaili.com/nn/%s" %page req = requests.get(url, headers=self.headers) html = req.text selector = etree.HTML(html) tbody = selector.xpath('//tr[@class]') for line in tbody: tds = line.xpath('td/text()') ip = tds[0] port = tds[1] speed = line.xpath('td[7]/div/@title')[0][:-1] latency = line.xpath('td[8]/div/@title')[0][:-1] # print('%s,%s,%s,%s'%(ip, port, speed, latency)) if float(speed) < 3 and float(latency) < 1: proxy = "%s:%s"%(ip, port) proxy_dict = {'http':proxy, 'https':proxy} valid_res = self.proxy_vaild(proxy_dict) if valid_res[0]: proxyes.append(valid_res[1]) logger.info('抓取 xicidaili 第 %d 页,有效代理 %d 个'%(page, len(proxyes))) page += 1 return proxyes
def scheduler(self): """根据状态码实现调度""" deal_code = [ '10000', '10001', '10003', '10004', '10016', '10020', '10021', '10022', '10023' ] pass_code = ['20800', '20801', '20802', '20803', '20003'] self.status_dict = { "10000": self.status_ok, "10001": self.status_change_key, "10003": self.status_change_key, "10004": self.status_change_user_agent, "10010": self.status_change_proxy, "10016": self.status_change_user_agent, "10020": self.status_change_key, "10021": self.status_change_proxy, "10022": self.status_change_proxy, "10023": self.status_change_key } status = self.respond['status'] infocode = self.respond['infocode'] # print(infocode) if infocode in deal_code: return self.status_dict[infocode]() elif infocode in pass_code: logger.info('出现 %s 跳过的网址 %s' % (infocode, self.req_url)) self.status_pass() else: print(infocode) logger.info(infocode) self.status_invalid_request()
def get_count(self): self.requestor() count = self.respond.get("count") if count: return int(count) else: logger.info('没有count字段的网址是 %s' % self.req_url) return 0
def status_sleep_try(self): if self.repeat_times <= 10: time.sleep(5) print('=====================休息5秒======================') self.repeat_times += 1 return self.process() else: logger.info("重试超过 10 次, 跳过 %s" % self.req.url) self.status_pass()
def status_ok(self): results = self.respond.get('results') if results: res_list = [] for i in results: res_list.append(self.parser(i)) print('%s 地址获取成功' % self.params['address']) return res_list else: logger.info('结果为空 %s' % self.req.url)
def scheduler(self): if isinstance(self.respond, dict): if self.respond.get("Error"): error_info = self.respond["Error"]["Message"] self.req_stat(error_info) logger.info('%s %s-%s %s' % (self.DDate, self.Dcity, self.Acity, error_info)) else: return self.parser() else: return self.requestor()
def parser(self): # print(self.respond) if self.respond: df = pd.DataFrame(self.respond) df['city'] = self.params['city_name'] df['date'] = self.params['date_end'] # res_list = df.to_dict('records') return df else: print(self.req_url) logger.info('%s 没有数据' % self.params['city_name'])
def status_ok(self): all_res = [] pois = self.respond.get('pois') if pois: for poi in pois: if poi: all_res.append(self.parser(poi)) else: logger.info('没有值的连接是 %s' % self.req_url) return all_res else: logger.info('没有值的连接是 %s' % self.req_url)
def proxy_vaild(self, proxy_dict): url = "http://ip.chinaz.com/getip.aspx" #用来测试IP是否可用的url try: r = requests.get(url, proxies=proxy_dict, headers=self.headers, timeout=3, allow_redirects = False) if r.status_code == 200 and r.text != Fetch_proxy.local_ip: print(r.text) return (True, proxy_dict) else: logger.info('_______%s 无效代理________'%r.status_code) return (False, ) except (req_e.ReadTimeout, req_e.ConnectTimeout, req_e.ProxyError,req_e.ConnectionError,req_e.ChunkedEncodingError): logger.info('_______连接超时 无效代理________') return (False, )
def status_ok(self): if 'next_page_token' not in self.respond: results = self.respond.get('results') if results: res_list = [] for i in results: res_list.append(self.parser(i)) print('%s %s 采集成功' % (self.params['location'], self.params['types'])) return res_list else: logger.info('结果为空 %s' % self.req_url) else: return '结果超出20个'
def expand(self, distance, lng1=0, lat1=0, lng2=0, lat2=0): # distance 单位为m lng_per_meter = 0.00001141 lat_per_meter = 0.00000899 ex_lng1 = round([(self.lng1 - (lng_per_meter * distance)), lng1][lng1 != 0], 6) ex_lng2 = round([(self.lng2 + (lng_per_meter * distance)), lng2][lng2 != 0], 6) ex_lat1 = round([(self.lat1 - (lat_per_meter * distance)), lat1][lat1 != 0], 6) ex_lat2 = round([(self.lat2 + (lat_per_meter * distance)), lat2][lat2 != 0], 6) logger.info('拓展矩形为(%s,%s,%s,%s)' % (ex_lng1, ex_lat1, ex_lng2, ex_lat2)) return Rectangle(ex_lng1, ex_lat1, ex_lng2, ex_lat2)
def fetch_new_proxyes(self, num): crawls = [self.fetch_ip181, self.fetch_66ip, self.fetch_xici, self.fetch_kxdaili] valid_proxyes = [] demand_num = num for crawl in crawls: new_proxyes = crawl(demand_num) logger.info('_______抓取新代理%s________'%len(new_proxyes)) valid_proxyes += new_proxyes demand_num -= len(new_proxyes) if demand_num <= 0: logger.info('_______代理抓取完毕,共%s________'%len(valid_proxyes)) # self.save_proxy(valid_proxyes) break else: continue return valid_proxyes
def scheduler_by_statuscode(self, status_code): """根据网络状态码进行调度""" if status_code == 200: try: self.respond = self.req.json() if self.proxys['proxies']: # 回收未用完的代理 Fetch_proxy.proxy_pool.append(self.proxys['proxies']) except: content = self.req.text while ",," in content: content = content.replace(',,', ',"",') while "[," in content: content = content.replace("[,", '["",') try: content = eval(content) except: pass if isinstance(content, list): self.respond = content else: # logger.info(content) self._respond = content self.respond = None elif status_code in [301, 302, 429, 302, 502, 403]: self.status_change_proxy() elif status_code in [400, 401, 402, 404]: logger.info('%s_%s 没有信息' % (self.url, status_code)) self.respond = None elif status_code in [202, 204]: print(status_code) time.sleep(2) self.status_change_user_agent() elif status_code in [500]: print(status_code) self.status_change_user_agent() else: print(status_code) self.status_change_user_agent()
def fetch_66ip(self, num): """抓取http://www.66ip.cn/,质量25%""" proxyes = [] url = "http://www.66ip.cn/nmtq.php?getnum=100&isp=0&anonymoustype=3&start=&ports=&export=&ipaddress=&area=1&proxytype=0&api=66ip" req = requests.get(url, headers=self.headers) html = req.text urls = html.split("</script>")[1].split("<br />") for u in urls[:-1]: if u.strip(): proxy = u.strip() proxy_dict = {'http':proxy, 'https':proxy} valid_res = self.proxy_vaild(proxy_dict) if valid_res[0]: proxyes.append(valid_res[1]) if len(proxyes) >= num: break else: continue logger.info('抓取 66ip,有效代理 %d 个'%(len(proxyes))) return proxyes
def fetch_ip181(self, num): """抓取http://www.ip181.com/,10分钟更新100个,质量55%""" proxyes = [] url = 'http://www.ip181.com/' req = requests.get(url, headers=self.headers) html = req.text selector = etree.HTML(html) tbody = selector.xpath('//tr') for line in tbody[1:]: tds = line.xpath('td/text()') ip = tds[0] port = tds[1] latency = tds[4].split(' ')[0] if float(latency) < 0.5: proxy = "%s:%s"%(ip, port) proxy_dict = {'http':proxy, 'https':proxy} valid_res = self.proxy_vaild(proxy_dict) if valid_res[0]: proxyes.append(valid_res[1]) if len(proxyes) >= num: break logger.info('抓取 ip181,有效代理 %d 个'%(len(proxyes))) return proxyes
def fetch_kxdaili(self, num): """抓取http://www.kxdaili.com/,质量 5%""" page = 1 proxyes = [] while len(proxyes) <= num and page <= 10: url = "http://www.kxdaili.com/dailiip/1/%d.html" % page req = requests.get(url,headers=self.headers) html = req.text selector = etree.HTML(html) tbody = selector.xpath('//tr') for line in tbody: tds = line.xpath('td/text()') ip = tds[0] port = tds[1] latency = tds[4].split(' ')[0] if float(latency) < 0.5: proxy = "%s:%s"%(ip, port) proxy_dict = {'http':proxy, 'https':proxy} valid_res = self.proxy_vaild(proxy_dict) if valid_res[0]: proxyes.append(valid_res[1]) logger.info('抓取 kxdaili 第 %d 页,有效代理 %d 个'%(page, len(proxyes))) page += 1 return proxyes
def parser(self, json_dict): print(json_dict) datas = json_dict.get('data') codes = json_dict.get('code') if codes == 0 and len(datas) != 0: points = [] min_count = datas[0]['count'] for i in datas: min_count = min(i['count'], min_count) for i in datas: point = {} gcj_lng = 1e-6 * (250.0 * i['grid_x'] + 125.0) gcj_lat = 1e-6 * (250.0 * i['grid_y'] + 125.0) point[ 'gcj_lng'] = gcj_lng # 此处的算法在宜出行网页后台的js可以找到,文件路径是http://c.easygo.qq.com/eg_toc/js/map-55f0ea7694.bundle.js point['gcj_lat'] = gcj_lat point['lng'], point[ 'lat'] = transCoordinateSystem.gcj02_to_wgs84( gcj_lng, gcj_lat) point['count'] = i['count'] / min_count point['req_time'] = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') points.append(point) Easygo_Clawer.cookies = self.cookies return points elif codes == 0 and len(datas) == 0: print("此区域没有点信息") logger.info("此区域没有点信息 %s" % self.req_url) elif codes == 3: logger.info("%s 账号需要验证" % self.qq_account) time.sleep(3) self.cookies = self.get_cookie() points = self.process() Easygo_Clawer.req_num = 1 return points elif codes == -100: logger.info("%s 账号已用完" % self.qq_account) time.sleep(3) self.cookies = self.get_cookie() points = self.process() Easygo_Clawer.req_num = 1 return points else: print(json_dict) logger.info("%s 账号出现未知错误" % self.qq_account)
def status_pass(self): logger.info('已跳过 %s' % self.req.url)
def status_change_key(self): logger.info('更换密钥 %s' % self.req.url) self.params.update_key() return self.process()
"从化区": [Rectangle(113.2738078, 23.37099304, 114.0565605, 23.93695479)], "番禺区": [Rectangle(113.2429326, 22.87177748, 113.5533215, 23.08258251)], "海珠区": [Rectangle(113.2333014, 23.04533721, 113.4122732, 23.11366537)], "花都区": [Rectangle(112.9540515, 23.24907373, 113.4694197, 23.61688869)], "荔湾区": [Rectangle(113.1706897, 23.0442161, 113.2693343, 23.15839047)], "黄埔区": [Rectangle(113.389631, 23.03409065, 113.6017962, 23.42672447)], "南沙区": [Rectangle(113.2911038, 22.56227328, 113.6843494, 22.90920969)], "天河区": [Rectangle(113.2922662, 23.09766052, 113.4391771, 23.24457675)], "越秀区": [Rectangle(113.2323543, 23.10463126, 113.3178628, 23.17175286)], "增城区": [Rectangle(113.5406707, 23.08627615, 113.9949777, 23.62208945)] } start_time = datetime.datetime.now().strftime('%y-%m-%d %I:%M:%S %p') info_dict = { '名称': 'Google POI 抓取工具V1.0', '邮箱': '*****@*****.**', '起始时间': start_time, '终止时间': '20180401' } logger.info(param_info(info_dict)) for region_name, rect_list in rect_dict.items(): main(region_name, rect_list) email_alarm = Email_alarm() end_time = datetime.datetime.now().strftime('%y-%m-%d %I:%M:%S %p') info_dict = { '名称': 'Google POI 抓取工具V1.0', '邮箱': '*****@*****.**', '起始时间': start_time, '终止时间': end_time } email_alarm.send_mail(param_info(info_dict))
def save_proxy(self, res_list): df = pd.DataFrame(res_list) df.to_csv('proxy.csv') logger.info('_______代理已储存________')
def status_invalid_request(self): logger.info('请求错误 %s' % self.req.url)
def status_unknown_error(self): logger.info('未知错误 %s' % self.req.url)
def param_info(info_dict): info_table = prettytable.PrettyTable(['项目', '描述']) for key in list(info_dict.keys()): info_table.add_row([key, info_dict[key]]) info_table.align = 'l' logger.info('\n' + str(info_table))