def process_item(self, item, collection_name, use_id=True): collection = self.db[collection_name] msg = 'insert data into collection: [%s]' %collection_name logger.info(msg) if use_id: collection.update({'_id':item['_id']}, dict(item), True) else: collection.insert(dict(item))
def execute_spider(): work_spider = WorkSpider() work_spider.run() # Blocking work_queue.join() save_queue.join() # Done logger.info('All Job Finishing, Please Check!')
def _connect(self): try: self.client = pymongo.MongoClient(self.host,self.port, serverSelectionTimeoutMS=database_connect_time_out,connectTimeoutMS=database_connect_time_out) self.client.server_info() msg = 'host: {} port: {} database_name : {} MongoDB数据库连接成功'.format(host, port, self.database) logger.info(msg) self.db = self.client[self.database] except ServerSelectionTimeoutError as e: msg = 'host: {} port: {} database_name : {} MongoDB数据库连接失败 原因: 可能配置文件出错或者连接超时 超时时间为: {} 毫秒'.format(host, port, self.database, database_connect_time_out) raise ConnectionFailure(msg)
def get_all_IP(self, collection_name): collection = self.db[collection_name] data = collection.find().sort("insert_time", pymongo.DESCENDING).sort( "response_time", pymongo.ASCENDING) ips = [] for i in data: ips.append(i.get('ip')) if len(ips) == 0: logger.info("数据库内暂无IP") self.update_ip_pool() self.ipList = ips return ips
def update_ip_pool(self): logger.info("开始执行更新IP代理池中的IP并从网上抓取新的IP放入池中") start_time = time.time() check() execute_spider() end_time = time.time() logger.info("刷新数据库中IP带内存中来") self.update_ipList() logger.info("IP代理池更新完毕.. 使用时间为 {} 秒".format(end_time - start_time))
def save_ip(self, response): website_name = response.get('meta').get('website_name') response_time = response.get('content') target_url = response.get('meta').get('target_url') _ip = response.get('url') msg = '[{ip}] can visit the target url [{target_url}], source is [{source}]'.format(ip=_ip, target_url=target_url, source=website_name) logger.info(msg) # mongodb 集合名称 insert_data = {} insert_data['_id'] = _ip+'_'+target_url insert_data['ip'] = _ip insert_data['source'] = website_name insert_data['response_time'] = response_time insert_data['target_url'] = target_url insert_data['insert_time'] = time.strftime('%Y-%m-%d %H:%M:%S') # 保存数入库 self.pipeline.process_item(insert_data, self.collection_name)
def request(self, _args, dont_filter): url = _args.get('url') sleep_time = _args.get('sleep_time') if _args.get( 'sleep_time') else self.sleep_time time_out = _args.get('time_out') if _args.get( 'time_out') else self.time_out retry_times = _args.get('retry_times') if _args.get( 'retry_times') else self.retry_times use_proxy = _args.get('use_proxy') if _args.get( 'use_proxy') else self.use_proxy _ip = _args.get('ip') if _args.get('ip') else self.ip ua_type = _args.get('ua_type') if _args.get( 'ua_type') else self.ua_type diy_header = _args.get('diy_header') if _args.get( 'diy_header') else self.diy_header method = _args.get('method') if _args.get('method') else self.method post_data = _args.get('submit_data') if _args.get( 'submit_data') else self.submit_data if not dont_filter: check_result = self.check(url) if not check_result: return 'HAS CRAWLED', url else: msg = 'new url' logger.info(msg) if not url.startswith('http'): raise ValueError('url has to be started with http or https') if diy_header: header = diy_header else: host = parse.urlparse(url).netloc header = { 'User-Agent': random.choice(PC_USER_AGENTS), 'Host': host, } if ua_type == 'mobile': header = { 'User-Agent': random.choice(MOBILE_USER_AGENTS), 'Host': host } times = 0 setting_time = retry_times con = None while retry_times > 0: times += 1 self.log.info('request %s, times: %d' % (url, times)) try: if use_proxy: ip = _ip if ip: proxy = { 'http': 'http://%s' % ip, 'https': 'http://%s' % ip } if method == 'get': con = request_session.get(url, headers=header, proxies=proxy, timeout=time_out, params=post_data, verify=False) elif method == 'post': if post_data and isinstance(post_data, dict): con = request_session.post(url, headers=header, proxies=proxy, timeout=time_out, data=post_data, verify=False) else: self.log.error( 'while method is post, post_data must be defined and defined as dict' ) if con.status_code not in self.status_code: self.log.error('status code is %s' % con.status_code) raise ValueError( 'status code not in the code in config.py, check your log' ) time.sleep(sleep_time) else: msg = 'ip can not be none while use_proxy is True' self.log.error(msg) os._exit(0) else: if method == 'get': con = request_session.get(url, headers=header, timeout=time_out, params=post_data, verify=False) elif method == 'post': if post_data and isinstance(post_data, dict): con = request_session.post(url, headers=header, timeout=time_out, data=post_data, verify=False) else: self.log.error( 'while method is post, post_data must be defined and defined as dict' ) os._exit(0) if con.status_code not in self.status_code: self.log.error('status code is %s' % con.status_code) raise ValueError( 'status code not in the code in config.py, check your log' ) time.sleep(sleep_time) except Exception as e: self.log.error(e) retry_times -= 1 self.log.warning( 'retrying request: [%s], times: %s retry_times: %s ' % (url, times, retry_times)) if times == setting_time: self.log.error( 'give up retrying request: [%s], times: %s is bigger than setting' % (url, times)) return None, None else: self.log.info('[%s] has requested successfully' % url) if con: if not dont_filter: url = self.md5_url(url) sbf.add(url) return con.content, con.url else: self.log.error('content is None, url is %s' % url) return None, None
source=website_name) logger.info(msg) # mongodb 集合名称 insert_data = {} insert_data['_id'] = _ip+'_'+target_url insert_data['ip'] = _ip insert_data['source'] = website_name insert_data['response_time'] = response_time insert_data['target_url'] = target_url insert_data['insert_time'] = time.strftime('%Y-%m-%d %H:%M:%S') # 保存数入库 self.pipeline.process_item(insert_data, self.collection_name) if __name__ == '__main__': # 测试代码 spidermain = SpiderMain() spidermain.run() # blocking work_queue.join() save_queue.join() # finishing crawl origin ip logger.info('available proxy has been saved in your database, please check!')
def _request_with_proxy(self, url, use_proxy): headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh-CN,zh;q=0.9", "User-Agent": random.choice(PC_USER_AGENTS) } # 获取进入while循环的初始时间 start_time = time.time() while True: # 获取当前时间和之前的初始时间做比较,如果超出自定义的时间则raise requests.exceptions.ProxyError end_time = time.time() if int(end_time - start_time) > proxy_timeout: logger.info( "request with proxy 方法时间执行过长 可能原因: IP池内IP全部失效或被目标网站封掉IP其他异常错误 当前ip为 {} 程序进行休息状态 休息时长为: {} 秒" .format(self.current_ip, proxy_timeout)) time.sleep(proxy_timeout) self.update_ip_pool() msg = "IP代理池休息完毕并更新 请重新进行数据抓取 可能原因: 查找历史日志 当前ip为 {}".format( self.current_ip) raise requests.exceptions.ProxyError(msg) proxy = {'http': self.current_ip, 'https': self.current_ip} if use_proxy: try: response = requests.get(url, proxies=proxy, timeout=request_timeout, headers=headers) code = response.status_code msg = "doing http request successfully current proxy ip is {} status_code :{}".format( self.current_ip, code) logger.info(msg) if code == 404: msg = " 404 Client Error: Not Found for url:{}".format( url) logger.info(msg) return response response.raise_for_status() if code == 200 and custom_filter_str != '' and custom_filter_str in response.text: raise Exception return response except requests.HTTPError as e: logger.info(e) self.current_ip = self.getRandomOne() msg = "random pick a ip from ipList new ip is {}".format( self.current_ip) logger.info(msg) except Exception as e: msg = "ip is {} can't use ".format(self.current_ip) logger.info(msg) self.current_ip = self.getRandomOne() msg = "random pick a ip from ipList new ip is {}".format( self.current_ip) logger.info(msg) else: print("no use proxy") try: response = requests.get(url, timeout=request_timeout, headers=headers) return response except Exception as e: msg = "ip is {} can't use ".format(self.current_ip) logger.info(msg) self.current_ip = self.getRandomOne() msg = "random pick a ip from ipList new ip is {}".format( self.current_ip) logger.info(msg)
# 重写run方法, # 若请求的函数为自定义, 则可以在crawl函数中设置: request=your_request_function, 默认为框架中的request def run(self): start() self.craw() def execute_spider(): work_spider = WorkSpider() work_spider.run() # Blocking work_queue.join() save_queue.join() # Done logger.info('All Job Finishing, Please Check!') if __name__ == '__main__': work_spider = WorkSpider() work_spider.run() # Blocking work_queue.join() save_queue.join() # Done logger.info('All Job Finishing, Please Check!')