def craw(self, request=aispider.request): for key, value in url_parse_dict.iteritems(): if value.get('status') == 'active': # 网站名 website_name = key # 网站url website_urls = value.get('url') # 请求方法 method = value.get('request_method') # 请求需要提交的数据 post_datas = value.get('submit_data') # 解析方法 parse_func = value.get('parse_func', 'system') if parse_func == 'system': parser = self.parse_to_get_ip else: parser = parse_func # 自定义头 diy_header = value.get('header') for url in website_urls: # 调用format_put_data 构造放入队列中的数据 if post_datas: for post_data in post_datas: put_data = format_put_data( args={ "url": url, "method": method, 'submit_data': post_data, 'diy_header': diy_header }, work_func=request, follow_func=self.get_and_check, meta={ 'value': value, 'website_name': website_name, 'parser': parser }) # 放入队列 work_queue.put(put_data) else: put_data = format_put_data( args={ "url": url, "method": method, 'data': post_datas }, work_func=request, follow_func=self.get_and_check, meta={ 'value': value, 'website_name': website_name, 'parser': parser }) # 放入队列 work_queue.put(put_data)
def start_check(self, ips, website_name): if ips: # 检测 for _ip in ips: for target_url in self.target_urls: url = target_url # 调用format_put_data 构造放入队列中的数据 put_data = format_put_data(args={ "url": url, 'ip': _ip, 'time_out': 5 }, work_func=valid, need_save=True, save_func=self.save_ip, meta={ 'website_name': website_name, 'target_url': target_url }) # 放入队列 work_queue.put(put_data) else: msg = 'There Are No Available From [{website_name}] Can Be Used To Check, Please Check!!!!!!!'.format( website_name=website_name) logger.error(msg)
def craw(self,singer_mids): for singer_mid in singer_mids: url = root_url_format.format(singermid = singer_mid,begin = begin,num = num) # use format_put_data put_data = format_put_data(args = {'url':url,'method':'get','submit_data':None}, work_func = self.downloader.request, follow_func = self.get_total_num) work_queue.put(put_data)
def craw(self, request=aispider.request): for key, value in url_parse_dict.iteritems(): if value.get('status') == 'active': # 网站名 website_name = key # 网站url website_urls = value.get('url') # 请求方法 method = value.get('request_method') # 请求需要提交的数据 post_datas = value.get('submit_data') # 解析方法 parse_func = value.get('parse_func', 'system') if parse_func == 'system': parser = self.parse_to_get_ip else: parser = parse_func # 自定义头 diy_header = value.get('header') for url in website_urls: # 调用format_put_data 构造放入队列中的数据 if post_datas: for post_data in post_datas: put_data = format_put_data( args={"url": url, "method": method, 'submit_data': post_data, 'diy_header': diy_header}, work_func=request, follow_func=self.get_and_check, meta={'value': value, 'website_name': website_name, 'parser': parser}) # 放入队列 work_queue.put(put_data) else: put_data = format_put_data(args={"url": url, "method": method, 'data': post_datas}, work_func=request, follow_func=self.get_and_check, meta={'value': value, 'website_name': website_name, 'parser': parser}) # 放入队列 work_queue.put(put_data)
def craw(self, singer_mids, begin=0, num=1): for singer_mid in singer_mids: url = root_url_format.format(singermid=singer_mid, begin=begin, num=num) # 调用format_put_data 构造放入队列中的数据 put_data = format_put_data(args={"url": url}, work_func=self.downloader.request, follow_func=self.get_total_num) # 放入队列 work_queue.put(put_data)
def start_check(self, ips, website_name): if ips: # 检测 for _ip in ips: for target_url in self.target_urls: url = target_url # 调用format_put_data 构造放入队列中的数据 put_data = format_put_data(args={"url": url, 'ip': _ip, 'time_out': 5}, work_func=valid, need_save=True, save_func=self.save_ip, meta={'website_name': website_name, 'target_url': target_url}) # 放入队列 work_queue.put(put_data) else: msg = 'There Are No Available From [{website_name}] Can Be Used To Check, Please Check!!!!!!!'.format( website_name=website_name) logger.error(msg)
def get_total_num(self, response): # 获取请求的内容 html_content = response.get('content') # 调用解析方法 datas = parser.get_data_by_json(html_content) total_num = datas.get('data').get('total', '') if total_num: singer_mid = datas.get('data').get('singer_mid') url = root_url_format.format(singermid=singer_mid, begin=0, num=total_num) # 再次调用format_put_data方法构造将要放入工作队列中的数据 put_data = format_put_data(args={"url": url}, work_func=self.downloader.request, need_save=True, save_func=self.save) work_queue.put(put_data)