def run(self, func, project_name, init_url): self.handle_method = func crawl_result = self.http_helper.get(self.current_url) if not str(crawl_result[1]).startswith('20') \ and not str(crawl_result[1]).startswith('30'): # 如果不是200系列和300系列的状态码输出错误 return { 'error': 'URL: ' + self.current_url + ' 获取失败 HTTP code: ' + str(crawl_result[1]) + ' Runtime: ' + str(crawl_result[2]) + 'ms' } urls = get_urls_form_html(self.current_url, crawl_result[0]) self.rest_result['current_url'] = self.current_url self.rest_result['http_code'] = crawl_result[1] current_url = self.current_url # 缓存一下,self.current_url会被下面代码改写 # 如果抓取自定义函数存在dict返回值则将dict推送至服务器 parse_result = self.handle_method( helper.S(self, crawl_result[0], urls, project_name, init_url)) if not isinstance(parse_result, dict): return self.rest_result if 'url' not in parse_result: parse_result['url'] = current_url if 'runtime' not in parse_result: parse_result['runtime'] = crawl_result[2] self.rest_result['result'] = parse_result return self.rest_result
def run(self, func, project_name, init_url): self.handle_method = func crawl_result = self.http_helper.get(self.current_url) if not str(crawl_result[1]).startswith('20') \ and not str(crawl_result[1]).startswith('30'): # 如果不是200系列和300系列的状态码输出错误 return { 'error': 'URL: ' + self.current_url + ' 获取失败 HTTP code: ' + str(crawl_result[1]) + ' Runtime: ' + str( crawl_result[2]) + 'ms'} urls = get_urls_form_html(self.current_url, crawl_result[0]) self.rest_result['current_url'] = self.current_url self.rest_result['http_code'] = crawl_result[1] current_url = self.current_url # 缓存一下,self.current_url会被下面代码改写 # 如果抓取自定义函数存在dict返回值则将dict推送至服务器 parse_result = self.handle_method( helper.S(self, crawl_result[0], urls, project_name, init_url)) if not isinstance(parse_result, dict): return self.rest_result if 'url' not in parse_result: parse_result['url'] = current_url if 'runtime' not in parse_result: parse_result['runtime'] = crawl_result[2] self.rest_result['result'] = parse_result return self.rest_result
def run(self, func, current_url, project_name, init_url, gevent_id): """ :param func: :return: """ self.handle_method = func # while True: # todo 需要些速度控制方法. gevent.sleep self.current_url = current_url print 'gevent_id: ' + str(gevent_id) + ' -- ' + self.project_name + ' -- ' + self.current_url if not self.current_url: # continue return self.put_data(urls_parsed=[self.current_url, ]) crawl_result = self.http_helper.get(self.current_url) if not str(crawl_result[1]).startswith('20') \ and not str(crawl_result[1]).startswith('30'): # 如果不是200系列和300系列的状态码输出错误 echo_err('gevent_id: ' + str(gevent_id) + ' -- ' + self.project_name + ' -- URL: ' + self.current_url + ' 获取失败 HTTP code: ' + str(crawl_result[1]) + ' Runtime: ' + str( crawl_result[2]) + 'ms') # continue self.put_data( urls_fail=( get_domain(self.current_url), int(crawl_result[1] if str(crawl_result[1]).isdigit() else 0), int(time.time()), ), ) return # 如果抓取自定义函数存在dict返回值则将dict推送至服务器 try: parse_result = self.handle_method( S(self, crawl_result[0], get_urls_form_html(self.current_url, crawl_result[0]), project_name, init_url)) except: print traceback.format_exc() return if not isinstance(parse_result, dict): # continue return if 'url' not in parse_result: parse_result['url'] = self.current_url # if 'runtime' not in parse_result: # parse_result['runtime'] = crawl_result[2] self.put_data(save=parse_result)