class FengHuangSettings(UpdateSettings): def __init__(self): self.time_object = TimeOperate() def updatesettings(self, settings): settings['DOWNLOAD_DELAY'] = 0.25 settings['RANDOMIZE_DOWNLOAD_DELAY'] = True settings['RETRY_PRIORITY_ADJUST'] = 2 settings['DOWNLOAD_TIMEOUT'] = 180 #因为这个网上特殊,一旦停止之后就不会继续爬虫 settings['DOWNLOADER_MIDDLEWARES'][ 'scrapy.downloadermiddlewares.retry.RetryMiddleware'] = 550 settings['COOKIES_ENABLED'] = False settings['RETRY_HTTP_CODES'] = [500, 503, 504, 400, 403, 404, 408] settings['RETRY_TIMES'] = 100 settings['LOG_FILE'] = settings['GENERAL_LOG_FILE'] + str( self.time_object.gettoday()) settings['WRONG_FILE'] = settings['WRONG_FILE'] + settings[ 'SPIDER_WRONG_FILE'] + str(self.time_object.gettoday()) self.touchfile(settings['LOG_FILE']) self.touchfile(settings['WRONG_FILE'])
class SinaUpateSettings(UpdateSettings): def __init__(self): self.time_object = TimeOperate() self.settings = get_project_settings() def updatesettings(self): self.settings['COOKIES_ENABLED'] = False self.settings['LOG_FILE'] = self.settings['SINA_LOG_FILE'] + str( self.time_object.gettoday()) self.settings['WRONG_FILE'] = self.settings[ 'WRONG_FILE'] + self.settings['SINA_WRONG_FILE'] + str( self.time_object.gettoday()) self.settings['DOWNLOAD_DELAY'] = 0.25 self.settings['RANDOMIZE_DOWNLOAD_DELAY'] = True self.touchfile(self.settings['LOG_FILE']) self.touchfile(self.settings['WRONG_FILE']) #self.settings['MONGODB_CONNECTION'] = MongoDBConnection() return self.settings
def __init__(self, website_config, spider_date, settings): self.current_date = spider_date self.website_config = website_config self.settings = settings self.time_object = TimeOperate() self.start_urls = [] self.start_urls.append(website_config['main_url'] + self.time_object.getyear(spider_date) + self.time_object.getmonth(spider_date) + self.time_object.getday(spider_date) + website_config['time_url']) self.begin_url = (website_config['main_url'] + self.time_object.getyear(spider_date) + self.time_object.getmonth(spider_date) + self.time_object.getday(spider_date) + self.website_config['js_file']) #print self.begin_url self.js = [] self.html_url = [] self.article = 0 #self.inform = False self.page_num = 0
class GeneralUpdateSettings(UpdateSettings): def __init__(self): self.time_object = TimeOperate() #self.settings = get_project_settings() def updatesettings(self, settings): settings['LOG_FILE'] = settings['GENERAL_LOG_FILE'] + str( self.time_object.gettoday()) settings['WRONG_FILE'] = settings['WRONG_FILE'] + settings[ 'SPIDER_WRONG_FILE'] + str(self.time_object.gettoday()) self.touchfile(settings['LOG_FILE']) self.touchfile(settings['WRONG_FILE'])
def __init__(self, setting_file): self.filename = settings_file self.json_object = JsonLoad(settings_file) self.settings = self.json_object.getdata() #print("配置文件:%s" %(self.settings)) #self.db_operation = news_operation() self.db_operation = news_operation(self.settings['record_log']) self.spider_operation = Spider_Operation(self.settings) self.vector_generator = BasePreProcessItem() self.time_operation = TimeOperate()
class Re_Spider(object): def __init__(self, setting_file): self.filename = settings_file self.json_object = JsonLoad(settings_file) self.settings = self.json_object.getdata() #print("配置文件:%s" %(self.settings)) #self.db_operation = news_operation() self.db_operation = news_operation(self.settings['record_log']) self.spider_operation = Spider_Operation(self.settings) self.vector_generator = BasePreProcessItem() self.time_operation = TimeOperate() def day_by_day_update(self): try: if self.settings['start_time'] == "" or self.settings[ 'stop_time'] == "": raise Exception("请配置起始时间") update_time = self.time_operation.str2date( self.settings['start_time']) while str(update_time) >= self.settings['stop_time']: self.settings['start_time'] = str(update_time) print("进度:%s" % (update_time)) self.settings['extract_condition'][ 'article_publish_time'] = str(update_time) self.__spider_for_page() self.db_operation.reset_para() update_time = self.time_operation.getthepreviousday( update_time) return True except BaseException, error: self.__error_email_info(error) return False finally:
def __init__(self): self.time_object = TimeOperate()
def __init__(self): self.time_object = TimeOperate() self.settings = get_project_settings()
def run_sina(): try: #print settings['LOG_FILE'] read_json_file = JsonLoad(settings['SINA_JSON_FILE']) configure_logging(settings) json_data = read_json_file.getdata() #runner = CrawlerRunner(settings) begin_time = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime()) # 开始时间 #current_date = time_strftime("%Y-%m-%d",time.localtime()) # 当天日期 logging.info('爬虫新浪网开始时间:'+begin_time) time_operation = TimeOperate() if json_data['start_time'] == '': yesterday_date = time_operation.getyesterdaydate() #获得昨天的日期 else: yesterday_date = time_operation.str2date(json_data['start_time']) temp_begin_spider_date = yesterday_date if str(temp_begin_spider_date) == json_data['stop_time']: logging.info('爬虫新浪网结束时间:'+time.strftime("%Y-%m-%d %H:%M:%S",time.localtime())) os._exit(0) while True: if str(yesterday_date) <= json_data['stop_time'] or str(yesterday_date) < settings['SINA_OLD_START_DATE']: #结束新浪的爬取 #print '' break #deal_class = '' web_config = '' if str(yesterday_date) >= settings['SINA_NEW_START_DATE']: web_config = json_data['new_version'] #deal_class = json_data['new_version'] else: web_config = json_data['old_version'] #day = day+1 deal_class = web_config['deal_class'] #settings['PREPROCESS_CLASS'] = web_config['preprocess_class'] logging.info('开始爬取日期:'+str(yesterday_date)) print str(yesterday_date) #begin_at = begin_at + 1 yield runner.crawl(globals()[deal_class],website_config = web_config, spider_date = yesterday_date,settings = settings ) yesterday_date = time_operation.getthepreviousday(yesterday_date) # 日期推前一天 reactor.stop() json_data['stop_time'] = str(temp_begin_spider_date) #更新停止时间 end_time = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime()) #结束时间 info_spider = ' begin at :'+begin_time+' end at :'+end_time logging.info(info_spider) sendbody = "time:"+ end_time + "新浪网爬虫结束" +"\n" email_object.send_information(sendbody,"新浪网爬虫结束",True) os._exit(0) except BaseException,error: #date = datetime.datetime.now() time_object = TimeOperate() date = time_object.getnow() logging.exception(error) sendbody = "time:" + date.strftime("%Y-%m-%d %H:%M:%S") + "error:" + str(error) + "\n" #email_object = Email(settings) email_object.send_information(sendbody) raise CloseSpider('新浪爬虫失败') os._exit(1)
class SinaOldSpider(CrawlSpider): def __init__(self, website_config, spider_date, settings): self.current_date = spider_date self.website_config = website_config self.settings = settings self.time_object = TimeOperate() self.start_urls = [] self.start_urls.append(website_config['main_url'] + self.time_object.getyear(spider_date) + self.time_object.getmonth(spider_date) + self.time_object.getday(spider_date) + website_config['time_url']) self.begin_url = (website_config['main_url'] + self.time_object.getyear(spider_date) + self.time_object.getmonth(spider_date) + self.time_object.getday(spider_date) + self.website_config['js_file']) #print self.begin_url self.js = [] self.html_url = [] self.article = 0 #self.inform = False self.page_num = 0 def parse(self, response): print response.url if self.isnew(response.body): #parsejson() # response的内容是json格式 for one_url in self.js: #print str(oneurl.replace('\"','')) yield scrapy.Request(one_url.replace('\"', ''), callback=self.parsejson) elif str( self.current_date ) < self.settings['SINA_MID_START_DATE'] and self.ishtml(response): for one_url in self.html_url: yield scrapy.Request(one_url, callback=self.parsehtml) else: entry_js = self.begin_url + str( self.page_num) + self.website_config['js_suffix'] yield scrapy.Request(entry_js, callback=self.parse_page_js) def parse_page_js(self, response): #print response.url #if response.body == "banned": #print "banned by server, begin use proxy" encode = chardet.detect(response.body)['encoding'] #print encode response_body = response.body.decode(encode, 'ignore').encode('utf-8') #re.sub(r'[\\x00-\\x08\\x0b-\\x0c\\x0e-\\x1f]'.encode('utf-8'),'',response_body) response_body = re.search(r'var sinaRss = .*'.encode('utf-8'), response_body) if response_body: response_body = response_body.group() response_body = response_body.split('=', 1)[1] #response_body = list(response_body) #print type(response_body) #re.sub(r'=','',response_body) #index = response_body.rfind('=') #print index #response_body[index] = '' index = response_body.rfind(';') #print response_body if index != -1: response_body = response_body[:index] while response_body != "" and response_body[-1] != ']': response_body = response_body[:-1] #response_body = eval(response_body) #可以把list,tuple,dict和string相互转化 try: #print response_body[75224] response_body = json.loads(response_body, strict=False) for one_response in response_body: url = one_response[-2] if re.match(r'^https?:/{2}\w.+$', url): yield scrapy.Request(url, callback=self.parsehtml) if len(response_body) <= 0: return except BaseException, error: self.__write_log(error) self.page_num = self.page_num + 1 next_page_js = self.begin_url + str( self.page_num) + self.website_config['js_suffix'] yield scrapy.Request(next_page_js, callback=self.parse_page_js) else: