Exemplo n.º 1
0
class FengHuangSettings(UpdateSettings):
    def __init__(self):

        self.time_object = TimeOperate()

    def updatesettings(self, settings):
        settings['DOWNLOAD_DELAY'] = 0.25

        settings['RANDOMIZE_DOWNLOAD_DELAY'] = True
        settings['RETRY_PRIORITY_ADJUST'] = 2
        settings['DOWNLOAD_TIMEOUT'] = 180  #因为这个网上特殊,一旦停止之后就不会继续爬虫
        settings['DOWNLOADER_MIDDLEWARES'][
            'scrapy.downloadermiddlewares.retry.RetryMiddleware'] = 550
        settings['COOKIES_ENABLED'] = False
        settings['RETRY_HTTP_CODES'] = [500, 503, 504, 400, 403, 404, 408]
        settings['RETRY_TIMES'] = 100
        settings['LOG_FILE'] = settings['GENERAL_LOG_FILE'] + str(
            self.time_object.gettoday())

        settings['WRONG_FILE'] = settings['WRONG_FILE'] + settings[
            'SPIDER_WRONG_FILE'] + str(self.time_object.gettoday())

        self.touchfile(settings['LOG_FILE'])

        self.touchfile(settings['WRONG_FILE'])
Exemplo n.º 2
0
class SinaUpateSettings(UpdateSettings):
    def __init__(self):

        self.time_object = TimeOperate()
        self.settings = get_project_settings()

    def updatesettings(self):

        self.settings['COOKIES_ENABLED'] = False

        self.settings['LOG_FILE'] = self.settings['SINA_LOG_FILE'] + str(
            self.time_object.gettoday())

        self.settings['WRONG_FILE'] = self.settings[
            'WRONG_FILE'] + self.settings['SINA_WRONG_FILE'] + str(
                self.time_object.gettoday())

        self.settings['DOWNLOAD_DELAY'] = 0.25

        self.settings['RANDOMIZE_DOWNLOAD_DELAY'] = True

        self.touchfile(self.settings['LOG_FILE'])

        self.touchfile(self.settings['WRONG_FILE'])

        #self.settings['MONGODB_CONNECTION'] = MongoDBConnection()

        return self.settings
Exemplo n.º 3
0
    def __init__(self, website_config, spider_date, settings):

        self.current_date = spider_date

        self.website_config = website_config

        self.settings = settings
        self.time_object = TimeOperate()
        self.start_urls = []

        self.start_urls.append(website_config['main_url'] +
                               self.time_object.getyear(spider_date) +
                               self.time_object.getmonth(spider_date) +
                               self.time_object.getday(spider_date) +
                               website_config['time_url'])
        self.begin_url = (website_config['main_url'] +
                          self.time_object.getyear(spider_date) +
                          self.time_object.getmonth(spider_date) +
                          self.time_object.getday(spider_date) +
                          self.website_config['js_file'])
        #print self.begin_url
        self.js = []
        self.html_url = []
        self.article = 0
        #self.inform = False
        self.page_num = 0
Exemplo n.º 4
0
class GeneralUpdateSettings(UpdateSettings):
    def __init__(self):

        self.time_object = TimeOperate()
        #self.settings = get_project_settings()
    def updatesettings(self, settings):
        settings['LOG_FILE'] = settings['GENERAL_LOG_FILE'] + str(
            self.time_object.gettoday())

        settings['WRONG_FILE'] = settings['WRONG_FILE'] + settings[
            'SPIDER_WRONG_FILE'] + str(self.time_object.gettoday())

        self.touchfile(settings['LOG_FILE'])

        self.touchfile(settings['WRONG_FILE'])
Exemplo n.º 5
0
    def __init__(self, setting_file):

        self.filename = settings_file
        self.json_object = JsonLoad(settings_file)

        self.settings = self.json_object.getdata()
        #print("配置文件:%s" %(self.settings))
        #self.db_operation = news_operation()
        self.db_operation = news_operation(self.settings['record_log'])

        self.spider_operation = Spider_Operation(self.settings)

        self.vector_generator = BasePreProcessItem()

        self.time_operation = TimeOperate()
Exemplo n.º 6
0
class Re_Spider(object):
    def __init__(self, setting_file):

        self.filename = settings_file
        self.json_object = JsonLoad(settings_file)

        self.settings = self.json_object.getdata()
        #print("配置文件:%s" %(self.settings))
        #self.db_operation = news_operation()
        self.db_operation = news_operation(self.settings['record_log'])

        self.spider_operation = Spider_Operation(self.settings)

        self.vector_generator = BasePreProcessItem()

        self.time_operation = TimeOperate()

    def day_by_day_update(self):

        try:
            if self.settings['start_time'] == "" or self.settings[
                    'stop_time'] == "":
                raise Exception("请配置起始时间")
            update_time = self.time_operation.str2date(
                self.settings['start_time'])

            while str(update_time) >= self.settings['stop_time']:

                self.settings['start_time'] = str(update_time)
                print("进度:%s" % (update_time))

                self.settings['extract_condition'][
                    'article_publish_time'] = str(update_time)

                self.__spider_for_page()

                self.db_operation.reset_para()

                update_time = self.time_operation.getthepreviousday(
                    update_time)
            return True

        except BaseException, error:
            self.__error_email_info(error)
            return False
        finally:
Exemplo n.º 7
0
    def __init__(self):

        self.time_object = TimeOperate()
Exemplo n.º 8
0
    def __init__(self):

        self.time_object = TimeOperate()
        self.settings = get_project_settings()
Exemplo n.º 9
0
def run_sina():

    try:
    
        #print settings['LOG_FILE']
        read_json_file = JsonLoad(settings['SINA_JSON_FILE'])
        configure_logging(settings) 

        json_data = read_json_file.getdata()
        


        #runner = CrawlerRunner(settings)
        
        begin_time = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime()) # 开始时间

        #current_date = time_strftime("%Y-%m-%d",time.localtime()) # 当天日期
        logging.info('爬虫新浪网开始时间:'+begin_time)
        time_operation = TimeOperate()
        if json_data['start_time'] == '':

            yesterday_date = time_operation.getyesterdaydate() #获得昨天的日期
        else:
            yesterday_date = time_operation.str2date(json_data['start_time'])

        temp_begin_spider_date = yesterday_date
        if str(temp_begin_spider_date) == json_data['stop_time']:
            logging.info('爬虫新浪网结束时间:'+time.strftime("%Y-%m-%d %H:%M:%S",time.localtime()))
            os._exit(0)
    
        while True:
            
            if str(yesterday_date) <= json_data['stop_time'] or str(yesterday_date) < settings['SINA_OLD_START_DATE']: #结束新浪的爬取
                #print ''
                    break
            #deal_class = ''
            web_config = ''
            if str(yesterday_date) >= settings['SINA_NEW_START_DATE']:
                web_config = json_data['new_version']

                #deal_class = json_data['new_version']
            else:
                web_config = json_data['old_version']
            #day = day+1
            deal_class = web_config['deal_class']
            #settings['PREPROCESS_CLASS'] = web_config['preprocess_class']
            logging.info('开始爬取日期:'+str(yesterday_date))
            print str(yesterday_date)
                #begin_at = begin_at + 1
            yield runner.crawl(globals()[deal_class],website_config = web_config, spider_date = yesterday_date,settings = settings )
            yesterday_date =  time_operation.getthepreviousday(yesterday_date) # 日期推前一天
        reactor.stop()
        
        json_data['stop_time'] = str(temp_begin_spider_date) #更新停止时间
        end_time = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime()) #结束时间
        info_spider = ' begin at :'+begin_time+' end at :'+end_time
        logging.info(info_spider)

        sendbody = "time:"+ end_time + "新浪网爬虫结束" +"\n"

        email_object.send_information(sendbody,"新浪网爬虫结束",True)
        os._exit(0)
    except BaseException,error:
        #date = datetime.datetime.now()
        time_object = TimeOperate()
        date = time_object.getnow()
        logging.exception(error)
        sendbody = "time:" + date.strftime("%Y-%m-%d %H:%M:%S") + "error:" + str(error) + "\n"
        #email_object = Email(settings)
        email_object.send_information(sendbody)
        raise CloseSpider('新浪爬虫失败')
        os._exit(1)
Exemplo n.º 10
0
class SinaOldSpider(CrawlSpider):
    def __init__(self, website_config, spider_date, settings):

        self.current_date = spider_date

        self.website_config = website_config

        self.settings = settings
        self.time_object = TimeOperate()
        self.start_urls = []

        self.start_urls.append(website_config['main_url'] +
                               self.time_object.getyear(spider_date) +
                               self.time_object.getmonth(spider_date) +
                               self.time_object.getday(spider_date) +
                               website_config['time_url'])
        self.begin_url = (website_config['main_url'] +
                          self.time_object.getyear(spider_date) +
                          self.time_object.getmonth(spider_date) +
                          self.time_object.getday(spider_date) +
                          self.website_config['js_file'])
        #print self.begin_url
        self.js = []
        self.html_url = []
        self.article = 0
        #self.inform = False
        self.page_num = 0

    def parse(self, response):
        print response.url
        if self.isnew(response.body):
            #parsejson()  # response的内容是json格式
            for one_url in self.js:
                #print str(oneurl.replace('\"',''))
                yield scrapy.Request(one_url.replace('\"', ''),
                                     callback=self.parsejson)
        elif str(
                self.current_date
        ) < self.settings['SINA_MID_START_DATE'] and self.ishtml(response):
            for one_url in self.html_url:
                yield scrapy.Request(one_url, callback=self.parsehtml)

        else:

            entry_js = self.begin_url + str(
                self.page_num) + self.website_config['js_suffix']

            yield scrapy.Request(entry_js, callback=self.parse_page_js)

    def parse_page_js(self, response):
        #print response.url
        #if response.body == "banned":
        #print "banned by server, begin use proxy"
        encode = chardet.detect(response.body)['encoding']
        #print encode
        response_body = response.body.decode(encode, 'ignore').encode('utf-8')
        #re.sub(r'[\\x00-\\x08\\x0b-\\x0c\\x0e-\\x1f]'.encode('utf-8'),'',response_body)
        response_body = re.search(r'var sinaRss = .*'.encode('utf-8'),
                                  response_body)

        if response_body:
            response_body = response_body.group()
            response_body = response_body.split('=', 1)[1]
            #response_body = list(response_body)
            #print type(response_body)
            #re.sub(r'=','',response_body)
            #index = response_body.rfind('=')
            #print index
            #response_body[index] = ''
            index = response_body.rfind(';')
            #print response_body
            if index != -1:
                response_body = response_body[:index]

                while response_body != "" and response_body[-1] != ']':
                    response_body = response_body[:-1]
                #response_body = eval(response_body) #可以把list,tuple,dict和string相互转化

                try:
                    #print response_body[75224]
                    response_body = json.loads(response_body, strict=False)

                    for one_response in response_body:
                        url = one_response[-2]

                        if re.match(r'^https?:/{2}\w.+$', url):
                            yield scrapy.Request(url, callback=self.parsehtml)
                    if len(response_body) <= 0:
                        return
                except BaseException, error:
                    self.__write_log(error)

                self.page_num = self.page_num + 1
                next_page_js = self.begin_url + str(
                    self.page_num) + self.website_config['js_suffix']
                yield scrapy.Request(next_page_js, callback=self.parse_page_js)
        else: