Exemplo n.º 1
0
    def __init__(self):

        user_agent_file = get_project_settings()['USER_AGENT_FILE']

        json_obj = JsonLoad(user_agent_file)

        self.agents = json_obj.getlist()
Exemplo n.º 2
0
 def __init__(self, settings):
     #print settings
     self.settings = settings
     self.log_obj = write_record(self.settings['record_log'])  #写日志对象
     json_object = JsonLoad(self.settings['user_agent_file'])
     self.agent_list = json_object.getlist()
     self.timeout = self.settings['timeout']
Exemplo n.º 3
0
    def get_sina_comment_2(self,article_item):
        refer = article_item['article_url']
        article_item['article_discuss'] = article_item['article_discuss'].split('?')
        article_item['article_discuss_number'] = 0
        article_item['article_attend_number'] = 0
        if len(article_item['article_discuss']) != 2:
            article_item['article_discuss'] = []
            return
        comment_page = 1
        lefturl = article_item['article_discuss'][1]
        # 存储评论
        
        cmntlist = []
        article_item['article_discuss'] = []
        #print 'hello wrold'
        json_object = JsonLoad(self.settings['USER_AGENT_FILE'])
        agent_list =  json_object.getlist()
        while comment_page == 1 or cmntlist != []:
            one_user_agent = random.choice(agent_list)
            headers = { 'User-Agent' : one_user_agent,'Referer':refer}  
            comment_url = self.website_config['comment_url'] +'&' +lefturl + "&page=" + str(comment_page) 
            try:
                #print comment_url
                request = urllib2.Request(comment_url,headers = headers)
                comment_content =  urllib2.urlopen(request,timeout = self.settings['DOWNLOAD_TIMEOUT']).read()
                if comment_content is None:
                    break
                find_str = "={"

                extract_contain = comment_content[comment_content.index(find_str) + len(find_str) -1:]

                extract_contain = extract_contain.replace('null','None')

                real_content = eval(extract_contain)

                if 'cmntlist' in real_content['result']:
                    cmntlist = real_content['result']['cmntlist']
                else:
                    cmntlist = []
                if cmntlist != []:
                    article_item['article_discuss'].append(cmntlist)
                if comment_page  == 1 and ('count' in real_content['result']):
                    #print real_content
                    article_item['article_discuss_number'] = int(real_content['result']['count']['show'])
                    article_item['article_attend_number'] = int(real_content['result']['count']['total'])
                comment_page = comment_page + 1
            except BaseException,error:
                date=datetime.datetime.now()
                sendbody = "time:" + date.strftime("%Y-%m-%d %H:%M:%S") + " comment_url:" + comment_url +" " +str(error)+"\n"
                filename = self.settings['WRONG_FILE']

                with open(filename,'a') as f:
                    f.write(sendbody)
                break        
Exemplo n.º 4
0
    def __init__(self, setting_file):

        self.filename = settings_file
        self.json_object = JsonLoad(settings_file)

        self.settings = self.json_object.getdata()
        #print("配置文件:%s" %(self.settings))
        #self.db_operation = news_operation()
        self.db_operation = news_operation(self.settings['record_log'])

        self.spider_operation = Spider_Operation(self.settings)

        self.vector_generator = BasePreProcessItem()

        self.time_operation = TimeOperate()
Exemplo n.º 5
0
    def get_fenghuangwang_comment(self,article_item):

        comment_url = self.website_config['discuss_url'] + article_item['article_url']

        json_object = JsonLoad(self.settings['USER_AGENT_FILE'])

        agent_list =  json_object.getlist()

        one_user_agent = random.choice(agent_list)
        headers = { 'User-Agent' : one_user_agent,'Referer':article_item['article_url']}
        article_item['article_discuss'] = []
        article_item['article_discuss_number'] = 0
        article_item['article_attend_number'] = 0

        try:

            request = urllib2.Request(comment_url,headers = headers)
            comment_content =  urllib2.urlopen(request,timeout = self.settings['DOWNLOAD_TIMEOUT']).read()

            if not comment_content:

                raise Exception("获取评论出错")
            
            find_str = "={"
            extract_contain = comment_content[comment_content.index(find_str) + len(find_str) -1:]
            #real_content = eval(extract_contain)
            extract_contain = extract_contain.replace('null','None')
            extract_contain = extract_contain.replace('false','None')
            #print len(extract_contain)
            real_content = extract_contain[0:len(extract_contain)-1] #去掉";"
            real_content = eval(real_content) #转化成字典
            #print real_content['count']
            article_item['article_discuss'] = real_content['comments']
            article_item['article_discuss_number'] = int(real_content['count'])
            article_item['article_attend_number'] = int(real_content['join_count'])

        except Exception,error:
            date=datetime.datetime.now()
            sendbody = "time:" + date.strftime("%Y-%m-%d %H:%M:%S") + " comment_url:" + comment_url +" " +str(error)+"\n"
            filename = self.settings['WRONG_FILE']

            with open(filename,'a') as f:
                f.write(sendbody)
Exemplo n.º 6
0
class Re_Spider(object):
    def __init__(self, setting_file):

        self.filename = settings_file
        self.json_object = JsonLoad(settings_file)

        self.settings = self.json_object.getdata()
        #print("配置文件:%s" %(self.settings))
        #self.db_operation = news_operation()
        self.db_operation = news_operation(self.settings['record_log'])

        self.spider_operation = Spider_Operation(self.settings)

        self.vector_generator = BasePreProcessItem()

        self.time_operation = TimeOperate()

    def day_by_day_update(self):

        try:
            if self.settings['start_time'] == "" or self.settings[
                    'stop_time'] == "":
                raise Exception("请配置起始时间")
            update_time = self.time_operation.str2date(
                self.settings['start_time'])

            while str(update_time) >= self.settings['stop_time']:

                self.settings['start_time'] = str(update_time)
                print("进度:%s" % (update_time))

                self.settings['extract_condition'][
                    'article_publish_time'] = str(update_time)

                self.__spider_for_page()

                self.db_operation.reset_para()

                update_time = self.time_operation.getthepreviousday(
                    update_time)
            return True

        except BaseException, error:
            self.__error_email_info(error)
            return False
        finally:
import sys
reload(sys)
sys.setdefaultencoding('utf8')

if __name__ == '__main__':

    try:
        #read_json_file = JsonLoad(SPLIT_JSON_FILE)
        fenghuang_update_object = FengHuangSettings()
        settings = get_project_settings()
        fenghuang_update_object.updatesettings(settings)
        #print settings
        #print settings['REQUEST_DEPTH_MAX']
        email_object = Email(settings)
        configure_logging(settings)
        read_json_file = JsonLoad(settings['SPLIT_JSON_FILE'])
        json_data = read_json_file.getdata()

        runner = CrawlerRunner(settings)
        # 开始爬虫,为了统计爬虫的时间
        begin_time = time.strftime("%Y-%m-%d %H:%M:%S",
                                   time.localtime())  # 开始时间
        for json_key in json_data:

            website_config = json_data[json_key]  # 取出每个网站的配置
            website_url = website_config['url']
            #website_urls  =  website_config[urls] #
            #website_urls = website_config['urls'] # 取每个网站的urls(每一项是地点的url)
            #print website_urls
            #settings['PREPROCESS_CLASS'] = web_config['preprocess_class'
            #logging.info('开始网站爬虫'+json_key+':''-'+time.strftime("%Y-%m-%d %H:%M:%S",time.localtime()))
Exemplo n.º 8
0
def run_sina():

    try:
    
        #print settings['LOG_FILE']
        read_json_file = JsonLoad(settings['SINA_JSON_FILE'])
        configure_logging(settings) 

        json_data = read_json_file.getdata()
        


        #runner = CrawlerRunner(settings)
        
        begin_time = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime()) # 开始时间

        #current_date = time_strftime("%Y-%m-%d",time.localtime()) # 当天日期
        logging.info('爬虫新浪网开始时间:'+begin_time)
        time_operation = TimeOperate()
        if json_data['start_time'] == '':

            yesterday_date = time_operation.getyesterdaydate() #获得昨天的日期
        else:
            yesterday_date = time_operation.str2date(json_data['start_time'])

        temp_begin_spider_date = yesterday_date
        if str(temp_begin_spider_date) == json_data['stop_time']:
            logging.info('爬虫新浪网结束时间:'+time.strftime("%Y-%m-%d %H:%M:%S",time.localtime()))
            os._exit(0)
    
        while True:
            
            if str(yesterday_date) <= json_data['stop_time'] or str(yesterday_date) < settings['SINA_OLD_START_DATE']: #结束新浪的爬取
                #print ''
                    break
            #deal_class = ''
            web_config = ''
            if str(yesterday_date) >= settings['SINA_NEW_START_DATE']:
                web_config = json_data['new_version']

                #deal_class = json_data['new_version']
            else:
                web_config = json_data['old_version']
            #day = day+1
            deal_class = web_config['deal_class']
            #settings['PREPROCESS_CLASS'] = web_config['preprocess_class']
            logging.info('开始爬取日期:'+str(yesterday_date))
            print str(yesterday_date)
                #begin_at = begin_at + 1
            yield runner.crawl(globals()[deal_class],website_config = web_config, spider_date = yesterday_date,settings = settings )
            yesterday_date =  time_operation.getthepreviousday(yesterday_date) # 日期推前一天
        reactor.stop()
        
        json_data['stop_time'] = str(temp_begin_spider_date) #更新停止时间
        end_time = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime()) #结束时间
        info_spider = ' begin at :'+begin_time+' end at :'+end_time
        logging.info(info_spider)

        sendbody = "time:"+ end_time + "新浪网爬虫结束" +"\n"

        email_object.send_information(sendbody,"新浪网爬虫结束",True)
        os._exit(0)
    except BaseException,error:
        #date = datetime.datetime.now()
        time_object = TimeOperate()
        date = time_object.getnow()
        logging.exception(error)
        sendbody = "time:" + date.strftime("%Y-%m-%d %H:%M:%S") + "error:" + str(error) + "\n"
        #email_object = Email(settings)
        email_object.send_information(sendbody)
        raise CloseSpider('新浪爬虫失败')
        os._exit(1)