예제 #1
0
    def start_getweiboinfo(self,response):
        db = MysqlStore()
        conn = db.get_connection()
        sql1 = "select * from t_user_follow where contentstate = 0" 
        cursor1 = db.select_operation(conn,sql1)
        
        sql2 = "select count(*) from t_user_follow where contentstate = 0"
        cursor2 = db.select_operation(conn,sql2)
        count = cursor2.fetchone()
        for i in range(10):        #(count[0]):  #count[0]为需要再爬取的用户数
            for result in cursor1.fetchmany(1):
                if result[1]:
                    mainpageurl = 'http://weibo.com/u/'+str(result[1])+'?from=otherprofile&wvr=3.6&loc=tagweibo'
                    GetWeibopage.data['uid'] = result[1]
                    getweibopage = GetWeibopage()
                    for page in range(WeiboSpider.page_num): 
                        GetWeibopage.data['page'] = page+1
                        firstloadurl = mainpageurl + getweibopage.get_firstloadurl()
                        yield  Request(url=firstloadurl,meta={'cookiejar':response.meta['cookiejar'],'uid':result[1]},callback=self.parse_firstload)

                        secondloadurl = mainpageurl + getweibopage.get_secondloadurl()
                        yield  Request(url=secondloadurl,meta={'cookiejar':response.meta['cookiejar'],'uid':result[1]},callback=self.parse_secondload)
           
                        thirdloadurl = mainpageurl + getweibopage.get_thirdloadurl()
                        yield  Request(url=thirdloadurl,meta={'cookiejar':response.meta['cookiejar'],'uid':result[1]},callback=self.parse_thirdload)
    def search_from_keywordDB(self,response):
        db = MysqlStore();conn = db.get_connection()
        main_url = "http://s.weibo.com/weibo/"
        getsearchpage = GetSearchpage()
     
        sql1 = "select keyword from cauc_keyword_test"
        cursor = db.select_operation(conn,sql1)
        for keyword in cursor.fetchall():
            print "this is the keyword:",keyword

        keywords = ['机场 炸弹','飞机 炸弹']
        for i in range(15):
            for keyword in keywords:
                sql3 = "select max(publish_time) from cauc_keyword_info where keyword = '%s'" % keyword  #检查是否爬取过该关键词
                cursor = db.select_operation(conn,sql3)
                newest_time = cursor.fetchone()[0]
                if newest_time is not None:  #已经爬取过该关键词,获取最新时间用于"最新时间-当前时间"时间段内容获取
                    current_time = get_current_time()
                    newest_time = format_time(newest_time)
                    print "爬取过的关键词:%s,搜索时间段%s~%s间的内容" % (keyword,newest_time,current_time)
                    search_url = main_url + getsearchpage.get_searchurl_time(keyword,newest_time,current_time)
                    yield Request(url=search_url,meta={'cookiejar':response.meta['cookiejar'],'search_url':search_url,'keyword':keyword},callback=self.parse_total_page)
                else:   #未爬取过该关键词
                    print "未爬取过的关键词:%s" % keyword
                    search_url = main_url + getsearchpage.get_searchurl(keyword)
                    yield Request(url=search_url,meta={'cookiejar':response.meta['cookiejar'],'search_url':search_url,'keyword':keyword},callback=self.parse_total_page)
            time.sleep(100000)
                
        conn.close()
def clear_flag():
    conn = MysqlStore().get_connection()
    cursor = conn.cursor()
    sql = "update cauc_parameters set param_value = 0 where param_key = 'flag'"
    n = cursor.execute(sql)
    conn.commit()
    db.close_connection(conn,cursor) 
    if n:
        logger.info('clear_flag success!')
def update_cookies(username,cookies):
    '''更新cookie到数据库'''
    db = MysqlStore()
    conn = db.get_connection()
    timestamp = int(time.time())
    cursor = conn.cursor()
    cursor.execute("insert into cauc_login_cookie_info(USERNAME,SUHB,SUB,SUBP,SUE,SUS,SUP,timestamp) values (%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE SUHB=%s,SUB=%s,SUBP=%s,SUE=%s,SUS=%s,SUP=%s,timestamp=%s",(md5(username),cookies['SUHB'],cookies['SUB'],cookies['SUBP'],cookies['SUE'],cookies['SUS'],cookies['SUP'],timestamp,cookies['SUHB'],cookies['SUB'],cookies['SUBP'],cookies['SUE'],cookies['SUS'],cookies['SUP'],timestamp))
    conn.commit()
    db.close_connection(conn,cursor) 
    logger.info('Update cookies into database...')
def allcookie_fetch():
    '''获取所有cookie'''
    db = MysqlStore()
    conn = db.get_connection()
    cursor = conn.cursor(MySQLdb.cursors.DictCursor)
    nums = cursor.execute('SELECT SUHB,SUB,SUBP,SUE,SUS,SUP,USERNAME FROM cauc_login_cookie_info WHERE (unix_timestamp()-cast(timestamp as signed)) < (%s)',(EXPIRES,))
    rows = cursor.fetchallDict()
    conn.commit()
    db.close_connection(conn,cursor) 
    return nums,rows
def set_flag():
    '''设置flag'''
    db = MysqlStore()
    conn = db.get_connection()
    cursor = conn.cursor()
    sql = "update cauc_parameters set param_value = 1 where param_key = 'flag'"
    n = cursor.execute(sql)
    conn.commit()
    db.close_connection(conn,cursor) 
    if n:
        logger.info('Set flag success!')
    else:
        logger.error('Set flag failed,flag is already 1!')
def user_fetch():
    '''获取账号信息'''
    db = MysqlStore()
    conn = db.get_connection()
    cursor = conn.cursor(MySQLdb.cursors.DictCursor)
    sql = 'SELECT id,username,password FROM cauc_login_account_info where is_delete = 0'
    nums = cursor.execute(sql)
    users = cursor.fetchallDict()
    conn.commit()
    db.close_connection(conn,cursor)
    if users:
        logger.info('User fetch success!')
        return users
    else:
        logger.error('There is no user in database!')
    def search_from_keywordDB(self, response):
        db = MysqlStore()
        main_url = "http://s.weibo.com/weibo/"
        getsearchpage = GetSearchpage()

        for round in range(1):  #遍历数据库的轮数
            conn = db.get_connection()

            #选取is_search位为0的关键词
            sql1 = "select keyword from cauc_keyword_test_copy where is_search = 0 and is_delete = 0"
            cursor1 = db.select_operation(conn, sql1)

            #对is_search位为0的关键词进行爬取
            for keyword in cursor1.fetchall():
                keyword = keyword[0]
                logger.info("this is the unsearched keyword:%s", keyword)
                search_url = main_url + getsearchpage.get_searchurl(keyword)
                yield Request(url=search_url,
                              cookies=random.choice(COOKIES),
                              meta={
                                  'search_url': search_url,
                                  'keyword': keyword
                              },
                              callback=self.parse_total_page)

            #选取is_search位为1的关键词
            sql2 = "select keyword from cauc_keyword_test_copy where is_search = 1 and is_delete = 0"
            cursor2 = db.select_operation(conn, sql2)

            #对is_search位为1的关键词进行爬取
            for keyword in cursor2.fetchall():
                keyword = keyword[0]
                logger.info("this is the searched keyword:%s", keyword)

                end_time = get_current_time()
                #start_time = get_time_by_interval(int(time.time()),3600)  #爬取3600秒,即1小时前的内容
                start_time = get_time_by_interval(int(
                    time.time()), int(self.interval))  #爬取interval秒前的内容

                search_url = main_url + getsearchpage.get_searchurl_time(
                    keyword, start_time, end_time)
                yield Request(url=search_url,
                              cookies=random.choice(COOKIES),
                              meta={
                                  'search_url': search_url,
                                  'keyword': keyword
                              },
                              callback=self.parse_total_page)

            #更新is_search标志位为1
            sql3 = "update cauc_keyword_test_copy set is_search = 1 where is_search = 0 and is_delete = 0"
            db.update_operation(conn, sql3)
            db.close_connection(conn)
예제 #9
0
    def start_getweiboinfo(self, response):
        db = MysqlStore()
        conn = db.get_connection()
        sql1 = "select * from t_user_follow where contentstate = 0"
        cursor1 = db.select_operation(conn, sql1)

        sql2 = "select count(*) from t_user_follow where contentstate = 0"
        cursor2 = db.select_operation(conn, sql2)
        count = cursor2.fetchone()
        for i in (count[0]):  #count[0]为需要再爬取的用户数
            for result in cursor1.fetchmany(1):
                if result[1]:
                    mainpageurl = 'http://weibo.com/u/' + str(
                        result[1]) + '?from=otherprofile&wvr=3.6&loc=tagweibo'
                    GetWeibopage.data['uid'] = result[1]
                    getweibopage = GetWeibopage()
                    for page in range(WeiboSpider.page_num):
                        GetWeibopage.data['page'] = page + 1
                        firstloadurl = mainpageurl + getweibopage.get_firstloadurl(
                        )
                        yield Request(url=firstloadurl,
                                      meta={
                                          'cookiejar':
                                          response.meta['cookiejar'],
                                          'uid': result[1]
                                      },
                                      callback=self.parse_firstload)

                        secondloadurl = mainpageurl + getweibopage.get_secondloadurl(
                        )
                        yield Request(url=secondloadurl,
                                      meta={
                                          'cookiejar':
                                          response.meta['cookiejar'],
                                          'uid': result[1]
                                      },
                                      callback=self.parse_secondload)

                        thirdloadurl = mainpageurl + getweibopage.get_thirdloadurl(
                        )
                        yield Request(url=thirdloadurl,
                                      meta={
                                          'cookiejar':
                                          response.meta['cookiejar'],
                                          'uid': result[1]
                                      },
                                      callback=self.parse_thirdload)
예제 #10
0
    def get_userinfo(self,response):
        db = MysqlStore()
        conn = db.get_connection()
        #sql1 = "select * from t_user_follow where infostate = 0 and contentstate = 0"
        sql1 = "select * from t_user_info where imagestate = 1 and imageurl = 1"
        cursor1 = db.select_operation(conn,sql1)

        sql2 = "select count(*) from t_user_follow where infostate = 0 and contentstate = 0"
        cursor2 = db.select_operation(conn,sql2)
        count = cursor2.fetchone()

        for i in range(1):     #count[0]):
            for result in cursor1.fetchmany(1):
                if result[0]:
                    mainpageurl = 'http://weibo.com/u/'+str(result[0])+'?from=otherprofile&wvr=3.6&loc=tagweibo'
                    GetWeibopage.data['uid'] = result[0]   #result[1]
                    getweibopage = GetWeibopage()
                    GetWeibopage.data['page'] = 1
                    firstloadurl = mainpageurl + getweibopage.get_firstloadurl()
                    yield  Request(url=firstloadurl,meta={'cookiejar':response.meta['cookiejar'],'uid':result[0]},callback=self.get_userurl)
    def search_from_keywordDB(self,response):
        db = MysqlStore();conn = db.get_connection()
        main_url = "http://s.weibo.com/weibo/"
        getsearchpage = GetSearchpage()
     
        sql1 = "select keyword from cauc_keyword_test_copy where is_search = 0"
        cursor = db.select_operation(conn,sql1)
        for round in range(3):
            for keyword in cursor.fetchall():
                keyword = keyword[0]
                print "this is the unsearched keyword:",keyword
                sql2 = "update cauc_keyword_test_copy set is_search = 1 where keyword = '%s'" % keyword
                db.update_operation(conn,sql2)
                search_url = main_url + getsearchpage.get_searchurl(keyword)
                #yield Request(url=search_url,meta={'cookiejar':response.meta['cookiejar'],'search_url':search_url,'keyword':keyword},callback=self.parse_total_page)

            print "current timestamp:",int(time.time())
            #设置循环爬取间隔
            time.sleep(WeiboSpider.settings['KEYWORD_INTERVAL']) 

            sql3 = "select keyword from cauc_keyword_test_copy where is_search = 1"
            cursor = db.select_operation(conn,sql3)
            for keyword in cursor.fetchall():
                keyword = keyword[0]
                print "this is the searched keyword",keyword
                #yield Request(url=search_url,meta={'cookiejar':response.meta['cookiejar'],'search_url':search_url,'keyword':keyword},callback=self.parse_total_page)
        conn.close()
예제 #12
0
    def start_getweibo_info(self,response):
        db = MysqlStore();
        #取出没有爬取过的且is_delete=0的重点人员
        GetWeibopage.data['page'] = 1; getweibopage = GetWeibopage()

        #for round in range(1): #遍历数据库的轮数
        conn = db.get_connection()
        sql1 = "select user_id from cauc_warning_man a \
                where a.is_search = 0 and a.is_delete = 0"          
        cursor1 = db.select_operation(conn,sql1)
        for user_id in cursor1.fetchall():
            user_id = user_id[0]
            logger.info("this is the unsearched user_id:%s",user_id)
            
            #获取需要爬取的总页面数
            start_time = self.start_time;end_time = get_current_time('day') 
            mainpage_url = "http://weibo.com/" + str(user_id) + "?is_ori=1&is_forward=1&is_text=1&is_pic=1&key_word=&start_time=" + start_time + "&end_time=" + end_time + "&is_search=1&is_searchadv=1&" 
            GetWeibopage.data['uid'] = user_id; 
            thirdload_url = mainpage_url + getweibopage.get_thirdloadurl()
            yield  Request(url=thirdload_url,cookies=random.choice(COOKIES),meta={'mainpage_url':mainpage_url,'uid':user_id,'is_search':0},callback=self.parse_total_page)
            

        #取出已经爬取过is_search=1的且is_delete=0的预警人员
        sql2 = "select user_id from cauc_warning_man a \
                where a.is_search = 1 and a.is_delete = 0"
        cursor2 = db.select_operation(conn,sql2)

        for user_id in cursor2.fetchall():
            user_id = user_id[0]
            logger.info("this is the searched user_id:%s",user_id)

            #start_time = get_time_by_interval(int(time.time()),86400,'day');end_time = get_current_time('day') #起始和结束间隔时间为1天(86400s),即过去一天的内容
            start_time = get_time_by_interval(int(time.time()),int(self.interval),'day');end_time = get_current_time('day') #起始和结束间隔时间为x天(由interval代表的秒换算而来)
            mainpage_url = "http://weibo.com/" + str(user_id) + "?is_ori=1&is_forward=1&is_text=1&is_pic=1&key_word=&start_time=" + start_time + "&end_time=" + end_time + "&is_search=1&is_searchadv=1&" 
            GetWeibopage.data['uid'] = user_id; 
            thirdload_url = mainpage_url + getweibopage.get_thirdloadurl()
            yield  Request(url=thirdload_url,cookies=random.choice(COOKIES),meta={'mainpage_url':mainpage_url,'uid':user_id,'is_search':1},callback=self.parse_total_page)

        #更新is_search标志位为1
        sql3 = "update cauc_warning_man set is_search = 1 where is_search = 0 and is_delete = 0"
        db.update_operation(conn,sql3)
        db.close_connection(conn)
    def search_from_keywordDB(self, response):
        db = MysqlStore()
        main_url = "http://s.weibo.com/weibo/"
        getsearchpage = GetSearchpage()

        for round in range(1):  #遍历数据库的轮数
            conn = db.get_connection()

            #对is_search位为0的关键词进行爬取
            sql1 = "select keyword from cauc_keyword_test_copy where is_search = 0"
            cursor = db.select_operation(conn, sql1)
            for keyword in cursor.fetchall():
                keyword = keyword[0]
                logger.info("this is the unsearched keyword:%s", keyword)
                #更新is_search标志位为1
                sql2 = "update cauc_keyword_test_copy set is_search = 1 where keyword = '%s'" % keyword
                db.update_operation(conn, sql2)
                search_url = main_url + getsearchpage.get_searchurl(keyword)
                yield Request(url=search_url,
                              meta={
                                  'cookiejar': response.meta['cookiejar'],
                                  'search_url': search_url,
                                  'keyword': keyword
                              },
                              callback=self.parse_total_page)

            logger.info("current timestamp:%d", int(time.time()))
            #设置循环爬取间隔
            time.sleep(WeiboSpider.settings['KEYWORD_INTERVAL'])  #可以采用间隔15min

            #对is_search位为1的关键词进行爬取
            sql3 = "select keyword from cauc_keyword_test_copy where is_search = 1"
            cursor = db.select_operation(conn, sql3)
            for keyword in cursor.fetchall():
                keyword = keyword[0]
                logger.info("this is the searched keyword:%s", keyword)

                end_time = get_current_time()
                start_time = get_time_by_interval(int(time.time()),
                                                  3600)  #爬取3600秒,即1小时前的内容

                search_url = main_url + getsearchpage.get_searchurl_time(
                    keyword, start_time, end_time)
                yield Request(url=search_url,
                              meta={
                                  'cookiejar': response.meta['cookiejar'],
                                  'search_url': search_url,
                                  'keyword': keyword
                              },
                              callback=self.parse_total_page)
            conn.close()
예제 #14
0
    def start_getweibo_info(self,response):
        db = MysqlStore();
        #取出没有爬取过的且is_delete=0的重点人员
        GetWeibopage.data['page'] = 1; getweibopage = GetWeibopage()

        for round in range(1): #遍历数据库的轮数
            conn = db.get_connection()

            sql1 = "select user_id from cauc_warning_man_test a \
                    where a.is_search = 0 and a.is_delete = 0"          
            cursor = db.select_operation(conn,sql1)
            for user_id in cursor.fetchall():
                user_id = user_id[0]
                logger.info("this is the unsearched user_id:%s",user_id)
            
                #更新is_search标志位为1
                sql2 = "update cauc_warning_man_test set is_search = 1 where user_id = '%s'" % user_id
                db.update_operation(conn,sql2)
                
                #获取需要爬取的总页面数
                start_time = self.start_time;end_time = get_current_time('hour') 
                mainpage_url = "http://weibo.com/" + str(user_id) + "?is_ori=1&is_forward=1&is_text=1&is_pic=1&key_word=&start_time=" + start_time + "&end_time=" + end_time + "&is_search=1&is_searchadv=1&" 
                GetWeibopage.data['uid'] = user_id; 
                thirdload_url = mainpage_url + getweibopage.get_thirdloadurl()
                yield  Request(url=thirdload_url,meta={'cookiejar':response.meta['cookiejar'],'mainpage_url':mainpage_url,'uid':user_id,'is_search':0},callback=self.parse_total_page)
                
            logger.info("current timestamp:%d",int(time.time()))
            #设置循环爬取间隔
            time.sleep(WeiboSpider.settings['WEIBOCONTENT_INTERVAL']) #可以采用间隔15min

            #取出已经爬取过is_search=1的且is_delete=0的预警人员
            sql3 = "select user_id from cauc_warning_man_test a \
                    where a.is_search = 1 and a.is_delete = 0"
            cursor = db.select_operation(conn,sql3)

            for user_id in cursor.fetchall():
                user_id = user_id[0]
                logger.info("this is the searched user_id:%s",user_id)

                start_time = get_time_by_interval(int(time.time()),86400,'hour');end_time = get_current_time('hour') #起始和结束间隔时间为1天(86400s)
                mainpage_url = "http://weibo.com/" + str(user_id) + "?is_ori=1&is_forward=1&is_text=1&is_pic=1&key_word=&start_time=" + start_time + "&end_time=" + end_time + "&is_search=1&is_searchadv=1&" 
                GetWeibopage.data['uid'] = user_id; 
                thirdload_url = mainpage_url + getweibopage.get_thirdloadurl()
                #yield  Request(url=thirdload_url,meta={'cookiejar':response.meta['cookiejar'],'mainpage_url':mainpage_url,'uid':user_id,'is_search':1},callback=self.parse_total_page)
            conn.close()
    def search_from_keywordDB(self,response):
        db = MysqlStore();main_url = "http://s.weibo.com/weibo/"
        getsearchpage = GetSearchpage()
     
        for round in range(1):  #遍历数据库的轮数
            conn = db.get_connection()

            #选取is_search位为0的关键词
            sql1 = "select keyword from cauc_keyword_test_copy where is_search = 0 and is_delete = 0" 
            cursor1 = db.select_operation(conn,sql1)

            #对is_search位为0的关键词进行爬取
            for keyword in cursor1.fetchall():
                keyword = keyword[0]
                logger.info("this is the unsearched keyword:%s",keyword)
                search_url = main_url + getsearchpage.get_searchurl(keyword)
                yield Request(url=search_url,cookies=random.choice(COOKIES),meta={'search_url':search_url,'keyword':keyword},callback=self.parse_total_page)


            #选取is_search位为1的关键词
            sql2 = "select keyword from cauc_keyword_test_copy where is_search = 1 and is_delete = 0" 
            cursor2 = db.select_operation(conn,sql2)

            #对is_search位为1的关键词进行爬取
            for keyword in cursor2.fetchall():
                keyword = keyword[0]
                logger.info("this is the searched keyword:%s",keyword)

                end_time = get_current_time()
                #start_time = get_time_by_interval(int(time.time()),3600)  #爬取3600秒,即1小时前的内容
                start_time = get_time_by_interval(int(time.time()),int(self.interval))  #爬取interval秒前的内容
                
                search_url = main_url + getsearchpage.get_searchurl_time(keyword,start_time,end_time)
                yield Request(url=search_url,cookies=random.choice(COOKIES),meta={'search_url':search_url,'keyword':keyword},callback=self.parse_total_page)

            #更新is_search标志位为1
            sql3 = "update cauc_keyword_test_copy set is_search = 1 where is_search = 0 and is_delete = 0"
            db.update_operation(conn,sql3)
            db.close_connection(conn)
    def start_getweibo_info(self,response):
        db = MysqlStore();
        #取出没有爬取过的且is_delete=0的重点人员
        GetWeibopage.data['page'] = 1; getweibopage = GetWeibopage()

        for round in range(1): #遍历数据库的轮数
            conn = db.get_connection()

            sql1 = "select user_id from cauc_black_man_test a \
                    where a.is_search = 0 and a.is_delete = 0"          
            cursor = db.select_operation(conn,sql1)
            for user_id in cursor.fetchall():
                user_id = user_id[0]
                logger.info("this is the unsearched user_id:%s",user_id)
            
                #更新is_search标志位为1
                sql2 = "update cauc_black_man_test set is_search = 1 where user_id = '%s'" % user_id
                db.update_operation(conn,sql2)
                
                #获取需要爬取的总页面数
                start_time = self.start_time;end_time = get_current_time('hour') 
                mainpage_url = "http://weibo.com/" + str(user_id) + "?is_ori=1&is_forward=1&is_text=1&is_pic=1&key_word=&start_time=" + start_time + "&end_time=" + end_time + "&is_search=1&is_searchadv=1&" 
                GetWeibopage.data['uid'] = user_id; 
                thirdload_url = mainpage_url + getweibopage.get_thirdloadurl()
                yield  Request(url=thirdload_url,meta={'cookiejar':response.meta['cookiejar'],'mainpage_url':mainpage_url,'uid':user_id,'is_search':0},callback=self.parse_total_page)
                
            logger.info("current timestamp:%d",int(time.time()))
            #设置循环爬取间隔
            time.sleep(WeiboSpider.settings['FRIENDCIRCAL_INTERVAL']) #可以采用间隔15min

            #取出已经爬取过is_search=1的且is_delete=0的重点人员
            sql3 = "select user_id from cauc_black_man_test a \
                    where a.is_search = 1 and a.is_delete = 0"
            cursor = db.select_operation(conn,sql3)

            for user_id in cursor.fetchall():
                user_id = user_id[0]
                logger.info("this is the searched user_id:%s",user_id)

                start_time = get_time_by_interval(int(time.time()),86400,'hour');end_time = get_current_time('hour') #起始和结束间隔时间为1天(86400s)
                mainpage_url = "http://weibo.com/" + str(user_id) + "?is_ori=1&is_forward=1&is_text=1&is_pic=1&key_word=&start_time=" + start_time + "&end_time=" + end_time + "&is_search=1&is_searchadv=1&" 
                GetWeibopage.data['uid'] = user_id; 
                thirdload_url = mainpage_url + getweibopage.get_thirdloadurl()
                #yield  Request(url=thirdload_url,meta={'cookiejar':response.meta['cookiejar'],'mainpage_url':mainpage_url,'uid':user_id,'is_search':1},callback=self.parse_total_page)
            conn.close()
    def search_from_keywordDB(self,response):
        db = MysqlStore();main_url = "http://s.weibo.com/weibo/"
        getsearchpage = GetSearchpage()
     
        for round in range(1):  #遍历数据库的轮数
            conn = db.get_connection()

            #对is_search位为0的关键词进行爬取
            sql1 = "select keyword from cauc_keyword_test_copy where is_search = 0"
            cursor = db.select_operation(conn,sql1)
            for keyword in cursor.fetchall():
                keyword = keyword[0]
                logger.info("this is the unsearched keyword:%s",keyword)
                #更新is_search标志位为1
                sql2 = "update cauc_keyword_test_copy set is_search = 1 where keyword = '%s'" % keyword
                db.update_operation(conn,sql2)
                search_url = main_url + getsearchpage.get_searchurl(keyword)
                yield Request(url=search_url,meta={'cookiejar':response.meta['cookiejar'],'search_url':search_url,'keyword':keyword},callback=self.parse_total_page)

            logger.info("current timestamp:%d",int(time.time()))
            #设置循环爬取间隔
            time.sleep(WeiboSpider.settings['KEYWORD_INTERVAL']) #可以采用间隔15min 

            #对is_search位为1的关键词进行爬取
            sql3 = "select keyword from cauc_keyword_test_copy where is_search = 1"
            cursor = db.select_operation(conn,sql3)
            for keyword in cursor.fetchall():
                keyword = keyword[0]
                logger.info("this is the searched keyword:%s",keyword)

                end_time = get_current_time()
                start_time = get_time_by_interval(int(time.time()),3600)  #爬取3600秒,即1小时前的内容
                
                search_url = main_url + getsearchpage.get_searchurl_time(keyword,start_time,end_time)
                yield Request(url=search_url,meta={'cookiejar':response.meta['cookiejar'],'search_url':search_url,'keyword':keyword},callback=self.parse_total_page)
            conn.close()
예제 #18
0
from  datamysql import MysqlStore
d = MysqlStore()
conn = d.get_connection()
sql = "select * from t_user_follow"
cursor = d.select_operation(conn,sql)

for i in range(100):
    print 'i:',i
    for result in cursor.fetchmany(5):
        if result[1]:
            print 'hhhhhhh'