def start_getweiboinfo(self,response): db = MysqlStore() conn = db.get_connection() sql1 = "select * from t_user_follow where contentstate = 0" cursor1 = db.select_operation(conn,sql1) sql2 = "select count(*) from t_user_follow where contentstate = 0" cursor2 = db.select_operation(conn,sql2) count = cursor2.fetchone() for i in range(10): #(count[0]): #count[0]为需要再爬取的用户数 for result in cursor1.fetchmany(1): if result[1]: mainpageurl = 'http://weibo.com/u/'+str(result[1])+'?from=otherprofile&wvr=3.6&loc=tagweibo' GetWeibopage.data['uid'] = result[1] getweibopage = GetWeibopage() for page in range(WeiboSpider.page_num): GetWeibopage.data['page'] = page+1 firstloadurl = mainpageurl + getweibopage.get_firstloadurl() yield Request(url=firstloadurl,meta={'cookiejar':response.meta['cookiejar'],'uid':result[1]},callback=self.parse_firstload) secondloadurl = mainpageurl + getweibopage.get_secondloadurl() yield Request(url=secondloadurl,meta={'cookiejar':response.meta['cookiejar'],'uid':result[1]},callback=self.parse_secondload) thirdloadurl = mainpageurl + getweibopage.get_thirdloadurl() yield Request(url=thirdloadurl,meta={'cookiejar':response.meta['cookiejar'],'uid':result[1]},callback=self.parse_thirdload)
def search_from_keywordDB(self,response): db = MysqlStore();conn = db.get_connection() main_url = "http://s.weibo.com/weibo/" getsearchpage = GetSearchpage() sql1 = "select keyword from cauc_keyword_test" cursor = db.select_operation(conn,sql1) for keyword in cursor.fetchall(): print "this is the keyword:",keyword keywords = ['机场 炸弹','飞机 炸弹'] for i in range(15): for keyword in keywords: sql3 = "select max(publish_time) from cauc_keyword_info where keyword = '%s'" % keyword #检查是否爬取过该关键词 cursor = db.select_operation(conn,sql3) newest_time = cursor.fetchone()[0] if newest_time is not None: #已经爬取过该关键词,获取最新时间用于"最新时间-当前时间"时间段内容获取 current_time = get_current_time() newest_time = format_time(newest_time) print "爬取过的关键词:%s,搜索时间段%s~%s间的内容" % (keyword,newest_time,current_time) search_url = main_url + getsearchpage.get_searchurl_time(keyword,newest_time,current_time) yield Request(url=search_url,meta={'cookiejar':response.meta['cookiejar'],'search_url':search_url,'keyword':keyword},callback=self.parse_total_page) else: #未爬取过该关键词 print "未爬取过的关键词:%s" % keyword search_url = main_url + getsearchpage.get_searchurl(keyword) yield Request(url=search_url,meta={'cookiejar':response.meta['cookiejar'],'search_url':search_url,'keyword':keyword},callback=self.parse_total_page) time.sleep(100000) conn.close()
def clear_flag(): conn = MysqlStore().get_connection() cursor = conn.cursor() sql = "update cauc_parameters set param_value = 0 where param_key = 'flag'" n = cursor.execute(sql) conn.commit() db.close_connection(conn,cursor) if n: logger.info('clear_flag success!')
def update_cookies(username,cookies): '''更新cookie到数据库''' db = MysqlStore() conn = db.get_connection() timestamp = int(time.time()) cursor = conn.cursor() cursor.execute("insert into cauc_login_cookie_info(USERNAME,SUHB,SUB,SUBP,SUE,SUS,SUP,timestamp) values (%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE SUHB=%s,SUB=%s,SUBP=%s,SUE=%s,SUS=%s,SUP=%s,timestamp=%s",(md5(username),cookies['SUHB'],cookies['SUB'],cookies['SUBP'],cookies['SUE'],cookies['SUS'],cookies['SUP'],timestamp,cookies['SUHB'],cookies['SUB'],cookies['SUBP'],cookies['SUE'],cookies['SUS'],cookies['SUP'],timestamp)) conn.commit() db.close_connection(conn,cursor) logger.info('Update cookies into database...')
def allcookie_fetch(): '''获取所有cookie''' db = MysqlStore() conn = db.get_connection() cursor = conn.cursor(MySQLdb.cursors.DictCursor) nums = cursor.execute('SELECT SUHB,SUB,SUBP,SUE,SUS,SUP,USERNAME FROM cauc_login_cookie_info WHERE (unix_timestamp()-cast(timestamp as signed)) < (%s)',(EXPIRES,)) rows = cursor.fetchallDict() conn.commit() db.close_connection(conn,cursor) return nums,rows
def set_flag(): '''设置flag''' db = MysqlStore() conn = db.get_connection() cursor = conn.cursor() sql = "update cauc_parameters set param_value = 1 where param_key = 'flag'" n = cursor.execute(sql) conn.commit() db.close_connection(conn,cursor) if n: logger.info('Set flag success!') else: logger.error('Set flag failed,flag is already 1!')
def user_fetch(): '''获取账号信息''' db = MysqlStore() conn = db.get_connection() cursor = conn.cursor(MySQLdb.cursors.DictCursor) sql = 'SELECT id,username,password FROM cauc_login_account_info where is_delete = 0' nums = cursor.execute(sql) users = cursor.fetchallDict() conn.commit() db.close_connection(conn,cursor) if users: logger.info('User fetch success!') return users else: logger.error('There is no user in database!')
def search_from_keywordDB(self, response): db = MysqlStore() main_url = "http://s.weibo.com/weibo/" getsearchpage = GetSearchpage() for round in range(1): #遍历数据库的轮数 conn = db.get_connection() #选取is_search位为0的关键词 sql1 = "select keyword from cauc_keyword_test_copy where is_search = 0 and is_delete = 0" cursor1 = db.select_operation(conn, sql1) #对is_search位为0的关键词进行爬取 for keyword in cursor1.fetchall(): keyword = keyword[0] logger.info("this is the unsearched keyword:%s", keyword) search_url = main_url + getsearchpage.get_searchurl(keyword) yield Request(url=search_url, cookies=random.choice(COOKIES), meta={ 'search_url': search_url, 'keyword': keyword }, callback=self.parse_total_page) #选取is_search位为1的关键词 sql2 = "select keyword from cauc_keyword_test_copy where is_search = 1 and is_delete = 0" cursor2 = db.select_operation(conn, sql2) #对is_search位为1的关键词进行爬取 for keyword in cursor2.fetchall(): keyword = keyword[0] logger.info("this is the searched keyword:%s", keyword) end_time = get_current_time() #start_time = get_time_by_interval(int(time.time()),3600) #爬取3600秒,即1小时前的内容 start_time = get_time_by_interval(int( time.time()), int(self.interval)) #爬取interval秒前的内容 search_url = main_url + getsearchpage.get_searchurl_time( keyword, start_time, end_time) yield Request(url=search_url, cookies=random.choice(COOKIES), meta={ 'search_url': search_url, 'keyword': keyword }, callback=self.parse_total_page) #更新is_search标志位为1 sql3 = "update cauc_keyword_test_copy set is_search = 1 where is_search = 0 and is_delete = 0" db.update_operation(conn, sql3) db.close_connection(conn)
def start_getweiboinfo(self, response): db = MysqlStore() conn = db.get_connection() sql1 = "select * from t_user_follow where contentstate = 0" cursor1 = db.select_operation(conn, sql1) sql2 = "select count(*) from t_user_follow where contentstate = 0" cursor2 = db.select_operation(conn, sql2) count = cursor2.fetchone() for i in (count[0]): #count[0]为需要再爬取的用户数 for result in cursor1.fetchmany(1): if result[1]: mainpageurl = 'http://weibo.com/u/' + str( result[1]) + '?from=otherprofile&wvr=3.6&loc=tagweibo' GetWeibopage.data['uid'] = result[1] getweibopage = GetWeibopage() for page in range(WeiboSpider.page_num): GetWeibopage.data['page'] = page + 1 firstloadurl = mainpageurl + getweibopage.get_firstloadurl( ) yield Request(url=firstloadurl, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': result[1] }, callback=self.parse_firstload) secondloadurl = mainpageurl + getweibopage.get_secondloadurl( ) yield Request(url=secondloadurl, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': result[1] }, callback=self.parse_secondload) thirdloadurl = mainpageurl + getweibopage.get_thirdloadurl( ) yield Request(url=thirdloadurl, meta={ 'cookiejar': response.meta['cookiejar'], 'uid': result[1] }, callback=self.parse_thirdload)
def get_userinfo(self,response): db = MysqlStore() conn = db.get_connection() #sql1 = "select * from t_user_follow where infostate = 0 and contentstate = 0" sql1 = "select * from t_user_info where imagestate = 1 and imageurl = 1" cursor1 = db.select_operation(conn,sql1) sql2 = "select count(*) from t_user_follow where infostate = 0 and contentstate = 0" cursor2 = db.select_operation(conn,sql2) count = cursor2.fetchone() for i in range(1): #count[0]): for result in cursor1.fetchmany(1): if result[0]: mainpageurl = 'http://weibo.com/u/'+str(result[0])+'?from=otherprofile&wvr=3.6&loc=tagweibo' GetWeibopage.data['uid'] = result[0] #result[1] getweibopage = GetWeibopage() GetWeibopage.data['page'] = 1 firstloadurl = mainpageurl + getweibopage.get_firstloadurl() yield Request(url=firstloadurl,meta={'cookiejar':response.meta['cookiejar'],'uid':result[0]},callback=self.get_userurl)
def search_from_keywordDB(self,response): db = MysqlStore();conn = db.get_connection() main_url = "http://s.weibo.com/weibo/" getsearchpage = GetSearchpage() sql1 = "select keyword from cauc_keyword_test_copy where is_search = 0" cursor = db.select_operation(conn,sql1) for round in range(3): for keyword in cursor.fetchall(): keyword = keyword[0] print "this is the unsearched keyword:",keyword sql2 = "update cauc_keyword_test_copy set is_search = 1 where keyword = '%s'" % keyword db.update_operation(conn,sql2) search_url = main_url + getsearchpage.get_searchurl(keyword) #yield Request(url=search_url,meta={'cookiejar':response.meta['cookiejar'],'search_url':search_url,'keyword':keyword},callback=self.parse_total_page) print "current timestamp:",int(time.time()) #设置循环爬取间隔 time.sleep(WeiboSpider.settings['KEYWORD_INTERVAL']) sql3 = "select keyword from cauc_keyword_test_copy where is_search = 1" cursor = db.select_operation(conn,sql3) for keyword in cursor.fetchall(): keyword = keyword[0] print "this is the searched keyword",keyword #yield Request(url=search_url,meta={'cookiejar':response.meta['cookiejar'],'search_url':search_url,'keyword':keyword},callback=self.parse_total_page) conn.close()
def start_getweibo_info(self,response): db = MysqlStore(); #取出没有爬取过的且is_delete=0的重点人员 GetWeibopage.data['page'] = 1; getweibopage = GetWeibopage() #for round in range(1): #遍历数据库的轮数 conn = db.get_connection() sql1 = "select user_id from cauc_warning_man a \ where a.is_search = 0 and a.is_delete = 0" cursor1 = db.select_operation(conn,sql1) for user_id in cursor1.fetchall(): user_id = user_id[0] logger.info("this is the unsearched user_id:%s",user_id) #获取需要爬取的总页面数 start_time = self.start_time;end_time = get_current_time('day') mainpage_url = "http://weibo.com/" + str(user_id) + "?is_ori=1&is_forward=1&is_text=1&is_pic=1&key_word=&start_time=" + start_time + "&end_time=" + end_time + "&is_search=1&is_searchadv=1&" GetWeibopage.data['uid'] = user_id; thirdload_url = mainpage_url + getweibopage.get_thirdloadurl() yield Request(url=thirdload_url,cookies=random.choice(COOKIES),meta={'mainpage_url':mainpage_url,'uid':user_id,'is_search':0},callback=self.parse_total_page) #取出已经爬取过is_search=1的且is_delete=0的预警人员 sql2 = "select user_id from cauc_warning_man a \ where a.is_search = 1 and a.is_delete = 0" cursor2 = db.select_operation(conn,sql2) for user_id in cursor2.fetchall(): user_id = user_id[0] logger.info("this is the searched user_id:%s",user_id) #start_time = get_time_by_interval(int(time.time()),86400,'day');end_time = get_current_time('day') #起始和结束间隔时间为1天(86400s),即过去一天的内容 start_time = get_time_by_interval(int(time.time()),int(self.interval),'day');end_time = get_current_time('day') #起始和结束间隔时间为x天(由interval代表的秒换算而来) mainpage_url = "http://weibo.com/" + str(user_id) + "?is_ori=1&is_forward=1&is_text=1&is_pic=1&key_word=&start_time=" + start_time + "&end_time=" + end_time + "&is_search=1&is_searchadv=1&" GetWeibopage.data['uid'] = user_id; thirdload_url = mainpage_url + getweibopage.get_thirdloadurl() yield Request(url=thirdload_url,cookies=random.choice(COOKIES),meta={'mainpage_url':mainpage_url,'uid':user_id,'is_search':1},callback=self.parse_total_page) #更新is_search标志位为1 sql3 = "update cauc_warning_man set is_search = 1 where is_search = 0 and is_delete = 0" db.update_operation(conn,sql3) db.close_connection(conn)
def search_from_keywordDB(self, response): db = MysqlStore() main_url = "http://s.weibo.com/weibo/" getsearchpage = GetSearchpage() for round in range(1): #遍历数据库的轮数 conn = db.get_connection() #对is_search位为0的关键词进行爬取 sql1 = "select keyword from cauc_keyword_test_copy where is_search = 0" cursor = db.select_operation(conn, sql1) for keyword in cursor.fetchall(): keyword = keyword[0] logger.info("this is the unsearched keyword:%s", keyword) #更新is_search标志位为1 sql2 = "update cauc_keyword_test_copy set is_search = 1 where keyword = '%s'" % keyword db.update_operation(conn, sql2) search_url = main_url + getsearchpage.get_searchurl(keyword) yield Request(url=search_url, meta={ 'cookiejar': response.meta['cookiejar'], 'search_url': search_url, 'keyword': keyword }, callback=self.parse_total_page) logger.info("current timestamp:%d", int(time.time())) #设置循环爬取间隔 time.sleep(WeiboSpider.settings['KEYWORD_INTERVAL']) #可以采用间隔15min #对is_search位为1的关键词进行爬取 sql3 = "select keyword from cauc_keyword_test_copy where is_search = 1" cursor = db.select_operation(conn, sql3) for keyword in cursor.fetchall(): keyword = keyword[0] logger.info("this is the searched keyword:%s", keyword) end_time = get_current_time() start_time = get_time_by_interval(int(time.time()), 3600) #爬取3600秒,即1小时前的内容 search_url = main_url + getsearchpage.get_searchurl_time( keyword, start_time, end_time) yield Request(url=search_url, meta={ 'cookiejar': response.meta['cookiejar'], 'search_url': search_url, 'keyword': keyword }, callback=self.parse_total_page) conn.close()
def start_getweibo_info(self,response): db = MysqlStore(); #取出没有爬取过的且is_delete=0的重点人员 GetWeibopage.data['page'] = 1; getweibopage = GetWeibopage() for round in range(1): #遍历数据库的轮数 conn = db.get_connection() sql1 = "select user_id from cauc_warning_man_test a \ where a.is_search = 0 and a.is_delete = 0" cursor = db.select_operation(conn,sql1) for user_id in cursor.fetchall(): user_id = user_id[0] logger.info("this is the unsearched user_id:%s",user_id) #更新is_search标志位为1 sql2 = "update cauc_warning_man_test set is_search = 1 where user_id = '%s'" % user_id db.update_operation(conn,sql2) #获取需要爬取的总页面数 start_time = self.start_time;end_time = get_current_time('hour') mainpage_url = "http://weibo.com/" + str(user_id) + "?is_ori=1&is_forward=1&is_text=1&is_pic=1&key_word=&start_time=" + start_time + "&end_time=" + end_time + "&is_search=1&is_searchadv=1&" GetWeibopage.data['uid'] = user_id; thirdload_url = mainpage_url + getweibopage.get_thirdloadurl() yield Request(url=thirdload_url,meta={'cookiejar':response.meta['cookiejar'],'mainpage_url':mainpage_url,'uid':user_id,'is_search':0},callback=self.parse_total_page) logger.info("current timestamp:%d",int(time.time())) #设置循环爬取间隔 time.sleep(WeiboSpider.settings['WEIBOCONTENT_INTERVAL']) #可以采用间隔15min #取出已经爬取过is_search=1的且is_delete=0的预警人员 sql3 = "select user_id from cauc_warning_man_test a \ where a.is_search = 1 and a.is_delete = 0" cursor = db.select_operation(conn,sql3) for user_id in cursor.fetchall(): user_id = user_id[0] logger.info("this is the searched user_id:%s",user_id) start_time = get_time_by_interval(int(time.time()),86400,'hour');end_time = get_current_time('hour') #起始和结束间隔时间为1天(86400s) mainpage_url = "http://weibo.com/" + str(user_id) + "?is_ori=1&is_forward=1&is_text=1&is_pic=1&key_word=&start_time=" + start_time + "&end_time=" + end_time + "&is_search=1&is_searchadv=1&" GetWeibopage.data['uid'] = user_id; thirdload_url = mainpage_url + getweibopage.get_thirdloadurl() #yield Request(url=thirdload_url,meta={'cookiejar':response.meta['cookiejar'],'mainpage_url':mainpage_url,'uid':user_id,'is_search':1},callback=self.parse_total_page) conn.close()
def search_from_keywordDB(self,response): db = MysqlStore();main_url = "http://s.weibo.com/weibo/" getsearchpage = GetSearchpage() for round in range(1): #遍历数据库的轮数 conn = db.get_connection() #选取is_search位为0的关键词 sql1 = "select keyword from cauc_keyword_test_copy where is_search = 0 and is_delete = 0" cursor1 = db.select_operation(conn,sql1) #对is_search位为0的关键词进行爬取 for keyword in cursor1.fetchall(): keyword = keyword[0] logger.info("this is the unsearched keyword:%s",keyword) search_url = main_url + getsearchpage.get_searchurl(keyword) yield Request(url=search_url,cookies=random.choice(COOKIES),meta={'search_url':search_url,'keyword':keyword},callback=self.parse_total_page) #选取is_search位为1的关键词 sql2 = "select keyword from cauc_keyword_test_copy where is_search = 1 and is_delete = 0" cursor2 = db.select_operation(conn,sql2) #对is_search位为1的关键词进行爬取 for keyword in cursor2.fetchall(): keyword = keyword[0] logger.info("this is the searched keyword:%s",keyword) end_time = get_current_time() #start_time = get_time_by_interval(int(time.time()),3600) #爬取3600秒,即1小时前的内容 start_time = get_time_by_interval(int(time.time()),int(self.interval)) #爬取interval秒前的内容 search_url = main_url + getsearchpage.get_searchurl_time(keyword,start_time,end_time) yield Request(url=search_url,cookies=random.choice(COOKIES),meta={'search_url':search_url,'keyword':keyword},callback=self.parse_total_page) #更新is_search标志位为1 sql3 = "update cauc_keyword_test_copy set is_search = 1 where is_search = 0 and is_delete = 0" db.update_operation(conn,sql3) db.close_connection(conn)
def start_getweibo_info(self,response): db = MysqlStore(); #取出没有爬取过的且is_delete=0的重点人员 GetWeibopage.data['page'] = 1; getweibopage = GetWeibopage() for round in range(1): #遍历数据库的轮数 conn = db.get_connection() sql1 = "select user_id from cauc_black_man_test a \ where a.is_search = 0 and a.is_delete = 0" cursor = db.select_operation(conn,sql1) for user_id in cursor.fetchall(): user_id = user_id[0] logger.info("this is the unsearched user_id:%s",user_id) #更新is_search标志位为1 sql2 = "update cauc_black_man_test set is_search = 1 where user_id = '%s'" % user_id db.update_operation(conn,sql2) #获取需要爬取的总页面数 start_time = self.start_time;end_time = get_current_time('hour') mainpage_url = "http://weibo.com/" + str(user_id) + "?is_ori=1&is_forward=1&is_text=1&is_pic=1&key_word=&start_time=" + start_time + "&end_time=" + end_time + "&is_search=1&is_searchadv=1&" GetWeibopage.data['uid'] = user_id; thirdload_url = mainpage_url + getweibopage.get_thirdloadurl() yield Request(url=thirdload_url,meta={'cookiejar':response.meta['cookiejar'],'mainpage_url':mainpage_url,'uid':user_id,'is_search':0},callback=self.parse_total_page) logger.info("current timestamp:%d",int(time.time())) #设置循环爬取间隔 time.sleep(WeiboSpider.settings['FRIENDCIRCAL_INTERVAL']) #可以采用间隔15min #取出已经爬取过is_search=1的且is_delete=0的重点人员 sql3 = "select user_id from cauc_black_man_test a \ where a.is_search = 1 and a.is_delete = 0" cursor = db.select_operation(conn,sql3) for user_id in cursor.fetchall(): user_id = user_id[0] logger.info("this is the searched user_id:%s",user_id) start_time = get_time_by_interval(int(time.time()),86400,'hour');end_time = get_current_time('hour') #起始和结束间隔时间为1天(86400s) mainpage_url = "http://weibo.com/" + str(user_id) + "?is_ori=1&is_forward=1&is_text=1&is_pic=1&key_word=&start_time=" + start_time + "&end_time=" + end_time + "&is_search=1&is_searchadv=1&" GetWeibopage.data['uid'] = user_id; thirdload_url = mainpage_url + getweibopage.get_thirdloadurl() #yield Request(url=thirdload_url,meta={'cookiejar':response.meta['cookiejar'],'mainpage_url':mainpage_url,'uid':user_id,'is_search':1},callback=self.parse_total_page) conn.close()
def search_from_keywordDB(self,response): db = MysqlStore();main_url = "http://s.weibo.com/weibo/" getsearchpage = GetSearchpage() for round in range(1): #遍历数据库的轮数 conn = db.get_connection() #对is_search位为0的关键词进行爬取 sql1 = "select keyword from cauc_keyword_test_copy where is_search = 0" cursor = db.select_operation(conn,sql1) for keyword in cursor.fetchall(): keyword = keyword[0] logger.info("this is the unsearched keyword:%s",keyword) #更新is_search标志位为1 sql2 = "update cauc_keyword_test_copy set is_search = 1 where keyword = '%s'" % keyword db.update_operation(conn,sql2) search_url = main_url + getsearchpage.get_searchurl(keyword) yield Request(url=search_url,meta={'cookiejar':response.meta['cookiejar'],'search_url':search_url,'keyword':keyword},callback=self.parse_total_page) logger.info("current timestamp:%d",int(time.time())) #设置循环爬取间隔 time.sleep(WeiboSpider.settings['KEYWORD_INTERVAL']) #可以采用间隔15min #对is_search位为1的关键词进行爬取 sql3 = "select keyword from cauc_keyword_test_copy where is_search = 1" cursor = db.select_operation(conn,sql3) for keyword in cursor.fetchall(): keyword = keyword[0] logger.info("this is the searched keyword:%s",keyword) end_time = get_current_time() start_time = get_time_by_interval(int(time.time()),3600) #爬取3600秒,即1小时前的内容 search_url = main_url + getsearchpage.get_searchurl_time(keyword,start_time,end_time) yield Request(url=search_url,meta={'cookiejar':response.meta['cookiejar'],'search_url':search_url,'keyword':keyword},callback=self.parse_total_page) conn.close()
from datamysql import MysqlStore d = MysqlStore() conn = d.get_connection() sql = "select * from t_user_follow" cursor = d.select_operation(conn,sql) for i in range(100): print 'i:',i for result in cursor.fetchmany(5): if result[1]: print 'hhhhhhh'