def crawler_twitter_init(): config = read_config() print('<-----初始化程序----->') twitter_crawler_queue = RedisQueue(name='twitter_users', redis_config=config['redis_config']) if twitter_crawler_queue.qsize() > 0: print('<-----有%s个任务还未完成---->' % twitter_crawler_queue.qsize()) if twitter_crawler_queue.empty(): with open(os.path.abspath('twitter_user_ids.json'), 'r') as f: user_ids = json.load(f) for id in user_ids['ids']: twitter_crawler_queue.put(id) print('<-----有%s个任务需要完成----->' % twitter_crawler_queue.qsize()) print('<-----twitter初始化完成----->')
def crawler_init(name='twitter'): print('<-----初始化程序----->') if name == 'twitter': twitter_crawler_queue = RedisQueue(name='twitter') if twitter_crawler_queue.qsize() == 0: with open(os.path.abspath('twitter_user_ids.json'), 'r') as f: user_ids = json.load(f) for id in user_ids['ids']: twitter_crawler_queue.put(id) else: facebook_crawler_queue = RedisQueue(name='facebook') if facebook_crawler_queue.qsize() == 0: with open(os.path.abspath('facebook_user_ids.json'), 'r') as f: user_ids = json.load(f) for id in user_ids['ids']: facebook_crawler_queue.put(id) print('<-----初始化成功------>')
def update_facebook_users_count(self, crawler): #facebook 用户账户的like_num字段 print('【爬取twitter usersInfos】') db = self.create_mongo_conn(db='FaceBook', collection='facebook') weipa_count = 1 while True: try: facebook_users_queue = RedisQueue( name='facebook_users', redis_config=self.app_config['redis_config']) print('[===未抓取的的id个数为:%s===]' % facebook_users_queue.qsize()) if facebook_users_queue.empty(): if weipa_count >= 3: print('<-----用户抓取完成----->') break else: weipa_count += 1 print('[==Retry:%s==]' % (weipa_count - 1)) time.sleep(5) continue id = facebook_users_queue.get() tweet = db.find_one({"id": id}) print(tweet) if not tweet: continue if tweet['link'].endswith('/'): url = tweet['link'] + 'community/' else: url = tweet['link'] + '/' + 'community/' content = crawler.crawler_user_likes(url) for item in content: # print(item) if "isLoginStatus" in item: update_doc = db.find_one_and_update( {"id": id}, { '$set': { 'likes_num': item['like_count'], 'fan_count': item['fan_count'], "update_time": datetime.now(), "isLoginStatus": item["isLoginStatus"] } }, return_document=ReturnDocument.AFTER) print('更新了%s文档成功' % update_doc['id']) else: update_doc = db.find_one_and_update( {"id": id}, { '$set': { 'likes_num': item['like_count'], 'fan_count': item['fan_count'], "update_time": datetime.now(), } }, return_document=ReturnDocument.AFTER) print('更新了%s文档成功' % update_doc['id']) weipa_count = 1 except Exception as e: raise e
def check_queue_isEmpty(): try: queue = RedisQueue(name='twitter_replay', redis_config=app_config['redis_config']) if queue.qsize() > 0: twitter_every_day_update_count_job() else: print('[没有任务!!]') return except Exception as e: print(e) return
def crawler_init(name='twitter'): print('<-----初始化程序----->') config = read_config() if name == 'twitter': twitter_crawler_queue = RedisQueue(name='twitter', redis_config=config['redis_config']) if twitter_crawler_queue.qsize() > 0: print('<-----有%s个任务还未完成---->' % twitter_crawler_queue.qsize()) if twitter_crawler_queue.empty(): with open(os.path.abspath('twitter_user_ids.json'), 'r') as f: user_ids = json.load(f) for id in user_ids['ids']: twitter_crawler_queue.put(id) # print(id) print('<-----有%s个任务需要完成----->' % twitter_crawler_queue.qsize()) print('<-----twitter初始化完成----->') else: facebook_crawler_queue = RedisQueue( name='facebook', redis_config=config['redis_config']) if facebook_crawler_queue.qsize() > 0: print('<-----有%s个任务还未完成---->' % facebook_crawler_queue.qsize()) if facebook_crawler_queue.empty(): with open(os.path.abspath('facebook_user_ids.json'), 'r') as f: user_ids = json.load(f) for id in user_ids['ids']: facebook_crawler_queue.put(id) print('<-----有%s个任务需要完成----->' % facebook_crawler_queue.qsize()) print('<-----facebook初始化完成----->')
def completion_twitter_text(conn): # with open('twitter_user_ids.json','r') as f: # ids = json.load(f) current = 0; # for id in ids['ids']: with open(os.path.abspath('config.json'), 'r') as f: app_config = json.load(f) twitter_crawler_queue = RedisQueue(name='twitter_test_ids', redis_config=app_config['redis_config2']) while True: try: # print(id) if twitter_crawler_queue.empty(): print('所有的用户全部跑完') break id = twitter_crawler_queue.get() print('[===取出的ID为%s,目前还有%s条需要抓取====]' % (id,twitter_crawler_queue.qsize())) es_url = 'http://narnia.idatage.com/stq/api/v1/rowlet/findEsTextByUserIdOrKeywords?startDate=2016-12-30&endDate=2018-02-12&category=tw&ids=%s' % (id,) # if i in [1,2,3,4,5,6,7,8] else 'http://narnia.idatage.com/stq/api/v1/rowlet/findEsTextByUserIdOrKeywords?startDate=2017-%s-01&endDate=2017-%s-01&category=%s&ids=%s' % (i,i+1,category,id) es_body = requests.get(es_url) # print('取出的内容为:') # print(es_body.text) es_body_tw = json.loads(es_body.text)['tw'] print(len(es_body_tw)) def handele(x): # print(x) x['_source']['index_name'] = x['_index'] x['_source']['type_name'] = x['_type'] x['_source']['id']=x['_id'] x['_source']['url'] = 'https://twitter.com/%s/status/%s' % (x['_source']['user']['screen_name'], x['_source']['id_str']) return x['_source']; es_body_tw_urls = list(map(handele,filter(lambda x:not x['_source']['truncated'],es_body_tw))) # print(es_body_tw_urls) if len(es_body_tw_urls)>200: pool = mp.Pool() res = pool.map(asynchronous_request,(es_body_tw_urls[i:i+200] for i in range(0,len(es_body_tw_urls),200))) # current += 1; print('更新%s用户' % id) elif 0<len(es_body_tw_urls)<200: asynchronous_request(ops=es_body_tw_urls) # current += 1; print('更新%s用户' % id) # print('第几%s个' % current) else: current += 1; print('该用户%s无需更新' % id) print('第几%s个' % current) conn.send(id) except Exception as e: current = 0 # print(e) raise e
def update_twitter_users_count(self, crawler): #twitter 用户的list字段 # with open(os.path.abspath('twitter_user_ids.json'), 'r') as f: # user_ids = json.load(f) # count=1; print('【爬取twitter usersInfos】') db = self.create_mongo_conn(db='Twitter', collection='twitter') weipa_count = 1 while True: try: twitter_users_queue = RedisQueue( name='twitter_users', redis_config=self.app_config['redis_config']) print('[===未抓取的的id个数为:%s===]' % twitter_users_queue.qsize()) if twitter_users_queue.empty(): if weipa_count >= 3: print('<-----用户抓取完成----->') break else: weipa_count += 1 print('[==Retry:%s==]' % (weipa_count - 1)) time.sleep(5) continue id = twitter_users_queue.get() tweet = db.find_one({"id_str": id}) tweet_count, flowing_count, followers_count, favorites_count, list_count, moment_count = crawler.crawler_list_count( tweet["screen_name"], user_id=id) print(tweet_count, flowing_count, followers_count, favorites_count, list_count, moment_count) after_doc = db.find_one_and_update( {"id_str": tweet['id_str']}, { "$set": { "list_num": list_count, "moment_num": moment_count, "followers_count": followers_count, "friends_count": flowing_count, "statuses_count": tweet_count, "favourites_count": favorites_count, "update_time": datetime.now(), } }, return_document=ReturnDocument.AFTER) print('更新%s文档成功' % after_doc['_id']) # count +=1; weipa_count = 1 except Exception as e: raise e
import sys,os,json sys.path.append(".") from src.redis_helper import RedisQueue if __name__ == '__main__': twitter_crawler_queue = RedisQueue(name='twittter') while True: print(twitter_crawler_queue.qsize()) print(twitter_crawler_queue.get())
def crawler_tweets_replay_count(self, crawler, history=False): #推文表中的replay_count字段 print('<=====启动tweet_replay抓取====>') db = self.create_mongo_conn() crawler_tweet_replay_queue = RedisQueue( name='twitter_replay', redis_config=self.app_config['redis_config']) weipa_count = 1 err_count = 1 if not history: es = Espusher() while True: try: print('[===未抓取的个数为:%s===]' % crawler_tweet_replay_queue.qsize()) # if crawler_tweet_replay_queue.empty(): # if weipa_count >=3: # print('<-----twitter_replay抓取完成----->') # break # else: # weipa_count+=1 # print('[==Retry:%s==]' % (weipa_count-1)) # time.sleep(10) # continue # print(weipa_count) urls = crawler_tweet_replay_queue.get() print("取出%s个" % len(urls)) content = crawler.crawler_replay_num(urls) for item in content: print(item) # print(item['url'].split('/')[-1]) if history: update_doc = db.update_many( { "id_str": item['url'].split('/')[-1], 'site': 'twitter' }, { '$set': { 'replay_count': item['reply_count'], 'retweet_count': item['retweet_count'], 'favorite_count': item['favorite_count'], "update_status": True } }) print('更新了%s个' % (update_doc.modified_count)) else: # print('push item to es') # print(item) data = db.find_one_and_delete({ "id_str": item['url'].split('/')[-1], 'site': 'twitter' }) data['replay_count'] = item['reply_count'] data['favorite_count'] = item['favorite_count'] data['retweet_count'] = item['retweet_count'] es.twitter_pusher(data) weipa_count = 1 err_count = 1 except Exception as e: print(e) continue
def crawler_reactions(self, crawler, history=False): #facebook posts表中的各种量 print('<=====启动facebook_reactions抓取====>') db = self.create_mongo_conn() crawler_reactions_queue = RedisQueue( name='facebook_reactions', redis_config=self.app_config['redis_config']) weipa_count = 1 err_count = 1 if not history: es = Espusher() while True: try: # print(db.count({"site": "facebook","update_status":False})) # tweets = list(db.find({"site": "facebook","update_status":False}).limit(20)) # if (len(tweets) == 0): # print('全部爬取完成') # break; print('[===未抓取的个数为:%s===]' % crawler_reactions_queue.qsize()) # if crawler_reactions_queue.empty(): # if weipa_count >=3: # print('<-----facebook_reactions抓取完成----->') # break # else: # weipa_count+=1 # print('[==Retry:%s==]' % (weipa_count-1)) # time.sleep(10) # continue urls = crawler_reactions_queue.get( ) #map(lambda x:{"url":'https://facebook.com%s' % x['permalink_url'],'id':x['_id']},tweets) content = crawler.crawler_reactions_nums(urls) # print(content) if not content: continue for item in content: # print(item) print(item['reactions']) # print(item['url']) # print(url) if not item['reactions']: print(item) else: if history: print(objectid.ObjectId(item['url']['id'])) update_doc = db.find_one_and_update( {"_id": objectid.ObjectId(item['url']['id'])}, { '$set': { 'comment_num': item['reactions']['comment_count'], 'likes_num': item['reactions']['likes_count'], 'share_count': item['reactions']["share_count"], "update_status": True } }, return_document=ReturnDocument.AFTER) if update_doc != None: print('更新了%s个' % update_doc['_id']) else: data = db.find_one_and_delete( {'_id': objectid.ObjectId(item['url']['id'])}) data['comment_num'] = item['reactions'][ 'comment_count'] data['likes_num'] = item['reactions'][ 'likes_count'] data['share_count'] = item['reactions'][ "share_count"] es.facebook_pusher(data) weipa_count = 1 err_count = 1 except Exception as e: print(e) continue
def crawler_tweets(self, crawler, site='facebook', deadtime='2017-1-1'): print('<-----启动文章抓取----->') weipa_count = 1 if site == 'twitter': twitter_crawler_queue = RedisQueue( name='twitter', redis_config=self.app_config['redis_config']) twitter_crawler_error_queue = RedisQueue( name='twitter_error', redis_config=self.app_config['redis_config']) while True: if twitter_crawler_error_queue.qsize() > 0: err_item = twitter_crawler_error_queue.get() print('取出出错条目:%s', err_item) current_max_id = err_item['current_max_id'] id = err_item['user_id'] crawler.fetch_user_tweets(user_id=id, current_max_id=current_max_id, deadline=deadtime) else: print('\n') print('[===Info:%s 未抓取的的id个数为:%s===]' % (datetime.now(), twitter_crawler_queue.qsize())) if twitter_crawler_queue.empty(): if weipa_count >= 3: print('<-----文章抓取完成----->') break else: weipa_count += 1 print('[==Retry:%s==]' % (weipa_count - 1)) time.sleep(5) continue id = twitter_crawler_queue.get() crawler.fetch_user_tweets(user_id=id, deadline=deadtime) weipa_count = 1 else: facebook_crawler_queue = RedisQueue( name='facebook', redis_config=self.app_config['redis_config']) facebook_crawler_error_queue = RedisQueue( name='facebook_error', redis_config=self.app_config['redis_config']) db = self.create_mongo_conn(db='FaceBook', collection='facebook') while True: if facebook_crawler_error_queue.qsize() > 0: err_item = facebook_crawler_error_queue.get() print('取出出错条目:%s', err_item) id = err_item['id'] url = err_item['url'] if url: crawler.fetch_user_tweets(id=id, urls=url, deadline=deadtime) else: print('\n') print('[===Info:%s 未抓取的的id个数为:%s===]' % (datetime.now(), facebook_crawler_queue.qsize())) if facebook_crawler_queue.empty(): if weipa_count >= 3: print('<-----文章抓取完成----->') break else: weipa_count += 1 print('[==Retry:%s==]' % (weipa_count - 1)) time.sleep(5) continue id = facebook_crawler_queue.get() print(id) doc = db.find_one({"id": str(id)}) # print(doc) crawler.fetch_user_tweets(id=id, urls=doc['link'] + 'posts/', deadline=deadtime) # print('完成全部抓取') weipa_count = 1