Exemplo n.º 1
0
def crawler_init(name='twitter'):
    print('<-----初始化程序----->')
    config = read_config()
    if name == 'twitter':
        twitter_crawler_queue = RedisQueue(name='twitter',
                                           redis_config=config['redis_config'])
        if twitter_crawler_queue.qsize() > 0:
            print('<-----有%s个任务还未完成---->' % twitter_crawler_queue.qsize())
        if twitter_crawler_queue.empty():
            with open(os.path.abspath('twitter_user_ids.json'), 'r') as f:
                user_ids = json.load(f)
                for id in user_ids['ids']:
                    twitter_crawler_queue.put(id)
                    # print(id)
        print('<-----有%s个任务需要完成----->' % twitter_crawler_queue.qsize())
        print('<-----twitter初始化完成----->')
    else:
        facebook_crawler_queue = RedisQueue(
            name='facebook', redis_config=config['redis_config'])
        if facebook_crawler_queue.qsize() > 0:
            print('<-----有%s个任务还未完成---->' % facebook_crawler_queue.qsize())
        if facebook_crawler_queue.empty():
            with open(os.path.abspath('facebook_user_ids.json'), 'r') as f:
                user_ids = json.load(f)
                for id in user_ids['ids']:
                    facebook_crawler_queue.put(id)
        print('<-----有%s个任务需要完成----->' % facebook_crawler_queue.qsize())
        print('<-----facebook初始化完成----->')
    def __init__(self, args={}):
        #Base.__init__()
        super(TWitter, self).__init__()

        #以上是俩种调用基类构造函数的方法
        self.__consumer_key = 'c58jPuNxqLex5QttLkoVF621T'
        self.__consumer_secret = "qU2EfulVxZ9a9mSPVm0bww4HXDyC8qk4a2gQrq7bgy4dKOqfup"
        self.__access_token = "930249938012798978-BJCWSdIgciyVZ0IUKLXVXLlc1A3D2my"
        self.__access_secret = "HjDrf1nvRDZIT5NSXioGVeOeZoev26Ibi08hCBQMhMof4"
        super(Base, self).__init__(self.__consumer_key, self.__consumer_secret,
                                   self.__access_token, self.__access_secret)

        auth = tweepy.OAuthHandler(self.__consumer_key, self.__consumer_secret)
        auth.set_access_token(self.__access_token, self.__access_secret)
        self.__flag = 'twitter'
        self.api = tweepy.API(auth)
        self.args = args
        # self.crawler_list_queue = RedisQueue(name='twitter_list',redis_config=redis_config)
        self.crawler_tweets_err_queue = RedisQueue(
            name='twitter_error', redis_config=self.app_config['redis_config'])
        self.crawler_replay_queue = RedisQueue(
            name='twitter_replay',
            redis_config=self.app_config['redis_config'])
        self.crawler_tweets_queue = RedisQueue(
            name='twitter', redis_config=self.app_config['redis_config'])
        self.twitter_users_queue = RedisQueue(
            name='twitter_users', redis_config=self.app_config['redis_config'])
Exemplo n.º 3
0
def clear_queue(site='twitter'):
    config = read_config()
    if site == 'twitter':
        twitter_crawler_queue = RedisQueue(name='twitter',
                                           redis_config=config['redis_config'])
        twitter_crawler_queue.clear()
    else:
        facebook_crawler_queue = RedisQueue(
            name='facebook', redis_config=config['redis_config'])
        facebook_crawler_queue.clear()
Exemplo n.º 4
0
 def __init__(self,args={}):
     #Base.__init__()
     super(FaceBook,self).__init__()
     #以上是俩种调用基类构造函数的方法
     self.__username = "******"
     self.__password = "******"
     self.__access_token = "EAACEdEose0cBAAkdhoyXkFejburMPqbr7b773AxZCs7b1BORK7V2gUxVlmKkYydZCZBuyy4UcZA0QxThf7ii0tbDnsiCSzwFJ9DZAeGTcUCsGHQPTk7hPamWAZA2mN6IBjNXDsDQwwzrwet4h1piWTP5fuBnKjZCGm8ZCyXjCEWS7apZCoo1ZAuO5OBfoc9IDCgjSDfvc3pWKWGEPcICelHO456OUnZAxeDpLUZD"
     self.__flag = 'facebook'
     self.args = args
     self.crawler_tweets_err_queue = RedisQueue(name='facebook_error', redis_config=self.app_config['redis_config'])
     self.crawler_reactions_queue = RedisQueue(name='facebook_reactions',redis_config=self.app_config['redis_config'])
     self.crawler_tweets_queue = RedisQueue(name='facebook',redis_config=self.app_config['redis_config'])
     self.facebook_users_queue = RedisQueue(name='facebook_users', redis_config=self.app_config['redis_config'])
Exemplo n.º 5
0
def crawler_init(name='twitter'):
    print('<-----初始化程序----->')
    if name == 'twitter':
        twitter_crawler_queue = RedisQueue(name='twitter')
        if twitter_crawler_queue.qsize() == 0:
            with open(os.path.abspath('twitter_user_ids.json'), 'r') as f:
                user_ids = json.load(f)
                for id in user_ids['ids']:
                    twitter_crawler_queue.put(id)
    else:
        facebook_crawler_queue = RedisQueue(name='facebook')
        if facebook_crawler_queue.qsize() == 0:
            with open(os.path.abspath('facebook_user_ids.json'), 'r') as f:
                user_ids = json.load(f)
                for id in user_ids['ids']:
                    facebook_crawler_queue.put(id)
    print('<-----初始化成功------>')
    def update_facebook_users_count(self, crawler):  #facebook 用户账户的like_num字段
        print('【爬取twitter usersInfos】')
        db = self.create_mongo_conn(db='FaceBook', collection='facebook')
        weipa_count = 1
        while True:
            try:
                facebook_users_queue = RedisQueue(
                    name='facebook_users',
                    redis_config=self.app_config['redis_config'])
                print('[===未抓取的的id个数为:%s===]' % facebook_users_queue.qsize())
                if facebook_users_queue.empty():
                    if weipa_count >= 3:
                        print('<-----用户抓取完成----->')
                        break
                    else:
                        weipa_count += 1
                        print('[==Retry:%s==]' % (weipa_count - 1))
                        time.sleep(5)
                        continue

                id = facebook_users_queue.get()
                tweet = db.find_one({"id": id})
                print(tweet)
                if not tweet:
                    continue
                if tweet['link'].endswith('/'):
                    url = tweet['link'] + 'community/'
                else:
                    url = tweet['link'] + '/' + 'community/'
                content = crawler.crawler_user_likes(url)
                for item in content:
                    # print(item)
                    if "isLoginStatus" in item:
                        update_doc = db.find_one_and_update(
                            {"id": id}, {
                                '$set': {
                                    'likes_num': item['like_count'],
                                    'fan_count': item['fan_count'],
                                    "update_time": datetime.now(),
                                    "isLoginStatus": item["isLoginStatus"]
                                }
                            },
                            return_document=ReturnDocument.AFTER)
                        print('更新了%s文档成功' % update_doc['id'])
                    else:
                        update_doc = db.find_one_and_update(
                            {"id": id}, {
                                '$set': {
                                    'likes_num': item['like_count'],
                                    'fan_count': item['fan_count'],
                                    "update_time": datetime.now(),
                                }
                            },
                            return_document=ReturnDocument.AFTER)
                        print('更新了%s文档成功' % update_doc['id'])
                weipa_count = 1
            except Exception as e:
                raise e
def check_queue_isEmpty():
    try:
        queue = RedisQueue(name='twitter_replay',
                           redis_config=app_config['redis_config'])
        if queue.qsize() > 0:
            twitter_every_day_update_count_job()
        else:
            print('[没有任务!!]')
        return
    except Exception as e:
        print(e)
        return
Exemplo n.º 8
0
def crawler_twitter_init():
    config = read_config()
    print('<-----初始化程序----->')
    twitter_crawler_queue = RedisQueue(name='twitter_users',
                                       redis_config=config['redis_config'])
    if twitter_crawler_queue.qsize() > 0:
        print('<-----有%s个任务还未完成---->' % twitter_crawler_queue.qsize())
    if twitter_crawler_queue.empty():
        with open(os.path.abspath('twitter_user_ids.json'), 'r') as f:
            user_ids = json.load(f)
            for id in user_ids['ids']:
                twitter_crawler_queue.put(id)
    print('<-----有%s个任务需要完成----->' % twitter_crawler_queue.qsize())
    print('<-----twitter初始化完成----->')
Exemplo n.º 9
0
def completion_twitter_text(conn):
    # with open('twitter_user_ids.json','r') as f:
    #     ids = json.load(f)
    current = 0;
    # for id in ids['ids']:
    with open(os.path.abspath('config.json'), 'r') as f:
        app_config = json.load(f)
    twitter_crawler_queue = RedisQueue(name='twitter_test_ids', redis_config=app_config['redis_config2'])
    while True:
        try:
            # print(id)
            if twitter_crawler_queue.empty():
                print('所有的用户全部跑完')
                break
            id = twitter_crawler_queue.get()
            print('[===取出的ID为%s,目前还有%s条需要抓取====]' % (id,twitter_crawler_queue.qsize()))
            es_url = 'http://narnia.idatage.com/stq/api/v1/rowlet/findEsTextByUserIdOrKeywords?startDate=2016-12-30&endDate=2018-02-12&category=tw&ids=%s' % (id,)  # if i in [1,2,3,4,5,6,7,8] else 'http://narnia.idatage.com/stq/api/v1/rowlet/findEsTextByUserIdOrKeywords?startDate=2017-%s-01&endDate=2017-%s-01&category=%s&ids=%s' % (i,i+1,category,id)
            es_body = requests.get(es_url)
            # print('取出的内容为:')
            # print(es_body.text)
            es_body_tw = json.loads(es_body.text)['tw']
            print(len(es_body_tw))
            def handele(x):
                # print(x)
                x['_source']['index_name'] = x['_index']
                x['_source']['type_name'] = x['_type']
                x['_source']['id']=x['_id']
                x['_source']['url'] = 'https://twitter.com/%s/status/%s' % (x['_source']['user']['screen_name'], x['_source']['id_str'])
                return x['_source'];
            es_body_tw_urls = list(map(handele,filter(lambda x:not x['_source']['truncated'],es_body_tw)))
            # print(es_body_tw_urls)
            if len(es_body_tw_urls)>200:
                pool = mp.Pool()
                res = pool.map(asynchronous_request,(es_body_tw_urls[i:i+200] for i in range(0,len(es_body_tw_urls),200)))
                # current += 1;
                print('更新%s用户' % id)
            elif 0<len(es_body_tw_urls)<200:
                asynchronous_request(ops=es_body_tw_urls)
                # current += 1;
                print('更新%s用户' % id)
                # print('第几%s个' % current)
            else:
                current += 1;
                print('该用户%s无需更新' % id)
                print('第几%s个' % current)
            conn.send(id)
        except Exception as e:
            current = 0
            # print(e)
            raise e
Exemplo n.º 10
0
    def update_twitter_users_count(self, crawler):  #twitter 用户的list字段
        # with open(os.path.abspath('twitter_user_ids.json'), 'r') as f:
        #     user_ids = json.load(f)
        # count=1;
        print('【爬取twitter usersInfos】')
        db = self.create_mongo_conn(db='Twitter', collection='twitter')
        weipa_count = 1
        while True:
            try:
                twitter_users_queue = RedisQueue(
                    name='twitter_users',
                    redis_config=self.app_config['redis_config'])
                print('[===未抓取的的id个数为:%s===]' % twitter_users_queue.qsize())
                if twitter_users_queue.empty():
                    if weipa_count >= 3:
                        print('<-----用户抓取完成----->')
                        break
                    else:
                        weipa_count += 1
                        print('[==Retry:%s==]' % (weipa_count - 1))
                        time.sleep(5)
                        continue

                id = twitter_users_queue.get()
                tweet = db.find_one({"id_str": id})
                tweet_count, flowing_count, followers_count, favorites_count, list_count, moment_count = crawler.crawler_list_count(
                    tweet["screen_name"], user_id=id)
                print(tweet_count, flowing_count, followers_count,
                      favorites_count, list_count, moment_count)
                after_doc = db.find_one_and_update(
                    {"id_str": tweet['id_str']}, {
                        "$set": {
                            "list_num": list_count,
                            "moment_num": moment_count,
                            "followers_count": followers_count,
                            "friends_count": flowing_count,
                            "statuses_count": tweet_count,
                            "favourites_count": favorites_count,
                            "update_time": datetime.now(),
                        }
                    },
                    return_document=ReturnDocument.AFTER)
                print('更新%s文档成功' % after_doc['_id'])
                # count +=1;
                weipa_count = 1
            except Exception as e:
                raise e
import sys,os,json
sys.path.append(".")

from  src.redis_helper import  RedisQueue


if __name__ == '__main__':
    twitter_crawler_queue = RedisQueue(name='twittter')
    while True:
        print(twitter_crawler_queue.qsize())
        print(twitter_crawler_queue.get())
Exemplo n.º 12
0
 def crawler_tweets_replay_count(self,
                                 crawler,
                                 history=False):  #推文表中的replay_count字段
     print('<=====启动tweet_replay抓取====>')
     db = self.create_mongo_conn()
     crawler_tweet_replay_queue = RedisQueue(
         name='twitter_replay',
         redis_config=self.app_config['redis_config'])
     weipa_count = 1
     err_count = 1
     if not history:
         es = Espusher()
     while True:
         try:
             print('[===未抓取的个数为:%s===]' %
                   crawler_tweet_replay_queue.qsize())
             # if crawler_tweet_replay_queue.empty():
             #     if weipa_count >=3:
             #         print('<-----twitter_replay抓取完成----->')
             #         break
             #     else:
             #         weipa_count+=1
             #         print('[==Retry:%s==]' % (weipa_count-1))
             #         time.sleep(10)
             #         continue
             # print(weipa_count)
             urls = crawler_tweet_replay_queue.get()
             print("取出%s个" % len(urls))
             content = crawler.crawler_replay_num(urls)
             for item in content:
                 print(item)
                 # print(item['url'].split('/')[-1])
                 if history:
                     update_doc = db.update_many(
                         {
                             "id_str": item['url'].split('/')[-1],
                             'site': 'twitter'
                         }, {
                             '$set': {
                                 'replay_count': item['reply_count'],
                                 'retweet_count': item['retweet_count'],
                                 'favorite_count': item['favorite_count'],
                                 "update_status": True
                             }
                         })
                     print('更新了%s个' % (update_doc.modified_count))
                 else:
                     # print('push item to es')
                     # print(item)
                     data = db.find_one_and_delete({
                         "id_str":
                         item['url'].split('/')[-1],
                         'site':
                         'twitter'
                     })
                     data['replay_count'] = item['reply_count']
                     data['favorite_count'] = item['favorite_count']
                     data['retweet_count'] = item['retweet_count']
                     es.twitter_pusher(data)
             weipa_count = 1
             err_count = 1
         except Exception as e:
             print(e)
             continue
Exemplo n.º 13
0
 def crawler_reactions(self, crawler, history=False):  #facebook posts表中的各种量
     print('<=====启动facebook_reactions抓取====>')
     db = self.create_mongo_conn()
     crawler_reactions_queue = RedisQueue(
         name='facebook_reactions',
         redis_config=self.app_config['redis_config'])
     weipa_count = 1
     err_count = 1
     if not history:
         es = Espusher()
     while True:
         try:
             # print(db.count({"site": "facebook","update_status":False}))
             # tweets = list(db.find({"site": "facebook","update_status":False}).limit(20))
             # if (len(tweets) == 0):
             #     print('全部爬取完成')
             #     break;
             print('[===未抓取的个数为:%s===]' % crawler_reactions_queue.qsize())
             # if crawler_reactions_queue.empty():
             #     if weipa_count >=3:
             #         print('<-----facebook_reactions抓取完成----->')
             #         break
             #     else:
             #         weipa_count+=1
             #         print('[==Retry:%s==]' % (weipa_count-1))
             #         time.sleep(10)
             #         continue
             urls = crawler_reactions_queue.get(
             )  #map(lambda x:{"url":'https://facebook.com%s' % x['permalink_url'],'id':x['_id']},tweets)
             content = crawler.crawler_reactions_nums(urls)
             # print(content)
             if not content: continue
             for item in content:
                 # print(item)
                 print(item['reactions'])
                 # print(item['url'])
                 # print(url)
                 if not item['reactions']:
                     print(item)
                 else:
                     if history:
                         print(objectid.ObjectId(item['url']['id']))
                         update_doc = db.find_one_and_update(
                             {"_id": objectid.ObjectId(item['url']['id'])},
                             {
                                 '$set': {
                                     'comment_num':
                                     item['reactions']['comment_count'],
                                     'likes_num':
                                     item['reactions']['likes_count'],
                                     'share_count':
                                     item['reactions']["share_count"],
                                     "update_status":
                                     True
                                 }
                             },
                             return_document=ReturnDocument.AFTER)
                         if update_doc != None:
                             print('更新了%s个' % update_doc['_id'])
                     else:
                         data = db.find_one_and_delete(
                             {'_id': objectid.ObjectId(item['url']['id'])})
                         data['comment_num'] = item['reactions'][
                             'comment_count']
                         data['likes_num'] = item['reactions'][
                             'likes_count']
                         data['share_count'] = item['reactions'][
                             "share_count"]
                         es.facebook_pusher(data)
             weipa_count = 1
             err_count = 1
         except Exception as e:
             print(e)
             continue
Exemplo n.º 14
0
    def crawler_tweets(self, crawler, site='facebook', deadtime='2017-1-1'):
        print('<-----启动文章抓取----->')
        weipa_count = 1
        if site == 'twitter':
            twitter_crawler_queue = RedisQueue(
                name='twitter', redis_config=self.app_config['redis_config'])
            twitter_crawler_error_queue = RedisQueue(
                name='twitter_error',
                redis_config=self.app_config['redis_config'])
            while True:
                if twitter_crawler_error_queue.qsize() > 0:
                    err_item = twitter_crawler_error_queue.get()
                    print('取出出错条目:%s', err_item)
                    current_max_id = err_item['current_max_id']
                    id = err_item['user_id']
                    crawler.fetch_user_tweets(user_id=id,
                                              current_max_id=current_max_id,
                                              deadline=deadtime)
                else:
                    print('\n')
                    print('[===Info:%s 未抓取的的id个数为:%s===]' %
                          (datetime.now(), twitter_crawler_queue.qsize()))
                    if twitter_crawler_queue.empty():
                        if weipa_count >= 3:
                            print('<-----文章抓取完成----->')
                            break
                        else:
                            weipa_count += 1
                            print('[==Retry:%s==]' % (weipa_count - 1))
                            time.sleep(5)
                            continue

                    id = twitter_crawler_queue.get()
                    crawler.fetch_user_tweets(user_id=id, deadline=deadtime)
                weipa_count = 1
        else:
            facebook_crawler_queue = RedisQueue(
                name='facebook', redis_config=self.app_config['redis_config'])
            facebook_crawler_error_queue = RedisQueue(
                name='facebook_error',
                redis_config=self.app_config['redis_config'])
            db = self.create_mongo_conn(db='FaceBook', collection='facebook')
            while True:
                if facebook_crawler_error_queue.qsize() > 0:
                    err_item = facebook_crawler_error_queue.get()
                    print('取出出错条目:%s', err_item)
                    id = err_item['id']
                    url = err_item['url']
                    if url:
                        crawler.fetch_user_tweets(id=id,
                                                  urls=url,
                                                  deadline=deadtime)
                else:
                    print('\n')
                    print('[===Info:%s 未抓取的的id个数为:%s===]' %
                          (datetime.now(), facebook_crawler_queue.qsize()))
                    if facebook_crawler_queue.empty():
                        if weipa_count >= 3:
                            print('<-----文章抓取完成----->')
                            break
                        else:
                            weipa_count += 1
                            print('[==Retry:%s==]' % (weipa_count - 1))
                            time.sleep(5)
                            continue

                    id = facebook_crawler_queue.get()
                    print(id)
                    doc = db.find_one({"id": str(id)})
                    # print(doc)
                    crawler.fetch_user_tweets(id=id,
                                              urls=doc['link'] + 'posts/',
                                              deadline=deadtime)
            # print('完成全部抓取')
                weipa_count = 1