예제 #1
0
    def update_watchlist(self, user, since_tweet_id, page_not_found):
        pre_watchlist_column = self.motor_column.pre_watchlist
        watchlist_column = self.motor_column.watchlist

        self.logger.debug(user)
        doc = yield pre_watchlist_column.find_one({'user.id_str': user['id_str'], 'state': 1})
        if not doc:
            doc = yield pre_watchlist_column.find_one({'user.screen_name': user['screen_name'], 'state': 1})
        if doc:
            yield pre_watchlist_column.remove({'_id': doc['_id']})
            del doc['_id']
        else:
            doc = yield watchlist_column.find_one({'user.id_str': user['id_str'], 'state': 1})
            if not doc:
                return
        if 'campaign_ids' in doc['user']:
            user.update({'campaign_ids': doc['user']['campaign_ids']})

        doc['user'] = user
        doc['since_tweet_id'] = since_tweet_id
        doc['page_not_found'] = page_not_found
        doc['updated_at'] = now_in_drnj_time()
        doc['state'] = 0

        yield watchlist_column.update({'user.id_str': doc['user']['id_str']}, doc, upsert=True)
예제 #2
0
    def create_campaign(self, params):
        campaigns_column = self.motor_column.campaigns

        params.update({'created_at': now_in_drnj_time()})

        user_id_strs_to_follow = str(params["user_id_strs_to_follow"])
        user_screen_names_to_follow = str(params["user_screen_names_to_follow"])
        # removing it to be used elsewhere
        params.pop("user_id_strs_to_follow", None)
        params.pop("user_screen_names_to_follow", None)
        add_to_watchlist(params['campaign_id'], user_id_strs_to_follow, user_screen_names_to_follow)

        yield campaigns_column.insert(params)
예제 #3
0
    def create_campaign(self, params):
        campaigns_column = self.motor_column.campaigns

        params.update({'created_at': now_in_drnj_time()})

        user_id_strs_to_follow = str(params["user_id_strs_to_follow"])
        user_screen_names_to_follow = str(
            params["user_screen_names_to_follow"])
        # removing it to be used elsewhere
        params.pop("user_id_strs_to_follow", None)
        params.pop("user_screen_names_to_follow", None)
        add_to_watchlist(params['campaign_id'], user_id_strs_to_follow,
                         user_screen_names_to_follow)

        yield campaigns_column.insert(params)
예제 #4
0
    def update_watchlist(self, user, since_tweet_id, page_not_found):
        pre_watchlist_column = self.motor_column.pre_watchlist
        watchlist_column = self.motor_column.watchlist

        self.logger.debug(user)
        doc = yield pre_watchlist_column.find_one({
            'user.id_str': user['id_str'],
            'state': 1
        })
        if not doc:
            doc = yield pre_watchlist_column.find_one({
                'user.screen_name':
                user['screen_name'],
                'state':
                1
            })
        if doc:
            yield pre_watchlist_column.remove({'_id': doc['_id']})
            del doc['_id']
        else:
            doc = yield watchlist_column.find_one({
                'user.id_str': user['id_str'],
                'state': 1
            })
            if not doc:
                return
        if 'campaign_ids' in doc['user']:
            user.update({'campaign_ids': doc['user']['campaign_ids']})

        doc['user'] = user
        doc['since_tweet_id'] = since_tweet_id
        doc['page_not_found'] = page_not_found
        doc['updated_at'] = now_in_drnj_time()
        doc['state'] = 0

        yield watchlist_column.update({'user.id_str': doc['user']['id_str']},
                                      doc,
                                      upsert=True)
예제 #5
0
    def prepare_hist_and_plot(self, n_tweets, users, n_bins, campaign_id):
        import numpy
        import matplotlib.pyplot as plot

        plot_graphs = False

        hist = {
            'user_creation': {
                'data': None,
                'bins': None,
            },
            'user_n_tweets': {
                'data': None,
                'bins': None,
            },
            'user_n_tweets_overall': {
                'data': None,
                'bins': None,
            },
            'n_tweets': None,
            'n_unique_users': None,
            'n_default_profile_image': None,
            'n_lower_than_threshold': None,
        }

        self.logger.debug("How many tweets? %d" % n_tweets)
        hist['n_tweets'] = n_tweets

        # TODO: abort if there are more than 200000 tweets.
        if n_tweets > 200000:
            return
        #
        # How many unique users?
        #
        n_unique_users = len(users)
        self.logger.debug("How many unique users? %d" % n_unique_users)
        hist['n_unique_users'] = n_unique_users

        ######
        sec_title = "Histogram of user creation dates?"
        #

        tmp_dates = []
        for x in users:
            tmp_date = x['user']['created_at']
            if type(tmp_date) != float:
                tmp_date = py_utc_time2drnj_time(tmp_date)
            tmp_dates.append(tmp_date)
    #    tmp_dates = [py_utc_time2drnj_time(x['user']['created_at']) for x in users]

        (hist['user_creation']['data'], hist['user_creation']['bins']) = numpy.histogram(tmp_dates, bins=n_bins)

        if plot_graphs:
            bins = hist['user_creation']['bins'][:-1]
            width = (hist['user_creation']['bins'][1] - hist['user_creation']['bins'][0])/2
            plot.bar(bins, hist['user_creation']['data'], width=width, align='center')

            xticklabels = [time.strftime('%d %b %Y', time.gmtime(drnj_time2py_time(x))) for x in bins]

            plot.xticks(bins, xticklabels)
            plot.title(sec_title)
            #plot.show()
            plot.savefig('1.pdf', dpi=600)

        #####
        sec_title = "Histogram of number of tweets of each user in this campaign"
        tmp_counts = [int(x['n_user_tweets']) for x in users]
        #
        (hist['user_n_tweets']['data'], hist['user_n_tweets']['bins']) = numpy.histogram(tmp_counts, bins=n_bins)

        if plot_graphs:
            bins = hist['user_n_tweets']['bins'][:-1]
            data = hist['user_n_tweets']['data']
            width = (hist['user_n_tweets']['bins'][1] - hist['user_n_tweets']['bins'][0])/2
            plot.bar(bins, data, width=width, align='center')

            xticklabels = bins

            plot.xticks(bins, xticklabels)
            plot.title(sec_title)
            #plot.show()
            plot.savefig('2.pdf', dpi=600)

        #####
        sec_title = "What percentage of them used the default profile image?"
        #
        n_default_profile_image = 0
        for u in users:
            if u['user']['default_profile_image']:
                n_default_profile_image += 1

        hist['n_default_profile_image'] = n_default_profile_image
        self.logger.debug("%s: %0.2f%%" % (sec_title, 100*(float(n_default_profile_image)/n_unique_users)))
        #####
        sec_title = "Histogram of tweet counts of unique users"
        tmp_counts = [int(x['user']['statuses_count']) for x in users]

        (hist['user_n_tweets_overall']['data'],
         hist['user_n_tweets_overall']['bins']) = numpy.histogram(tmp_counts, bins=n_bins)

        if plot_graphs:
            bins = hist['user_n_tweets_overall']['bins'][:-1]
            data = hist['user_n_tweets_overall']['data']
            width = (hist['user_n_tweets_overall']['bins'][1] - hist['user_n_tweets_overall']['bins'][0])/2
            plot.bar(bins, data, width=width, align='center')

            xticklabels = bins

            plot.xticks(bins, xticklabels)
            plot.title(sec_title)
            #plot.show()
            plot.savefig('3.pdf', dpi=600)
        #
        sec_title = "What percentage of them have lower than 5 tweets?"
        n_lower_than_threshold = 0
        for u in users:
            if u['user']['statuses_count'] < 5:
                n_lower_than_threshold += 1

        hist['n_lower_than_threshold'] = n_lower_than_threshold
        self.logger.debug("%s: %0.2f%%" % (sec_title, 100*(float(n_lower_than_threshold)/n_unique_users)))

        self.logger.debug(hist)

        # converting numpy.array's to normal python lists.
        for k in hist.keys():
            if type(hist[k]) == dict:
                for k2 in hist[k].keys():
                    if type(hist[k][k2]) == type(numpy.array([])):
                        hist[k][k2] = list(hist[k][k2])

        hist = {'campaign_id': campaign_id,
                'histogram': hist,
                'created_at': now_in_drnj_time()}
        return hist
예제 #6
0
    def store_friends_or_followers(self, user_id, IDS, drnjID, fof):
        """Stores/updates list of drenaj user data using raw twitter data

        IDS -- list of user ids client crawler reports
        """
        # db = mongo_client[DRENAJ_DB[DRENAJ_APP_ENVIRONMENT]]
        queue_collection = self.application.db.motor_column.queue
        graph_collection = self.application.db.motor_column.graph

        num_new_discovered_users = 0
        num_edges_inserted = 0

        dt = now_in_drnj_time()
        queue_query = {"id": user_id}

        print queue_query

        # ATC: This mechanism requires finding the id twice
        # With indexing, this may not be a big problem
        # Alternative is trying to update and catching the pymongo.errors.OperationFailure exception
        n_ids = yield queue_collection.find(queue_query).count()
        id_exists = n_ids > 0

        if id_exists:
            print 'ID EXISTS'
            queue_document = {
                "$set": {
                    fof + "_retrieved_at": dt,
                    "retrieved_by": drnjID
                }
            }
            # creates entry if query does not exist
            # queue_collection.update(queue_query, queue_document, upsert=True)

            yield queue_collection.update(queue_query, queue_document)

        else:
            print 'ID NOT EXISTS'
            if fof == 'friends':
                queue_document = validate_document(
                    new_queue_document(), {
                        "id": user_id,
                        "id_str": str(user_id),
                        "profile_retrieved_at": 0,
                        "friends_retrieved_at": dt,
                        "followers_retrieved_at": 0,
                        "retrieved_by": drnjID
                    })

            elif fof == 'followers':
                queue_document = validate_document(
                    new_queue_document(), {
                        "id": user_id,
                        "id_str": str(user_id),
                        "profile_retrieved_at": 0,
                        "friends_retrieved_at": 0,
                        "followers_retrieved_at": dt,
                        "retrieved_by": drnjID
                    })
            print "QUEUE DOCUMENT: " + queue_document
            yield queue_collection.insert(queue_document)
            num_new_discovered_users += 1

        # process each user id in IDS
        for id in reversed(IDS):
            # Insert the newly discovered id into the queue
            # insert will be rejected if _id exists
            queue_document = validate_document(
                new_queue_document(), {
                    "id": id,
                    "id_str": str(id),
                    "profile_retrieved_at": 0,
                    "friends_retrieved_at": 0,
                    "followers_retrieved_at": 0,
                    "retrieved_by": drnjID
                })

            yield queue_collection.insert(queue_document)
            num_new_discovered_users += 1

            dt = now_in_drnj_time()
            if fof == 'friends':
                source = user_id
                sink = id
            elif fof == 'followers':
                source = id
                sink = user_id
            else:
                return

            edge = yield graph_collection.find_one({
                "id": source,
                "friend_id": sink
            })

            if edge == None:
                doc = validate_document(
                    new_graph_document(), {
                        'id': source,
                        'friend_id': sink,
                        'id_str': str(source),
                        'friend_id_str': str(sink),
                        'record_retrieved_at': dt,
                        "retrieved_by": drnjID
                    })
                yield graph_collection.insert(doc)
                num_edges_inserted += 1

        # TODO: Handle unfollows: Find edges that no longer exist and move old record to graph_history and add unfollow record

        raise Return({
            'num_new_users': num_new_discovered_users,
            'num_new_edges': num_edges_inserted
        })
    def store_multiple_profiles(self, ids, S, drnjID, campaign_id):
        """

        """
        # print "Received recent profile of ", v['name'], ' a.k.a. ', v['screen_name']

        db = self.application.db
        queue_coll = db.motor_column.queue

        print S
        for i in range(len(S)):
            status = None
            if 'status' in S[i]:
                status = S[i]['status']
                del S[i]['status']
            else:
                status = {}
                status['text'] = None

            status['user'] = S[i]
            status['user']['history'] = False

            DB_TEST_VERSION = 0.2
            tweet_dat = validate_document(new_tweet_template(), {
                "tweet": status,
                # TODO: Replace this DB_TEST_VERSION with source code
                # version later
                "drenaj_service_version": DB_TEST_VERSION,
                "campaign_id": campaign_id,
                "record_retrieved_at": drnj_time.now_in_drnj_time(),
                "retrieved_by": drnjID,
            }, fail=False)

            print tweet_dat

            user_id = S[i]['id_str']

            # print profile_dat

            # Check Queue
            now = drnj_time.now_in_drnj_time()
            queue_query = {"id": user_id}
            queue_document = validate_document(new_queue_document(), {
                "id": int(user_id),
                "id_str": user_id,
                "profile_retrieved_at": now,
                "$setOnInsert": {
                    "friends_retrieved_at": 0,
                    "followers_retrieved_at": 0,
                },
                "retrieved_by": drnjID
            })

            # creates entry if query does not exist
            yield queue_coll.update(queue_query, queue_document, upsert=True)

            # Insert to profiles
            ##         profiles_query = {"profile.id": user_id}
            ##         prof = profiles_collection.find_and_modify(profiles_query, remove=True)
            ##         if prof is not None:
            ##             profiles_history_collection.insert(prof)
            ##
            ##         profiles_collection.insert(profile_dat)

            # this call marks the current entries as history
            # maybe we won't need this for certain queries
            db.move_to_history(user_id)

            db.insert_tweet(tweet_dat)
            #        tweets_collection.insert(tweet_dat)

            ids.remove(int(user_id))

        raise Return(ids)
예제 #8
0
    def store_multiple_profiles(self, ids, S, drnjID, campaign_id):
        """

        """
        # print "Received recent profile of ", v['name'], ' a.k.a. ', v['screen_name']

        db = self.application.db
        queue_coll = db.motor_column.queue

        print S
        for i in range(len(S)):
            status = None
            if 'status' in S[i]:
                status = S[i]['status']
                del S[i]['status']
            else:
                status = {}
                status['text'] = None

            status['user'] = S[i]
            status['user']['history'] = False

            DB_TEST_VERSION = 0.2
            tweet_dat = validate_document(
                new_tweet_template(),
                {
                    "tweet": status,
                    # TODO: Replace this DB_TEST_VERSION with source code
                    # version later
                    "drenaj_service_version": DB_TEST_VERSION,
                    "campaign_id": campaign_id,
                    "record_retrieved_at": drnj_time.now_in_drnj_time(),
                    "retrieved_by": drnjID,
                },
                fail=False)

            print tweet_dat

            user_id = S[i]['id_str']

            # print profile_dat

            # Check Queue
            now = drnj_time.now_in_drnj_time()
            queue_query = {"id": user_id}
            queue_document = validate_document(
                new_queue_document(), {
                    "id": int(user_id),
                    "id_str": user_id,
                    "profile_retrieved_at": now,
                    "$setOnInsert": {
                        "friends_retrieved_at": 0,
                        "followers_retrieved_at": 0,
                    },
                    "retrieved_by": drnjID
                })

            # creates entry if query does not exist
            yield queue_coll.update(queue_query, queue_document, upsert=True)

            # Insert to profiles
            ##         profiles_query = {"profile.id": user_id}
            ##         prof = profiles_collection.find_and_modify(profiles_query, remove=True)
            ##         if prof is not None:
            ##             profiles_history_collection.insert(prof)
            ##
            ##         profiles_collection.insert(profile_dat)

            # this call marks the current entries as history
            # maybe we won't need this for certain queries
            db.move_to_history(user_id)

            db.insert_tweet(tweet_dat)
            #        tweets_collection.insert(tweet_dat)

            ids.remove(int(user_id))

        raise Return(ids)
예제 #9
0
    def store_friends_or_followers(self, user_id, IDS, drnjID, fof):
        """Stores/updates list of drenaj user data using raw twitter data

        IDS -- list of user ids client crawler reports
        """
        # db = mongo_client[DRENAJ_DB[DRENAJ_APP_ENVIRONMENT]]
        queue_collection = self.application.db.motor_column.queue
        graph_collection = self.application.db.motor_column.graph

        num_new_discovered_users = 0
        num_edges_inserted = 0

        dt = now_in_drnj_time()
        queue_query = {"id": user_id}

        print queue_query

        # ATC: This mechanism requires finding the id twice
        # With indexing, this may not be a big problem
        # Alternative is trying to update and catching the pymongo.errors.OperationFailure exception
        n_ids = yield queue_collection.find(queue_query).count()
        id_exists = n_ids > 0

        if id_exists:
            print 'ID EXISTS'
            queue_document = {"$set":
                {
                    fof + "_retrieved_at": dt,
                    "retrieved_by": drnjID}
            }
            # creates entry if query does not exist
            # queue_collection.update(queue_query, queue_document, upsert=True)

            yield queue_collection.update(queue_query, queue_document)

        else:
            print 'ID NOT EXISTS'
            if fof == 'friends':
                queue_document = validate_document(new_queue_document(), {
                    "id": user_id,
                    "id_str": str(user_id),
                    "profile_retrieved_at": 0,
                    "friends_retrieved_at": dt,
                    "followers_retrieved_at": 0,
                    "retrieved_by": drnjID
                })

            elif fof == 'followers':
                queue_document = validate_document(new_queue_document(), {
                    "id": user_id,
                    "id_str": str(user_id),
                    "profile_retrieved_at": 0,
                    "friends_retrieved_at": 0,
                    "followers_retrieved_at": dt,
                    "retrieved_by": drnjID
                })
            print "QUEUE DOCUMENT: " + queue_document
            yield queue_collection.insert(queue_document)
            num_new_discovered_users += 1

        # process each user id in IDS
        for id in reversed(IDS):
            # Insert the newly discovered id into the queue
            # insert will be rejected if _id exists
            queue_document = validate_document(new_queue_document(), {
                "id": id,
                "id_str": str(id),
                "profile_retrieved_at": 0,
                "friends_retrieved_at": 0,
                "followers_retrieved_at": 0,
                "retrieved_by": drnjID})

            yield queue_collection.insert(queue_document)
            num_new_discovered_users += 1

            dt = now_in_drnj_time()
            if fof == 'friends':
                source = user_id
                sink = id
            elif fof == 'followers':
                source = id
                sink = user_id
            else:
                return

            edge = yield graph_collection.find_one({"id": source, "friend_id": sink})

            if edge == None:
                doc = validate_document(new_graph_document(), {
                    'id': source,
                    'friend_id': sink,
                    'id_str': str(source),
                    'friend_id_str': str(sink),
                    'record_retrieved_at': dt,
                    "retrieved_by": drnjID
                })
                yield graph_collection.insert(doc)
                num_edges_inserted += 1

        # TODO: Handle unfollows: Find edges that no longer exist and move old record to graph_history and add unfollow record

        raise Return(
                {'num_new_users': num_new_discovered_users, 'num_new_edges': num_edges_inserted})
예제 #10
0
    def post(self, *args, **keywords):
        """
        `view`
            - params:
                - user_id: user_id of the twitter user that we want to
                retrieve all tweets.
        `store`
            - params:
                - tweet_data: array of tweets in json format.
        `retweets`
            - params:
                - tweet_id: `id` of the tweet that we want to
                retrieve all retweets.
        `filter`
            - not implemented yet.
        """

        if len(args) > 1:
            (action) = args
        else:
            action = args[0]

        print action

        verbose_response = self.get_argument('verbose', '')

        if (action == 'view'):
            try:
                # TODO: we need to get here again.
                user_id = self.get_argument('user_id')
                # if no user_id is supplied.
                if user_id == '':
                    tmp = []
                else:

                    tweets_coll = self.application.db.motor_column.tweets
                    # running the query
                    cursor = tweets_coll.find({
                        'tweet.user.id_str': str(user_id),
                    })

                    tmp = [x for x in (yield cursor.to_list(length=100))]

                self.write(bson.json_util.dumps({'results': tmp}))
                self.add_header('Content-Type', 'application/json')
            except MissingArgumentError as e:
                # TODO: implement logging.
                raise HTTPError(500, 'You didn''t supply %s as an argument' % e.arg_name)
        elif (action == 'store'):
            try:
                tweet_data = self.get_argument('tweet_data')
                campaign_id = self.get_argument('campaign_id', 'default')
                watchlist_related = self.get_argument('watchlist_related', '')
                if tweet_data:
                    #tweet_array = bson.json_util.loads(tweet_data, object_hook=self.datetime_hook)
                    tweet_array = bson.json_util.loads(tweet_data)
                    tmp_tweets = []
                    # TODO: Sanity check the data!
                    # For example, treat 'entities', 'user' specially.
                    DB_TEST_VERSION = 0.2
                    for tweet_obj in tweet_array:
                        tweet_obj['user']['history'] = False
                        tmp_tweets.append(validate_document(new_tweet_template(), {
                            "tweet": tweet_obj,
                            # TODO: Replace this DB_TEST_VERSION with source code
                            # version later
                            "drenaj_service_version": DB_TEST_VERSION,
                            # TODO: "retrieved_by": keywords['drnjID'],
                            "retrieved_by": "drenaj",
                            "campaign_id": campaign_id,
                            "record_retrieved_at": drnj_time.now_in_drnj_time(),
                        }, fail=False))
                        ### TODO: add these users later.
                        ### tmp_users.append(validate_document(new_user_template(), tweet_obj['user']))
                    if tmp_tweets:
                        self.application.db.insert_tweet(tmp_tweets)
                        # WARN: This functionality is not available for now.
                        # campaign_id, user_objects = extract_arguments(tmp_tweets)
                        # res = app_object.send_task('init_user_to_graph_offline',
                        #                    [[ campaign_id, bson.json_util.dumps(user_objects) ]],
                        #                    queue="offline_jobs")
                        # logger.info(str(res))
                        if watchlist_related:
                            print watchlist_related
                            watchlist_related = bson.json_util.loads(watchlist_related)
                            print watchlist_related
                            #self.application.db.update_watchlist(**watchlist_related)
                            update_task_state_in_watchlist(**watchlist_related)


                    else:
                        raise HTTPError(500, 'You tried to insert no tweets?!')
#                    tweets_coll.insert(tmp_tweets)
                else:
                    tmp = []

                if verbose_response:
                    self.write(bson.json_util.dumps({'results': tmp}))
                else:
                    self.write(bson.json_util.dumps({'results': 'ok'}))
                self.add_header('Content-Type', 'application/json')
                self.finish()
            except MissingArgumentError as e:
                # TODO: implement logging.
                raise HTTPError(500, 'You didn''t supply %s as an argument' % e.arg_name)
        elif (action == 'retweets'):
            try:
                tweet_id = self.get_argument('tweet_id')
                self.write('not implemented yet')
            except MissingArgumentError as e:
                # TODO: implement logging.
                raise HTTPError(500, 'You didn''t supply %s as an argument' % e.arg_name)
        elif (action == 'filter'):
            try:
                campaign_id = self.get_argument('campaign_id')
                skip = self.get_argument('skip', 0)
                limit = self.get_argument('limit', 100)
                res_format = self.get_argument('format', 'json')
                since_datetime = self.get_argument('since_datetime', -1)
                until_datetime = self.get_argument('until_datetime', -1)
                sort_by_datetime = self.get_argument('sort_by_datetime', 0)
                produce_dbpedia_spotlight_result = self.get_argument('dbpedia_spotlight_result', 0)

                tweets_coll = self.application.db.motor_column.tweets

                query_string = {'campaign_id' : '%s' % campaign_id}
                if since_datetime != -1:
                    if 'tweet.created_at' not in query_string:
                        query_string['tweet.created_at'] = {}
                    query_string['tweet.created_at']['$gte'] = float(since_datetime)
                if until_datetime != -1:
                    if 'tweet.created_at' not in query_string:
                        query_string['tweet.created_at'] = {}
                    query_string['tweet.created_at']['$lt'] = float(until_datetime)
                sort_string = []
                if sort_by_datetime != 0:
                    sort_string = [('tweet.created_at', pymongo.ASCENDING)] # ascending
                print "STARTED " + str(query_string)
                cursor = tweets_coll.find(query_string)
                if sort_string:
                    cursor = cursor.sort(sort_string)
                cursor = cursor.skip(int(skip)).limit(int(limit))
                           # TODO: removing because of complaint:
                           # TypeError: if no direction is specified, key_or_list must be an instance of list
                           # .sort({"$natural" : 1})\
                tmp = []
                for x in (yield cursor.to_list(length=100)):
                    if int(produce_dbpedia_spotlight_result) == 1:
                        x['tweet']['dbpedia_spotlight_result'] = self.produce_dbpedia_spotlight_result(x['tweet']['text'])
                    tmp.append(x)
                # tmp = [x for x in (yield cursor.to_list(length=100))]
                print "ENDED " + str(query_string)
                DB_TEST_VERSION = 0.2
                if res_format == 'json':
                    self.write(bson.json_util.dumps(
                            {'results': tmp,
                            # TODO: Replace this DB_TEST_VERSION with source code
                            # version later
                            "drenaj_service_version": DB_TEST_VERSION,
                            # TODO: "requested_by": keywords['drnjID'],
                            "requested_by": "drenaj",
                            "campaign_id": campaign_id,
                            "served_at": drnj_time.now_in_drnj_time(),
                             'skip': int(skip),
                             'limit': int(limit)}))
                    self.add_header('Content-Type', 'application/json')
                elif res_format == 'html':
                    env = Environment(loader=FileSystemLoader('templates'))

                    template = env.get_template('statuses/filter.html')
                    result = template.render(statuses=[x['tweet'] for x in tmp])
                    self.write(result)

                self.finish()

            except MissingArgumentError as e:
                # TODO: implement logging.
                raise HTTPError(500, 'You didn''t supply %s as an argument' % e.arg_name)
            # db.tweets.find({'campaign_id' : 'syria'}).sort({$natural : 1}).skip(10).limit(14)
        else:
            # Tornado will make sure that this point will not be reached, if
            # it's regexp based router works correctly
            raise HTTPError(404, 'No such method')
예제 #11
0
    def post(self, *args, **keywords):
        """
        `view`
            - params:
                - user_id: user_id of the twitter user that we want to
                retrieve all tweets.
        `store`
            - params:
                - tweet_data: array of tweets in json format.
        `retweets`
            - params:
                - tweet_id: `id` of the tweet that we want to
                retrieve all retweets.
        `filter`
            - not implemented yet.
        """

        if len(args) > 1:
            (action) = args
        else:
            action = args[0]

        print action

        verbose_response = self.get_argument('verbose', '')

        if (action == 'view'):
            try:
                # TODO: we need to get here again.
                user_id = self.get_argument('user_id')
                # if no user_id is supplied.
                if user_id == '':
                    tmp = []
                else:

                    tweets_coll = self.application.db.motor_column.tweets
                    # running the query
                    cursor = tweets_coll.find({
                        'tweet.user.id_str':
                        str(user_id),
                    })

                    tmp = [x for x in (yield cursor.to_list(length=100))]

                self.write(bson.json_util.dumps({'results': tmp}))
                self.add_header('Content-Type', 'application/json')
            except MissingArgumentError as e:
                # TODO: implement logging.
                raise HTTPError(
                    500, 'You didn'
                    't supply %s as an argument' % e.arg_name)
        elif (action == 'store'):
            try:
                tweet_data = self.get_argument('tweet_data')
                campaign_id = self.get_argument('campaign_id', 'default')
                watchlist_related = self.get_argument('watchlist_related', '')
                if tweet_data:
                    #tweet_array = bson.json_util.loads(tweet_data, object_hook=self.datetime_hook)
                    tweet_array = bson.json_util.loads(tweet_data)
                    tmp_tweets = []
                    # TODO: Sanity check the data!
                    # For example, treat 'entities', 'user' specially.
                    DB_TEST_VERSION = 0.2
                    for tweet_obj in tweet_array:
                        tweet_obj['user']['history'] = False
                        tmp_tweets.append(
                            validate_document(
                                new_tweet_template(),
                                {
                                    "tweet":
                                    tweet_obj,
                                    # TODO: Replace this DB_TEST_VERSION with source code
                                    # version later
                                    "drenaj_service_version":
                                    DB_TEST_VERSION,
                                    # TODO: "retrieved_by": keywords['drnjID'],
                                    "retrieved_by":
                                    "drenaj",
                                    "campaign_id":
                                    campaign_id,
                                    "record_retrieved_at":
                                    drnj_time.now_in_drnj_time(),
                                },
                                fail=False))
                        ### TODO: add these users later.
                        ### tmp_users.append(validate_document(new_user_template(), tweet_obj['user']))
                    if tmp_tweets:
                        self.application.db.insert_tweet(tmp_tweets)
                        # WARN: This functionality is not available for now.
                        # campaign_id, user_objects = extract_arguments(tmp_tweets)
                        # res = app_object.send_task('init_user_to_graph_offline',
                        #                    [[ campaign_id, bson.json_util.dumps(user_objects) ]],
                        #                    queue="offline_jobs")
                        # logger.info(str(res))
                        if watchlist_related:
                            print watchlist_related
                            watchlist_related = bson.json_util.loads(
                                watchlist_related)
                            print watchlist_related
                            #self.application.db.update_watchlist(**watchlist_related)
                            update_task_state_in_watchlist(**watchlist_related)

                    else:
                        raise HTTPError(500, 'You tried to insert no tweets?!')
#                    tweets_coll.insert(tmp_tweets)
                else:
                    tmp = []

                if verbose_response:
                    self.write(bson.json_util.dumps({'results': tmp}))
                else:
                    self.write(bson.json_util.dumps({'results': 'ok'}))
                self.add_header('Content-Type', 'application/json')
                self.finish()
            except MissingArgumentError as e:
                # TODO: implement logging.
                raise HTTPError(
                    500, 'You didn'
                    't supply %s as an argument' % e.arg_name)
        elif (action == 'retweets'):
            try:
                tweet_id = self.get_argument('tweet_id')
                self.write('not implemented yet')
            except MissingArgumentError as e:
                # TODO: implement logging.
                raise HTTPError(
                    500, 'You didn'
                    't supply %s as an argument' % e.arg_name)
        elif (action == 'filter'):
            try:
                campaign_id = self.get_argument('campaign_id')
                skip = self.get_argument('skip', 0)
                limit = self.get_argument('limit', 100)
                res_format = self.get_argument('format', 'json')
                since_datetime = self.get_argument('since_datetime', -1)
                until_datetime = self.get_argument('until_datetime', -1)
                sort_by_datetime = self.get_argument('sort_by_datetime', 0)
                produce_dbpedia_spotlight_result = self.get_argument(
                    'dbpedia_spotlight_result', 0)

                tweets_coll = self.application.db.motor_column.tweets

                query_string = {'campaign_id': '%s' % campaign_id}
                if since_datetime != -1:
                    if 'tweet.created_at' not in query_string:
                        query_string['tweet.created_at'] = {}
                    query_string['tweet.created_at']['$gte'] = float(
                        since_datetime)
                if until_datetime != -1:
                    if 'tweet.created_at' not in query_string:
                        query_string['tweet.created_at'] = {}
                    query_string['tweet.created_at']['$lt'] = float(
                        until_datetime)
                sort_string = []
                if sort_by_datetime != 0:
                    sort_string = [('tweet.created_at', pymongo.ASCENDING)
                                   ]  # ascending
                print "STARTED " + str(query_string)
                cursor = tweets_coll.find(query_string)
                if sort_string:
                    cursor = cursor.sort(sort_string)
                cursor = cursor.skip(int(skip)).limit(int(limit))
                # TODO: removing because of complaint:
                # TypeError: if no direction is specified, key_or_list must be an instance of list
                # .sort({"$natural" : 1})\
                tmp = []
                for x in (yield cursor.to_list(length=100)):
                    if int(produce_dbpedia_spotlight_result) == 1:
                        x['tweet'][
                            'dbpedia_spotlight_result'] = self.produce_dbpedia_spotlight_result(
                                x['tweet']['text'])
                    tmp.append(x)
                # tmp = [x for x in (yield cursor.to_list(length=100))]
                print "ENDED " + str(query_string)
                DB_TEST_VERSION = 0.2
                if res_format == 'json':
                    self.write(
                        bson.json_util.dumps({
                            'results':
                            tmp,
                            # TODO: Replace this DB_TEST_VERSION with source code
                            # version later
                            "drenaj_service_version":
                            DB_TEST_VERSION,
                            # TODO: "requested_by": keywords['drnjID'],
                            "requested_by":
                            "drenaj",
                            "campaign_id":
                            campaign_id,
                            "served_at":
                            drnj_time.now_in_drnj_time(),
                            'skip':
                            int(skip),
                            'limit':
                            int(limit)
                        }))
                    self.add_header('Content-Type', 'application/json')
                elif res_format == 'html':
                    env = Environment(loader=FileSystemLoader('templates'))

                    template = env.get_template('statuses/filter.html')
                    result = template.render(
                        statuses=[x['tweet'] for x in tmp])
                    self.write(result)

                self.finish()

            except MissingArgumentError as e:
                # TODO: implement logging.
                raise HTTPError(
                    500, 'You didn'
                    't supply %s as an argument' % e.arg_name)
            # db.tweets.find({'campaign_id' : 'syria'}).sort({$natural : 1}).skip(10).limit(14)
        else:
            # Tornado will make sure that this point will not be reached, if
            # it's regexp based router works correctly
            raise HTTPError(404, 'No such method')
예제 #12
0
    def prepare_hist_and_plot(self, n_tweets, users, n_bins, campaign_id):
        import numpy
        import matplotlib.pyplot as plot

        plot_graphs = False

        hist = {
            'user_creation': {
                'data': None,
                'bins': None,
            },
            'user_n_tweets': {
                'data': None,
                'bins': None,
            },
            'user_n_tweets_overall': {
                'data': None,
                'bins': None,
            },
            'n_tweets': None,
            'n_unique_users': None,
            'n_default_profile_image': None,
            'n_lower_than_threshold': None,
        }

        self.logger.debug("How many tweets? %d" % n_tweets)
        hist['n_tweets'] = n_tweets

        # TODO: abort if there are more than 200000 tweets.
        if n_tweets > 200000:
            return
        #
        # How many unique users?
        #
        n_unique_users = len(users)
        self.logger.debug("How many unique users? %d" % n_unique_users)
        hist['n_unique_users'] = n_unique_users

        ######
        sec_title = "Histogram of user creation dates?"
        #

        tmp_dates = []
        for x in users:
            tmp_date = x['user']['created_at']
            if type(tmp_date) != float:
                tmp_date = py_utc_time2drnj_time(tmp_date)
            tmp_dates.append(tmp_date)

    #    tmp_dates = [py_utc_time2drnj_time(x['user']['created_at']) for x in users]

        (hist['user_creation']['data'],
         hist['user_creation']['bins']) = numpy.histogram(tmp_dates,
                                                          bins=n_bins)

        if plot_graphs:
            bins = hist['user_creation']['bins'][:-1]
            width = (hist['user_creation']['bins'][1] -
                     hist['user_creation']['bins'][0]) / 2
            plot.bar(bins,
                     hist['user_creation']['data'],
                     width=width,
                     align='center')

            xticklabels = [
                time.strftime('%d %b %Y', time.gmtime(drnj_time2py_time(x)))
                for x in bins
            ]

            plot.xticks(bins, xticklabels)
            plot.title(sec_title)
            #plot.show()
            plot.savefig('1.pdf', dpi=600)

        #####
        sec_title = "Histogram of number of tweets of each user in this campaign"
        tmp_counts = [int(x['n_user_tweets']) for x in users]
        #
        (hist['user_n_tweets']['data'],
         hist['user_n_tweets']['bins']) = numpy.histogram(tmp_counts,
                                                          bins=n_bins)

        if plot_graphs:
            bins = hist['user_n_tweets']['bins'][:-1]
            data = hist['user_n_tweets']['data']
            width = (hist['user_n_tweets']['bins'][1] -
                     hist['user_n_tweets']['bins'][0]) / 2
            plot.bar(bins, data, width=width, align='center')

            xticklabels = bins

            plot.xticks(bins, xticklabels)
            plot.title(sec_title)
            #plot.show()
            plot.savefig('2.pdf', dpi=600)

        #####
        sec_title = "What percentage of them used the default profile image?"
        #
        n_default_profile_image = 0
        for u in users:
            if u['user']['default_profile_image']:
                n_default_profile_image += 1

        hist['n_default_profile_image'] = n_default_profile_image
        self.logger.debug("%s: %0.2f%%" %
                          (sec_title, 100 *
                           (float(n_default_profile_image) / n_unique_users)))
        #####
        sec_title = "Histogram of tweet counts of unique users"
        tmp_counts = [int(x['user']['statuses_count']) for x in users]

        (hist['user_n_tweets_overall']['data'],
         hist['user_n_tweets_overall']['bins']) = numpy.histogram(tmp_counts,
                                                                  bins=n_bins)

        if plot_graphs:
            bins = hist['user_n_tweets_overall']['bins'][:-1]
            data = hist['user_n_tweets_overall']['data']
            width = (hist['user_n_tweets_overall']['bins'][1] -
                     hist['user_n_tweets_overall']['bins'][0]) / 2
            plot.bar(bins, data, width=width, align='center')

            xticklabels = bins

            plot.xticks(bins, xticklabels)
            plot.title(sec_title)
            #plot.show()
            plot.savefig('3.pdf', dpi=600)
        #
        sec_title = "What percentage of them have lower than 5 tweets?"
        n_lower_than_threshold = 0
        for u in users:
            if u['user']['statuses_count'] < 5:
                n_lower_than_threshold += 1

        hist['n_lower_than_threshold'] = n_lower_than_threshold
        self.logger.debug("%s: %0.2f%%" %
                          (sec_title, 100 *
                           (float(n_lower_than_threshold) / n_unique_users)))

        self.logger.debug(hist)

        # converting numpy.array's to normal python lists.
        for k in hist.keys():
            if type(hist[k]) == dict:
                for k2 in hist[k].keys():
                    if type(hist[k][k2]) == type(numpy.array([])):
                        hist[k][k2] = list(hist[k][k2])

        hist = {
            'campaign_id': campaign_id,
            'histogram': hist,
            'created_at': now_in_drnj_time()
        }
        return hist