Exemplo n.º 1
0
def advance_search_dataset(q, f, num, event_id):
    _, db, _ = get_spider_config()
    collection = db.paper
    tweetCriteria = got.manager.TweetCriteria().setQuerySearch(q).setTweetType(
        f).setMaxTweets(num)
    tweets = got.manager.TweetManager.getTweets(tweetCriteria)
    for tweet in tweets:
        if collection.find_one({'_id': tweet['id']}) == None:
            collection.insert_one({
                '_id':
                tweet['id'],
                'tweet':
                tweet,
                'event_id':
                json.loads(event_id, object_hook=json_util.object_hook),
                'f':
                f,
                'q':
                q
            })
    db.close()
Exemplo n.º 2
0
import re
import json
import fire
import time
from tqdm import tqdm
from datetime import datetime, timedelta
from collections import Counter
from Config import get_spider_config
_, db, r = get_spider_config()

users = [
    i['tweet']['user']['screen_name']
    for i in db.korea_missile.find({}, {"tweet.user.screen_name": 1})
]
freq_users = [i[0] for i in Counter(users).most_common() if i[1] >= 5]


def get_query_str(loc, triggers, target):
    # start = (now - time_delta).strftime("%Y-%m-%d %H:%M:%S")
    # now_str = now.strftime("%Y-%m-%d %H:%M:%S")
    return '(' + loc + ')' + ' ' + '(' + ' OR '.join(
        triggers) + ')' + ' ' + '(' + target + ')'


def get_task():
    locs = ["North Korea"]
    triggers = ["test", "launch", "fire"]
    targets = ["messile", "satellite", "rocket", "nuclear"]
    now = datetime.now()
    WAIT_TIME_MINUTES = 15
    while True: