Пример #1
0
def find_keywords_and_groups(id_str, text, retweet_id_str):
    """Find the keywords and associated groups in the tweet."""
    # refresh keywords
    global keywords, keywords_sync_time
    if (time() - keywords_sync_time) > 60 * 60:
        keywords = get_keywords()
        keywords_sync_time = time()

    # First check if retweets are already processed in the cache
    if retweet_id_str:
        key = "t:%s" % retweet_id_str
        rt = redis.get(key)
        if rt:
            kw, groups, tokens = json.loads(rt)
            insert_tweet.apply_async((id_str, kw, groups, tokens), queue="master")
            redis.expire(key, rt_cache_time)
            return

    frog = get_frog()
    # tokens contains a list of dictionaries with frog's analysis per token
    # each dict has the keys "index", "lemma", "pos", "posprob" and "text"
    # where "text" is the original text
    tokens = frog.process(text)
    kw = []
    groups = []
    for (i, t) in enumerate(tokens):
        lemma = t["lemma"]
        k = keywords.get(lemma, None)
        if k is not None:
            if t["posprob"] > posprob_minimum:
                if not t["pos"].startswith(k.pos + "("):
                    continue

            # when the second to last keyword is matched and the very last keyword is "…"
            # then we have a false match because the second to last keyword was truncated
            if i == (len(tokens) - 2):
                if tokens[-1]["text"] == "…":
                    continue

            kw.append(lemma)
            groups += k.groups
    kw, groups = list(set(kw)), list(set(groups))
    insert_tweet.apply_async((id_str, kw, groups, tokens), queue="master")

    # put retweets in the cache
    if retweet_id_str:
        data = [kw, groups, tokens]
        redis.set(key, json.dumps(data), ex=rt_cache_time)
Пример #2
0
from hortiradar.clustering import Config, ExtendedTweet, Cluster, Stories
from hortiradar.clustering.util import round_time
from hortiradar.database import get_db, get_keywords

db = get_db()
groups = [
    g["name"]
    for g in db.groups.find({}, projection={
        "name": True,
        "_id": False
    })
]
storiesdb = db.stories
tweetsdb = db.tweets

keywords = get_keywords(local=True)

redis = StrictRedis()

spam_level = Config.getfloat('database:parameters', "spam_level")
tweet_threshold = Config.getfloat('storify:parameters', 'tweet_threshold')


def is_spam(t):
    return t.get("spam") is not None and t["spam"] > spam_level


def get_filt_tokens(tweet):
    return [t.lemma for t in tweet.tokens if not t.filter_token()]

Пример #3
0
import os
from configparser import ConfigParser
from time import time
from typing import Sequence

from redis import StrictRedis
import ujson as json

from hortiradar.database import app, get_frog, get_keywords, insert_tweet, insert_lemma


if os.environ.get("ROLE") == "worker":
    keywords = get_keywords()
    keywords_sync_time = time()

    config = ConfigParser()
    config.read(os.path.dirname(__file__) + "/tasks_workers.ini")
    posprob_minimum = config["workers"].getfloat("posprob_minimum")

    redis = StrictRedis()
    rt_cache_time = 60 * 60 * 6


@app.task
def find_keywords_and_groups(id_str, text, retweet_id_str):
    """Find the keywords and associated groups in the tweet."""
    # refresh keywords
    global keywords, keywords_sync_time
    if (time() - keywords_sync_time) > 60 * 60:
        keywords = get_keywords()
        keywords_sync_time = time()