示例#1
0
def main():
    """
    Main predictor function
    """

    args = init_parser()
    config = init_config(args)
    logger = get_logger(f'hawkes-{config["partition"]}',
                        broker_list=config["bootstrap_servers"],
                        debug=True)
    consumer = KafkaConsumer(bootstrap_servers=config["bootstrap_servers"])
    consumer.assign(
        [TopicPartition(config["consumer_topic"], config["partition"])])
    producer = KafkaProducer(
        bootstrap_servers=config["bootstrap_servers"],
        value_serializer=lambda v: json.dumps(v).encode("utf-8"),
        key_serializer=lambda v: json.dumps(v).encode("utf-8"))

    alpha = config["alpha"]
    mu = config["mu"]

    for message in consumer:
        mess = message.value.decode().replace("'", '"').replace('(',
                                                                '[').replace(
                                                                    ')', ']')

        mess = eval(mess)

        cascade = np.array(mess["tweets"])
        tweet_id = mess["cid"]
        text = mess["msg"]
        T_obs = mess["T_obs"]
        p, beta = 0.02, 1 / 3600
        t = cascade[-1, 0]
        LL = loglikelihood((p, beta), cascade, t)
        LL_MLE, MLE = compute_MLE(cascade, t, alpha, mu)
        p_est, beta_est = MLE
        N, G1, n_star = prediction([p_est, beta_est], cascade, alpha, mu, t)

        messfinal = {
            "type": "parameters",
            "cid": tweet_id,
            "msg": text,
            "n_obs": len(cascade),
            "n_supp": N,
            "params": list(MLE),
            "G1": G1,
            "n_star": n_star
        }

        producer.send(config["producer_topic"],
                      key=T_obs,
                      value=messfinal,
                      partition=config["partition"])

        logger.info(
            "Predicted params p = {: .3f} and beta = {: .3f} for tweet {} at time {} on partition: {}"
            .format(p_est, beta_est, tweet_id, T_obs, config["partition"]))
示例#2
0
def main():
    """
    Main predictor function
    """

    args = init_parser()
    config = init_config(args)
    consumer = KafkaConsumer(config["consumer_topic"],
                             bootstrap_servers=config["bootstrap_servers"])
    producer = KafkaProducer(bootstrap_servers=config["bootstrap_servers"])

    regressors = defaultdict(RandomForestRegressor)
    train_X = defaultdict(list)
    train_y = defaultdict(list)

    # Set the frequence of trainings of each random forest
    update_size = config["update_size"]

    logger = get_logger('learner',
                        broker_list=config["bootstrap_servers"],
                        debug=True)

    for message in consumer:

        t = message.key

        value = message.value.decode().replace("'", '"').replace('(',
                                                                 '[').replace(
                                                                     ')', ']')

        value = eval(value)
        inputs = value['X']  # (beta, n_star, G1)
        W = value['W']

        train_X[t].append(inputs)
        train_y[t].append(W)

        if not len(train_X[t]) % update_size:

            regressors[t].fit(train_X[t], train_y[t])

            regressor_message = pickle.dumps({
                "type": "model",
                "regressor": regressors[t]
            })

            producer.send('models',
                          key=t,
                          value=regressor_message,
                          partition=message.partition)

            logger.info("Model {}s updated and sent".format(t))
示例#3
0
def main():

    args = init_parser()
    config = init_config(args)

    consumer = KafkaConsumer(
        config["consumer_topic"],
        bootstrap_servers=config["bootstrap_servers"],
        value_deserializer=lambda v: json.loads(v.decode('utf-8')))

    for message in consumer:

        n_tot = message.value["n_tot"]
        tweet_id = message.value["cid"]
        T_obs = message.value["T_obs"]

        if n_tot > config["retweet_limit"]:
            print(
                "Tweet {} may reach an important size, {: .3f} retweets predicted with {}s of observation"
                .format(tweet_id, n_tot, T_obs))
示例#4
0
def main():

    args = init_parser()
    config = init_config(args)

    consumer = KafkaConsumer(
        config["consumer_topic"],
        bootstrap_servers=config["bootstrap_servers"],
        value_deserializer=lambda v: json.loads(v.decode('utf-8')))

    are_list = []

    for message in consumer:

        are_list.append(message.value['ARE'])

        if len(are_list) % config["update_period"] == 0:
            print("Mean ARE: {}\nMear ARE on the {} last tweets: {}".format(
                average(are_list), config["update_period"],
                average(are_list[-config["update_period"]:])))
示例#5
0
def main():
    """
    Main predictor function
    """

    args = init_parser()
    config = init_config(args)

    partition = config["obs_map"][config["obs_window"]]
    consumer = KafkaConsumer(
        bootstrap_servers=config["bootstrap_servers"],
        key_deserializer=lambda v: v.decode(),
    )
    consumer.assign([
        TopicPartition(topic, partition) for topic in config["consumer_topic"]
    ])

    producer_samples = KafkaProducer(
        bootstrap_servers=config["bootstrap_servers"],
        value_serializer=lambda v: json.dumps(v).encode('utf-8'),
        key_serializer=str.encode)

    producer_alerts = KafkaProducer(
        bootstrap_servers=config["bootstrap_servers"],
        value_serializer=lambda v: json.dumps(v).encode('utf-8'))

    alpha = config["alpha"]
    mu = config["mu"]
    alert_limit = config["alert_limit"]

    regressor = RandomForestRegressor()

    sizes = defaultdict(dict)
    forest_inputs = {}
    logger = get_logger(f'predictor-{partition}',
                        broker_list=config["bootstrap_servers"],
                        debug=True)

    for message in consumer:
        try:
            mess = message.value.decode().replace("'", '"')
            mess = json.loads(mess)
        except:
            mess = pickle.loads(message.value)

        ###################   MODEL
        if mess['type'] == 'model':
            regressor = mess['regressor']
            logger.info("Updated model received")

        ###################   SIZE
        t = message.key
        if mess['type'] == 'size':
            # When we receive the final size of a cascade, we store it
            tweet_id = mess['cid']
            sizes[tweet_id]["real"] = mess['n_tot']

        if mess['type'] == "parameters":

            G1 = mess['G1']
            n_star = mess['n_star']
            tweet_id = mess['cid']
            p, beta = mess['params']
            msg = mess['msg']
            n_obs = mess['n_obs']

            try:
                sklearn.utils.validation.check_is_fitted(regressor)
                n_tot = regressor.predict((beta, n_star, G1))
            except:
                n_tot = n_obs + G1 / (1 - n_star)

            sizes[tweet_id]["prediction"] = n_tot

            forest_inputs[tweet_id] = [beta, n_star, G1, n_obs]

            alert_message = {
                'type': 'alert',
                'cid': tweet_id,
                'msg': msg,
                'T_obs': t,
                'n_tot': n_tot,
            }

            producer_alerts.send('alerts', key=None, value=alert_message)
            producer_alerts.flush()
            logger.info("Alert produced for tweet {} at time {}".format(
                tweet_id, t))

            if n_tot > alert_limit:
                logger.warning(
                    "Tweet {} may create an important cascade with {} retweets predicted"
                    .format(tweet_id, n_tot))

        if len(sizes[tweet_id].keys()) == 2:
            true_size = sizes[tweet_id]["real"]
            pred_size = sizes[tweet_id]["prediction"]
            are = abs(pred_size - true_size) / true_size

            stat_message = {
                'type': 'stat',
                'cid': tweet_id,
                'T_obs': t,
                'ARE': are
            }

            producer_alerts.send('stats', key=None, value=stat_message)
            producer_alerts.flush()
            beta, n_star, G1, n_obs = forest_inputs[tweet_id]

            W = (true_size - n_obs) * (1 - n_star) / G1

            sample_message = {
                'type': 'sample',
                'cid': tweet_id,
                'X': (beta, n_star, G1),
                'W': W
            }

            producer_samples.send('samples',
                                  key=args.obs_window,
                                  value=sample_message)
            producer_samples.flush()
            logger.info(
                "Stats and sample produced for tweet {} at time {}".format(
                    tweet_id, t))