def main(): """ Main predictor function """ args = init_parser() config = init_config(args) logger = get_logger(f'hawkes-{config["partition"]}', broker_list=config["bootstrap_servers"], debug=True) consumer = KafkaConsumer(bootstrap_servers=config["bootstrap_servers"]) consumer.assign( [TopicPartition(config["consumer_topic"], config["partition"])]) producer = KafkaProducer( bootstrap_servers=config["bootstrap_servers"], value_serializer=lambda v: json.dumps(v).encode("utf-8"), key_serializer=lambda v: json.dumps(v).encode("utf-8")) alpha = config["alpha"] mu = config["mu"] for message in consumer: mess = message.value.decode().replace("'", '"').replace('(', '[').replace( ')', ']') mess = eval(mess) cascade = np.array(mess["tweets"]) tweet_id = mess["cid"] text = mess["msg"] T_obs = mess["T_obs"] p, beta = 0.02, 1 / 3600 t = cascade[-1, 0] LL = loglikelihood((p, beta), cascade, t) LL_MLE, MLE = compute_MLE(cascade, t, alpha, mu) p_est, beta_est = MLE N, G1, n_star = prediction([p_est, beta_est], cascade, alpha, mu, t) messfinal = { "type": "parameters", "cid": tweet_id, "msg": text, "n_obs": len(cascade), "n_supp": N, "params": list(MLE), "G1": G1, "n_star": n_star } producer.send(config["producer_topic"], key=T_obs, value=messfinal, partition=config["partition"]) logger.info( "Predicted params p = {: .3f} and beta = {: .3f} for tweet {} at time {} on partition: {}" .format(p_est, beta_est, tweet_id, T_obs, config["partition"]))
def main(): """ Main predictor function """ args = init_parser() config = init_config(args) consumer = KafkaConsumer(config["consumer_topic"], bootstrap_servers=config["bootstrap_servers"]) producer = KafkaProducer(bootstrap_servers=config["bootstrap_servers"]) regressors = defaultdict(RandomForestRegressor) train_X = defaultdict(list) train_y = defaultdict(list) # Set the frequence of trainings of each random forest update_size = config["update_size"] logger = get_logger('learner', broker_list=config["bootstrap_servers"], debug=True) for message in consumer: t = message.key value = message.value.decode().replace("'", '"').replace('(', '[').replace( ')', ']') value = eval(value) inputs = value['X'] # (beta, n_star, G1) W = value['W'] train_X[t].append(inputs) train_y[t].append(W) if not len(train_X[t]) % update_size: regressors[t].fit(train_X[t], train_y[t]) regressor_message = pickle.dumps({ "type": "model", "regressor": regressors[t] }) producer.send('models', key=t, value=regressor_message, partition=message.partition) logger.info("Model {}s updated and sent".format(t))
def main(): args = init_parser() config = init_config(args) consumer = KafkaConsumer( config["consumer_topic"], bootstrap_servers=config["bootstrap_servers"], value_deserializer=lambda v: json.loads(v.decode('utf-8'))) for message in consumer: n_tot = message.value["n_tot"] tweet_id = message.value["cid"] T_obs = message.value["T_obs"] if n_tot > config["retweet_limit"]: print( "Tweet {} may reach an important size, {: .3f} retweets predicted with {}s of observation" .format(tweet_id, n_tot, T_obs))
def main(): args = init_parser() config = init_config(args) consumer = KafkaConsumer( config["consumer_topic"], bootstrap_servers=config["bootstrap_servers"], value_deserializer=lambda v: json.loads(v.decode('utf-8'))) are_list = [] for message in consumer: are_list.append(message.value['ARE']) if len(are_list) % config["update_period"] == 0: print("Mean ARE: {}\nMear ARE on the {} last tweets: {}".format( average(are_list), config["update_period"], average(are_list[-config["update_period"]:])))
def main(): """ Main predictor function """ args = init_parser() config = init_config(args) partition = config["obs_map"][config["obs_window"]] consumer = KafkaConsumer( bootstrap_servers=config["bootstrap_servers"], key_deserializer=lambda v: v.decode(), ) consumer.assign([ TopicPartition(topic, partition) for topic in config["consumer_topic"] ]) producer_samples = KafkaProducer( bootstrap_servers=config["bootstrap_servers"], value_serializer=lambda v: json.dumps(v).encode('utf-8'), key_serializer=str.encode) producer_alerts = KafkaProducer( bootstrap_servers=config["bootstrap_servers"], value_serializer=lambda v: json.dumps(v).encode('utf-8')) alpha = config["alpha"] mu = config["mu"] alert_limit = config["alert_limit"] regressor = RandomForestRegressor() sizes = defaultdict(dict) forest_inputs = {} logger = get_logger(f'predictor-{partition}', broker_list=config["bootstrap_servers"], debug=True) for message in consumer: try: mess = message.value.decode().replace("'", '"') mess = json.loads(mess) except: mess = pickle.loads(message.value) ################### MODEL if mess['type'] == 'model': regressor = mess['regressor'] logger.info("Updated model received") ################### SIZE t = message.key if mess['type'] == 'size': # When we receive the final size of a cascade, we store it tweet_id = mess['cid'] sizes[tweet_id]["real"] = mess['n_tot'] if mess['type'] == "parameters": G1 = mess['G1'] n_star = mess['n_star'] tweet_id = mess['cid'] p, beta = mess['params'] msg = mess['msg'] n_obs = mess['n_obs'] try: sklearn.utils.validation.check_is_fitted(regressor) n_tot = regressor.predict((beta, n_star, G1)) except: n_tot = n_obs + G1 / (1 - n_star) sizes[tweet_id]["prediction"] = n_tot forest_inputs[tweet_id] = [beta, n_star, G1, n_obs] alert_message = { 'type': 'alert', 'cid': tweet_id, 'msg': msg, 'T_obs': t, 'n_tot': n_tot, } producer_alerts.send('alerts', key=None, value=alert_message) producer_alerts.flush() logger.info("Alert produced for tweet {} at time {}".format( tweet_id, t)) if n_tot > alert_limit: logger.warning( "Tweet {} may create an important cascade with {} retweets predicted" .format(tweet_id, n_tot)) if len(sizes[tweet_id].keys()) == 2: true_size = sizes[tweet_id]["real"] pred_size = sizes[tweet_id]["prediction"] are = abs(pred_size - true_size) / true_size stat_message = { 'type': 'stat', 'cid': tweet_id, 'T_obs': t, 'ARE': are } producer_alerts.send('stats', key=None, value=stat_message) producer_alerts.flush() beta, n_star, G1, n_obs = forest_inputs[tweet_id] W = (true_size - n_obs) * (1 - n_star) / G1 sample_message = { 'type': 'sample', 'cid': tweet_id, 'X': (beta, n_star, G1), 'W': W } producer_samples.send('samples', key=args.obs_window, value=sample_message) producer_samples.flush() logger.info( "Stats and sample produced for tweet {} at time {}".format( tweet_id, t))