def update_records(self, update_messages, human): # if we're using naive tracing, we actually don't care which records we update if not update_messages: return self grouped_update_messages = self.group_by_received_at(update_messages) for received_at, update_messages in grouped_update_messages.items(): for update_message in update_messages: old_message_dec = Message(update_message.uid, update_message.risk, update_message.contact_day, update_message.unobs_id, update_message.has_app) old_message_enc = encode_message(old_message_dec) old_cluster = None for cluster, messages in self.clusters_by_day[ update_message.contact_day].items(): for message in messages: if message == old_message_enc: old_cluster = cluster #print(old_cluster) break if old_cluster: break updated_message = Message(old_message_dec.uid, update_message.new_risk, old_message_dec.day, old_message_dec.unobs_id, old_message_dec.has_app) new_cluster = hash_to_cluster(updated_message) self.update_record(old_cluster, new_cluster, old_message_dec, updated_message) return self
def update_records(self, update_messages): """ Updates old encounter messages with a new risk""" if not update_messages: return self # TODO: implement a 24-hour cycle of message updates and batch the update messages by this. grouped_update_messages = self.group_by_received_at(update_messages) # for each batch of update messages for received_at, update_messages in grouped_update_messages.items(): # for each update message in the batch for update_message in update_messages: old_message_dec = Message(update_message.uid, update_message.risk, update_message.day, update_message.unobs_id) old_message_enc = encode_message(old_message_dec) old_cluster = None # TODO: don't use the secret info when finding the message to update. Also, optimize this loop. # Find the matching message to update for cluster, messages in self.clusters_by_day[update_message.day].items(): for message in messages: if message == old_message_enc: old_cluster = cluster break if old_cluster: break # Create the a new encounter message with the update risk and replace the old encounter message updated_message = Message(old_message_dec.uid, update_message.new_risk, old_message_dec.day, old_message_dec.unobs_id) new_cluster = hash_to_cluster(updated_message) self.update_record(old_cluster, new_cluster, old_message_dec, updated_message) return self
def add_messages(self, messages, current_day, rng=None): """ This function clusters new messages by scoring them against old messages in a sort of naive nearest neighbors approach""" for message in messages: m_dec = decode_message(message) # otherwise score against previous messages best_cluster, best_message, best_score = self.score_matches( m_dec, current_day, rng=rng) if best_score >= 0: cluster_id = best_cluster else: cluster_id = hash_to_cluster(m_dec) self.all_messages.append(message) self.clusters[cluster_id].append(message) self.add_to_clusters_by_day(cluster_id, m_dec.day, message)
def score_matches(self, m_new, current_day, rng=None): """ This function checks a new risk message against all previous messages, and assigns to the closest one in a brute force manner""" best_cluster = hash_to_cluster(m_new) best_message = None best_score = -1 for i in range(current_day - 3, current_day + 1, -1): for cluster_id, messages in self.clusters_by_day[i].items(): for m_enc in messages: obs_uid, risk, day, unobs_uid, has_app = decode_message( m_enc) if m_new.uid == obs_uid and m_new.day == day: best_cluster = cluster_id best_message = m_enc best_score = 3 break elif compare_uids( m_new.uid, obs_uid, 1) and m_new.day - 1 == day and m_new.risk == risk: best_cluster = cluster_id best_message = m_enc best_score = 2 elif compare_uids( m_new.uid, obs_uid, 2) and m_new.day - 2 == day and best_score < 1: best_cluster = cluster_id best_message = m_enc best_score = 1 elif compare_uids( m_new.uid, obs_uid, 3) and m_new.day - 3 == day and best_score < 0: best_cluster = cluster_id best_message = m_enc best_score = 0 else: best_cluster = cluster_id best_message = m_enc best_score = -1 if best_score == 3: break if best_score == 3: break # print(f"best_cluster: {best_cluster}, m_new: {m_new}, best_score: {best_score}") # print(self.clusters) if best_message: best_message = decode_message(best_message) return best_cluster, best_message, best_score
def score_matches(self, m_new, current_day): """ This function checks a new risk message against previous clusterings, and assigns to the closest one in a brute force manner""" cluster_days = hash_to_cluster_day(m_new) best_cluster = hash_to_cluster(m_new) if self.clusters_by_day[current_day].get(best_cluster, None): return best_cluster found = False for day, cluster_ids in cluster_days.items(): for cluster_id in cluster_ids: if self.clusters_by_day[current_day - day].get(cluster_id, None): best_cluster = cluster_id found = True break if found: break return best_cluster
def update_records(self, update_messages, human): # if we're using naive tracing, we actually don't care which records we update #if not config.CLUSTER_MESSAGES and config.CLUSTER_TYPE == "heuristic": # for update_message in update_messages: # self.clusters_by_day if not update_messages: return self grouped_update_messages = self.group_by_received_at(update_messages) for received_at, update_messages in grouped_update_messages.items(): # num days x num clusters cluster_cards = np.zeros((max(self.clusters_by_day.keys()) + 1, max(self.clusters.keys()) + 1)) update_cards = np.zeros((max(self.clusters_by_day.keys()) + 1, 1)) # figure out the cardinality of each day's message set for day, clusters in self.clusters_by_day.items(): for cluster_id, messages in clusters.items(): cluster_cards[day][cluster_id] = len(messages) for update_message in update_messages: update_cards[update_message.day] += 1 # find the nearest cardinality cluster perfect_signatures = np.where( (cluster_cards == update_cards).all(axis=0))[0] if not any(perfect_signatures): # calculate the wasserstein distance between every signature scores = [] for cluster_idx in range(cluster_cards.shape[1]): scores.append( dist(cluster_cards[:, cluster_idx], update_cards.reshape(-1))) best_cluster = int(np.argmin(scores)) # for each day for day in range(len(update_cards)): cur_cardinality = int(cluster_cards[day, best_cluster]) target_cardinality = int(update_cards[day]) # if (and while) the cardinality is not what it should be, as determined by the update_messages while cur_cardinality - target_cardinality != 0: # print(f"day: {day}, cur_cardinality: {cur_cardinality}, target_cardinality: {target_cardinality}") # if we need to remove messages from this cluster on this day, if cur_cardinality > target_cardinality: best_score = -1 best_message = None new_cluster_id = None # then for each message in that day/cluster, for message in self.clusters_by_day[day][ best_cluster]: for cluster_id, messages in self.clusters_by_day[ day].items(): if cluster_id == best_cluster: continue # and for each alternative cluster on that day for candidate_cluster_message in messages: # check if it's a good cluster to move this message to score = self.score_two_messages( decode_message( candidate_cluster_message), message) if (score > best_score or not best_message): best_message = message new_cluster_id = cluster_id # if there are no other clusters on that day make a new cluster if not best_message: best_message = message message = decode_message(message) new_cluster_id = hash_to_cluster(message) best_message = decode_message(best_message) # for the message which best fits another cluster, move it there self.update_record(best_cluster, new_cluster_id, best_message, best_message) cur_cardinality -= 1 # print(f"removing from cluster {best_cluster} to cluster {new_cluster_id} on day {day}") #otherwise we need to add messages to this cluster/day else: # so look for messages which closely match our update messages, and add them for update_message in update_messages: if update_message.day == day: break best_score = -2 best_message = None old_cluster_id = None for cluster_id, messages in self.clusters_by_day[ day].items(): for message in messages: score = self.score_two_messages( update_message, message) if (score > best_score and cluster_id != best_cluster): best_message = message old_cluster_id = cluster_id best_message = decode_message(best_message) updated_message = Message(best_message.uid, update_message.new_risk, best_message.day, best_message.unobs_id) # print(f"adding from cluster {old_cluster_id} to cluster {best_cluster} on day {day}") self.update_record(old_cluster_id, best_cluster, best_message, updated_message) cur_cardinality += 1 else: best_cluster = self.score_clusters(update_messages, perfect_signatures) for update_message in update_messages: best_score = -1 best_message = self.clusters_by_day[ update_message.day][best_cluster][0] for risk_message in self.clusters_by_day[ update_message.day][best_cluster]: score = self.score_two_messages(update_message, risk_message) if score > best_score: best_message = risk_message best_message = decode_message(best_message) updated_message = Message(best_message.uid, update_message.new_risk, best_message.day, best_message.unobs_id) self.update_record(best_cluster, best_cluster, best_message, updated_message) return self