validate_code_scheme.verify_scheme(scheme) id = scheme["SchemeID"] if id in existing_ids: skipped_existing += 1 continue fcw.set_code_scheme(dataset_id, scheme) print(f"Written: {id}") added += 1 print(f"Added: {added}, Skipped: {skipped_existing}") elif content_type == "messages": added = 0 skipped_existing = 0 all_messages = fcw.get_all_messages(dataset_id) existing_ids = set() highest_seq_no = -1 for message in all_messages: existing_ids.add(message["MessageID"]) if message["SequenceNumber"] > highest_seq_no: highest_seq_no = message["SequenceNumber"] messages_to_write = [] for message in json_data: validate_message_structure.verify_message(message) id = message["MessageID"] if id in existing_ids: skipped_existing += 1 continue
if content_type in ["all", "messages"]: if content_type == "all": print("Messages:") previous_export = [] last_updated = None if previous_export_file_path is not None: with open(previous_export_file_path) as f: previous_export = json.load(f) for msg in previous_export: if "LastUpdated" in msg and (last_updated is None or isoparse( msg["LastUpdated"]) > last_updated): last_updated = isoparse(msg["LastUpdated"]) if last_updated is None: warnings.warn( f"Previous export file {previous_export_file_path} does not contain a message with a " f"'LastUpdated' field; performing a full download of the entire dataset..." ) messages_dict = {msg["MessageID"]: msg for msg in previous_export} new_messages_dict = { msg["MessageID"]: msg for msg in fcw.get_all_messages(dataset_id, last_updated_after=last_updated) } messages_dict.update(new_messages_dict) messages = list(messages_dict.values()) messages.sort(key=lambda msg: msg["SequenceNumber"]) print(json.dumps(messages, indent=2))
def predict_labels_for_dataset(dataset_id): DATASET_ID = dataset_id fcw.set_dataset_autolabel_complete(DATASET_ID, 0.0) log(f"Predicting labels for: {DATASET_ID}") code_scheme_ids = fcw.get_code_scheme_ids(DATASET_ID) log(f"Code_Scheme_IDs for: {code_scheme_ids}") code_schemes = {} for code_scheme_id in code_scheme_ids: fb_map_scheme = fcw.get_code_scheme(DATASET_ID, code_scheme_id) code_schemes[code_scheme_id] = Scheme.from_firebase_map(fb_map_scheme) log(f"Code_schemes: {len(code_schemes)}") messages_fb = fcw.get_all_messages(DATASET_ID) messages = [] seq_num_map = {} for message_fb in messages_fb: seq_num_map[message_fb["MessageID"]] = message_fb["SequenceNumber"] # Work around interpretation with firebase rewriting '1.0' to '1' for label_map in message_fb["Labels"]: if "Confidence" in label_map: label_map["Confidence"] = float(label_map["Confidence"]) messages.append(Message.from_firebase_map(message_fb)) log(f"Messages: {len(messages)}") for scheme_id in code_scheme_ids: log(f"Processing scheme: {scheme_id}") messages_for_model = [] labels_for_model = [] for message in messages: for label in message.labels: if label.scheme_id != scheme_id: continue if label.code_id == "SPECIAL-MANUALLY_UNCODED": continue if not label.checked: continue messages_for_model.append(message.text) labels_for_model.append(label.code_id) break log(f"Messages for model: {len(labels_for_model)}") model, scores = model_utils.build_and_evaluate(messages_for_model, labels_for_model) log(f"Model built") log(f"Scores: {str(scores)}") dt_time = pytz.utc.localize( datetime.utcnow()).isoformat(timespec="microseconds") origin = Origin("label_predictor", "Label Predictor", "Automatic") messages_to_predict = [] message_update_batch = [] i = 0 for message in messages: i = i + 1 if i % 100 == 0: fcw.set_dataset_autolabel_complete(DATASET_ID, i / len(messages)) print(f"{i} messages / {len(messages)} processed") if len(message.labels) != 0 and message.labels[0].checked: continue msg = message.text pred_label = model.predict([msg])[0] pred_distance = model.decision_function([msg])[0] max_confidence = max(model.predict_proba([msg])[0]) if (max_confidence > 0.8): label = Label(scheme_id, pred_label, dt_time, origin, confidence=max_confidence) message.labels = [label] firebase_map = message.to_firebase_map() firebase_map["SequenceNumber"] = seq_num_map[ message.message_id] message_update_batch.append(firebase_map) if (len(message_update_batch) > 100): fcw.set_messages_content_batch(DATASET_ID, message_update_batch) log(f"Messages updated {len(message_update_batch)}") message_update_batch.clear() fcw.set_messages_content_batch(DATASET_ID, message_update_batch) log(f"Messages updated {len(message_update_batch)}") fcw.set_dataset_autolabel_complete(DATASET_ID, 1.0)
if (len(sys.argv) == 2): print("Datasets:") ids = fcw.get_dataset_ids() print(json.dumps(ids, indent=2)) exit(0) DATASET_ID = sys.argv[2] if (len(sys.argv) == 4): CONTENT_TYPE = sys.argv[3].lower() ALL = CONTENT_TYPE == "all" if CONTENT_TYPE in ["all", "users"]: if ALL: print("Users:") print(json.dumps(fcw.get_user_ids(DATASET_ID), indent=2)) if CONTENT_TYPE in ["all", "schemes"]: if ALL: print("Schemes:") schemes = fcw.get_all_code_schemes(DATASET_ID) print(json.dumps(schemes, indent=2)) if CONTENT_TYPE in ["all", "messages"]: if ALL: print("Messages:") messages = fcw.get_all_messages(DATASET_ID) messages.sort(key=lambda msg: msg["SequenceNumber"]) print(json.dumps(messages, indent=2))