Exemplo n.º 1
0
        validate_code_scheme.verify_scheme(scheme)
        id = scheme["SchemeID"]

        if id in existing_ids:
            skipped_existing += 1
            continue

        fcw.set_code_scheme(dataset_id, scheme)
        print(f"Written: {id}")
        added += 1

    print(f"Added: {added}, Skipped: {skipped_existing}")
elif content_type == "messages":
    added = 0
    skipped_existing = 0
    all_messages = fcw.get_all_messages(dataset_id)

    existing_ids = set()
    highest_seq_no = -1
    for message in all_messages:
        existing_ids.add(message["MessageID"])
        if message["SequenceNumber"] > highest_seq_no:
            highest_seq_no = message["SequenceNumber"]

    messages_to_write = []
    for message in json_data:
        validate_message_structure.verify_message(message)
        id = message["MessageID"]
        if id in existing_ids:
            skipped_existing += 1
            continue
Exemplo n.º 2
0
if content_type in ["all", "messages"]:
    if content_type == "all":
        print("Messages:")

    previous_export = []
    last_updated = None
    if previous_export_file_path is not None:
        with open(previous_export_file_path) as f:
            previous_export = json.load(f)
        for msg in previous_export:
            if "LastUpdated" in msg and (last_updated is None or isoparse(
                    msg["LastUpdated"]) > last_updated):
                last_updated = isoparse(msg["LastUpdated"])
        if last_updated is None:
            warnings.warn(
                f"Previous export file {previous_export_file_path} does not contain a message with a "
                f"'LastUpdated' field; performing a full download of the entire dataset..."
            )

    messages_dict = {msg["MessageID"]: msg for msg in previous_export}
    new_messages_dict = {
        msg["MessageID"]: msg
        for msg in fcw.get_all_messages(dataset_id,
                                        last_updated_after=last_updated)
    }
    messages_dict.update(new_messages_dict)

    messages = list(messages_dict.values())
    messages.sort(key=lambda msg: msg["SequenceNumber"])
    print(json.dumps(messages, indent=2))
Exemplo n.º 3
0
def predict_labels_for_dataset(dataset_id):
    DATASET_ID = dataset_id
    fcw.set_dataset_autolabel_complete(DATASET_ID, 0.0)
    log(f"Predicting labels for: {DATASET_ID}")
    code_scheme_ids = fcw.get_code_scheme_ids(DATASET_ID)
    log(f"Code_Scheme_IDs for: {code_scheme_ids}")

    code_schemes = {}
    for code_scheme_id in code_scheme_ids:
        fb_map_scheme = fcw.get_code_scheme(DATASET_ID, code_scheme_id)
        code_schemes[code_scheme_id] = Scheme.from_firebase_map(fb_map_scheme)

    log(f"Code_schemes: {len(code_schemes)}")

    messages_fb = fcw.get_all_messages(DATASET_ID)
    messages = []
    seq_num_map = {}
    for message_fb in messages_fb:
        seq_num_map[message_fb["MessageID"]] = message_fb["SequenceNumber"]
        # Work around interpretation with firebase rewriting '1.0' to '1'
        for label_map in message_fb["Labels"]:
            if "Confidence" in label_map:
                label_map["Confidence"] = float(label_map["Confidence"])

        messages.append(Message.from_firebase_map(message_fb))

    log(f"Messages: {len(messages)}")

    for scheme_id in code_scheme_ids:
        log(f"Processing scheme: {scheme_id}")

        messages_for_model = []
        labels_for_model = []
        for message in messages:
            for label in message.labels:
                if label.scheme_id != scheme_id:
                    continue
                if label.code_id == "SPECIAL-MANUALLY_UNCODED":
                    continue
                if not label.checked:
                    continue

                messages_for_model.append(message.text)
                labels_for_model.append(label.code_id)
                break

        log(f"Messages for model: {len(labels_for_model)}")

        model, scores = model_utils.build_and_evaluate(messages_for_model,
                                                       labels_for_model)

        log(f"Model built")
        log(f"Scores: {str(scores)}")

        dt_time = pytz.utc.localize(
            datetime.utcnow()).isoformat(timespec="microseconds")
        origin = Origin("label_predictor", "Label Predictor", "Automatic")

        messages_to_predict = []
        message_update_batch = []
        i = 0
        for message in messages:
            i = i + 1
            if i % 100 == 0:
                fcw.set_dataset_autolabel_complete(DATASET_ID,
                                                   i / len(messages))
                print(f"{i} messages / {len(messages)} processed")

            if len(message.labels) != 0 and message.labels[0].checked:
                continue
            msg = message.text

            pred_label = model.predict([msg])[0]
            pred_distance = model.decision_function([msg])[0]
            max_confidence = max(model.predict_proba([msg])[0])

            if (max_confidence > 0.8):
                label = Label(scheme_id,
                              pred_label,
                              dt_time,
                              origin,
                              confidence=max_confidence)
                message.labels = [label]
                firebase_map = message.to_firebase_map()
                firebase_map["SequenceNumber"] = seq_num_map[
                    message.message_id]
                message_update_batch.append(firebase_map)

                if (len(message_update_batch) > 100):
                    fcw.set_messages_content_batch(DATASET_ID,
                                                   message_update_batch)
                    log(f"Messages updated {len(message_update_batch)}")
                    message_update_batch.clear()

        fcw.set_messages_content_batch(DATASET_ID, message_update_batch)
        log(f"Messages updated {len(message_update_batch)}")
        fcw.set_dataset_autolabel_complete(DATASET_ID, 1.0)
Exemplo n.º 4
0
if (len(sys.argv) == 2):
    print("Datasets:")
    ids = fcw.get_dataset_ids()
    print(json.dumps(ids, indent=2))
    exit(0)

DATASET_ID = sys.argv[2]

if (len(sys.argv) == 4):
    CONTENT_TYPE = sys.argv[3].lower()

ALL = CONTENT_TYPE == "all"

if CONTENT_TYPE in ["all", "users"]:
    if ALL:
        print("Users:")
    print(json.dumps(fcw.get_user_ids(DATASET_ID), indent=2))

if CONTENT_TYPE in ["all", "schemes"]:
    if ALL:
        print("Schemes:")
    schemes = fcw.get_all_code_schemes(DATASET_ID)
    print(json.dumps(schemes, indent=2))

if CONTENT_TYPE in ["all", "messages"]:
    if ALL:
        print("Messages:")
    messages = fcw.get_all_messages(DATASET_ID)
    messages.sort(key=lambda msg: msg["SequenceNumber"])
    print(json.dumps(messages, indent=2))