예제 #1
0
    def score(self, data_path: str) -> float:
        log.info(f"Scoring majority classifier with: {data_path}")
        dialogs = CuriosityDialogReader().read(data_path)
        log.info(f"N Dialogs: {len(dialogs)}")
        correct = 0
        total = 0
        n_messages = 0
        for d in dialogs:
            dialog_senders = d["senders"].array
            dialog_likes = d["likes"]
            for sender, liked in zip(dialog_senders, dialog_likes):
                if sender == ASSISTANT_IDX:
                    label = liked.label
                    # If liked and majority class in training was liked
                    if label == "liked" and self._like_all:
                        correct += 1
                    # If not liked and majority class in training was not liked
                    elif label == "liked" and not self._like_all:
                        correct += 1
                    total += 1
                n_messages += 1

        log.info(f"N Correct Assistant Messages: {correct}")
        log.info(f"N Total Assistant Messages: {total}")
        log.info(f"N Total Messages: {n_messages}")
        total = max(1, total)
        return correct / total
예제 #2
0
 def train(self, data_path: str) -> None:
     log.info(f"Training majority classifier with: {data_path}")
     self._n_total_assistant_msgs = 0
     self._n_liked_assistant_msgs = 0
     n_messages = 0
     dialogs = CuriosityDialogReader().read(data_path)
     log.info(f"N Dialogs: {len(dialogs)}")
     for d in dialogs:
         dialog_senders = d["senders"].array
         dialog_likes = d["likes"]
         for sender, liked in zip(dialog_senders, dialog_likes):
             # Only care about assistant messages
             if sender == ASSISTANT_IDX:
                 if liked.label == "liked":
                     self._n_liked_assistant_msgs += 1
                 self._n_total_assistant_msgs += 1
             n_messages += 1
     self._n_total_assistant_msgs = max(1, self._n_total_assistant_msgs)
     log.info(f"N Liked Assistant Messages: {self._n_liked_assistant_msgs}")
     log.info(f"N Total Assistant Messages: {self._n_total_assistant_msgs}")
     log.info(f"N Total Messages: {n_messages}")
     if (self._n_liked_assistant_msgs / self._n_total_assistant_msgs) > 0.5:
         self._like_all = True
     else:
         self._like_all = False
     log.info(f"Majority Class Liked: {self._like_all}")
예제 #3
0
def fact_length_stats(data_path: str):
    dialogs = CuriosityDialogReader().read(data_path)
    fact_lengths = []
    for d in dialogs:
        for facts in d["facts"]:
            for f in facts:
                fact_lengths.append({"n_tokens": f.sequence_length()})
    df = pd.DataFrame(fact_lengths)
    summary = df.describe(percentiles=[0.25, 0.5, 0.75, 0.8, 0.9, 0.95, 0.99])
    log.info(f"Summary\n{summary}")
예제 #4
0
    def score(self, data_path: str):
        dialogs = CuriosityDialogReader().read(data_path)
        n_assistant_messages = 0
        all_rr = []
        for d in dialogs:
            msg_history = []
            dialog_senders = d["senders"].array
            dialog_facts = d["facts"]
            dialog_fact_labels = d["fact_labels"]
            dialog_messages = d["messages"]
            for msg, sender, facts, fact_labels in zip(dialog_messages,
                                                       dialog_senders,
                                                       dialog_facts,
                                                       dialog_fact_labels):
                if sender == ASSISTANT_IDX:
                    context = " ".join(msg_history)
                    fact_texts = [tokens_to_str(tokens) for tokens in facts]
                    doc_scores = self._similarity.score(context, fact_texts)
                    # First get a list where first position is maximal score
                    sorted_scores = np.argsort(-np.array(doc_scores))
                    exists_rel_doc = False
                    best_rank = None
                    for rel_idx in fact_labels.array:
                        if rel_idx != -1:
                            # Then find the rank + 1 of the relevant doc
                            exists_rel_doc = True
                            # import ipdb;ipdb.set_trace();
                            rank = np.where(sorted_scores == rel_idx)[0][0] + 1
                            # We only care about the best rank, if there are multiple
                            # relevant docs
                            if best_rank is None or rank < best_rank:
                                best_rank = rank

                    # Ignore this example if there is no relevant doc
                    if exists_rel_doc:
                        all_rr.append(1 / best_rank)
                    n_assistant_messages += 1

                # Only add the actually used message after prediction
                # Add user and assistant messages
                msg_text = tokens_to_str(msg.tokens)
                msg_history.append(msg_text)
        mean_rr = np.mean(all_rr)
        log.info(f"Msgs with Facts: {len(all_rr)}")
        log.info(f"Total Assistant Msgs: {n_assistant_messages}")
        log.info(f"MRR: {mean_rr}")
        return mean_rr
예제 #5
0
    def score(self, data_path: str) -> float:
        log.info(f"Scoring majority classifier with: {data_path}")
        dialogs = CuriosityDialogReader().read(data_path)
        log.info(f"N Dialogs: {len(dialogs)}")
        correct = 0
        total = 0
        n_messages = 0
        for d in dialogs:
            dialog_senders = d["senders"].array
            dialog_acts_list = d["dialog_acts"]

            for i in range(len(dialog_senders)):
                sender = dialog_senders[i]
                acts = dialog_acts_list[i].labels

                if sender == ASSISTANT_IDX:
                    for act in acts:
                        if i in self._majority_per_turn:
                            if act == self._majority_per_turn[i]:
                                correct += 1
                        else:
                            if act == self._majority:
                                correct += 1

                    total += len(acts)
                n_messages += 1

        log.info(f"N Correct Acts: {correct}")
        log.info(f"N Total Acts: {total}")
        log.info(f"N Total Messages: {n_messages}")
        total = max(1, total)
        n_messages = max(1, n_messages)
        p = correct / n_messages  # assumes 1 prediction per message
        r = correct / total
        f1 = 2 * (p * r) / (p + r)
        return f1
예제 #6
0
def test_text_to_instance():
    facts_0 = [
        {
            "fid": 1,
            "used": True
        },
        {
            "fid": 1,
            "used": False
        },
        {
            "fid": 1,
            "used": False
        },
    ]
    facts_1 = [
        {
            "fid": 1,
            "used": False
        },
        {
            "fid": 1,
            "used": False
        },
        {
            "fid": 1,
            "used": False
        },
    ]
    facts_2 = [
        {
            "fid": 1,
            "used": False
        },
        {
            "fid": 1,
            "used": True
        },
        {
            "fid": 1,
            "used": True
        },
    ]
    messages = [
        {
            "sender": USER,
            "message": "first text",
            "liked": False
        },
        {
            "sender": ASSISTANT,
            "message": "second text",
            "liked": True,
            "facts": facts_0,
        },
        {
            "sender": USER,
            "message": "third text",
            "liked": False
        },
        {
            "sender": ASSISTANT,
            "message": "fourth text",
            "liked": True,
            "facts": facts_1,
        },
        {
            "sender": USER,
            "message": "fifth text",
            "liked": False
        },
        {
            "sender": ASSISTANT,
            "message": "sixth text",
            "liked": False,
            "facts": facts_2,
        },
    ]
    dialog = {"messages": messages, "dialog_id": 0}
    instance = CuriosityDialogReader().text_to_instance(dialog,
                                                        ignore_fact=True)
    like_labels = [l.label for l in instance["likes"]]
    assert like_labels == [
        "not_liked",
        "liked",
        "not_liked",
        "liked",
        "not_liked",
        "not_liked",
    ]
    fact_labels = instance["fact_labels"]
    # Users have 1 dummy fact
    assert len(fact_labels[0].array) == 1
    assert len(fact_labels[2].array) == 1
    assert len(fact_labels[4].array) == 1

    assert fact_labels[0].array[0] == 0
    assert fact_labels[2].array[0] == 0
    assert fact_labels[4].array[0] == 0

    assert list(fact_labels[1].array) == [1, 0, 0]
    assert list(fact_labels[3].array) == [0, 0, 0]
    assert list(fact_labels[5].array) == [0, 1, 1]
예제 #7
0
    def train(self, data_path: str) -> None:
        log.info(f"Training majority classifier with: {data_path}")
        self._n_total_assistant_msgs = 0
        n_messages = 0
        dialogs = CuriosityDialogReader().read(data_path)
        log.info(f"N Dialogs: {len(dialogs)}")
        for d in dialogs:
            dialog_senders = d["senders"].array
            dialog_acts_list = d["dialog_acts"]

            for i in range(len(dialog_senders)):
                sender = dialog_senders[i]
                acts = dialog_acts_list[i].labels

                # Histogram stat per turn
                if i not in self._count_per_turn:
                    self._count_per_turn[i] = {}

                # Only care about assistant messages
                if sender == ASSISTANT_IDX:
                    for act in acts:
                        # Histogram stat per turn
                        self._count_per_turn[i][act] = (
                            self._count_per_turn[i].get(act, 0) + 1)

                        # Histogram stat overall
                        self._count[act] = self._count.get(act, 0) + 1

                        # Total count
                        self._n_total_acts += 1

                    self._n_total_assistant_msgs += 1
                n_messages += 1
        self._n_total_assistant_msgs = max(1, self._n_total_assistant_msgs)
        log.info(f"N Total Assistant Messages: {self._n_total_acts}")
        log.info(f"N Total Acts: {self._n_total_assistant_msgs}")
        log.info(f"N Total Messages: {n_messages}")

        # Sort count overall
        lst = [(count, act) for act, count in self._count.items()]
        lst.sort(reverse=True)

        # Majority act in this turn
        self._majority = lst[0][1]

        for turn_idx, act_stat in self._count_per_turn.items():
            # Sort count_per_turn for each turn_idx
            lst = [(count, act) for act, count in act_stat.items()]
            lst.sort(reverse=True)

            if len(lst) != 0:
                majority_act = lst[0][1]
            else:
                majority_act = self._majority

            # Majority act in this turn
            self._majority_per_turn[turn_idx] = majority_act
            print("Turn: %d, Majority Act: %s" % (turn_idx, majority_act))

        log.info(f"Majority Act: {self._majority}")
        log.info(f"Majority Map: {self._majority_per_turn}")
        log.info(f"Count Map Per Turn: {self._count_per_turn}")
        log.info(f"Count Map: {self._count}")