def validated_annotation(self, predicted_output_sequence):
     try:
         tree = Annotation(
             predicted_output_sequence,
             accept_flat_intents_slots=self.accept_flat_intents_slots,
         ).tree
     except (ValueError, IndexError):
         tree = Annotation(INVALID_TREE_STR).tree
     return tree.flat_str()
 def get_annotation_from_string(self, stringified_tree_str: str) -> Annotation:
     try:
         tree = Annotation(
             stringified_tree_str.upper(),
             accept_flat_intents_slots=self.accept_flat_intents_slots,
         ).tree
     except (ValueError, IndexError):
         tree = Annotation(INVALID_TREE_STR).tree
     return tree
Пример #3
0
 def stringify_annotation_tree(self, tree_tokens, tree_vocab):
     stringified_tree_str = stringify(tree_tokens, tree_vocab._vocab)
     try:
         tree = Annotation(
             stringified_tree_str.upper(),
             accept_flat_intents_slots=self.accept_flat_intents_slots,
         ).tree
     except (ValueError, IndexError):
         tree = Annotation(INVALID_TREE_STR).tree
     return tree
Пример #4
0
    def initialize(self, vocab_builder=None):
        """Build vocabulary based on training corpus."""
        if self.vocab:
            return
        vocab_builder = vocab_builder or VocabBuilder()
        vocab_builder.use_unk = False
        vocab_builder.use_pad = False

        try:
            while True:
                row = yield
                annotation = Annotation(row[self.column])
                actions = annotation.tree.to_actions()
                vocab_builder.add_all(actions)
        except GeneratorExit:
            self.vocab = vocab_builder.make_vocab()
            self.shift_idx = self.vocab.idx[SHIFT]
            self.reduce_idx = self.vocab.idx[REDUCE]

            def filterVocab(fn):
                return [
                    token for nt, token in self.vocab.idx.items() if fn(nt)
                ]

            self.ignore_subNTs_roots = filterVocab(is_unsupported)
            self.valid_NT_idxs = filterVocab(is_valid_nonterminal)
            self.valid_IN_idxs = filterVocab(is_intent_nonterminal)
            self.valid_SL_idxs = filterVocab(is_slot_nonterminal)
Пример #5
0
    def test_annotation_errors(self):
        """
        Test invalid annotation strings for which the Annotation class should raise
        ValueError.
        """

        TEST_EXAMPLES = (
            # Extra brackets
            "[device/close_app please [] exit ]",
            # Missing closing bracket
            "[IN:CREATE_CALL call [SL:CONTACT mom ]",
            # Missing intent label
            "[IN:CREATE_REMINDER Remind me to [ [IN:CREATE_CALL [SL:METHOD_CALL call ] "
            "[SL:CONTACT John ] ] ] [SL:DATE_TIME at 6 pm tonight ] ]",
            # No brackets
            "hang on, it's marty's party, not mary's party",
        )
        for annotation_str in TEST_EXAMPLES:
            try:
                Annotation(annotation_str, accept_flat_intents_slots=True)
            except ValueError as e:
                print(e)
                pass
            else:
                raise Exception("Annotation error not catched.")
Пример #6
0
    def gen_masked_source_target(self, tokens: List[int], vocab: Vocabulary):
        cleaned_tokens = self.clean_eos_bos(tokens)
        original_target_string = " ".join(
            [vocab[idx] for idx in cleaned_tokens]).upper()
        try:
            annotation = Annotation(
                original_target_string,
                accept_flat_intents_slots=self.accept_flat_intents_slots,
            )
        except Exception as e:
            # This should never happen other than when testing
            print(e, original_target_string)
            dec_source = [
                vocab.idx[vocab.mask_token] for _ in range(len(tokens))
            ]
            dec_target = [
                vocab.idx[vocab.pad_token] for _ in range(len(tokens))
            ]
            return dec_source, dec_target
        assert len(annotation.root.children) == 1
        mask_tree_str = self.gen_masked_tree(annotation.root.children[0],
                                             vocab.mask_token)

        # We are calling the .split() instead of the tokenize() of tensorizer
        # because the input str contains special MASK token __MASK__
        # It we call tokenize() on this input_str, it may lower __MASK__ or split
        # in unexpected ways causing issues.
        # Hence temporary workaround is that we call split(" ") and lower all tokens
        # other than MASK tokens

        # handle special tokens in vocab
        mask_tree_str: List[str] = list(
            map(
                lambda token: SPECIAL_TOKENS.get(token, token.lower()),
                mask_tree_str.split(" "),
            ))

        dec_source = [vocab.idx.get(t) for t in mask_tree_str]

        dec_target = self._prepare_dec_target(dec_source, cleaned_tokens,
                                              vocab)

        if self.use_bos:
            if self.should_mask():
                dec_source.insert(0, vocab.get_mask_index())
                dec_target.insert(0, vocab.get_bos_index())
            else:
                dec_source.insert(0, vocab.get_bos_index())
                dec_target.insert(0, vocab.get_pad_index())

        if self.use_eos:
            if self.should_mask():
                dec_source.append(vocab.get_mask_index())
                dec_target.append(vocab.get_eos_index())
            else:
                dec_source.append(vocab.get_eos_index())
                dec_target.append(vocab.get_pad_index())
        return dec_source, dec_target
Пример #7
0
    def preprocess_row(self, row_data: Dict[str, Any]) -> Dict[str, Any]:
        utterance = row_data.get(DFColumn.UTTERANCE, "")
        features = self.featurizer.featurize(
            InputRecord(
                raw_text=utterance,
                raw_gazetteer_feats=row_data.get(DFColumn.DICT_FEAT, ""),
            ))
        actions = ""
        # training time
        if DFColumn.SEQLOGICAL in row_data:
            annotation = Annotation(row_data[DFColumn.SEQLOGICAL])
            actions = annotation.tree.to_actions()

            # Seqlogical format is required for building the tree representation of
            # compositional utterances and, it depends on tokenization.
            # Here during preprocessing, if the tokens produced from Featurizer
            # and those from the seqlogical format are not consistent, then it leads
            # to inconsistent non terminals and actions which in turn leads to
            # the model's forward method throwing an exception.
            # This should NOT happen but the check below is to make sure the
            # model training doesn't fail just in case there's inconsistency.
            tokens_from_seqlogical = annotation.tree.list_tokens()
            try:
                assert len(features.tokens) == len(tokens_from_seqlogical)
                for t1, t2 in zip(features.tokens, tokens_from_seqlogical):
                    assert t1.lower() == t2.lower()
            except AssertionError:
                print(
                    "\nTokens from Featurizer and Seqlogical format are not same "
                    + f'for the utterance "{utterance}"')
                print(
                    f"{len(features.tokens)} tokens from Featurizer: {features.tokens}"
                )
                print(
                    f"{len(tokens_from_seqlogical)} tokens from Seqlogical format: "
                    + f"{tokens_from_seqlogical}")
                return {}

        return {
            DatasetFieldName.TEXT_FIELD:
            features.tokens,
            DatasetFieldName.DICT_FIELD: (
                features.gazetteer_feats,
                features.gazetteer_feat_weights,
                features.gazetteer_feat_lengths,
            ),
            ACTION_FEATURE_FIELD:
            actions,
            ACTION_LABEL_FIELD:
            copy.deepcopy(actions),
            DatasetFieldName.TOKENS:
            features.tokens,
            DatasetFieldName.UTTERANCE_FIELD:
            utterance,
        }
 def test_tree_to_metric_node(self):
     TEXT_EXAMPLES = [
         (
             "[IN:alarm/set_alarm  repeat the [SL:datetime 3 : 00 pm ] "
             + "[SL:alarm/name alarm ]  [SL:datetime for Sunday august 12th ]  ] ",
             Node(
                 label="IN:alarm/set_alarm",
                 span=Span(start=0, end=49),
                 children={
                     Node(label="SL:datetime", span=Span(start=11, end=20)),
                     Node(label="SL:alarm/name", span=Span(start=21, end=26)),
                     Node(label="SL:datetime", span=Span(start=27, end=49)),
                 },
             ),
         ),
         (
             "[IN:calling/call_friend call [SL:person moms ] cellphone ]",
             Node(
                 label="IN:calling/call_friend",
                 span=Span(start=0, end=19),
                 children={Node(label="SL:person", span=Span(start=5, end=9))},
             ),
         ),
         (
             "[IN:GET_DIRECTIONS I need [SL:ANCHOR directions] to [SL:DESTINATION "
             + "[IN:GET_EVENT the jazz festival]]]",
             Node(
                 label="IN:GET_DIRECTIONS",
                 span=Span(start=0, end=38),
                 children={
                     Node(label="SL:ANCHOR", span=Span(start=7, end=17)),
                     Node(
                         label="SL:DESTINATION",
                         span=Span(start=21, end=38),
                         children={
                             Node(label="IN:GET_EVENT", span=Span(start=21, end=38))
                         },
                     ),
                 },
             ),
         ),
     ]
     for annotation_string, expected_frame in TEXT_EXAMPLES:
         annotation = Annotation(annotation_string)
         frame = CompositionalMetricReporter.tree_to_metric_node(annotation.tree)
         self.assertEqual(frame, expected_frame)
Пример #9
0
 def test_annotation(self):
     TEST_EXAMPLES = [
         ("[device/close_app exit ]", None),
         ("[IN:CREATE_CALL call [SL:CONTACT mom ] ]", None),
         ("[meta/provideSlotValue [SL:CONTACT An Yu ] ]", None),
         (
             "[IN:CREATE_REMINDER Set a reminder to [SL:TODO pick up Sean ] "
             "at [SL:DATE_TIME 3:15 pm today ]. ]",
             "[IN:CREATE_REMINDER Set a reminder to [SL:TODO pick up Sean ] "
             "at [SL:DATE_TIME 3:15 pm today ] . ]",
         ),
         (
             "[IN:CREATE_REMINDER Remind me to [SL:TODO [IN:CREATE_CALL "
             "[SL:METHOD_CALL call ] [SL:CONTACT John ] ] ] [SL:DATE_TIME at 6 pm "
             "tonight ] ]",
             None,
         ),
         (  # The same example above with some whitespaces removed
             "[IN:CREATE_REMINDER Remind me to[SL:TODO[IN:CREATE_CALL"
             "[SL:METHOD_CALL call][SL:CONTACT John]]][SL:DATE_TIME at 6 pm "
             "tonight]]",
             "[IN:CREATE_REMINDER Remind me to [SL:TODO [IN:CREATE_CALL "
             "[SL:METHOD_CALL call ] [SL:CONTACT John ] ] ] [SL:DATE_TIME at 6 pm "
             "tonight ] ]",
         ),
         (  # Combination labels
             "[IN:GET_INFO_TRAFFIC [SL:OBSTRUCTION Traffic ] please? ] "
             "[IN:GET_ESTIMATED_DURATION How long is my [SL:METHOD_TRAVEL drive ] "
             "[SL:DESTINATION [IN:GET_LOCATION_HOME home ] ] ]",
             "[IN:COMBINE [SL:COMBINE [IN:GET_INFO_TRAFFIC [SL:OBSTRUCTION Traffic ]"
             " please? ] ] [SL:COMBINE [IN:GET_ESTIMATED_DURATION How long is my "
             "[SL:METHOD_TRAVEL drive ] [SL:DESTINATION [IN:GET_LOCATION_HOME home ]"
             " ] ] ] ]",
         ),
         (r'[cu:other \["Fine"\] ]', None),  # Annotation uses escape
     ]
     for annotation_str, expected_annotation_str in TEST_EXAMPLES:
         expected_annotation_str = expected_annotation_str or annotation_str
         annotation = Annotation(annotation_str,
                                 accept_flat_intents_slots=True)
         self.assertEqual(annotation.tree.flat_str().strip(),
                          expected_annotation_str)
Пример #10
0
 def numberize(self, row):
     """Tokenize, look up in vocabulary."""
     annotation = Annotation(row[self.column])
     return self.vocab.lookup_all(annotation.tree.to_actions())
Пример #11
0
def get_frame(parse: str) -> Node:
    annotation = Annotation(parse)
    frame = CompositionalMetricReporter.tree_to_metric_node(annotation.tree)
    return frame