def validated_annotation(self, predicted_output_sequence): try: tree = Annotation( predicted_output_sequence, accept_flat_intents_slots=self.accept_flat_intents_slots, ).tree except (ValueError, IndexError): tree = Annotation(INVALID_TREE_STR).tree return tree.flat_str()
def get_annotation_from_string(self, stringified_tree_str: str) -> Annotation: try: tree = Annotation( stringified_tree_str.upper(), accept_flat_intents_slots=self.accept_flat_intents_slots, ).tree except (ValueError, IndexError): tree = Annotation(INVALID_TREE_STR).tree return tree
def stringify_annotation_tree(self, tree_tokens, tree_vocab): stringified_tree_str = stringify(tree_tokens, tree_vocab._vocab) try: tree = Annotation( stringified_tree_str.upper(), accept_flat_intents_slots=self.accept_flat_intents_slots, ).tree except (ValueError, IndexError): tree = Annotation(INVALID_TREE_STR).tree return tree
def initialize(self, vocab_builder=None): """Build vocabulary based on training corpus.""" if self.vocab: return vocab_builder = vocab_builder or VocabBuilder() vocab_builder.use_unk = False vocab_builder.use_pad = False try: while True: row = yield annotation = Annotation(row[self.column]) actions = annotation.tree.to_actions() vocab_builder.add_all(actions) except GeneratorExit: self.vocab = vocab_builder.make_vocab() self.shift_idx = self.vocab.idx[SHIFT] self.reduce_idx = self.vocab.idx[REDUCE] def filterVocab(fn): return [ token for nt, token in self.vocab.idx.items() if fn(nt) ] self.ignore_subNTs_roots = filterVocab(is_unsupported) self.valid_NT_idxs = filterVocab(is_valid_nonterminal) self.valid_IN_idxs = filterVocab(is_intent_nonterminal) self.valid_SL_idxs = filterVocab(is_slot_nonterminal)
def test_annotation_errors(self): """ Test invalid annotation strings for which the Annotation class should raise ValueError. """ TEST_EXAMPLES = ( # Extra brackets "[device/close_app please [] exit ]", # Missing closing bracket "[IN:CREATE_CALL call [SL:CONTACT mom ]", # Missing intent label "[IN:CREATE_REMINDER Remind me to [ [IN:CREATE_CALL [SL:METHOD_CALL call ] " "[SL:CONTACT John ] ] ] [SL:DATE_TIME at 6 pm tonight ] ]", # No brackets "hang on, it's marty's party, not mary's party", ) for annotation_str in TEST_EXAMPLES: try: Annotation(annotation_str, accept_flat_intents_slots=True) except ValueError as e: print(e) pass else: raise Exception("Annotation error not catched.")
def gen_masked_source_target(self, tokens: List[int], vocab: Vocabulary): cleaned_tokens = self.clean_eos_bos(tokens) original_target_string = " ".join( [vocab[idx] for idx in cleaned_tokens]).upper() try: annotation = Annotation( original_target_string, accept_flat_intents_slots=self.accept_flat_intents_slots, ) except Exception as e: # This should never happen other than when testing print(e, original_target_string) dec_source = [ vocab.idx[vocab.mask_token] for _ in range(len(tokens)) ] dec_target = [ vocab.idx[vocab.pad_token] for _ in range(len(tokens)) ] return dec_source, dec_target assert len(annotation.root.children) == 1 mask_tree_str = self.gen_masked_tree(annotation.root.children[0], vocab.mask_token) # We are calling the .split() instead of the tokenize() of tensorizer # because the input str contains special MASK token __MASK__ # It we call tokenize() on this input_str, it may lower __MASK__ or split # in unexpected ways causing issues. # Hence temporary workaround is that we call split(" ") and lower all tokens # other than MASK tokens # handle special tokens in vocab mask_tree_str: List[str] = list( map( lambda token: SPECIAL_TOKENS.get(token, token.lower()), mask_tree_str.split(" "), )) dec_source = [vocab.idx.get(t) for t in mask_tree_str] dec_target = self._prepare_dec_target(dec_source, cleaned_tokens, vocab) if self.use_bos: if self.should_mask(): dec_source.insert(0, vocab.get_mask_index()) dec_target.insert(0, vocab.get_bos_index()) else: dec_source.insert(0, vocab.get_bos_index()) dec_target.insert(0, vocab.get_pad_index()) if self.use_eos: if self.should_mask(): dec_source.append(vocab.get_mask_index()) dec_target.append(vocab.get_eos_index()) else: dec_source.append(vocab.get_eos_index()) dec_target.append(vocab.get_pad_index()) return dec_source, dec_target
def preprocess_row(self, row_data: Dict[str, Any]) -> Dict[str, Any]: utterance = row_data.get(DFColumn.UTTERANCE, "") features = self.featurizer.featurize( InputRecord( raw_text=utterance, raw_gazetteer_feats=row_data.get(DFColumn.DICT_FEAT, ""), )) actions = "" # training time if DFColumn.SEQLOGICAL in row_data: annotation = Annotation(row_data[DFColumn.SEQLOGICAL]) actions = annotation.tree.to_actions() # Seqlogical format is required for building the tree representation of # compositional utterances and, it depends on tokenization. # Here during preprocessing, if the tokens produced from Featurizer # and those from the seqlogical format are not consistent, then it leads # to inconsistent non terminals and actions which in turn leads to # the model's forward method throwing an exception. # This should NOT happen but the check below is to make sure the # model training doesn't fail just in case there's inconsistency. tokens_from_seqlogical = annotation.tree.list_tokens() try: assert len(features.tokens) == len(tokens_from_seqlogical) for t1, t2 in zip(features.tokens, tokens_from_seqlogical): assert t1.lower() == t2.lower() except AssertionError: print( "\nTokens from Featurizer and Seqlogical format are not same " + f'for the utterance "{utterance}"') print( f"{len(features.tokens)} tokens from Featurizer: {features.tokens}" ) print( f"{len(tokens_from_seqlogical)} tokens from Seqlogical format: " + f"{tokens_from_seqlogical}") return {} return { DatasetFieldName.TEXT_FIELD: features.tokens, DatasetFieldName.DICT_FIELD: ( features.gazetteer_feats, features.gazetteer_feat_weights, features.gazetteer_feat_lengths, ), ACTION_FEATURE_FIELD: actions, ACTION_LABEL_FIELD: copy.deepcopy(actions), DatasetFieldName.TOKENS: features.tokens, DatasetFieldName.UTTERANCE_FIELD: utterance, }
def test_tree_to_metric_node(self): TEXT_EXAMPLES = [ ( "[IN:alarm/set_alarm repeat the [SL:datetime 3 : 00 pm ] " + "[SL:alarm/name alarm ] [SL:datetime for Sunday august 12th ] ] ", Node( label="IN:alarm/set_alarm", span=Span(start=0, end=49), children={ Node(label="SL:datetime", span=Span(start=11, end=20)), Node(label="SL:alarm/name", span=Span(start=21, end=26)), Node(label="SL:datetime", span=Span(start=27, end=49)), }, ), ), ( "[IN:calling/call_friend call [SL:person moms ] cellphone ]", Node( label="IN:calling/call_friend", span=Span(start=0, end=19), children={Node(label="SL:person", span=Span(start=5, end=9))}, ), ), ( "[IN:GET_DIRECTIONS I need [SL:ANCHOR directions] to [SL:DESTINATION " + "[IN:GET_EVENT the jazz festival]]]", Node( label="IN:GET_DIRECTIONS", span=Span(start=0, end=38), children={ Node(label="SL:ANCHOR", span=Span(start=7, end=17)), Node( label="SL:DESTINATION", span=Span(start=21, end=38), children={ Node(label="IN:GET_EVENT", span=Span(start=21, end=38)) }, ), }, ), ), ] for annotation_string, expected_frame in TEXT_EXAMPLES: annotation = Annotation(annotation_string) frame = CompositionalMetricReporter.tree_to_metric_node(annotation.tree) self.assertEqual(frame, expected_frame)
def test_annotation(self): TEST_EXAMPLES = [ ("[device/close_app exit ]", None), ("[IN:CREATE_CALL call [SL:CONTACT mom ] ]", None), ("[meta/provideSlotValue [SL:CONTACT An Yu ] ]", None), ( "[IN:CREATE_REMINDER Set a reminder to [SL:TODO pick up Sean ] " "at [SL:DATE_TIME 3:15 pm today ]. ]", "[IN:CREATE_REMINDER Set a reminder to [SL:TODO pick up Sean ] " "at [SL:DATE_TIME 3:15 pm today ] . ]", ), ( "[IN:CREATE_REMINDER Remind me to [SL:TODO [IN:CREATE_CALL " "[SL:METHOD_CALL call ] [SL:CONTACT John ] ] ] [SL:DATE_TIME at 6 pm " "tonight ] ]", None, ), ( # The same example above with some whitespaces removed "[IN:CREATE_REMINDER Remind me to[SL:TODO[IN:CREATE_CALL" "[SL:METHOD_CALL call][SL:CONTACT John]]][SL:DATE_TIME at 6 pm " "tonight]]", "[IN:CREATE_REMINDER Remind me to [SL:TODO [IN:CREATE_CALL " "[SL:METHOD_CALL call ] [SL:CONTACT John ] ] ] [SL:DATE_TIME at 6 pm " "tonight ] ]", ), ( # Combination labels "[IN:GET_INFO_TRAFFIC [SL:OBSTRUCTION Traffic ] please? ] " "[IN:GET_ESTIMATED_DURATION How long is my [SL:METHOD_TRAVEL drive ] " "[SL:DESTINATION [IN:GET_LOCATION_HOME home ] ] ]", "[IN:COMBINE [SL:COMBINE [IN:GET_INFO_TRAFFIC [SL:OBSTRUCTION Traffic ]" " please? ] ] [SL:COMBINE [IN:GET_ESTIMATED_DURATION How long is my " "[SL:METHOD_TRAVEL drive ] [SL:DESTINATION [IN:GET_LOCATION_HOME home ]" " ] ] ] ]", ), (r'[cu:other \["Fine"\] ]', None), # Annotation uses escape ] for annotation_str, expected_annotation_str in TEST_EXAMPLES: expected_annotation_str = expected_annotation_str or annotation_str annotation = Annotation(annotation_str, accept_flat_intents_slots=True) self.assertEqual(annotation.tree.flat_str().strip(), expected_annotation_str)
def numberize(self, row): """Tokenize, look up in vocabulary.""" annotation = Annotation(row[self.column]) return self.vocab.lookup_all(annotation.tree.to_actions())
def get_frame(parse: str) -> Node: annotation = Annotation(parse) frame = CompositionalMetricReporter.tree_to_metric_node(annotation.tree) return frame