def test_align_slot_labels(self): self.assertEqual( align_slot_labels( [[0, 4], [5, 8], [9, 14], [15, 19], [20, 25]], "20:25:music/type,5:14:music/artistName", True, ), "NoLabel B-music/artistName I-music/artistName NoLabel B-music/type", )
def preprocess_row(self, row_data: Dict[str, Any]) -> Dict[str, Any]: features = self.featurize(row_data) res = { # feature field # TODO move the logic to text field DatasetFieldName.TEXT_FIELD: self._get_tokens(features), DatasetFieldName.DICT_FIELD: ( features.gazetteer_feats, features.gazetteer_feat_weights, features.gazetteer_feat_lengths, ), DatasetFieldName.CHAR_FIELD: features.characters, DatasetFieldName.PRETRAINED_MODEL_EMBEDDING: features.pretrained_token_embedding, # extra data # TODO move the logic to FloatField DatasetFieldName.DOC_WEIGHT_FIELD: row_data.get(DFColumn.DOC_WEIGHT) or 1.0, DatasetFieldName.WORD_WEIGHT_FIELD: row_data.get(DFColumn.WORD_WEIGHT) or 1.0, DatasetFieldName.UTTERANCE_FIELD: row_data.get(DFColumn.UTTERANCE), DatasetFieldName.DENSE_FIELD: row_data.get(DatasetFieldName.DENSE_FIELD), DatasetFieldName.TOKEN_RANGE: features.token_ranges, } if DatasetFieldName.DOC_LABEL_FIELD in self.labels: res[DatasetFieldName.DOC_LABEL_FIELD] = row_data.get( DFColumn.DOC_LABEL) if DatasetFieldName.WORD_LABEL_FIELD in self.labels: # TODO move it into word label field res[DatasetFieldName. WORD_LABEL_FIELD] = data_utils.align_slot_labels( features.token_ranges, row_data.get(DFColumn.WORD_LABEL), self.labels[ DatasetFieldName.WORD_LABEL_FIELD].use_bio_labels, ) res[DatasetFieldName.RAW_WORD_LABEL] = row_data.get( DFColumn.WORD_LABEL) return res
def test_align_slot_labels_with_none_label(self): self.assertEqual( align_slot_labels([[0, 4], [5, 8]], None, True), "NoLabel NoLabel" )
def preprocess_row(self, row_data: Dict[str, Any]) -> Dict[str, Any]: """Preprocess steps for a single input row: 1. apply tokenization to a sequence of utterances; 2. process dictionary features to align with the last utterance. 3. align word labels with the last utterance. Args: row_data (Dict[str, Any]): Dict of one row data with column names as keys. Keys includes "doc_label", "word_label", "text", "dict_feat", "word weight" and "doc weight". Returns: Dict[str, Any]: Preprocessed dict of one row data includes: "seq_word_feat" (list of list of string) tokenized words of sequence of utterances "word_feat" (list of string) tokenized words of last utterance "raw_word_label" (string) raw word label "token_range" (list of tuple) token ranges of word labels, each tuple contains the start position index and the end position index "utterance" (list of string) raw utterances "word_label" (list of string) list of labels of words in last utterance "doc_label" (string) doc label for intent classification "word_weight" (float) weight of word label "doc_weight" (float) weight of document label "dict_feat" (tuple, optional) tuple of three lists, the first is the label of each words, the second is the weight of the feature, the third is the length of the feature. """ sequence = data_utils.parse_json_array(row_data[RawData.TEXT]) # ignore dictionary feature for context sentences other than the last one features_list = [ self.featurizer.featurize(InputRecord(raw_text=utterance)) for utterance in sequence[:-1] ] # adding dictionary feature for the last (current) message features_list.append( self.featurizer.featurize( InputRecord( raw_text=sequence[-1], raw_gazetteer_feats=row_data.get(ModelInput.DICT, ""), ))) res = { # features ModelInput.SEQ: [utterance.tokens for utterance in features_list], ModelInput.TEXT: features_list[-1].tokens, ModelInput.DICT: ( features_list[-1].gazetteer_feats, features_list[-1].gazetteer_feat_weights, features_list[-1].gazetteer_feat_lengths, ), ModelInput.CHAR: features_list[-1].characters, ModelInput.PRETRAINED: features_list[-1].pretrained_token_embedding, # labels DocLabelConfig._name: row_data[RawData.DOC_LABEL], # extra data # TODO move the logic to FloatField ExtraField.DOC_WEIGHT: row_data.get(RawData.DOC_WEIGHT) or 1.0, ExtraField.WORD_WEIGHT: row_data.get(RawData.WORD_WEIGHT) or 1.0, ExtraField.RAW_WORD_LABEL: row_data[RawData.WORD_LABEL], ExtraField.UTTERANCE: row_data[RawData.TEXT], ExtraField.TOKEN_RANGE: features_list[-1].token_ranges, } if WordLabelConfig._name in self.labels: # TODO move it into word label field res[WordLabelConfig._name] = data_utils.align_slot_labels( features_list[-1].token_ranges, row_data[RawData.WORD_LABEL], self.labels[WordLabelConfig._name].use_bio_labels, ) return res