示例#1
0
    def __init__(
        self,
        model: RNNModel,
        output_layer: Seq2SeqOutputLayer,
        sequence_generator: ScriptedSequenceGenerator,
        src_vocab: Vocabulary,
        trg_vocab: Vocabulary,
        dictfeat_vocab: Vocabulary,
    ):
        BaseModel.__init__(self)
        self.model = model
        self.encoder = self.model.encoder
        self.decoder = self.model.decoder
        self.output_layer = output_layer
        self.sequence_generator = sequence_generator

        # Target vocab EOS index is useful for recognizing when to stop generating
        self.trg_eos_index = trg_vocab.get_eos_index()

        # Target vocab PAD index is useful for shifting source/target prior to decoding
        self.trg_pad_index = trg_vocab.get_pad_index()

        # Source, target and dictfeat vocab are needed for export so that we can handle
        # string input
        self.src_dict = src_vocab
        self.trg_dict = trg_vocab
        self.dictfeat_dict = dictfeat_vocab

        self.force_eval_predictions = False
示例#2
0
 def from_config(cls, config: Config, **kwargs):
     tokenizer = create_component(ComponentType.TOKENIZER, config.tokenizer)
     replacements = {
         config.unk_token: UNK,
         config.pad_token: PAD,
         config.bos_token: BOS,
         config.eos_token: EOS,
         config.mask_token: MASK,
     }
     if isinstance(tokenizer, WordPieceTokenizer):
         vocab = Vocabulary(
             [token for token, _ in tokenizer.vocab.items()],
             replacements=replacements,
         )
     else:
         dictionary = BertDictionary.load(config.vocab_file)
         vocab = Vocabulary(
             dictionary.symbols, dictionary.count, replacements=replacements
         )
     return cls(
         columns=config.columns,
         tokenizer=tokenizer,
         add_bos_token=config.add_bos_token,
         add_eos_token=config.add_eos_token,
         use_eos_token_for_bos=config.use_eos_token_for_bos,
         max_seq_len=config.max_seq_len,
         vocab=vocab,
         **kwargs,
     )
示例#3
0
 def __init__(self, vocab: Vocabulary):
     super().__init__()
     self.vocab = ScriptVocabulary(
         list(vocab),
         pad_idx=vocab.get_pad_index(-1),
         bos_idx=vocab.get_bos_index(-1),
         eos_idx=vocab.get_eos_index(-1),
         unk_idx=vocab.get_unk_index(-1),
     )
示例#4
0
 def __init__(self, poss_slots: List[str], tokenizer: nn.Module = None):
     super().__init__()
     self.NO_LABEL = Token("NoLabel")
     poss_slots = list(poss_slots)
     if self.NO_LABEL not in poss_slots:
         poss_slots.insert(0, self.NO_LABEL)
     if SpecialTokens.PAD not in poss_slots:
         poss_slots.insert(1, SpecialTokens.PAD)
     self.vocab = Vocabulary(poss_slots)
示例#5
0
    def test_torchscript_intent_slot_output_layer(self, num_doc_labels,
                                                  num_word_labels, seq_lens):
        batch_size = len(seq_lens)
        doc_vocab = Vocabulary([
            OutputLayerTest._generate_random_string()
            for _ in range(num_doc_labels)
        ])
        word_vocab = Vocabulary([
            OutputLayerTest._generate_random_string()
            for _ in range(num_word_labels)
        ])
        intent_slot_output_layer = IntentSlotOutputLayer.from_config(
            config=IntentSlotOutputLayer.Config(),
            doc_labels=doc_vocab,
            word_labels=word_vocab,
        )
        doc_logits = OutputLayerTest._generate_doc_classification_inputs(
            batch_size, num_doc_labels)
        word_logits, seq_lens_tensor = OutputLayerTest._generate_word_tagging_inputs(
            batch_size, num_word_labels, seq_lens)
        context = {"seq_lens": seq_lens_tensor}
        torchscript_output_layer = intent_slot_output_layer.torchscript_predictions(
        )

        pt_output = intent_slot_output_layer.get_pred(
            (doc_logits, word_logits), None, context)[1]
        ts_output = torchscript_output_layer((doc_logits, word_logits),
                                             seq_lens_tensor)

        self._validate_doc_classification_result(pt_output[0], ts_output[0],
                                                 doc_vocab)
        self._validate_word_tagging_result(pt_output[1], ts_output[1],
                                           word_vocab)

        (
            word_bpe_logits,
            seq_lens_tensor,
            token_indices_tensor,
        ) = OutputLayerTest._generate_bpe_tagging_inputs(
            batch_size, num_word_labels, seq_lens)
        context = {
            "seq_lens": seq_lens_tensor,
            "token_indices": token_indices_tensor
        }
        pt_output = intent_slot_output_layer.get_pred(
            (doc_logits, word_bpe_logits), None, context)[1]
        ts_output = torchscript_output_layer((doc_logits, word_bpe_logits),
                                             seq_lens_tensor,
                                             token_indices_tensor)

        self._validate_doc_classification_result(pt_output[0], ts_output[0],
                                                 doc_vocab)
        self._validate_word_tagging_result(pt_output[1], ts_output[1],
                                           word_vocab)
示例#6
0
    def _prepare_dec_target(self, dec_source: List[int],
                            clean_input_tokens: List[int],
                            vocab: Vocabulary) -> List[int]:
        dec_target = [
            vocab.get_pad_index()
            if dec_source_token != vocab.get_mask_index() else
            dec_real_target_token
            for (dec_source_token,
                 dec_real_target_token) in zip(dec_source, clean_input_tokens)
        ]

        return dec_target
示例#7
0
    def test_torchscript_intent_slot_output_layer(
        self, num_doc_labels, num_word_labels, seq_lens
    ):
        batch_size = len(seq_lens)
        doc_vocab = Vocabulary(
            [OutputLayerTest._generate_random_string() for _ in range(num_doc_labels)]
        )
        word_vocab = Vocabulary(
            [OutputLayerTest._generate_random_string() for _ in range(num_word_labels)]
        )
        intent_slot_output_layer = IntentSlotOutputLayer.from_config(
            config=IntentSlotOutputLayer.Config(),
            doc_labels=doc_vocab,
            word_labels=word_vocab,
        )
        doc_logits = OutputLayerTest._generate_doc_classification_inputs(
            batch_size, num_doc_labels
        )
        word_logits, seq_lens_tensor = OutputLayerTest._generate_word_tagging_inputs(
            batch_size, num_word_labels, seq_lens
        )
        context = {"seq_lens": seq_lens_tensor}
        torchscript_output_layer = intent_slot_output_layer.torchscript_predictions()

        pt_output = intent_slot_output_layer.get_pred(
            (doc_logits, word_logits), None, context
        )[1]
        with redirect_stdout() as redirected_stdout:
            ts_output = torchscript_output_layer((doc_logits, word_logits), context)
            buffer = redirected_stdout.getvalue()
            assert (
                "Implicit dimension choice for log_softmax has been deprecated"
                not in buffer
            )

        self._validate_doc_classification_result(pt_output[0], ts_output[0], doc_vocab)
        self._validate_word_tagging_result(pt_output[1], ts_output[1], word_vocab)

        (
            word_bpe_logits,
            seq_lens_tensor,
            token_indices_tensor,
        ) = OutputLayerTest._generate_bpe_tagging_inputs(
            batch_size, num_word_labels, seq_lens
        )
        context = {"seq_lens": seq_lens_tensor, "token_indices": token_indices_tensor}
        pt_output = intent_slot_output_layer.get_pred(
            (doc_logits, word_bpe_logits), None, context
        )[1]
        ts_output = torchscript_output_layer((doc_logits, word_bpe_logits), context)

        self._validate_doc_classification_result(pt_output[0], ts_output[0], doc_vocab)
        self._validate_word_tagging_result(pt_output[1], ts_output[1], word_vocab)
示例#8
0
 def __init__(self, tokenizer: Tokenizer, vocab: Vocabulary, max_seq_len: int):
     super().__init__()
     self.tokenizer = tokenizer
     self.vocab = ScriptVocabulary(
         list(vocab),
         pad_idx=vocab.get_pad_index(),
         bos_idx=vocab.get_bos_index(-1),
         eos_idx=vocab.get_eos_index(-1),
         unk_idx=vocab.get_unk_index(),
     )
     self.vocab_lookup = VocabLookup(self.vocab)
     self.max_seq_len = max_seq_len
示例#9
0
    def test_wordblstm_export_to_caffe2(self, export_num_words,
                                        num_word_classes, test_num_words,
                                        num_predictions):
        for WORD_CONFIG in WORD_CONFIGS:
            config = self._get_config(WordTaggingTask.Config, WORD_CONFIG)
            tensorizers, data = _NewTask._init_tensorizers(config)
            word_labels = [
                SpecialTokens.PAD, SpecialTokens.UNK, "NoLabel", "person"
            ]
            tensorizers["labels"].vocab = Vocabulary(word_labels)
            tensorizers["tokens"].vocab = Vocabulary(WORD_VOCAB)
            py_model = _NewTask._init_model(config.model, tensorizers)
            dummy_test_input = self._get_rand_input_intent_slot(
                BATCH_SIZE, W_VOCAB_SIZE, test_num_words)
            exporter = ModelExporter(
                ModelExporter.Config(),
                py_model.get_export_input_names(tensorizers),
                dummy_test_input,
                py_model.vocab_to_export(tensorizers),
                py_model.get_export_output_names(tensorizers),
            )
            with tempfile.NamedTemporaryFile(
                    delete=False,
                    suffix=".{}".format(".predictor")) as pred_file:
                exporter.export_to_caffe2(py_model, pred_file.name)
                workspace.ResetWorkspace()
            pred_net = pe.prepare_prediction_net(pred_file.name,
                                                 CAFFE2_DB_TYPE)
            for _i in range(num_predictions):
                test_inputs = self._get_rand_input_intent_slot(
                    BATCH_SIZE, W_VOCAB_SIZE, test_num_words)
                self._feed_c2_input(workspace, test_inputs,
                                    exporter.input_names, exporter.vocab_map)
                workspace.RunNetOnce(pred_net)
                word_output_names = [
                    "{}:{}".format("word_scores", class_name)
                    for class_name in word_labels
                ]
                py_model.eval()
                py_outs = py_model(*test_inputs)
                context = {"seq_lens": test_inputs[-1]}
                target = None
                pred, score = py_model.get_pred(py_outs, target, context)
                c2_word_out = []
                for o_name in word_output_names:
                    c2_word_out.extend(list(workspace.FetchBlob(o_name)))

                np.testing.assert_array_almost_equal(
                    torch.transpose(score, 1,
                                    2).contiguous().view(-1).detach().numpy(),
                    np.array(c2_word_out).flatten(),
                )
示例#10
0
    def test_seq_nn_export_to_caffe2(
        self,
        export_num_words,
        num_doc_classes,
        test_num_words,
        num_predictions,
        test_num_seq,
    ):
        config = self._get_config(SeqNNTask.Config, SEQ_NN_CONFIG)
        tensorizers, data = _NewTask._init_tensorizers(config)
        doc_labels = [SpecialTokens.UNK, "cu:other", "cu:address_Person"]
        tensorizers["labels"].vocab = Vocabulary(doc_labels)
        tensorizers["tokens"].vocab = Vocabulary(WORD_VOCAB)
        py_model = _NewTask._init_model(config.model, tensorizers)
        dummy_test_input = self._get_seq_nn_rand_input(BATCH_SIZE,
                                                       W_VOCAB_SIZE,
                                                       test_num_words,
                                                       test_num_seq)
        exporter = ModelExporter(
            ModelExporter.Config(),
            py_model.get_export_input_names(tensorizers),
            dummy_test_input,
            py_model.vocab_to_export(tensorizers),
            py_model.get_export_output_names(tensorizers),
        )
        with tempfile.NamedTemporaryFile(
                delete=False, suffix=".{}".format(".predictor")) as pred_file:
            output_names = exporter.export_to_caffe2(py_model, pred_file.name)
            workspace.ResetWorkspace()

        pred_net = pe.prepare_prediction_net(pred_file.name, CAFFE2_DB_TYPE)
        for _i in range(num_predictions):
            test_inputs = self._get_seq_nn_rand_input(BATCH_SIZE, W_VOCAB_SIZE,
                                                      test_num_words,
                                                      test_num_seq)
            self._feed_c2_input(workspace, test_inputs, exporter.input_names,
                                exporter.vocab_map)
            workspace.RunNetOnce(pred_net)
            c2_out = [
                list(workspace.FetchBlob(o_name)) for o_name in output_names
            ]

            py_model.eval()
            py_outs = py_model(*test_inputs)
            # Do log_softmax since we do that before exporting predictor nets
            py_outs = F.log_softmax(py_outs, 1)
            np.testing.assert_array_almost_equal(
                py_outs.view(-1).detach().numpy(),
                np.array(c2_out).flatten())
示例#11
0
    def __init__(
        self,
        pretrained_embeddings_path: str,
        vocab: Vocabulary,
        embedding_dim: int,
        mlp_layer_dims: Optional[Sequence[int]] = None,
        lowercase_tokens: bool = False,
        skip_header: bool = True,
        delimiter: str = " ",
    ) -> None:
        super().__init__()
        pretrained_embedding = PretrainedEmbedding(
            pretrained_embeddings_path,
            lowercase_tokens=lowercase_tokens,
            skip_header=skip_header,
            delimiter=delimiter,
        )
        embeddings_weight = pretrained_embedding.initialize_embeddings_weights(
            vocab.idx,  # tensorizer.vocab.idx,
            vocab.unk_token,  # tensorizer.vocab.unk_token,
            embedding_dim,
            EmbedInitStrategy.RANDOM,
        )
        num_embeddings = len(vocab.idx)

        self.embedding = nn.Embedding(
            num_embeddings,
            embedding_dim,
            _weight=embeddings_weight,
            padding_idx=vocab.get_pad_index(),
        )

        # Initialize unk embedding with zeros
        # to guard the model against randomized decisions based on unknown words
        unk_token_idx = vocab.get_unk_index()
        if unk_token_idx >= 0:
            self.embedding.weight.data[unk_token_idx].fill_(0.0)

        # Create MLP layers
        if mlp_layer_dims is None:
            mlp_layer_dims = []

        self.mlp = nn.Sequential(
            *(nn.Sequential(nn.Linear(m, n), nn.ReLU())
              for m, n in zip([embedding_dim] +
                              list(mlp_layer_dims), mlp_layer_dims)))
        self.output_dim = mlp_layer_dims[
            -1] if mlp_layer_dims else embedding_dim
示例#12
0
    def gen_masked_source_target(self, tokens: List[int], vocab: Vocabulary):
        cleaned_tokens = self.clean_eos_bos(tokens)
        original_target_string = " ".join(
            [vocab[idx] for idx in cleaned_tokens]).upper()
        try:
            annotation = Annotation(
                original_target_string,
                accept_flat_intents_slots=self.accept_flat_intents_slots,
            )
        except Exception as e:
            # This should never happen other than when testing
            print(e, original_target_string)
            dec_source = [
                vocab.idx[vocab.mask_token] for _ in range(len(tokens))
            ]
            dec_target = [
                vocab.idx[vocab.pad_token] for _ in range(len(tokens))
            ]
            return dec_source, dec_target
        assert len(annotation.root.children) == 1
        mask_tree_str = self.gen_masked_tree(annotation.root.children[0],
                                             vocab.mask_token)

        # We are calling the .split() instead of the tokenize() of tensorizer
        # because the input str contains special MASK token __MASK__
        # It we call tokenize() on this input_str, it may lower __MASK__ or split
        # in unexpected ways causing issues.
        # Hence temporary workaround is that we call split(" ") and lower all tokens
        # other than MASK tokens

        # handle special tokens in vocab
        mask_tree_str: List[str] = list(
            map(
                lambda token: SPECIAL_TOKENS.get(token, token.lower()),
                mask_tree_str.split(" "),
            ))

        dec_source = [vocab.idx.get(t) for t in mask_tree_str]

        dec_target = self._prepare_dec_target(dec_source, cleaned_tokens,
                                              vocab)

        if self.use_bos:
            if self.should_mask():
                dec_source.insert(0, vocab.get_mask_index())
                dec_target.insert(0, vocab.get_bos_index())
            else:
                dec_source.insert(0, vocab.get_bos_index())
                dec_target.insert(0, vocab.get_pad_index())

        if self.use_eos:
            if self.should_mask():
                dec_source.append(vocab.get_mask_index())
                dec_target.append(vocab.get_eos_index())
            else:
                dec_source.append(vocab.get_eos_index())
                dec_target.append(vocab.get_pad_index())
        return dec_source, dec_target
示例#13
0
def build_fairseq_vocab(
    vocab_file: str,
    dictionary_class: Dictionary = Dictionary,
    special_token_replacements: Dict[str, SpecialToken] = None,
    max_vocab: int = -1,
    min_count: int = -1,
    tokens_to_add: Optional[List[str]] = None,
):
    """
    Function builds a PyText vocabulary for models pre-trained using Fairseq
    modules. The dictionary class can take any Fairseq Dictionary class
    and is used to load the vocab file.
    """
    if not special_token_replacements:
        special_token_replacements = {
            "<pad>": SpecialTokens.PAD,
            "<s>": SpecialTokens.BOS,
            "</s>": SpecialTokens.EOS,
            "<unk>": SpecialTokens.UNK,
            "<mask>": SpecialTokens.MASK,
        }
    with PathManager.open(vocab_file) as f:
        dictionary = dictionary_class.load(f)
        # finalize will sort the dict based on frequency so only do this if
        # a min_count or max_vocab size is specified
        if min_count > 0 or max_vocab > 0:
            dictionary.finalize(threshold=min_count, nwords=max_vocab, padding_factor=1)
        if tokens_to_add:
            for token in tokens_to_add:
                dictionary.add_symbol(token)
        return Vocabulary(
            dictionary.symbols,
            dictionary.count,
            replacements=special_token_replacements,
        )
示例#14
0
    def from_config(cls, config: Config, tensorizers):
        has_answer_labels = ["False", "True"]
        tensorizers["has_answer"].vocab = Vocabulary(has_answer_labels)
        vocab = tensorizers["squad_input"].vocab

        encoder = create_module(
            config.encoder,
            output_encoded_layers=True,
            padding_idx=vocab.get_pad_index(),
            vocab_size=vocab.__len__(),
        )

        pos_decoder = create_module(config.pos_decoder,
                                    in_dim=encoder.representation_dim,
                                    out_dim=2)
        has_ans_decoder = create_module(
            config.has_ans_decoder,
            in_dim=encoder.representation_dim,
            out_dim=len(has_answer_labels),
        )

        output_layer = create_module(config.output_layer,
                                     labels=has_answer_labels,
                                     is_kd=config.is_kd)

        return cls(encoder,
                   pos_decoder,
                   has_ans_decoder,
                   output_layer,
                   is_kd=config.is_kd)
示例#15
0
def build_fairseq_vocab(
    vocab_file: str,
    dictionary_class: Dictionary = Dictionary,
    special_token_replacements: Dict[str, Token] = None,
    max_vocab: int = -1,
    min_count: int = -1,
    tokens_to_add: Optional[List[str]] = None,
) -> Vocabulary:
    """
    Function builds a PyText vocabulary for models pre-trained using Fairseq
    modules. The dictionary class can take any Fairseq Dictionary class
    and is used to load the vocab file.
    """
    dictionary = dictionary_class.load(vocab_file)
    # finalize will sort the dict based on frequency so only do this if
    # a min_count or max_vocab size is specified
    if min_count > 0 or max_vocab > 0:
        dictionary.finalize(threshold=min_count,
                            nwords=max_vocab,
                            padding_factor=1)
    if tokens_to_add:
        for token in tokens_to_add:
            dictionary.add_symbol(token)
    return Vocabulary(dictionary.symbols,
                      dictionary.count,
                      replacements=special_token_replacements)
示例#16
0
    def test_doc_classification_output_layer(self):
        tensorizer = LabelTensorizer()
        tensorizer.vocab = Vocabulary([SpecialTokens.PAD, "foo", "bar"])
        layer = ClassificationOutputLayer.from_config(
            config=ClassificationOutputLayer.Config(loss=CrossEntropyLoss.Config()),
            labels=tensorizer.vocab,
        )
        self.assertEqual(layer.loss_fn.ignore_index, 0)

        # use default pad
        tensorizer.vocab = Vocabulary(["foo", "bar"])
        layer = ClassificationOutputLayer.from_config(
            config=ClassificationOutputLayer.Config(loss=CrossEntropyLoss.Config()),
            labels=tensorizer.vocab,
        )
        self.assertEqual(layer.loss_fn.ignore_index, -1)
 def __init__(self, num_tags, labels: Vocabulary, *args) -> None:
     super().__init__(list(labels), *args)
     self.crf = CRF(
         num_tags=num_tags,
         ignore_index=labels.get_pad_index(Padding.DEFAULT_LABEL_PAD_IDX),
         default_label_pad_index=Padding.DEFAULT_LABEL_PAD_IDX,
     )
示例#18
0
    def test_lookup_tokens(self):
        text = "let's tokenize this"
        tokenizer = Tokenizer()
        vocab = Vocabulary(text.split() + [BOS, EOS])
        tokens, start_idx, end_idx = lookup_tokens(
            text,
            tokenizer=tokenizer,
            vocab=vocab,
            add_bos_token=False,
            add_eos_token=False,
        )
        self.assertEqual(tokens, [0, 1, 2])
        self.assertEqual(start_idx, (0, 6, 15))
        self.assertEqual(end_idx, (5, 14, 19))

        tokens, start_idx, end_idx = lookup_tokens(
            text,
            tokenizer=tokenizer,
            vocab=vocab,
            add_bos_token=True,
            add_eos_token=True,
        )
        self.assertEqual(tokens, [3, 0, 1, 2, 4])
        self.assertEqual(start_idx, (-1, 0, 6, 15, -1))
        self.assertEqual(end_idx, (-1, 5, 14, 19, -1))
示例#19
0
 def from_config(cls, config: Config, **kwargs):
     """
     from_config parses the config associated with the tensorizer and
     creates both the tokenizer and the Vocabulary object. The extra arguments
     passed as kwargs allow us to reuse thie function with variable number
     of arguments (eg: for classes which derive from this class).
     """
     tokenizer = create_component(ComponentType.TOKENIZER, config.tokenizer)
     special_token_replacements = {
         "[UNK]": UNK,
         "[PAD]": PAD,
         "[CLS]": BOS,
         "[MASK]": MASK,
         "[SEP]": EOS,
     }
     if isinstance(tokenizer, WordPieceTokenizer):
         vocab = Vocabulary(
             [token for token, _ in tokenizer.vocab.items()],
             replacements=special_token_replacements,
         )
     else:
         with PathManager.open(config.vocab_file) as file_path:
             vocab = build_fairseq_vocab(
                 dictionary_class=BertDictionary,
                 vocab_file=file_path,
                 special_token_replacements=special_token_replacements,
             )
     return cls(
         columns=config.columns,
         vocab=vocab,
         tokenizer=tokenizer,
         max_seq_len=config.max_seq_len,
         **kwargs,
     )
示例#20
0
    def test_torchscript_word_tagging_output_layer(self, num_labels, seq_lens):
        batch_size = len(seq_lens)
        vocab = Vocabulary(
            [OutputLayerTest._generate_random_string() for _ in range(num_labels)]
        )

        word_layer = WordTaggingOutputLayer.from_config(
            config=WordTaggingOutputLayer.Config(), labels=vocab
        )
        crf_layer = CRFOutputLayer.from_config(
            config=CRFOutputLayer.Config(), labels=vocab
        )

        logits, seq_lens_tensor = OutputLayerTest._generate_word_tagging_inputs(
            batch_size, num_labels, seq_lens
        )
        context = {"seq_lens": seq_lens_tensor}

        torchsript_word_layer = word_layer.torchscript_predictions()
        torchscript_crf_layer = crf_layer.torchscript_predictions()

        self._validate_word_tagging_result(
            word_layer.get_pred(logits, None, context)[1],
            torchsript_word_layer(logits, context),
            vocab,
        )
        self._validate_word_tagging_result(
            crf_layer.get_pred(logits, None, context)[1],
            torchscript_crf_layer(logits, context),
            vocab,
        )
示例#21
0
class SlotLabelTransform(Transform):
    def __init__(self, poss_slots: List[str], tokenizer: nn.Module = None):
        super().__init__()
        self.NO_LABEL = Token("NoLabel")
        poss_slots = list(poss_slots)
        if self.NO_LABEL not in poss_slots:
            poss_slots.insert(0, self.NO_LABEL)
        if SpecialTokens.PAD not in poss_slots:
            poss_slots.insert(1, SpecialTokens.PAD)
        if SpecialTokens.UNK not in poss_slots:
            poss_slots.insert(2, SpecialTokens.UNK)
        self.vocab = Vocabulary(poss_slots)

    def process_slots(self, slots_list: str) -> List[Slot]:
        if "," in slots_list:
            slots_list = slots_list.split(",")
        elif slots_list != "":
            slots_list = [slots_list]
        else:
            return []
        slot_labels: List[Slot] = []
        for curr_slot in slots_list:
            first_delim = curr_slot.find(":")
            second_delim = curr_slot.find(":", first_delim + 1)
            start_ind = int(curr_slot[0:first_delim])
            end_ind = int(curr_slot[first_delim + 1:second_delim])
            slot_name = curr_slot[second_delim + 1:]
            slot_labels.append(Slot(slot_name, start_ind, end_ind))
        return slot_labels

    def forward(self, text_and_slots):
        """
        Turn slot labels and text into a list of token labels with the same
        length as the number of tokens in the text.
        """
        tokens, start, end = text_and_slots[0].values()
        slots = self.process_slots(text_and_slots[1])
        curr_slot_i = 0
        curr_token_i = 0
        slot_labels: List[str] = []
        while curr_token_i < len(tokens) and curr_slot_i < len(slots):
            curr_slot = slots[curr_slot_i]
            if int(start[curr_token_i]) > curr_slot.end:
                curr_slot_i += 1
            else:
                if int(end[curr_token_i]) > curr_slot.start:
                    slot_labels.append(curr_slot.label)
                else:
                    slot_labels.append(self.NO_LABEL)
                curr_token_i += 1
        slot_labels += [self.NO_LABEL] * (len(tokens) - curr_token_i)
        slot_label_idx = self.vocab.lookup_all(slot_labels)
        return {"slot_labels": torch.tensor(slot_label_idx)}

    @property
    def is_jitable(self) -> bool:
        return False
示例#22
0
 def setUp(self):
     self.input_iterator = [
         {"text": "hello world"},
         {"text": "feeling lucky today"},
         {"text": "hello"},
         {"text": "lucky world"},
         {"text": "today world"},
     ]
     self.vocab = Vocabulary(["hello", "world", "feeling", "lucky", "today"])
示例#23
0
    def __init__(
        self,
        model: RNNModel,
        output_layer: Seq2SeqOutputLayer,
        src_vocab: Vocabulary,
        trg_vocab: Vocabulary,
        dictfeat_vocab: Vocabulary,
        generator_config=None,
    ):
        BaseModel.__init__(self)
        self.model = model
        self.encoder = self.model.encoder
        self.decoder = self.model.decoder
        self.output_layer = output_layer

        # Sequence generation is expected to be used only for inference, and to
        # take the trained model(s) as input. Creating the sequence generator
        # may apply Torchscript JIT compilation and quantization, which modify
        # the input model. Therefore, we want to create the sequence generator
        # after training.
        if generator_config is not None:
            self.sequence_generator_builder = lambda models: create_module(
                generator_config, models, trg_vocab.get_eos_index())
        self.sequence_generator = None

        # Disable predictions until testing (see above comment about sequence
        # generator). If this functionality is needed, a new sequence generator
        # with a copy of the model should be used for each epoch during the
        # EVAL stage.
        self.force_eval_predictions = False

        # Target vocab EOS index is useful for recognizing when to stop generating
        self.trg_eos_index = trg_vocab.get_eos_index()

        # Target vocab PAD index is useful for shifting source/target prior to decoding
        self.trg_pad_index = trg_vocab.get_pad_index()

        # Source, target and dictfeat vocab are needed for export so that we can handle
        # string input
        self.src_dict = src_vocab
        self.trg_dict = trg_vocab
        self.dictfeat_dict = dictfeat_vocab

        log_class_usage(__class__)
示例#24
0
 def __init__(self, bpe, dictionary: Dictionary):
     self.bpe = bpe
     self.vocab = Vocabulary(
         dictionary.symbols,
         pad_token=str(dictionary[dictionary.pad()]),
         bos_token=str(dictionary[dictionary.bos()]),
         eos_token=str(dictionary[dictionary.eos()]),
     )
     self.bos = self.vocab.bos_token
     self.eos = self.vocab.eos_token
示例#25
0
 def test_create_word_tagging_output_layer(self):
     tensorizer = LabelTensorizer()
     tensorizer.vocab = Vocabulary(["foo", "bar"])
     tensorizer.pad_idx = 0
     layer = WordTaggingOutputLayer.from_config(
         config=WordTaggingOutputLayer.Config(label_weights={"foo": 2.2}),
         labels=tensorizer.vocab,
     )
     np.testing.assert_array_almost_equal(
         np.array([2.2, 1]), layer.loss_fn.weight.detach().numpy()
     )
示例#26
0
class LabelTransform(Transform):
    def __init__(self, label_names: List[str]):
        super().__init__()
        self.vocab = Vocabulary(label_names)

    def forward(self, label: str) -> Dict[str, torch.Tensor]:
        label_id = self.vocab.lookup_all(label)
        return {"label_ids": torch.tensor(label_id, dtype=torch.long)}

    @property
    def is_jitable(self) -> bool:
        return False
示例#27
0
def build_dumb_slot_labelling_model():
    return build_slot_labelling_model(
        None,
        5,
        100,
        [10 for i in range(100)],
        0.4,
        False,
        None,
        None,
        5,
        Vocabulary([SpecialTokens.UNK, SpecialTokens.PAD, "the", "cat"]),
    )
示例#28
0
 def _build_vocab(self, vocab_file: str, max_vocab: int,
                  min_count: int) -> Vocabulary:
     """
     Build Vocab for XLM by calling the vocab reader associated with the model
     source.
     """
     if self.is_fairseq:
         vocab_list, counts, replacements = read_fairseq_vocab(
             vocab_file, max_vocab, min_count)
     else:
         vocab_list, counts, replacements = read_vocab(
             vocab_file, max_vocab, min_count)
     return Vocabulary(vocab_list, counts, replacements=replacements)
示例#29
0
    def gen_masked_source_target(self, tokens: List[int], vocab: Vocabulary):
        num_masks = self.random.randint(self.minimum_masks, len(tokens))

        ind: Set[int] = set(
            self.random.choice(len(tokens), size=num_masks, replace=False))

        dec_source: List[int] = [
            vocab.get_mask_index() if idx in ind else token
            for idx, token in enumerate(tokens)
        ]

        dec_target = self._prepare_dec_target(dec_source, tokens, vocab)

        return dec_source, dec_target
示例#30
0
def build_legacy_pytext_vocab_pipeline(vocab_file):
    from pytext.data.utils import Vocabulary

    tokenizer = get_tokenizer("basic_english")
    f = open(vocab_file, 'r')

    vocab_counter = Counter([token for line in f for token in line.rstrip()])
    sorted_by_freq_tuples = sorted(vocab_counter.items(), key=lambda x: x[1], reverse=True)
    vocab_list = [pair[0] for pair in sorted_by_freq_tuples]
    vocab_list.insert(0, "<unk>")

    pipeline = sequential_transforms(tokenizer_func(tokenizer),
                                     PyTextVocabTransform(Vocabulary(vocab_list, unk_token="<unk>")))
    return pipeline, None, None