Python Vocabulary示例，pytext.data.utils.Vocabulary Python示例

示例#1

0

显示文件

    def __init__(
        self,
        model: RNNModel,
        output_layer: Seq2SeqOutputLayer,
        sequence_generator: ScriptedSequenceGenerator,
        src_vocab: Vocabulary,
        trg_vocab: Vocabulary,
        dictfeat_vocab: Vocabulary,
    ):
        BaseModel.__init__(self)
        self.model = model
        self.encoder = self.model.encoder
        self.decoder = self.model.decoder
        self.output_layer = output_layer
        self.sequence_generator = sequence_generator

        # Target vocab EOS index is useful for recognizing when to stop generating
        self.trg_eos_index = trg_vocab.get_eos_index()

        # Target vocab PAD index is useful for shifting source/target prior to decoding
        self.trg_pad_index = trg_vocab.get_pad_index()

        # Source, target and dictfeat vocab are needed for export so that we can handle
        # string input
        self.src_dict = src_vocab
        self.trg_dict = trg_vocab
        self.dictfeat_dict = dictfeat_vocab

        self.force_eval_predictions = False

示例#2

0

显示文件

 def from_config(cls, config: Config, **kwargs):
     tokenizer = create_component(ComponentType.TOKENIZER, config.tokenizer)
     replacements = {
         config.unk_token: UNK,
         config.pad_token: PAD,
         config.bos_token: BOS,
         config.eos_token: EOS,
         config.mask_token: MASK,
     }
     if isinstance(tokenizer, WordPieceTokenizer):
         vocab = Vocabulary(
             [token for token, _ in tokenizer.vocab.items()],
             replacements=replacements,
         )
     else:
         dictionary = BertDictionary.load(config.vocab_file)
         vocab = Vocabulary(
             dictionary.symbols, dictionary.count, replacements=replacements
         )
     return cls(
         columns=config.columns,
         tokenizer=tokenizer,
         add_bos_token=config.add_bos_token,
         add_eos_token=config.add_eos_token,
         use_eos_token_for_bos=config.use_eos_token_for_bos,
         max_seq_len=config.max_seq_len,
         vocab=vocab,
         **kwargs,
     )

示例#3

0

显示文件

 def __init__(self, vocab: Vocabulary):
     super().__init__()
     self.vocab = ScriptVocabulary(
         list(vocab),
         pad_idx=vocab.get_pad_index(-1),
         bos_idx=vocab.get_bos_index(-1),
         eos_idx=vocab.get_eos_index(-1),
         unk_idx=vocab.get_unk_index(-1),
     )

示例#4

0

显示文件

文件： transforms.py 项目： freegliboracle/pytext

 def __init__(self, poss_slots: List[str], tokenizer: nn.Module = None):
     super().__init__()
     self.NO_LABEL = Token("NoLabel")
     poss_slots = list(poss_slots)
     if self.NO_LABEL not in poss_slots:
         poss_slots.insert(0, self.NO_LABEL)
     if SpecialTokens.PAD not in poss_slots:
         poss_slots.insert(1, SpecialTokens.PAD)
     self.vocab = Vocabulary(poss_slots)

示例#5

0

显示文件

文件： output_layer_test.py 项目： marskong/pytext

    def test_torchscript_intent_slot_output_layer(self, num_doc_labels,
                                                  num_word_labels, seq_lens):
        batch_size = len(seq_lens)
        doc_vocab = Vocabulary([
            OutputLayerTest._generate_random_string()
            for _ in range(num_doc_labels)
        ])
        word_vocab = Vocabulary([
            OutputLayerTest._generate_random_string()
            for _ in range(num_word_labels)
        ])
        intent_slot_output_layer = IntentSlotOutputLayer.from_config(
            config=IntentSlotOutputLayer.Config(),
            doc_labels=doc_vocab,
            word_labels=word_vocab,
        )
        doc_logits = OutputLayerTest._generate_doc_classification_inputs(
            batch_size, num_doc_labels)
        word_logits, seq_lens_tensor = OutputLayerTest._generate_word_tagging_inputs(
            batch_size, num_word_labels, seq_lens)
        context = {"seq_lens": seq_lens_tensor}
        torchscript_output_layer = intent_slot_output_layer.torchscript_predictions(
        )

        pt_output = intent_slot_output_layer.get_pred(
            (doc_logits, word_logits), None, context)[1]
        ts_output = torchscript_output_layer((doc_logits, word_logits),
                                             seq_lens_tensor)

        self._validate_doc_classification_result(pt_output[0], ts_output[0],
                                                 doc_vocab)
        self._validate_word_tagging_result(pt_output[1], ts_output[1],
                                           word_vocab)

        (
            word_bpe_logits,
            seq_lens_tensor,
            token_indices_tensor,
        ) = OutputLayerTest._generate_bpe_tagging_inputs(
            batch_size, num_word_labels, seq_lens)
        context = {
            "seq_lens": seq_lens_tensor,
            "token_indices": token_indices_tensor
        }
        pt_output = intent_slot_output_layer.get_pred(
            (doc_logits, word_bpe_logits), None, context)[1]
        ts_output = torchscript_output_layer((doc_logits, word_bpe_logits),
                                             seq_lens_tensor,
                                             token_indices_tensor)

        self._validate_doc_classification_result(pt_output[0], ts_output[0],
                                                 doc_vocab)
        self._validate_word_tagging_result(pt_output[1], ts_output[1],
                                           word_vocab)

示例#6

0

显示文件

文件： masked_util.py 项目： facebookresearch/pytext

    def _prepare_dec_target(self, dec_source: List[int],
                            clean_input_tokens: List[int],
                            vocab: Vocabulary) -> List[int]:
        dec_target = [
            vocab.get_pad_index()
            if dec_source_token != vocab.get_mask_index() else
            dec_real_target_token
            for (dec_source_token,
                 dec_real_target_token) in zip(dec_source, clean_input_tokens)
        ]

        return dec_target

示例#7

0

显示文件

文件： output_layer_test.py 项目： a-domingu/tbcnn

    def test_torchscript_intent_slot_output_layer(
        self, num_doc_labels, num_word_labels, seq_lens
    ):
        batch_size = len(seq_lens)
        doc_vocab = Vocabulary(
            [OutputLayerTest._generate_random_string() for _ in range(num_doc_labels)]
        )
        word_vocab = Vocabulary(
            [OutputLayerTest._generate_random_string() for _ in range(num_word_labels)]
        )
        intent_slot_output_layer = IntentSlotOutputLayer.from_config(
            config=IntentSlotOutputLayer.Config(),
            doc_labels=doc_vocab,
            word_labels=word_vocab,
        )
        doc_logits = OutputLayerTest._generate_doc_classification_inputs(
            batch_size, num_doc_labels
        )
        word_logits, seq_lens_tensor = OutputLayerTest._generate_word_tagging_inputs(
            batch_size, num_word_labels, seq_lens
        )
        context = {"seq_lens": seq_lens_tensor}
        torchscript_output_layer = intent_slot_output_layer.torchscript_predictions()

        pt_output = intent_slot_output_layer.get_pred(
            (doc_logits, word_logits), None, context
        )[1]
        with redirect_stdout() as redirected_stdout:
            ts_output = torchscript_output_layer((doc_logits, word_logits), context)
            buffer = redirected_stdout.getvalue()
            assert (
                "Implicit dimension choice for log_softmax has been deprecated"
                not in buffer
            )

        self._validate_doc_classification_result(pt_output[0], ts_output[0], doc_vocab)
        self._validate_word_tagging_result(pt_output[1], ts_output[1], word_vocab)

        (
            word_bpe_logits,
            seq_lens_tensor,
            token_indices_tensor,
        ) = OutputLayerTest._generate_bpe_tagging_inputs(
            batch_size, num_word_labels, seq_lens
        )
        context = {"seq_lens": seq_lens_tensor, "token_indices": token_indices_tensor}
        pt_output = intent_slot_output_layer.get_pred(
            (doc_logits, word_bpe_logits), None, context
        )[1]
        ts_output = torchscript_output_layer((doc_logits, word_bpe_logits), context)

        self._validate_doc_classification_result(pt_output[0], ts_output[0], doc_vocab)
        self._validate_word_tagging_result(pt_output[1], ts_output[1], word_vocab)

示例#8

0

显示文件

文件： bert_tensorizer.py 项目： nadileaf/pytext

 def __init__(self, tokenizer: Tokenizer, vocab: Vocabulary, max_seq_len: int):
     super().__init__()
     self.tokenizer = tokenizer
     self.vocab = ScriptVocabulary(
         list(vocab),
         pad_idx=vocab.get_pad_index(),
         bos_idx=vocab.get_bos_index(-1),
         eos_idx=vocab.get_eos_index(-1),
         unk_idx=vocab.get_unk_index(),
     )
     self.vocab_lookup = VocabLookup(self.vocab)
     self.max_seq_len = max_seq_len

示例#9

0

显示文件

    def test_wordblstm_export_to_caffe2(self, export_num_words,
                                        num_word_classes, test_num_words,
                                        num_predictions):
        for WORD_CONFIG in WORD_CONFIGS:
            config = self._get_config(WordTaggingTask.Config, WORD_CONFIG)
            tensorizers, data = _NewTask._init_tensorizers(config)
            word_labels = [
                SpecialTokens.PAD, SpecialTokens.UNK, "NoLabel", "person"
            ]
            tensorizers["labels"].vocab = Vocabulary(word_labels)
            tensorizers["tokens"].vocab = Vocabulary(WORD_VOCAB)
            py_model = _NewTask._init_model(config.model, tensorizers)
            dummy_test_input = self._get_rand_input_intent_slot(
                BATCH_SIZE, W_VOCAB_SIZE, test_num_words)
            exporter = ModelExporter(
                ModelExporter.Config(),
                py_model.get_export_input_names(tensorizers),
                dummy_test_input,
                py_model.vocab_to_export(tensorizers),
                py_model.get_export_output_names(tensorizers),
            )
            with tempfile.NamedTemporaryFile(
                    delete=False,
                    suffix=".{}".format(".predictor")) as pred_file:
                exporter.export_to_caffe2(py_model, pred_file.name)
                workspace.ResetWorkspace()
            pred_net = pe.prepare_prediction_net(pred_file.name,
                                                 CAFFE2_DB_TYPE)
            for _i in range(num_predictions):
                test_inputs = self._get_rand_input_intent_slot(
                    BATCH_SIZE, W_VOCAB_SIZE, test_num_words)
                self._feed_c2_input(workspace, test_inputs,
                                    exporter.input_names, exporter.vocab_map)
                workspace.RunNetOnce(pred_net)
                word_output_names = [
                    "{}:{}".format("word_scores", class_name)
                    for class_name in word_labels
                ]
                py_model.eval()
                py_outs = py_model(*test_inputs)
                context = {"seq_lens": test_inputs[-1]}
                target = None
                pred, score = py_model.get_pred(py_outs, target, context)
                c2_word_out = []
                for o_name in word_output_names:
                    c2_word_out.extend(list(workspace.FetchBlob(o_name)))

                np.testing.assert_array_almost_equal(
                    torch.transpose(score, 1,
                                    2).contiguous().view(-1).detach().numpy(),
                    np.array(c2_word_out).flatten(),
                )

示例#10

0

显示文件

    def test_seq_nn_export_to_caffe2(
        self,
        export_num_words,
        num_doc_classes,
        test_num_words,
        num_predictions,
        test_num_seq,
    ):
        config = self._get_config(SeqNNTask.Config, SEQ_NN_CONFIG)
        tensorizers, data = _NewTask._init_tensorizers(config)
        doc_labels = [SpecialTokens.UNK, "cu:other", "cu:address_Person"]
        tensorizers["labels"].vocab = Vocabulary(doc_labels)
        tensorizers["tokens"].vocab = Vocabulary(WORD_VOCAB)
        py_model = _NewTask._init_model(config.model, tensorizers)
        dummy_test_input = self._get_seq_nn_rand_input(BATCH_SIZE,
                                                       W_VOCAB_SIZE,
                                                       test_num_words,
                                                       test_num_seq)
        exporter = ModelExporter(
            ModelExporter.Config(),
            py_model.get_export_input_names(tensorizers),
            dummy_test_input,
            py_model.vocab_to_export(tensorizers),
            py_model.get_export_output_names(tensorizers),
        )
        with tempfile.NamedTemporaryFile(
                delete=False, suffix=".{}".format(".predictor")) as pred_file:
            output_names = exporter.export_to_caffe2(py_model, pred_file.name)
            workspace.ResetWorkspace()

        pred_net = pe.prepare_prediction_net(pred_file.name, CAFFE2_DB_TYPE)
        for _i in range(num_predictions):
            test_inputs = self._get_seq_nn_rand_input(BATCH_SIZE, W_VOCAB_SIZE,
                                                      test_num_words,
                                                      test_num_seq)
            self._feed_c2_input(workspace, test_inputs, exporter.input_names,
                                exporter.vocab_map)
            workspace.RunNetOnce(pred_net)
            c2_out = [
                list(workspace.FetchBlob(o_name)) for o_name in output_names
            ]

            py_model.eval()
            py_outs = py_model(*test_inputs)
            # Do log_softmax since we do that before exporting predictor nets
            py_outs = F.log_softmax(py_outs, 1)
            np.testing.assert_array_almost_equal(
                py_outs.view(-1).detach().numpy(),
                np.array(c2_out).flatten())

示例#11

0

显示文件

文件： doc_model.py 项目： theniteshsingh/pytext

    def __init__(
        self,
        pretrained_embeddings_path: str,
        vocab: Vocabulary,
        embedding_dim: int,
        mlp_layer_dims: Optional[Sequence[int]] = None,
        lowercase_tokens: bool = False,
        skip_header: bool = True,
        delimiter: str = " ",
    ) -> None:
        super().__init__()
        pretrained_embedding = PretrainedEmbedding(
            pretrained_embeddings_path,
            lowercase_tokens=lowercase_tokens,
            skip_header=skip_header,
            delimiter=delimiter,
        )
        embeddings_weight = pretrained_embedding.initialize_embeddings_weights(
            vocab.idx,  # tensorizer.vocab.idx,
            vocab.unk_token,  # tensorizer.vocab.unk_token,
            embedding_dim,
            EmbedInitStrategy.RANDOM,
        )
        num_embeddings = len(vocab.idx)

        self.embedding = nn.Embedding(
            num_embeddings,
            embedding_dim,
            _weight=embeddings_weight,
            padding_idx=vocab.get_pad_index(),
        )

        # Initialize unk embedding with zeros
        # to guard the model against randomized decisions based on unknown words
        unk_token_idx = vocab.get_unk_index()
        if unk_token_idx >= 0:
            self.embedding.weight.data[unk_token_idx].fill_(0.0)

        # Create MLP layers
        if mlp_layer_dims is None:
            mlp_layer_dims = []

        self.mlp = nn.Sequential(
            *(nn.Sequential(nn.Linear(m, n), nn.ReLU())
              for m, n in zip([embedding_dim] +
                              list(mlp_layer_dims), mlp_layer_dims)))
        self.output_dim = mlp_layer_dims[
            -1] if mlp_layer_dims else embedding_dim

示例#12

0

显示文件

文件： masked_util.py 项目： facebookresearch/pytext

    def gen_masked_source_target(self, tokens: List[int], vocab: Vocabulary):
        cleaned_tokens = self.clean_eos_bos(tokens)
        original_target_string = " ".join(
            [vocab[idx] for idx in cleaned_tokens]).upper()
        try:
            annotation = Annotation(
                original_target_string,
                accept_flat_intents_slots=self.accept_flat_intents_slots,
            )
        except Exception as e:
            # This should never happen other than when testing
            print(e, original_target_string)
            dec_source = [
                vocab.idx[vocab.mask_token] for _ in range(len(tokens))
            ]
            dec_target = [
                vocab.idx[vocab.pad_token] for _ in range(len(tokens))
            ]
            return dec_source, dec_target
        assert len(annotation.root.children) == 1
        mask_tree_str = self.gen_masked_tree(annotation.root.children[0],
                                             vocab.mask_token)

        # We are calling the .split() instead of the tokenize() of tensorizer
        # because the input str contains special MASK token __MASK__
        # It we call tokenize() on this input_str, it may lower __MASK__ or split
        # in unexpected ways causing issues.
        # Hence temporary workaround is that we call split(" ") and lower all tokens
        # other than MASK tokens

        # handle special tokens in vocab
        mask_tree_str: List[str] = list(
            map(
                lambda token: SPECIAL_TOKENS.get(token, token.lower()),
                mask_tree_str.split(" "),
            ))

        dec_source = [vocab.idx.get(t) for t in mask_tree_str]

        dec_target = self._prepare_dec_target(dec_source, cleaned_tokens,
                                              vocab)

        if self.use_bos:
            if self.should_mask():
                dec_source.insert(0, vocab.get_mask_index())
                dec_target.insert(0, vocab.get_bos_index())
            else:
                dec_source.insert(0, vocab.get_bos_index())
                dec_target.insert(0, vocab.get_pad_index())

        if self.use_eos:
            if self.should_mask():
                dec_source.append(vocab.get_mask_index())
                dec_target.append(vocab.get_eos_index())
            else:
                dec_source.append(vocab.get_eos_index())
                dec_target.append(vocab.get_pad_index())
        return dec_source, dec_target

示例#13

0

显示文件

def build_fairseq_vocab(
    vocab_file: str,
    dictionary_class: Dictionary = Dictionary,
    special_token_replacements: Dict[str, SpecialToken] = None,
    max_vocab: int = -1,
    min_count: int = -1,
    tokens_to_add: Optional[List[str]] = None,
):
    """
    Function builds a PyText vocabulary for models pre-trained using Fairseq
    modules. The dictionary class can take any Fairseq Dictionary class
    and is used to load the vocab file.
    """
    if not special_token_replacements:
        special_token_replacements = {
            "<pad>": SpecialTokens.PAD,
            "<s>": SpecialTokens.BOS,
            "</s>": SpecialTokens.EOS,
            "<unk>": SpecialTokens.UNK,
            "<mask>": SpecialTokens.MASK,
        }
    with PathManager.open(vocab_file) as f:
        dictionary = dictionary_class.load(f)
        # finalize will sort the dict based on frequency so only do this if
        # a min_count or max_vocab size is specified
        if min_count > 0 or max_vocab > 0:
            dictionary.finalize(threshold=min_count, nwords=max_vocab, padding_factor=1)
        if tokens_to_add:
            for token in tokens_to_add:
                dictionary.add_symbol(token)
        return Vocabulary(
            dictionary.symbols,
            dictionary.count,
            replacements=special_token_replacements,
        )

示例#14

0

显示文件

文件： bert_squad_qa.py 项目： twwhatever/pytext

    def from_config(cls, config: Config, tensorizers):
        has_answer_labels = ["False", "True"]
        tensorizers["has_answer"].vocab = Vocabulary(has_answer_labels)
        vocab = tensorizers["squad_input"].vocab

        encoder = create_module(
            config.encoder,
            output_encoded_layers=True,
            padding_idx=vocab.get_pad_index(),
            vocab_size=vocab.__len__(),
        )

        pos_decoder = create_module(config.pos_decoder,
                                    in_dim=encoder.representation_dim,
                                    out_dim=2)
        has_ans_decoder = create_module(
            config.has_ans_decoder,
            in_dim=encoder.representation_dim,
            out_dim=len(has_answer_labels),
        )

        output_layer = create_module(config.output_layer,
                                     labels=has_answer_labels,
                                     is_kd=config.is_kd)

        return cls(encoder,
                   pos_decoder,
                   has_ans_decoder,
                   output_layer,
                   is_kd=config.is_kd)

示例#15

0

显示文件

文件： bert_tensorizer.py 项目： theniteshsingh/pytext

def build_fairseq_vocab(
    vocab_file: str,
    dictionary_class: Dictionary = Dictionary,
    special_token_replacements: Dict[str, Token] = None,
    max_vocab: int = -1,
    min_count: int = -1,
    tokens_to_add: Optional[List[str]] = None,
) -> Vocabulary:
    """
    Function builds a PyText vocabulary for models pre-trained using Fairseq
    modules. The dictionary class can take any Fairseq Dictionary class
    and is used to load the vocab file.
    """
    dictionary = dictionary_class.load(vocab_file)
    # finalize will sort the dict based on frequency so only do this if
    # a min_count or max_vocab size is specified
    if min_count > 0 or max_vocab > 0:
        dictionary.finalize(threshold=min_count,
                            nwords=max_vocab,
                            padding_factor=1)
    if tokens_to_add:
        for token in tokens_to_add:
            dictionary.add_symbol(token)
    return Vocabulary(dictionary.symbols,
                      dictionary.count,
                      replacements=special_token_replacements)

示例#16

0

显示文件

文件： output_layer_test.py 项目： a-domingu/tbcnn

    def test_doc_classification_output_layer(self):
        tensorizer = LabelTensorizer()
        tensorizer.vocab = Vocabulary([SpecialTokens.PAD, "foo", "bar"])
        layer = ClassificationOutputLayer.from_config(
            config=ClassificationOutputLayer.Config(loss=CrossEntropyLoss.Config()),
            labels=tensorizer.vocab,
        )
        self.assertEqual(layer.loss_fn.ignore_index, 0)

        # use default pad
        tensorizer.vocab = Vocabulary(["foo", "bar"])
        layer = ClassificationOutputLayer.from_config(
            config=ClassificationOutputLayer.Config(loss=CrossEntropyLoss.Config()),
            labels=tensorizer.vocab,
        )
        self.assertEqual(layer.loss_fn.ignore_index, -1)

示例#17

0

显示文件

文件： word_tagging_output_layer.py 项目： twwhatever/pytext

 def __init__(self, num_tags, labels: Vocabulary, *args) -> None:
     super().__init__(list(labels), *args)
     self.crf = CRF(
         num_tags=num_tags,
         ignore_index=labels.get_pad_index(Padding.DEFAULT_LABEL_PAD_IDX),
         default_label_pad_index=Padding.DEFAULT_LABEL_PAD_IDX,
     )

示例#18

0

显示文件

文件： tensorizers_test.py 项目： twild-fb/pytext

    def test_lookup_tokens(self):
        text = "let's tokenize this"
        tokenizer = Tokenizer()
        vocab = Vocabulary(text.split() + [BOS, EOS])
        tokens, start_idx, end_idx = lookup_tokens(
            text,
            tokenizer=tokenizer,
            vocab=vocab,
            add_bos_token=False,
            add_eos_token=False,
        )
        self.assertEqual(tokens, [0, 1, 2])
        self.assertEqual(start_idx, (0, 6, 15))
        self.assertEqual(end_idx, (5, 14, 19))

        tokens, start_idx, end_idx = lookup_tokens(
            text,
            tokenizer=tokenizer,
            vocab=vocab,
            add_bos_token=True,
            add_eos_token=True,
        )
        self.assertEqual(tokens, [3, 0, 1, 2, 4])
        self.assertEqual(start_idx, (-1, 0, 6, 15, -1))
        self.assertEqual(end_idx, (-1, 5, 14, 19, -1))

示例#19

0

显示文件

文件： bert_tensorizer.py 项目： nadileaf/pytext

 def from_config(cls, config: Config, **kwargs):
     """
     from_config parses the config associated with the tensorizer and
     creates both the tokenizer and the Vocabulary object. The extra arguments
     passed as kwargs allow us to reuse thie function with variable number
     of arguments (eg: for classes which derive from this class).
     """
     tokenizer = create_component(ComponentType.TOKENIZER, config.tokenizer)
     special_token_replacements = {
         "[UNK]": UNK,
         "[PAD]": PAD,
         "[CLS]": BOS,
         "[MASK]": MASK,
         "[SEP]": EOS,
     }
     if isinstance(tokenizer, WordPieceTokenizer):
         vocab = Vocabulary(
             [token for token, _ in tokenizer.vocab.items()],
             replacements=special_token_replacements,
         )
     else:
         with PathManager.open(config.vocab_file) as file_path:
             vocab = build_fairseq_vocab(
                 dictionary_class=BertDictionary,
                 vocab_file=file_path,
                 special_token_replacements=special_token_replacements,
             )
     return cls(
         columns=config.columns,
         vocab=vocab,
         tokenizer=tokenizer,
         max_seq_len=config.max_seq_len,
         **kwargs,
     )

示例#20

0

显示文件

文件： output_layer_test.py 项目： a-domingu/tbcnn

    def test_torchscript_word_tagging_output_layer(self, num_labels, seq_lens):
        batch_size = len(seq_lens)
        vocab = Vocabulary(
            [OutputLayerTest._generate_random_string() for _ in range(num_labels)]
        )

        word_layer = WordTaggingOutputLayer.from_config(
            config=WordTaggingOutputLayer.Config(), labels=vocab
        )
        crf_layer = CRFOutputLayer.from_config(
            config=CRFOutputLayer.Config(), labels=vocab
        )

        logits, seq_lens_tensor = OutputLayerTest._generate_word_tagging_inputs(
            batch_size, num_labels, seq_lens
        )
        context = {"seq_lens": seq_lens_tensor}

        torchsript_word_layer = word_layer.torchscript_predictions()
        torchscript_crf_layer = crf_layer.torchscript_predictions()

        self._validate_word_tagging_result(
            word_layer.get_pred(logits, None, context)[1],
            torchsript_word_layer(logits, context),
            vocab,
        )
        self._validate_word_tagging_result(
            crf_layer.get_pred(logits, None, context)[1],
            torchscript_crf_layer(logits, context),
            vocab,
        )

示例#21

0

显示文件

class SlotLabelTransform(Transform):
    def __init__(self, poss_slots: List[str], tokenizer: nn.Module = None):
        super().__init__()
        self.NO_LABEL = Token("NoLabel")
        poss_slots = list(poss_slots)
        if self.NO_LABEL not in poss_slots:
            poss_slots.insert(0, self.NO_LABEL)
        if SpecialTokens.PAD not in poss_slots:
            poss_slots.insert(1, SpecialTokens.PAD)
        if SpecialTokens.UNK not in poss_slots:
            poss_slots.insert(2, SpecialTokens.UNK)
        self.vocab = Vocabulary(poss_slots)

    def process_slots(self, slots_list: str) -> List[Slot]:
        if "," in slots_list:
            slots_list = slots_list.split(",")
        elif slots_list != "":
            slots_list = [slots_list]
        else:
            return []
        slot_labels: List[Slot] = []
        for curr_slot in slots_list:
            first_delim = curr_slot.find(":")
            second_delim = curr_slot.find(":", first_delim + 1)
            start_ind = int(curr_slot[0:first_delim])
            end_ind = int(curr_slot[first_delim + 1:second_delim])
            slot_name = curr_slot[second_delim + 1:]
            slot_labels.append(Slot(slot_name, start_ind, end_ind))
        return slot_labels

    def forward(self, text_and_slots):
        """
        Turn slot labels and text into a list of token labels with the same
        length as the number of tokens in the text.
        """
        tokens, start, end = text_and_slots[0].values()
        slots = self.process_slots(text_and_slots[1])
        curr_slot_i = 0
        curr_token_i = 0
        slot_labels: List[str] = []
        while curr_token_i < len(tokens) and curr_slot_i < len(slots):
            curr_slot = slots[curr_slot_i]
            if int(start[curr_token_i]) > curr_slot.end:
                curr_slot_i += 1
            else:
                if int(end[curr_token_i]) > curr_slot.start:
                    slot_labels.append(curr_slot.label)
                else:
                    slot_labels.append(self.NO_LABEL)
                curr_token_i += 1
        slot_labels += [self.NO_LABEL] * (len(tokens) - curr_token_i)
        slot_label_idx = self.vocab.lookup_all(slot_labels)
        return {"slot_labels": torch.tensor(slot_label_idx)}

    @property
    def is_jitable(self) -> bool:
        return False

示例#22

0

显示文件

 def setUp(self):
     self.input_iterator = [
         {"text": "hello world"},
         {"text": "feeling lucky today"},
         {"text": "hello"},
         {"text": "lucky world"},
         {"text": "today world"},
     ]
     self.vocab = Vocabulary(["hello", "world", "feeling", "lucky", "today"])

示例#23

0

显示文件

    def __init__(
        self,
        model: RNNModel,
        output_layer: Seq2SeqOutputLayer,
        src_vocab: Vocabulary,
        trg_vocab: Vocabulary,
        dictfeat_vocab: Vocabulary,
        generator_config=None,
    ):
        BaseModel.__init__(self)
        self.model = model
        self.encoder = self.model.encoder
        self.decoder = self.model.decoder
        self.output_layer = output_layer

        # Sequence generation is expected to be used only for inference, and to
        # take the trained model(s) as input. Creating the sequence generator
        # may apply Torchscript JIT compilation and quantization, which modify
        # the input model. Therefore, we want to create the sequence generator
        # after training.
        if generator_config is not None:
            self.sequence_generator_builder = lambda models: create_module(
                generator_config, models, trg_vocab.get_eos_index())
        self.sequence_generator = None

        # Disable predictions until testing (see above comment about sequence
        # generator). If this functionality is needed, a new sequence generator
        # with a copy of the model should be used for each epoch during the
        # EVAL stage.
        self.force_eval_predictions = False

        # Target vocab EOS index is useful for recognizing when to stop generating
        self.trg_eos_index = trg_vocab.get_eos_index()

        # Target vocab PAD index is useful for shifting source/target prior to decoding
        self.trg_pad_index = trg_vocab.get_pad_index()

        # Source, target and dictfeat vocab are needed for export so that we can handle
        # string input
        self.src_dict = src_vocab
        self.trg_dict = trg_vocab
        self.dictfeat_dict = dictfeat_vocab

        log_class_usage(__class__)

示例#24

0

显示文件

 def __init__(self, bpe, dictionary: Dictionary):
     self.bpe = bpe
     self.vocab = Vocabulary(
         dictionary.symbols,
         pad_token=str(dictionary[dictionary.pad()]),
         bos_token=str(dictionary[dictionary.bos()]),
         eos_token=str(dictionary[dictionary.eos()]),
     )
     self.bos = self.vocab.bos_token
     self.eos = self.vocab.eos_token

示例#25

0

显示文件

文件： output_layer_test.py 项目： a-domingu/tbcnn

 def test_create_word_tagging_output_layer(self):
     tensorizer = LabelTensorizer()
     tensorizer.vocab = Vocabulary(["foo", "bar"])
     tensorizer.pad_idx = 0
     layer = WordTaggingOutputLayer.from_config(
         config=WordTaggingOutputLayer.Config(label_weights={"foo": 2.2}),
         labels=tensorizer.vocab,
     )
     np.testing.assert_array_almost_equal(
         np.array([2.2, 1]), layer.loss_fn.weight.detach().numpy()
     )

示例#26

0

显示文件

class LabelTransform(Transform):
    def __init__(self, label_names: List[str]):
        super().__init__()
        self.vocab = Vocabulary(label_names)

    def forward(self, label: str) -> Dict[str, torch.Tensor]:
        label_id = self.vocab.lookup_all(label)
        return {"label_ids": torch.tensor(label_id, dtype=torch.long)}

    @property
    def is_jitable(self) -> bool:
        return False

示例#27

0

显示文件

def build_dumb_slot_labelling_model():
    return build_slot_labelling_model(
        None,
        5,
        100,
        [10 for i in range(100)],
        0.4,
        False,
        None,
        None,
        5,
        Vocabulary([SpecialTokens.UNK, SpecialTokens.PAD, "the", "cat"]),
    )

示例#28

0

显示文件

 def _build_vocab(self, vocab_file: str, max_vocab: int,
                  min_count: int) -> Vocabulary:
     """
     Build Vocab for XLM by calling the vocab reader associated with the model
     source.
     """
     if self.is_fairseq:
         vocab_list, counts, replacements = read_fairseq_vocab(
             vocab_file, max_vocab, min_count)
     else:
         vocab_list, counts, replacements = read_vocab(
             vocab_file, max_vocab, min_count)
     return Vocabulary(vocab_list, counts, replacements=replacements)

示例#29

0

显示文件

文件： masked_util.py 项目： facebookresearch/pytext

    def gen_masked_source_target(self, tokens: List[int], vocab: Vocabulary):
        num_masks = self.random.randint(self.minimum_masks, len(tokens))

        ind: Set[int] = set(
            self.random.choice(len(tokens), size=num_masks, replace=False))

        dec_source: List[int] = [
            vocab.get_mask_index() if idx in ind else token
            for idx, token in enumerate(tokens)
        ]

        dec_target = self._prepare_dec_target(dec_source, tokens, vocab)

        return dec_source, dec_target

示例#30

0

显示文件

def build_legacy_pytext_vocab_pipeline(vocab_file):
    from pytext.data.utils import Vocabulary

    tokenizer = get_tokenizer("basic_english")
    f = open(vocab_file, 'r')

    vocab_counter = Counter([token for line in f for token in line.rstrip()])
    sorted_by_freq_tuples = sorted(vocab_counter.items(), key=lambda x: x[1], reverse=True)
    vocab_list = [pair[0] for pair in sorted_by_freq_tuples]
    vocab_list.insert(0, "<unk>")

    pipeline = sequential_transforms(tokenizer_func(tokenizer),
                                     PyTextVocabTransform(Vocabulary(vocab_list, unk_token="<unk>")))
    return pipeline, None, None