def __init__(
        self,
        vocab: Vocabulary,
        text_field_embedder: TextFieldEmbedder,
        elmo_text_field_embedder: TextFieldEmbedder,
        quote_response_encoder: Seq2SeqEncoder,
        quote_response_encoder_aux: Seq2VecEncoder,
        classifier_feedforward: FeedForward,
        classifier_feedforward_2: FeedForward,
        initializer: InitializerApplicator = InitializerApplicator(),
        regularizer: Optional[RegularizerApplicator] = None,
        report_auxiliary_metrics: bool = False,
        # predict_mode: bool = False,
    ) -> None:

        super(SarcasmClassifier, self).__init__(vocab, regularizer)

        self.text_field_embedder = text_field_embedder
        self.elmo_text_field_embedder = elmo_text_field_embedder
        self.num_classes = self.vocab.get_vocab_size("labels")
        self.num_classes_emotions = self.vocab.get_vocab_size("emotion_labels")
        self.quote_response_encoder = quote_response_encoder
        self.quote_response_encoder_aux = quote_response_encoder_aux
        self.classifier_feedforward = classifier_feedforward
        self.classifier_feedforward_2 = classifier_feedforward_2
        self.attention_seq2seq = Attention(
            quote_response_encoder.get_output_dim())

        self.label_acc_metrics = {"accuracy": CategoricalAccuracy()}
        self.label_f1_metrics = {}
        self.label_f1_metrics_emotions = {}
        # for i in range(self.num_classes):
        #     self.label_f1_metrics[vocab.get_token_from_index(index=i, namespace="label")] =\
        #         F1Measure(positive_label=i)

        for i in range(self.num_classes):
            self.label_f1_metrics[vocab.get_token_from_index(index=i, namespace="labels")] =\
                F1Measure(positive_label=i)
        for i in range(self.num_classes_emotions):
            self.label_f1_metrics_emotions[vocab.get_token_from_index(index=i, namespace="emotion_labels")] =\
                F1Measure(positive_label=i)

        self.loss = torch.nn.CrossEntropyLoss()

        # self.attention_seq2seq = Attention(quote_response_encoder.get_output_dim())

        self.report_auxiliary_metrics = report_auxiliary_metrics
        # self.predict_mode = predict_mode

        initializer(self)
示例#2
0
    def __init__(self, vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 node_embedder: TokenEmbedder,
                 verbose_metrics: False,
                 classifier_feedforward: FeedForward,
                 use_node_vector: bool = True,
                 use_abstract: bool = True,
                 dropout: float = 0.2,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super(AclClassifier, self).__init__(vocab, regularizer)

        self.node_embedder = node_embedder
        self.text_field_embedder = text_field_embedder
        self.use_node_vector = use_node_vector
        self.use_abstract = use_abstract
        self.dropout = torch.nn.Dropout(dropout)
        self.num_classes = self.vocab.get_vocab_size("labels")

        self.classifier_feedforward = classifier_feedforward

        self.label_accuracy = CategoricalAccuracy()
        self.label_f1_metrics = {}

        self.verbose_metrics = verbose_metrics

        for i in range(self.num_classes):
            label_name = vocab.get_token_from_index(index=i, namespace="labels")
            self.label_f1_metrics[label_name] = F1Measure(positive_label=i)

        self.loss = torch.nn.CrossEntropyLoss()

        initializer(self)
示例#3
0
    def __init__(
        self,
        vocab: Vocabulary,
        text_field_embedder: TextFieldEmbedder,
        verbose_metrics: False,
        dropout: float = 0.2,
        initializer: InitializerApplicator = InitializerApplicator(),
        regularizer: Optional[RegularizerApplicator] = None,
    ) -> None:
        super(TextClassifier, self).__init__(vocab, regularizer)

        self.text_field_embedder = text_field_embedder
        self.dropout = torch.nn.Dropout(dropout)
        self.num_classes = self.vocab.get_vocab_size("labels")
        self.classifier_feedforward = torch.nn.Linear(
            self.text_field_embedder.get_output_dim(), self.num_classes)

        self.label_accuracy = CategoricalAccuracy()
        self.label_f1_metrics = {}

        self.verbose_metrics = verbose_metrics

        for i in range(self.num_classes):
            self.label_f1_metrics[vocab.get_token_from_index(
                index=i, namespace="labels")] = F1Measure(positive_label=i)
        self.loss = torch.nn.CrossEntropyLoss()

        initializer(self)
示例#4
0
    def __init__(self,
                 vocab: Vocabulary,
                 quote_response_encoder: Seq2VecEncoder,
                 bert_model_name: str = None,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:

        super(SarcasmClassifier, self).__init__(vocab, regularizer)

        self.quote_response_encoder = quote_response_encoder
        self.text_field_embedder = BertModel.from_pretrained(bert_model_name)

        self.num_classes_emotions = self.vocab.get_vocab_size("labels")
        self.linear = nn.Linear(200, self.num_classes_emotions)

        self.label_acc_metrics = {"accuracy": CategoricalAccuracy()}
        self.label_f1_metrics_emotions = {}
        # for i in range(self.num_classes):
        #     self.label_f1_metrics[vocab.get_token_from_index(index=i, namespace="label")] =\
        #         F1Measure(positive_label=i)

        for i in range(self.num_classes_emotions):
            self.label_f1_metrics_emotions[vocab.get_token_from_index(index=i, namespace="labels")] =\
                F1Measure(positive_label=i)

        self.loss = torch.nn.CrossEntropyLoss()

        # self.attention_seq2seq = Attention(quote_response_encoder.get_output_dim())

        # self.predict_mode = predict_mode

        initializer(self)
示例#5
0
def compute_background_log_frequency(vocab: Vocabulary,
                                     vocab_namespace: str,
                                     precomputed_bg_file=None):
    """
    Load in the word counts from the JSON file and compute the
    background log term frequency w.r.t this vocabulary.
    """
    # precomputed_word_counts = json.load(open(precomputed_word_counts, "r"))
    log_term_frequency = torch.FloatTensor(
        vocab.get_vocab_size(vocab_namespace))
    if precomputed_bg_file is not None:
        with open(precomputed_bg_file, "r") as file_:
            precomputed_bg = json.load(file_)
    else:
        precomputed_bg = vocab._retained_counter.get(vocab_namespace)  # pylint: disable=protected-access
        if precomputed_bg is None:
            return log_term_frequency
    for i in range(vocab.get_vocab_size(vocab_namespace)):
        token = vocab.get_token_from_index(i, vocab_namespace)
        if token in ("@@UNKNOWN@@", "@@PADDING@@", '@@START@@',
                     '@@END@@') or token not in precomputed_bg:
            log_term_frequency[i] = 1e-12
        elif token in precomputed_bg:
            if precomputed_bg[token] == 0:
                log_term_frequency[i] = 1e-12
            else:
                log_term_frequency[i] = precomputed_bg[token]
    log_term_frequency = torch.log(log_term_frequency)
    return log_term_frequency
 def embed(self, vocab: Vocabulary, tokens: torch.Tensor) -> torch.Tensor:
     """
     Idea: reconstruct string tokens from token ids -> feed to spacy -> return tensors
     :param vocab:
     :param tokens:
     :return:
     """
     with SwitchDefaultTensor():
         embedded_sentences = []
         tokens_cpu = tokens.cpu()
         batch_size, seq_len = tokens.shape
         for sentence in tokens_cpu:
             str_tokens: List[str] = [
                 vocab.get_token_from_index(int(token))
                 for token in sentence if token != 0
             ]  #skip padding
             doc = Doc(self.nlp.vocab, words=str_tokens)
             self.nlp.pipeline[1][1](doc)  #word pieces
             self.nlp.pipeline[2][1](doc)  #run transformer on wordpieces
             #add padding back in
             #embedded = torch.from_numpy(cupy.asnumpy(doc.tensor)).to(device) # shape (str_tokens, output dim)
             embedded = from_dlpack(
                 doc.tensor.toDlpack())  # shape (str_tokens, output dim)
             assert embedded.shape == (len(str_tokens),
                                       self.get_output_dim())
             if seq_len - len(str_tokens) > 0:
                 padded = torch.zeros(seq_len - len(str_tokens),
                                      self.get_output_dim())
                 embedded = torch.cat([embedded, padded], dim=0)
             embedded_sentences.append(embedded)
         return torch.stack(embedded_sentences, dim=0)
示例#7
0
    def __init__(self, vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 text_encoder: Seq2SeqEncoder,
                 classifier_feedforward: FeedForward,
                 verbose_metrics: False,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None,
                 ) -> None:
        super(TextClassifier, self).__init__(vocab, regularizer)

        self.text_field_embedder = text_field_embedder
        self.num_classes = self.vocab.get_vocab_size("labels")
        self.text_encoder = text_encoder
        self.classifier_feedforward = classifier_feedforward

        self.label_accuracy = CategoricalAccuracy()
        self.label_f1_metrics = {}

        self.verbose_metrics = verbose_metrics

        for i in range(self.num_classes):
            self.label_f1_metrics[vocab.get_token_from_index(index=i, namespace="labels")] = F1Measure(positive_label=i)
        self.loss = torch.nn.CrossEntropyLoss()

        self.pool = lambda text, mask: util.get_final_encoder_states(text, mask, bidirectional=True)

        initializer(self)
示例#8
0
def compute_background_log_frequency(vocab: Vocabulary,
                                     vocab_namespace: str,
                                     precomputed_bg_file=None):
    """
    Load in the word counts from the JSON file and compute the
    background log term frequency w.r.t this vocabulary.
    """
    # precomputed_word_counts = json.load(open(precomputed_word_counts, "r"))
    # bp()
    # sample a probability tensor from a symmetric dirichlet
    log_term_frequency = torch.distributions.dirichlet.Dirichlet(
        torch.ones(vocab.get_vocab_size(vocab_namespace))).sample()
    if precomputed_bg_file is not None:
        with open(precomputed_bg_file, "r") as file_:
            precomputed_bg = json.load(file_)
    else:
        precomputed_bg = vocab._retained_counter.get(vocab_namespace)  # pylint: disable=protected-access
        if precomputed_bg is None:
            return log_term_frequency
    # bp()
    for i in range(vocab.get_vocab_size(vocab_namespace)):
        token = vocab.get_token_from_index(i, vocab_namespace)

        if token in precomputed_bg:
            log_term_frequency[i] = precomputed_bg[token]
        elif token in ("@@UNKNOWN@@", "@@PADDING@@", '@@START@@',
                       '@@END@@') or token not in precomputed_bg:
            log_term_frequency[i] = 1e-12
    # bp()
    assert log_term_frequency.sum().allclose(torch.ones(1))
    log_term_frequency = torch.log(log_term_frequency)

    # return torch.zeros(vocab.get_vocab_size(vocab_namespace))
    return log_term_frequency
示例#9
0
    def __init__(self,
                 vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 classifier_feedforward: FeedForward,
                 elmo: Elmo = None,
                 use_input_elmo: bool = False):
        super().__init__(vocab)
        self.elmo = elmo
        self.use_elmo = use_input_elmo
        self.text_field_embedder = text_field_embedder
        self.num_classes = self.vocab.get_vocab_size("labels")
        self.encoder = encoder
        self.classifier_feed_forward = classifier_feedforward
        self.label_accuracy = CategoricalAccuracy()

        self.label_f1_metrics = {}

        # create F1 Measures for each class
        for i in range(self.num_classes):
            self.label_f1_metrics[vocab.get_token_from_index(index=i, namespace="labels")] = \
                F1Measure(positive_label=i)

        self.loss = torch.nn.CrossEntropyLoss()

        self.attention = Attention(encoder.get_output_dim())
示例#10
0
    def embed(self, vocab: Vocabulary, tokens: torch.Tensor) -> torch.Tensor:
        """
        Idea: reconstruct string tokens from token ids -> feed to spacy -> return tensors
        :param vocab:
        :param tokens:
        :return:
        """
        with SwitchDefaultTensor():
            with torch.autograd.no_grad():
                embedded_sentences = []
                tokens_cpu = tokens.cpu()
                batch_size, seq_len = tokens.shape
                sents = []
                for sentence in tokens_cpu:
                    str_tokens: List[str] = [
                        vocab.get_token_from_index(int(token))
                        for token in sentence if token != 0
                    ]  #skip padding
                    sents.append(str_tokens)
                doc = make_doc(self.nlp.vocab, sents)
                self.nlp.pipeline[1][1](doc)  #word pieces
                self.nlp.pipeline[2][1](doc)  #run transformer on wordpieces

                #Now iterate over sentences in correct order and cut out the correct tensor + pad it
                for sent, str_tokens in zip(doc.sents, sents):
                    #add padding back in
                    embedded = from_dlpack(sent.tensor.toDlpack()
                                           )  # shape (str_tokens, output dim)
                    if seq_len - len(str_tokens) > 0:
                        padded = torch.zeros(seq_len - len(str_tokens),
                                             self.get_output_dim())
                        embedded = torch.cat([embedded, padded], dim=0)
                    embedded_sentences.append(embedded)
                return torch.stack(embedded_sentences, dim=0)
示例#11
0
    def __init__(self,
                 vocab: Vocabulary,
                 input_embedder: TextFieldEmbedder,
                 encoder: Encoder = None,
                 dropout: float = None,
                 initializer: InitializerApplicator = InitializerApplicator()
                ) -> None:
        """
        Parameters
        ----------
        vocab: `Vocabulary`
            vocab to use
        input_embedder: `TextFieldEmbedder`
            generic embedder of tokens
        encoder: `Encoder`, optional (default = None)
            Seq2Vec or Seq2Seq Encoder wrapper. If no encoder is provided,
            assume that the input is a bag of word counts, for linear classification.
        dropout: `float`, optional (default = None)
            if set, will apply dropout to output of encoder.
        initializer: `InitializerApplicator`
            generic initializer
        """
        super().__init__(vocab)
        self._input_embedder = input_embedder
        if dropout:
            self._dropout = torch.nn.Dropout(dropout)
        else:
            self._dropout = None
        self._encoder = encoder
        self._num_labels = vocab.get_vocab_size(namespace="labels")
        if self._encoder:
            self._clf_input_dim = self._encoder.get_output_dim()
        else:
            self._clf_input_dim = self._input_embedder.get_output_dim()
        self._classification_layer = torch.nn.Linear(self._clf_input_dim,
                                                     self._num_labels)

        self.attn = torch.nn.Parameter(torch.randn(5, self._num_labels))

        self._accuracy = CategoricalAccuracy()
        self.label_f1_metrics = {}
        self.label_order = []
        for i in range(self._num_labels):
            self.label_f1_metrics[vocab.get_token_from_index(index=i, namespace="labels")] = F1Measure(positive_label=i)
            self.label_order.append(vocab.get_token_from_index(index=i, namespace="labels"))
        self._loss = torch.nn.CrossEntropyLoss()
        initializer(self)
示例#12
0
 def _get_vocab_index_mapping(self, archived_vocab: Vocabulary) -> List[Tuple[int, int]]:
     vocab_index_mapping: List[Tuple[int, int]] = []
     for index in range(self.vocab.get_vocab_size(namespace='tokens')):
         token = self.vocab.get_token_from_index(index=index, namespace='tokens')
         archived_token_index = archived_vocab.get_token_index(token, namespace='tokens')
         # Checking if we got the UNK token index, because we don't want all new token
         # representations initialized to UNK token's representation. We do that by checking if
         # the two tokens are the same. They will not be if the token at the archived index is
         # UNK.
         if archived_vocab.get_token_from_index(archived_token_index, namespace="tokens") == token:
             vocab_index_mapping.append((index, archived_token_index))
     return vocab_index_mapping
 def _get_vocab_index_mapping(self, archived_vocab: Vocabulary) -> List[Tuple[int, int]]:
     vocab_index_mapping: List[Tuple[int, int]] = []
     for index in range(self.vocab.get_vocab_size(namespace='tokens')):
         token = self.vocab.get_token_from_index(index=index, namespace='tokens')
         archived_token_index = archived_vocab.get_token_index(token, namespace='tokens')
         # Checking if we got the UNK token index, because we don't want all new token
         # representations initialized to UNK token's representation. We do that by checking if
         # the two tokens are the same. They will not be if the token at the archived index is
         # UNK.
         if archived_vocab.get_token_from_index(archived_token_index, namespace="tokens") == token:
             vocab_index_mapping.append((index, archived_token_index))
     return vocab_index_mapping
    def __init__(self,
                 vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 user_utterance_encoder: Seq2VecEncoder,
                 prev_user_utterance_encoder: Seq2VecEncoder,
                 prev_sys_utterance_encoder: Seq2VecEncoder,
                 classifier_feedforward: FeedForward,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super(IntentClassifier, self).__init__(vocab, regularizer)

        self.text_field_embedder = text_field_embedder
        self.num_classes = self.vocab.get_vocab_size("labels")
        self.user_utterance_encoder = user_utterance_encoder
        self.prev_user_utterance_encoder = prev_user_utterance_encoder
        self.prev_sys_utterance_encoder = prev_sys_utterance_encoder
        self.classifier_feedforward = classifier_feedforward

        if text_field_embedder.get_output_dim(
        ) != user_utterance_encoder.get_input_dim():
            raise ConfigurationError(
                "The output dimension of the text_field_embedder must match the "
                "input dimension of the user_utterance_encoder. Found {} and {}, "
                "respectively.".format(text_field_embedder.get_output_dim(),
                                       user_utterance_encoder.get_input_dim()))
        if text_field_embedder.get_output_dim(
        ) != prev_user_utterance_encoder.get_input_dim():
            raise ConfigurationError(
                "The output dimension of the text_field_embedder must match the "
                "input dimension of the prev_user_utterance_encoder. Found {} and {}, "
                "respectively.".format(
                    text_field_embedder.get_output_dim(),
                    prev_user_utterance_encoder.get_input_dim()))
        if text_field_embedder.get_output_dim(
        ) != prev_sys_utterance_encoder.get_input_dim():
            raise ConfigurationError(
                "The output dimension of the text_field_embedder must match the "
                "input dimension of the prev_sys_utterance_encoder. Found {} and {}, "
                "respectively.".format(
                    text_field_embedder.get_output_dim(),
                    prev_sys_utterance_encoder.get_input_dim()))

        self.label_accuracy = CategoricalAccuracy()

        self.label_f1_metrics = {}
        for i in range(self.num_classes):
            self.label_f1_metrics[vocab.get_token_from_index(
                index=i, namespace="labels")] = F1Measure(positive_label=i)

        self.loss = torch.nn.CrossEntropyLoss()

        initializer(self)
示例#15
0
    def __init__(self,
                 vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 label_encoder: Seq2VecEncoder,
                 calculate_span_f1: bool = None,
                 tag_encoding: Optional[str] = None,
                 tag_namespace: str = "tags",
                 verbose_metrics: bool = False,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super(ParamTaggerPipeline, self).__init__(vocab, regularizer)

        self.label_encoder = label_encoder
        self.tag_namespace = tag_namespace
        self.text_field_embedder = text_field_embedder
        self.num_classes = self.vocab.get_vocab_size(tag_namespace)
        self.encoder = encoder
        self._verbose_metrics = verbose_metrics
        self.tag_projection_layer = TimeDistributed(
            Linear(self.encoder.get_output_dim(), self.num_classes))

        check_dimensions_match(text_field_embedder.get_output_dim(),
                               encoder.get_input_dim(),
                               "text field embedding dim", "encoder input dim")

        # We keep calculate_span_f1 as a constructor argument for API consistency with
        # the CrfTagger, even it is redundant in this class
        # (tag_encoding serves the same purpose).
        if calculate_span_f1 and not tag_encoding:
            raise ConfigurationError("calculate_span_f1 is True, but "
                                     "no tag_encoding was specified.")

        self.accuracy = CategoricalAccuracy()

        if calculate_span_f1 or tag_encoding:
            self._f1_metric = SpanBasedF1Measure(vocab,
                                                 tag_namespace=tag_namespace,
                                                 tag_encoding=tag_encoding)
        else:
            self._f1_metric = None

        self.f1 = SpanBasedF1Measure(vocab, tag_namespace=tag_namespace)

        self.tag_f1_metrics = {}
        for i in range(self.num_classes):
            self.tag_f1_metrics[vocab.get_token_from_index(
                index=i,
                namespace=tag_namespace)] = F1Measure(positive_label=i)

        initializer(self)
示例#16
0
    def __init__(self, vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 text_encoder: Seq2SeqEncoder,
                 classifier_feedforward: FeedForward,
                 verbose_metrics: False,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None,
                 loss: Optional[dict] = None,
                 ) -> None:
        super(TextClassifier, self).__init__(vocab, regularizer)

        self.text_field_embedder = text_field_embedder
        self.num_classes = self.vocab.get_vocab_size("labels")
        self.text_encoder = text_encoder
        self.classifier_feedforward = classifier_feedforward
        self.prediction_layer = torch.nn.Linear(self.classifier_feedforward.get_output_dim(), self.num_classes)
        self.pool = lambda text, mask: util.get_final_encoder_states(text, mask, bidirectional=True)

        self.label_accuracy = CategoricalAccuracy()
        self.label_f1_metrics = {}
        for i in range(self.num_classes):
            self.label_f1_metrics[vocab.get_token_from_index(index=i, namespace="labels")] = F1Measure(positive_label=i)
        self.verbose_metrics = verbose_metrics

        if loss is None:
            self.loss = torch.nn.CrossEntropyLoss()
        else:
            alpha = loss.get('alpha')
            gamma = loss.get('gamma')
            weight = loss.get('weight')
            if alpha is not None:
                alpha = float(alpha)
            if gamma is not None:
                gamma = float(gamma)
            if weight is not None:
                weight = torch.tensor([1.0, float(weight)])
            if loss.get('type') == 'CrossEntropyLoss':
                self.loss = torch.nn.CrossEntropyLoss(weight=weight)
            elif loss.get('type') == 'BinaryFocalLoss':
                self.loss = BinaryFocalLoss(alpha=alpha, gamma=gamma)
            elif loss.get('type') == 'FocalLoss':
                self.loss = FocalLoss(alpha=alpha, gamma=gamma)
            elif loss.get('type') == 'MultiLabelMarginLoss':
                self.loss = torch.nn.MultiLabelMarginLoss()
            elif loss.get('type') == 'MultiLabelSoftMarginLoss':
                self.loss = torch.nn.MultiLabelSoftMarginLoss(weight)
            else:
                raise ValueError(f'Unexpected loss "{loss}"')

        initializer(self)
示例#17
0
def label_for_index(vocab: Vocabulary, idx: int) -> str:
    """Gets label string for a label `int` id

    Parameters
    ----------
    vocab: `allennlp.data.Vocabulary`
    idx: `int
        the token index

    Returns
    -------
    label: `str`
        The string for a label id
    """
    return vocab.get_token_from_index(idx, namespace=LABELS_NAMESPACE)
示例#18
0
    def __init__(
        self,
        vocab: Vocabulary,
        text_field_embedder: TextFieldEmbedder,
        seq2vec_encoder: Seq2VecEncoder,
        feedforward_layer: FeedForward,
        seq2seq_encoder: Seq2SeqEncoder = None,
        dropout: float = None,
        num_labels: int = None,
        label_namespace: str = "labels",
        initializer: InitializerApplicator = InitializerApplicator(),
        regularizer: Optional[RegularizerApplicator] = None,
    ) -> None:

        super().__init__(vocab, regularizer)
        self._text_field_embedder = text_field_embedder

        if seq2seq_encoder:
            self._seq2seq_encoder = seq2seq_encoder
        else:
            self._seq2seq_encoder = None

        self._seq2vec_encoder = seq2vec_encoder
        self._classifier_input_dim = self._seq2vec_encoder.get_output_dim()

        if dropout:
            self._dropout = torch.nn.Dropout(dropout)
        else:
            self._dropout = None

        self._label_namespace = label_namespace

        if num_labels:
            self._num_labels = num_labels
        else:
            self._num_labels = vocab.get_vocab_size(
                namespace=self._label_namespace)
        self._feedforward_layer = feedforward_layer
        self._classification_layer = torch.nn.Linear(
            self._classifier_input_dim, self._num_labels)
        self._accuracy = CategoricalAccuracy()
        self._label_f1_metrics: Dict[str, F1Measure] = {}
        for i in range(self._num_labels):
            self._label_f1_metrics[vocab.get_token_from_index(
                index=i, namespace="labels")] = F1Measure(positive_label=i)
        self._loss = torch.nn.CrossEntropyLoss()
        initializer(self)
示例#19
0
    def __init__(
        self,
        vocab: Vocabulary,
        text_field_embedder: TextFieldEmbedder,
        verbose_metrics: False,
        dropout: float = 0.2,
        initializer: InitializerApplicator = InitializerApplicator(),
        regularizer: Optional[RegularizerApplicator] = None,
        loss: Optional[dict] = None,
    ) -> None:
        super(TextClassifier, self).__init__(vocab, regularizer)

        self.text_field_embedder = text_field_embedder
        self.dropout = torch.nn.Dropout(dropout)
        self.num_classes = self.vocab.get_vocab_size("labels")
        self.classifier_feedforward = torch.nn.Linear(
            self.text_field_embedder.get_output_dim(), self.num_classes)

        self.label_accuracy = CategoricalAccuracy()
        self.label_f1_metrics = {}

        self.verbose_metrics = verbose_metrics

        for i in range(self.num_classes):
            self.label_f1_metrics[vocab.get_token_from_index(
                index=i, namespace="labels")] = F1Measure(positive_label=i)

        if loss is None or loss.get('type') == 'CrossEntropyLoss':
            self.loss = torch.nn.CrossEntropyLoss()
        elif loss.get('type') == 'BinaryFocalLoss':
            self.loss = BinaryFocalLoss(alpha=loss.get('alpha'),
                                        gamma=loss.get('gamma'))
        elif loss.get('type') == 'FocalLoss':
            self.loss = FocalLoss(alpha=loss.get('alpha'),
                                  gamma=loss.get('gamma'))
        elif loss.get('type') == 'MultiLabelMarginLoss':
            self.loss = torch.nn.MultiLabelMarginLoss()
        elif loss.get('type') == 'MultiLabelSoftMarginLoss':
            self.loss = torch.nn.MultiLabelSoftMarginLoss(
                weight=torch.tensor(loss.get('weight')) if 'weight' in
                loss else None)
        else:
            raise ValueError(f'Unexpected loss "{loss}"')

        initializer(self)
示例#20
0
def prediction_to_rows(*, fold: str, guesser_name: str, vocab: Vocabulary,
                       question: Instance, prediction) -> List[Dict[str, Any]]:
    top_scores = prediction["top_k_scores"]
    top_indices = prediction["top_k_indices"]
    meta = question["metadata"]
    rows = []
    for score, guess_idx in zip(top_scores, top_indices):
        guess = vocab.get_token_from_index(guess_idx, namespace="page_labels")
        rows.append({
            "qanta_id": meta["qanta_id"],
            "proto_id": meta["proto_id"],
            "char_index": meta["char_idx"],
            "guess": guess,
            "score": score,
            "fold": fold,
            "guesser": guesser_name,
        })
    return rows
示例#21
0
    def __init__(self,
                 vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 node_embedder: TokenEmbedder,
                 null_text_embedder: TokenEmbedder,
                 verbose_metrics: False,
                 classifier_feedforward: FeedForward,
                 use_node_vector: bool = True,
                 use_text: bool = True,
                 dropout: float = 0.2,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super(AclEdgeClassifier, self).__init__(vocab, regularizer)

        self.node_embedder = node_embedder
        self.text_field_embedder = text_field_embedder
        # Instead of setting this, omit embedding path in config
        # to get randomly initialized embeddings.
        #self.use_node_vector = use_node_vector
        self.use_text = use_text
        self.null_text_embedder = null_text_embedder
        self.dropout = torch.nn.Dropout(dropout)
        self.num_classes = self.vocab.get_vocab_size("labels")
        self.sep_index = self.vocab.get_token_index("[SEP]")

        self.classifier_feedforward = classifier_feedforward

        self.label_accuracy = CategoricalAccuracy()
        self.label_f1_metrics = {}

        self.verbose_metrics = verbose_metrics

        for i in range(self.num_classes):
            self.label_f1_metrics[vocab.get_token_from_index(
                index=i, namespace="labels")] = F1Measure(positive_label=i)

        self.confusion_matrix = ConfusionMatrix(self.num_classes)

        self.loss = torch.nn.CrossEntropyLoss()

        initializer(self)
示例#22
0
def convert_indices_to_string(hyps: List[List[Hypothesis]],
                              metadata: List[Dict[str, Any]],
                              vocab: Vocabulary,
                              end_token: str = "[SEP]",
                              return_all: bool = False,
                              index_name: str = "tokens"):
    """Convert the token ids in hyps to result rewrite string."""
    vocab_size = vocab.get_vocab_size(namespace=index_name)

    rewrite_tokens = []
    rewrite_strings = []
    origin_rewrite_strings = []
    origin_query_strings = []
    other_rewrite_strings = []
    # for each instance
    for hyp, mdata in zip(hyps, metadata):
        oovs = mdata['oovs']
        if 'rewrite' in mdata:
            origin_query_words = mdata['query_words']
            origin_rw_words = mdata['rewrite']
        other_rw_string = []
        for i, h in enumerate(hyp):
            word_ids = h.tokens
            words = []
            for wid in word_ids:
                try:
                    w = vocab.get_token_from_index(wid, namespace=index_name)
                except Exception:
                    assert oovs is not None, "Error: No oov words in the dialogue!"
                    dialogue_oov_idx = wid - vocab_size
                    try:
                        w = oovs[dialogue_oov_idx]
                    except Exception:
                        raise ValueError(
                            "Error: model produce word ID %i corresponds to dialogue OOV %i "
                            "but this example only has %i OOV words." %
                            (wid, dialogue_oov_idx, len(oovs)))
                words.append(w)
            if i == 0:
                if 'rewrite' in mdata:
                    origin_query_strings.append(origin_query_words)
                    origin_rewrite_strings.append(origin_rw_words)

                # find the end position
                try:
                    stop_idx = words.index(end_token)
                    words = words[:stop_idx]
                except ValueError:
                    pass
                rewrite_tokens.append(words)
                rewrite_strings.append("".join(words))
                if not return_all:
                    break
            else:
                other_rw_string.append("".join(words))
        other_rewrite_strings.append(other_rw_string)

    # return result string, rewrite_token, gold_target and origin_query
    output_dict = {}
    output_dict['rewrite_string'] = rewrite_strings
    output_dict['rewrite_token'] = rewrite_tokens
    output_dict['gold_target'] = origin_rewrite_strings
    output_dict['origin_query'] = origin_query_strings
    # if return_all return other rewrite results (not only the best)
    if return_all:
        output_dict['other_rewrites'] = other_rewrite_strings
    return output_dict
示例#23
0
文件: mlm.py 项目: lgessler/embur
    tokens = [Token(t) for t in s.split(" ")]
    indexed = idx.tokens_to_indices(tokens, vocab)
    print([vocab.get_token_from_index(i) for i in indexed['token_ids']])
    return Instance({"tokens": TextField(tokens, {"tokens": idx})})

instances = [prepare_instance("ϩⲙⲡⲣⲁⲛ ⲙⲡⲛⲟⲩⲧⲉ ⲛϣⲟⲣⲡ ⲁⲛⲟⲕ"), prepare_instance("ϩⲙⲡⲣⲁⲛ ⲙⲡⲛⲟⲩⲧⲉ ⲛϣⲟⲣⲡ ⲁⲛⲟⲕ")]
for i in instances:
    i["tokens"].index(vocab)

tensors = [i.as_tensor_dict() for i in instances]

collator = DataCollatorForWholeWordMask(tokenizer=tokenizer)
ids = torch.cat([tensors[0]['tokens']['tokens']['token_ids'].unsqueeze(0),
                 tensors[1]['tokens']['tokens']['token_ids'].unsqueeze(0)], dim=0)
ids.shape
wwm = collator._whole_word_mask([[vocab.get_token_from_index(i.item()) for i in wp_ids] for wp_ids in ids])

wwms = []
for i in range(ids.shape[0]):
    tokens = [vocab.get_token_from_index(i.item()) for i in ids[i]]
    wwm = torch.tensor(collator._whole_word_mask(tokens)).unsqueeze(0)
    wwms.append(wwm)
wwms = torch.cat(wwms, dim=0)

wwm = torch.tensor(wwm).unsqueeze(0)
wwm
masked_ids, labels = collator.mask_tokens(ids, wwm)
masked_ids
labels
print([vocab.get_token_from_index(i.item()) for i in out[0][0]])
示例#24
0
    def __init__(self, vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 user_utterance_encoder: Seq2VecEncoder,
                 prev_user_utterance_encoder: Seq2VecEncoder,
                 prev_sys_utterance_encoder: Seq2VecEncoder,
                 classifier_feedforward: FeedForward,
                 encoder: Seq2SeqEncoder,
                 calculate_span_f1: bool = None,
                 tag_encoding: Optional[str] = None,
                 tag_namespace: str = "tags",
                 verbose_metrics: bool = False,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super(IntentParamClassifier, self).__init__(vocab, regularizer)

        # Intent task
        self.text_field_embedder = text_field_embedder
        self.label_num_classes = self.vocab.get_vocab_size("labels")
        self.user_utterance_encoder = user_utterance_encoder
        self.prev_user_utterance_encoder = prev_user_utterance_encoder
        self.prev_sys_utterance_encoder = prev_sys_utterance_encoder
        self.classifier_feedforward = classifier_feedforward

        if text_field_embedder.get_output_dim() != user_utterance_encoder.get_input_dim():
            raise ConfigurationError("The output dimension of the text_field_embedder must match the "
                                     "input dimension of the user_utterance_encoder. Found {} and {}, "
                                     "respectively.".format(text_field_embedder.get_output_dim(),
                                                            user_utterance_encoder.get_input_dim()))
        if text_field_embedder.get_output_dim() != prev_user_utterance_encoder.get_input_dim():
            raise ConfigurationError("The output dimension of the text_field_embedder must match the "
                                     "input dimension of the prev_user_utterance_encoder. Found {} and {}, "
                                     "respectively.".format(text_field_embedder.get_output_dim(),
                                                            prev_user_utterance_encoder.get_input_dim()))
        if text_field_embedder.get_output_dim() != prev_sys_utterance_encoder.get_input_dim():
            raise ConfigurationError("The output dimension of the text_field_embedder must match the "
                                     "input dimension of the prev_sys_utterance_encoder. Found {} and {}, "
                                     "respectively.".format(text_field_embedder.get_output_dim(),
                                                            prev_sys_utterance_encoder.get_input_dim()))

        self.label_accuracy = CategoricalAccuracy()

        self.label_f1_metrics = {}
        for i in range(self.label_num_classes):
            self.label_f1_metrics[vocab.get_token_from_index(index=i, namespace="labels")] = F1Measure(positive_label=i)

        self.loss = torch.nn.CrossEntropyLoss()


        # Param task
        self.tag_namespace = tag_namespace
        self.tag_num_classes = self.vocab.get_vocab_size(tag_namespace)
        self.encoder = encoder
        self._verbose_metrics = verbose_metrics
        self.tag_projection_layer = TimeDistributed(Linear(self.encoder.get_output_dim(),
                                                           self.tag_num_classes))

        check_dimensions_match(text_field_embedder.get_output_dim(), encoder.get_input_dim(),
                               "text field embedding dim", "encoder input dim")

        # We keep calculate_span_f1 as a constructor argument for API consistency with
        # the CrfTagger, even it is redundant in this class
        # (tag_encoding serves the same purpose).
        if calculate_span_f1 and not tag_encoding:
            raise ConfigurationError("calculate_span_f1 is True, but "
                                     "no tag_encoding was specified.")

        self.tag_accuracy = CategoricalAccuracy()

        if calculate_span_f1 or tag_encoding:
            self._f1_metric = SpanBasedF1Measure(vocab,
                                                 tag_namespace=tag_namespace,
                                                 tag_encoding=tag_encoding)
        else:
            self._f1_metric = None

        self.f1 = SpanBasedF1Measure(vocab, tag_namespace=tag_namespace)

        self.tag_f1_metrics = {}
        for k in range(self.tag_num_classes):
            self.tag_f1_metrics[vocab.get_token_from_index(index=k, namespace=tag_namespace)] = F1Measure(
                positive_label=k)

        initializer(self)
示例#25
0
def construct_trees(vocab: Vocabulary,
                    namespace: str,
                    predictions: torch.FloatTensor,
                    all_spans: torch.LongTensor,
                    num_spans: torch.LongTensor,
                    sentences: List[List[str]],
                    pos_tags: List[List[str]] = None) -> List[Tree]:
    """
    Construct ``nltk.Tree``'s for each batch element by greedily nesting spans.
    The trees use exclusive end indices, which contrasts with how spans are
    represented in the rest of the model.
    Parameters
    ----------
    predictions : ``torch.FloatTensor``, required.
        A tensor of shape ``(batch_size, num_spans, span_label_vocab_size)``
        representing a distribution over the label classes per span.
    all_spans : ``torch.LongTensor``, required.
        A tensor of shape (batch_size, num_spans, 2), representing the span
        indices we scored.
    num_spans : ``torch.LongTensor``, required.
        A tensor of shape (batch_size), representing the lengths of non-padded spans
        in ``enumerated_spans``.
    sentences : ``List[List[str]]``, required.
        A list of tokens in the sentence for each element in the batch.
    pos_tags : ``List[List[str]]``, optional (default = None).
        A list of POS tags for each word in the sentence for each element
        in the batch.
    Returns
    -------
    A ``List[Tree]`` containing the decoded trees for each element in the batch.
    """
    # Switch to using exclusive end spans.
    exclusive_end_spans = all_spans.clone()
    exclusive_end_spans[:, :, -1] += 1
    no_label_id = vocab.get_token_index(BratDoc.NEG_SPAN_LABEL, namespace)

    trees: List[Tree] = []
    for batch_index, (scored_spans, spans, sentence) in enumerate(zip(predictions,
                                                                      exclusive_end_spans,
                                                                      sentences)):
        selected_spans = []
        for prediction, span in zip(scored_spans[:num_spans[batch_index]],
                                    spans[:num_spans[batch_index]]):
            start, end = span
            no_label_prob = prediction[no_label_id]
            label_prob, label_index = torch.max(prediction, -1)

            # Does the span have a label != NO-LABEL or is it the root node?
            # If so, include it in the spans that we consider.
            if int(label_index) != no_label_id or (start == 0 and end == len(sentence)):
                # TODO(Mark): Remove this once pylint sorts out named tuples.
                # https://github.com/PyCQA/pylint/issues/1418
                selected_spans.append(SpanInformation(start=int(start), # pylint: disable=no-value-for-parameter
                                                      end=int(end),
                                                      label_prob=float(label_prob),
                                                      no_label_prob=float(no_label_prob),
                                                      label_index=int(label_index)))

        # The spans we've selected might overlap, which causes problems when we try
        # to construct the tree as they won't nest properly.
        consistent_spans = SpanConstituencyParser.resolve_overlap_conflicts_greedily(selected_spans)

        spans_to_labels = {(span.start, span.end): vocab.get_token_from_index(span.label_index, namespace)
                           for span in consistent_spans}
        sentence_pos = pos_tags[batch_index] if pos_tags is not None else None
        trees.append(SpanConstituencyParser.construct_tree_from_spans(spans_to_labels, sentence, sentence_pos))

    return trees
    def __init__(
        self,
        vocab: Vocabulary,
        text_field_embedder: TextFieldEmbedder,
        citation_text_encoder: Seq2SeqEncoder,
        classifier_feedforward: FeedForward,
        classifier_feedforward_2: FeedForward,
        classifier_feedforward_3: FeedForward,
        initializer: InitializerApplicator = InitializerApplicator(),
        regularizer: Optional[RegularizerApplicator] = None,
        report_auxiliary_metrics: bool = False,
        predict_mode: bool = False,
    ) -> None:
        """
        Additional Args:
            lexicon_embedder_params: parameters for the lexicon attention model
            use_sparse_lexicon_features: whether to use sparse (onehot) lexicon features
            multilabel: whether the classification is multi-label
            data_format: s2 or jurgens
            report_auxiliary_metrics: report metrics for aux tasks
            predict_mode: predict unlabeled examples
        """
        super(ScaffoldBilstmAttentionClassifier,
              self).__init__(vocab, regularizer)

        self.text_field_embedder = text_field_embedder
        self.num_classes = self.vocab.get_vocab_size("labels")
        self.num_classes_sections = self.vocab.get_vocab_size("section_labels")
        self.num_classes_cite_worthiness = self.vocab.get_vocab_size(
            "cite_worthiness_labels")
        self.citation_text_encoder = citation_text_encoder
        self.classifier_feedforward = classifier_feedforward
        self.classifier_feedforward_2 = classifier_feedforward_2
        self.classifier_feedforward_3 = classifier_feedforward_3

        self.label_accuracy = CategoricalAccuracy()
        self.label_f1_metrics = {}
        self.label_f1_metrics_sections = {}
        self.label_f1_metrics_cite_worthiness = {}
        # for i in range(self.num_classes):
        #     self.label_f1_metrics[vocab.get_token_from_index(index=i, namespace="labels")] =\
        #         F1Measure(positive_label=i)

        for i in range(self.num_classes):
            self.label_f1_metrics[vocab.get_token_from_index(index=i, namespace="labels")] =\
                F1Measure(positive_label=i)
        for i in range(self.num_classes_sections):
            self.label_f1_metrics_sections[vocab.get_token_from_index(index=i, namespace="section_labels")] =\
                F1Measure(positive_label=i)
        for i in range(self.num_classes_cite_worthiness):
            self.label_f1_metrics_cite_worthiness[vocab.get_token_from_index(index=i, namespace="cite_worthiness_labels")] =\
                F1Measure(positive_label=i)
        self.loss = torch.nn.CrossEntropyLoss()

        self.attention_seq2seq = Attention(
            citation_text_encoder.get_output_dim())

        self.report_auxiliary_metrics = report_auxiliary_metrics
        self.predict_mode = predict_mode

        initializer(self)
示例#27
0
def _read_pretrained_embedding_file(
        embeddings_filename: str,
        embedding_dim: int,
        vocab: Vocabulary,
        namespace: str = "tokens") -> torch.FloatTensor:
    """
    Reads a pre-trained embedding file and generates an Embedding layer that has weights
    initialized to the pre-trained embeddings.  The Embedding layer can either be trainable or
    not.

    We use the ``Vocabulary`` to map from the word strings in the embeddings file to the indices
    that we need, and to know which words from the embeddings file we can safely ignore.

    Parameters
    ----------
    embeddings_filename : str, required.
        The path to a file containing pretrined embeddings. The embeddings
        file is assumed to be gzipped and space delimited, e.g. [word] [dim 1] [dim 2] ...
    vocab : Vocabulary, required.
        A Vocabulary object.
    namespace : str, (optional, default=tokens)
        The namespace of the vocabulary to find pretrained embeddings for.
    trainable : bool, (optional, default=True)
        Whether or not the embedding parameters should be optimized.

    Returns
    -------
    A weight matrix with embeddings initialized from the read file.  The matrix has shape
    ``(vocab.get_vocab_size(namespace), embedding_dim)``, where the indices of words appearing in
    the pretrained embedding file are initialized to the pretrained embedding value.
    """
    words_to_keep = set(
        vocab.get_index_to_token_vocabulary(namespace).values())
    vocab_size = vocab.get_vocab_size(namespace)
    embeddings = {}

    # First we read the embeddings from the file, only keeping vectors for the words we need.
    logger.info("Reading embeddings from file")
    with gzip.open(cached_path(embeddings_filename), 'rb') as embeddings_file:
        expected_length = embedding_dim
        for line in embeddings_file:
            fields = line.decode('utf-8').strip().split(' ')
            if len(fields) - 1 != embedding_dim and len(
                    fields) - 1 != expected_length:
                # Sometimes there are funny unicode parsing problems that lead to different
                # fields lengths (e.g., a word with a unicode space character that splits
                # into more than one column).  We skip those lines.  Note that if you have
                # some kind of long header, this could result in all of your lines getting
                # skipped.  It's hard to check for that here; you just have to look in the
                # embedding_misses_file and at the model summary to make sure things look
                # like they are supposed to.
                logger.warning(
                    "Found line with wrong number of dimensions "
                    "(expected %d or %d, was %d): %s", embedding_dim,
                    expected_length,
                    len(fields) - 1, ' '.join(fields[:10]) + '[...]')
                try:
                    n1 = float(
                        fields[1])  # test that the second field is a number
                    assert len(
                        fields
                    ) - 1 > embedding_dim  # test that we could take a subset of the line
                    # if these tests pass, print a warning but use the vector and allow
                    # future vectors with the same length.
                    # NOTE TK TODO REMOVE: in future replace this by allowing user to specify
                    # both the 'actual' and 'desired' input embedding dimension.
                    logger.warning(
                        "Will change expected_length to %s and allow this and "
                        "similar vectors",
                        len(fields) - 1)
                    expected_length = len(fields) - 1
                except:
                    logger.warning("Skipping...")
                    continue
            word = fields[0]
            if word in words_to_keep:
                vector = numpy.asarray(fields[1:embedding_dim + 1],
                                       dtype='float32')
                embeddings[word] = vector

    if not embeddings:
        raise ConfigurationError(
            "No embeddings of correct dimension found; you probably "
            "misspecified your embedding_dim parameter, or didn't "
            "pre-populate your Vocabulary")

    all_embeddings = numpy.asarray(list(embeddings.values()))
    embeddings_mean = float(numpy.mean(all_embeddings))
    embeddings_std = float(numpy.std(all_embeddings))
    # Now we initialize the weight matrix for an embedding layer, starting with random vectors,
    # then filling in the word vectors we just read.
    logger.info("Initializing pre-trained embedding layer")
    embedding_matrix = torch.FloatTensor(vocab_size, embedding_dim).normal_(
        embeddings_mean, embeddings_std)

    for i in range(0, vocab_size):
        word = vocab.get_token_from_index(i, namespace)

        # If we don't have a pre-trained vector for this word, we'll just leave this row alone,
        # so the word has a random initialization.
        if word in embeddings:
            embedding_matrix[i] = torch.FloatTensor(embeddings[word])
        else:
            logger.debug(
                "Word %s was not found in the embedding file. Initialising randomly.",
                word)

    # The weight matrix is initialized, so we construct and return the actual Embedding.
    return embedding_matrix
示例#28
0
def get_pretrained_embedding_layer(embeddings_filename: str,
                                   vocab: Vocabulary,
                                   namespace: str = "tokens",
                                   trainable: bool = True):
    """
    Reads a pre-trained embedding file and generates an Embedding layer that has weights
    initialized to the pre-trained embeddings.  The Embedding layer can either be trainable or
    not.

    We use the ``Vocabulary`` to map from the word strings in the embeddings file to the indices
    that we need, and to know which words from the embeddings file we can safely ignore.

    Parameters
    ----------

    embeddings_filename : str, required.
        The path to a file containing pretrined embeddings. The embeddings
        file is assumed to be gzipped and space delimited, e.g. [word] [dim 1] [dim 2] ...
    vocab : Vocabulary, required.
        A Vocabulary object.
    namespace : str, (optional, default=tokens)
        The namespace of the vocabulary to find pretrained embeddings for.
    trainable : bool, (optional, default=True)
        Whether or not the embedding parameters should be optimized.

    Returns
    -------

    An Embedding Module initialised with a weight matrix of shape
    (vocab.get_vocab_size(namespace), pretrained_embedding_dim),
    where the indices of words appearing in the pretrained embedding file
    are initialized to the pretrained embedding value.

    """
    words_to_keep = set(vocab.get_index_to_token_vocabulary(namespace).values())
    vocab_size = vocab.get_vocab_size(namespace)
    embeddings = {}
    embedding_dim = None

    # First we read the embeddings from the file, only keeping vectors for the words we need.
    logger.info("Reading embeddings from file")
    with gzip.open(embeddings_filename, 'rb') as embeddings_file:
        for line in embeddings_file:
            fields = line.decode('utf-8').strip().split(' ')
            if embedding_dim is None:
                embedding_dim = len(fields) - 1
                assert embedding_dim > 1, "Found embedding size of 1; do you have a header?"
            else:
                if len(fields) - 1 != embedding_dim:
                    # Sometimes there are funny unicode parsing problems that lead to different
                    # fields lengths (e.g., a word with a unicode space character that splits
                    # into more than one column).  We skip those lines.  Note that if you have
                    # some kind of long header, this could result in all of your lines getting
                    # skipped.  It's hard to check for that here; you just have to look in the
                    # embedding_misses_file and at the model summary to make sure things look
                    # like they are supposed to.
                    continue
            word = fields[0]
            if word in words_to_keep:
                vector = numpy.asarray(fields[1:], dtype='float32')
                embeddings[word] = vector

    # Now we initialize the weight matrix for an embedding layer, starting with random vectors,
    # then filling in the word vectors we just read.
    logger.info("Initializing pre-trained embedding layer")
    embedding_matrix = torch.FloatTensor(vocab_size, embedding_dim).normal_(0, 1)

    for i in range(0, vocab_size):
        word = vocab.get_token_from_index(i, namespace)

        # If we don't have a pre-trained vector for this word, we'll just leave this row alone,
        # so the word has a random initialization.
        if word in embeddings:
            embedding_matrix[i] = torch.FloatTensor(embeddings[word])
        else:
            logger.debug("Word %s was not found in the embedding file. Initialising randomly.", word)

    # The weight matrix is initialized, so we construct and return the actual Embedding.
    return Embedding(num_embeddings=vocab_size,
                     embedding_dim=embedding_dim,
                     padding_index=0,
                     weight=embedding_matrix,
                     trainable=trainable)
示例#29
0
def _read_pretrained_word2vec_format_embedding_file(embeddings_filename: str, # pylint: disable=invalid-name
                                                    embedding_dim: int,
                                                    vocab: Vocabulary,
                                                    namespace: str = "tokens") -> torch.FloatTensor:
    """
    Read from a gzipped-word2vec format file.  The embeddings file is assumed to be gzipped and
    space delimited, e.g. [word] [dim 1] [dim 2] ...

    The remainder of the docstring is identical to ``_read_pretrained_embedding_file``.
    """
    words_to_keep = set(vocab.get_index_to_token_vocabulary(namespace).values())
    vocab_size = vocab.get_vocab_size(namespace)
    embeddings = {}

    # First we read the embeddings from the file, only keeping vectors for the words we need.
    logger.info("Reading embeddings from file")
    with gzip.open(cached_path(embeddings_filename), 'rb') as embeddings_file:
        for line in embeddings_file:
            fields = line.decode('utf-8').strip().split(' ')
            if len(fields) - 1 != embedding_dim:
                # Sometimes there are funny unicode parsing problems that lead to different
                # fields lengths (e.g., a word with a unicode space character that splits
                # into more than one column).  We skip those lines.  Note that if you have
                # some kind of long header, this could result in all of your lines getting
                # skipped.  It's hard to check for that here; you just have to look in the
                # embedding_misses_file and at the model summary to make sure things look
                # like they are supposed to.
                logger.warning("Found line with wrong number of dimensions (expected %d, was %d): %s",
                               embedding_dim, len(fields) - 1, line)
                continue
            word = fields[0]
            if word in words_to_keep:
                vector = numpy.asarray(fields[1:], dtype='float32')
                embeddings[word] = vector

    if not embeddings:
        raise ConfigurationError("No embeddings of correct dimension found; you probably "
                                 "misspecified your embedding_dim parameter, or didn't "
                                 "pre-populate your Vocabulary")

    all_embeddings = numpy.asarray(list(embeddings.values()))
    embeddings_mean = float(numpy.mean(all_embeddings))
    embeddings_std = float(numpy.std(all_embeddings))
    # Now we initialize the weight matrix for an embedding layer, starting with random vectors,
    # then filling in the word vectors we just read.
    logger.info("Initializing pre-trained embedding layer")
    embedding_matrix = torch.FloatTensor(vocab_size, embedding_dim).normal_(embeddings_mean,
                                                                            embeddings_std)

    for i in range(0, vocab_size):
        word = vocab.get_token_from_index(i, namespace)

        # If we don't have a pre-trained vector for this word, we'll just leave this row alone,
        # so the word has a random initialization.
        if word in embeddings:
            embedding_matrix[i] = torch.FloatTensor(embeddings[word])
        else:
            logger.debug("Word %s was not found in the embedding file. Initialising randomly.", word)

    # The weight matrix is initialized, so we construct and return the actual Embedding.
    return embedding_matrix
示例#30
0
def _read_pretrained_word2vec_format_embedding_file(
        embeddings_filename: str,  # pylint: disable=invalid-name
        embedding_dim: int,
        vocab: Vocabulary,
        namespace: str = "tokens") -> torch.FloatTensor:
    """
    Read from a gzipped-word2vec format file.  The embeddings file is assumed to be gzipped and
    space delimited, e.g. [word] [dim 1] [dim 2] ...

    The remainder of the docstring is identical to ``_read_pretrained_embedding_file``.
    """
    words_to_keep = set(
        vocab.get_index_to_token_vocabulary(namespace).values())
    vocab_size = vocab.get_vocab_size(namespace)
    embeddings = {}

    # First we read the embeddings from the file, only keeping vectors for the words we need.
    logger.info("Reading embeddings from file")
    with gzip.open(cached_path(embeddings_filename), 'rb') as embeddings_file:
        for line in embeddings_file:
            fields = line.decode('utf-8').rstrip().split(' ')
            if len(fields) - 1 != embedding_dim:
                # Sometimes there are funny unicode parsing problems that lead to different
                # fields lengths (e.g., a word with a unicode space character that splits
                # into more than one column).  We skip those lines.  Note that if you have
                # some kind of long header, this could result in all of your lines getting
                # skipped.  It's hard to check for that here; you just have to look in the
                # embedding_misses_file and at the model summary to make sure things look
                # like they are supposed to.
                logger.warning(
                    "Found line with wrong number of dimensions (expected %d, was %d): %s",
                    embedding_dim,
                    len(fields) - 1, line)
                continue
            word = fields[0]
            if word in words_to_keep:
                vector = numpy.asarray(fields[1:], dtype='float32')
                embeddings[word] = vector

    if not embeddings:
        raise ConfigurationError(
            "No embeddings of correct dimension found; you probably "
            "misspecified your embedding_dim parameter, or didn't "
            "pre-populate your Vocabulary")

    all_embeddings = numpy.asarray(list(embeddings.values()))
    embeddings_mean = float(numpy.mean(all_embeddings))
    embeddings_std = float(numpy.std(all_embeddings))
    # Now we initialize the weight matrix for an embedding layer, starting with random vectors,
    # then filling in the word vectors we just read.
    logger.info("Initializing pre-trained embedding layer")
    embedding_matrix = torch.FloatTensor(vocab_size, embedding_dim).normal_(
        embeddings_mean, embeddings_std)

    for i in range(0, vocab_size):
        word = vocab.get_token_from_index(i, namespace)

        # If we don't have a pre-trained vector for this word, we'll just leave this row alone,
        # so the word has a random initialization.
        if word in embeddings:
            embedding_matrix[i] = torch.FloatTensor(embeddings[word])
        else:
            logger.debug(
                "Word %s was not found in the embedding file. Initialising randomly.",
                word)

    # The weight matrix is initialized, so we construct and return the actual Embedding.
    return embedding_matrix
示例#31
0
def _read_pretrained_embedding_file(
        embeddings_filename: str,
        embedding_dim: int,
        vocab: Vocabulary,
        namespace: str = "tokens") -> torch.FloatTensor:
    """
    Reads a pre-trained embedding file and generates an Embedding layer that has weights
    initialized to the pre-trained embeddings.  The Embedding layer can either be trainable or
    not.

    We use the ``Vocabulary`` to map from the word strings in the embeddings file to the indices
    that we need, and to know which words from the embeddings file we can safely ignore.

    Parameters
    ----------
    embeddings_filename : str, required.
        The path to a file containing pretrined embeddings. The embeddings
        file is assumed to be gzipped and space delimited, e.g. [word] [dim 1] [dim 2] ...
    vocab : Vocabulary, required.
        A Vocabulary object.
    namespace : str, (optional, default=tokens)
        The namespace of the vocabulary to find pretrained embeddings for.
    trainable : bool, (optional, default=True)
        Whether or not the embedding parameters should be optimized.

    Returns
    -------
    A weight matrix with embeddings initialized from the read file.  The matrix has shape
    ``(vocab.get_vocab_size(namespace), embedding_dim)``, where the indices of words appearing in
    the pretrained embedding file are initialized to the pretrained embedding value.
    """
    words_to_keep = set(
        vocab.get_index_to_token_vocabulary(namespace).values())
    vocab_size = vocab.get_vocab_size(namespace)
    embeddings = {}

    words_found = set()
    # First we read the embeddings from the file, only keeping vectors for the words we need.
    logger.info("Reading embeddings from file; {}".format(len(words_to_keep)))
    with gzip.open(embeddings_filename, 'rb') as embeddings_file:
        for line in embeddings_file:
            fields = line.decode('utf-8').strip().split(' ')
            if len(fields) - 1 != embedding_dim:
                # Sometimes there are funny unicode parsing problems that lead to different
                # fields lengths (e.g., a word with a unicode space character that splits
                # into more than one column).  We skip those lines.  Note that if you have
                # some kind of long header, this could result in all of your lines getting
                # skipped.  It's hard to check for that here; you just have to look in the
                # embedding_misses_file and at the model summary to make sure things look
                # like they are supposed to.
                logger.warning(
                    "Found line with wrong number of dimensions (expected %d, was %d): %s",
                    embedding_dim,
                    len(fields) - 1, line)
                continue
            word = fields[0]
            if word in words_to_keep:
                words_found.add(word)
                vector = numpy.asarray(fields[1:], dtype='float32')
                embeddings[word] = vector
    notfound = words_to_keep.difference(words_found)
    logger.info("Emb load count: {}; Emb not found count: {}".format(
        len(words_found), len(notfound)))
    #"""
    with open("/home/kz918/bpe/eval/bidaf/not_found.txt",
              'w',
              encoding='utf-8') as f:
        for word in notfound:
            f.write(word)
            f.write('\n')
    #"""
    #assert len(notfound) < 10
    if not embeddings:
        raise ConfigurationError(
            "No embeddings of correct dimension found; you probably "
            "misspecified your embedding_dim parameter, or didn't "
            "pre-populate your Vocabulary")

    all_embeddings = numpy.asarray(list(embeddings.values()))
    embeddings_mean = float(numpy.mean(all_embeddings))
    embeddings_std = float(numpy.std(all_embeddings))
    # Now we initialize the weight matrix for an embedding layer, starting with random vectors,
    # then filling in the word vectors we just read.
    logger.info("Initializing pre-trained embedding layer")
    embedding_matrix = torch.FloatTensor(vocab_size, embedding_dim).normal_(
        embeddings_mean, embeddings_std)

    for i in range(0, vocab_size):
        word = vocab.get_token_from_index(i, namespace)

        # If we don't have a pre-trained vector for this word, we'll just leave this row alone,
        # so the word has a random initialization.
        if word in embeddings:
            embedding_matrix[i] = torch.FloatTensor(embeddings[word])
        else:
            logger.debug(
                "Word %s was not found in the embedding file. Initialising randomly.",
                word)

    # The weight matrix is initialized, so we construct and return the actual Embedding.
    return embedding_matrix