def __init__( self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, elmo_text_field_embedder: TextFieldEmbedder, quote_response_encoder: Seq2SeqEncoder, quote_response_encoder_aux: Seq2VecEncoder, classifier_feedforward: FeedForward, classifier_feedforward_2: FeedForward, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None, report_auxiliary_metrics: bool = False, # predict_mode: bool = False, ) -> None: super(SarcasmClassifier, self).__init__(vocab, regularizer) self.text_field_embedder = text_field_embedder self.elmo_text_field_embedder = elmo_text_field_embedder self.num_classes = self.vocab.get_vocab_size("labels") self.num_classes_emotions = self.vocab.get_vocab_size("emotion_labels") self.quote_response_encoder = quote_response_encoder self.quote_response_encoder_aux = quote_response_encoder_aux self.classifier_feedforward = classifier_feedforward self.classifier_feedforward_2 = classifier_feedforward_2 self.attention_seq2seq = Attention( quote_response_encoder.get_output_dim()) self.label_acc_metrics = {"accuracy": CategoricalAccuracy()} self.label_f1_metrics = {} self.label_f1_metrics_emotions = {} # for i in range(self.num_classes): # self.label_f1_metrics[vocab.get_token_from_index(index=i, namespace="label")] =\ # F1Measure(positive_label=i) for i in range(self.num_classes): self.label_f1_metrics[vocab.get_token_from_index(index=i, namespace="labels")] =\ F1Measure(positive_label=i) for i in range(self.num_classes_emotions): self.label_f1_metrics_emotions[vocab.get_token_from_index(index=i, namespace="emotion_labels")] =\ F1Measure(positive_label=i) self.loss = torch.nn.CrossEntropyLoss() # self.attention_seq2seq = Attention(quote_response_encoder.get_output_dim()) self.report_auxiliary_metrics = report_auxiliary_metrics # self.predict_mode = predict_mode initializer(self)
def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, node_embedder: TokenEmbedder, verbose_metrics: False, classifier_feedforward: FeedForward, use_node_vector: bool = True, use_abstract: bool = True, dropout: float = 0.2, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None) -> None: super(AclClassifier, self).__init__(vocab, regularizer) self.node_embedder = node_embedder self.text_field_embedder = text_field_embedder self.use_node_vector = use_node_vector self.use_abstract = use_abstract self.dropout = torch.nn.Dropout(dropout) self.num_classes = self.vocab.get_vocab_size("labels") self.classifier_feedforward = classifier_feedforward self.label_accuracy = CategoricalAccuracy() self.label_f1_metrics = {} self.verbose_metrics = verbose_metrics for i in range(self.num_classes): label_name = vocab.get_token_from_index(index=i, namespace="labels") self.label_f1_metrics[label_name] = F1Measure(positive_label=i) self.loss = torch.nn.CrossEntropyLoss() initializer(self)
def __init__( self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, verbose_metrics: False, dropout: float = 0.2, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None, ) -> None: super(TextClassifier, self).__init__(vocab, regularizer) self.text_field_embedder = text_field_embedder self.dropout = torch.nn.Dropout(dropout) self.num_classes = self.vocab.get_vocab_size("labels") self.classifier_feedforward = torch.nn.Linear( self.text_field_embedder.get_output_dim(), self.num_classes) self.label_accuracy = CategoricalAccuracy() self.label_f1_metrics = {} self.verbose_metrics = verbose_metrics for i in range(self.num_classes): self.label_f1_metrics[vocab.get_token_from_index( index=i, namespace="labels")] = F1Measure(positive_label=i) self.loss = torch.nn.CrossEntropyLoss() initializer(self)
def __init__(self, vocab: Vocabulary, quote_response_encoder: Seq2VecEncoder, bert_model_name: str = None, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None) -> None: super(SarcasmClassifier, self).__init__(vocab, regularizer) self.quote_response_encoder = quote_response_encoder self.text_field_embedder = BertModel.from_pretrained(bert_model_name) self.num_classes_emotions = self.vocab.get_vocab_size("labels") self.linear = nn.Linear(200, self.num_classes_emotions) self.label_acc_metrics = {"accuracy": CategoricalAccuracy()} self.label_f1_metrics_emotions = {} # for i in range(self.num_classes): # self.label_f1_metrics[vocab.get_token_from_index(index=i, namespace="label")] =\ # F1Measure(positive_label=i) for i in range(self.num_classes_emotions): self.label_f1_metrics_emotions[vocab.get_token_from_index(index=i, namespace="labels")] =\ F1Measure(positive_label=i) self.loss = torch.nn.CrossEntropyLoss() # self.attention_seq2seq = Attention(quote_response_encoder.get_output_dim()) # self.predict_mode = predict_mode initializer(self)
def compute_background_log_frequency(vocab: Vocabulary, vocab_namespace: str, precomputed_bg_file=None): """ Load in the word counts from the JSON file and compute the background log term frequency w.r.t this vocabulary. """ # precomputed_word_counts = json.load(open(precomputed_word_counts, "r")) log_term_frequency = torch.FloatTensor( vocab.get_vocab_size(vocab_namespace)) if precomputed_bg_file is not None: with open(precomputed_bg_file, "r") as file_: precomputed_bg = json.load(file_) else: precomputed_bg = vocab._retained_counter.get(vocab_namespace) # pylint: disable=protected-access if precomputed_bg is None: return log_term_frequency for i in range(vocab.get_vocab_size(vocab_namespace)): token = vocab.get_token_from_index(i, vocab_namespace) if token in ("@@UNKNOWN@@", "@@PADDING@@", '@@START@@', '@@END@@') or token not in precomputed_bg: log_term_frequency[i] = 1e-12 elif token in precomputed_bg: if precomputed_bg[token] == 0: log_term_frequency[i] = 1e-12 else: log_term_frequency[i] = precomputed_bg[token] log_term_frequency = torch.log(log_term_frequency) return log_term_frequency
def embed(self, vocab: Vocabulary, tokens: torch.Tensor) -> torch.Tensor: """ Idea: reconstruct string tokens from token ids -> feed to spacy -> return tensors :param vocab: :param tokens: :return: """ with SwitchDefaultTensor(): embedded_sentences = [] tokens_cpu = tokens.cpu() batch_size, seq_len = tokens.shape for sentence in tokens_cpu: str_tokens: List[str] = [ vocab.get_token_from_index(int(token)) for token in sentence if token != 0 ] #skip padding doc = Doc(self.nlp.vocab, words=str_tokens) self.nlp.pipeline[1][1](doc) #word pieces self.nlp.pipeline[2][1](doc) #run transformer on wordpieces #add padding back in #embedded = torch.from_numpy(cupy.asnumpy(doc.tensor)).to(device) # shape (str_tokens, output dim) embedded = from_dlpack( doc.tensor.toDlpack()) # shape (str_tokens, output dim) assert embedded.shape == (len(str_tokens), self.get_output_dim()) if seq_len - len(str_tokens) > 0: padded = torch.zeros(seq_len - len(str_tokens), self.get_output_dim()) embedded = torch.cat([embedded, padded], dim=0) embedded_sentences.append(embedded) return torch.stack(embedded_sentences, dim=0)
def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, text_encoder: Seq2SeqEncoder, classifier_feedforward: FeedForward, verbose_metrics: False, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None, ) -> None: super(TextClassifier, self).__init__(vocab, regularizer) self.text_field_embedder = text_field_embedder self.num_classes = self.vocab.get_vocab_size("labels") self.text_encoder = text_encoder self.classifier_feedforward = classifier_feedforward self.label_accuracy = CategoricalAccuracy() self.label_f1_metrics = {} self.verbose_metrics = verbose_metrics for i in range(self.num_classes): self.label_f1_metrics[vocab.get_token_from_index(index=i, namespace="labels")] = F1Measure(positive_label=i) self.loss = torch.nn.CrossEntropyLoss() self.pool = lambda text, mask: util.get_final_encoder_states(text, mask, bidirectional=True) initializer(self)
def compute_background_log_frequency(vocab: Vocabulary, vocab_namespace: str, precomputed_bg_file=None): """ Load in the word counts from the JSON file and compute the background log term frequency w.r.t this vocabulary. """ # precomputed_word_counts = json.load(open(precomputed_word_counts, "r")) # bp() # sample a probability tensor from a symmetric dirichlet log_term_frequency = torch.distributions.dirichlet.Dirichlet( torch.ones(vocab.get_vocab_size(vocab_namespace))).sample() if precomputed_bg_file is not None: with open(precomputed_bg_file, "r") as file_: precomputed_bg = json.load(file_) else: precomputed_bg = vocab._retained_counter.get(vocab_namespace) # pylint: disable=protected-access if precomputed_bg is None: return log_term_frequency # bp() for i in range(vocab.get_vocab_size(vocab_namespace)): token = vocab.get_token_from_index(i, vocab_namespace) if token in precomputed_bg: log_term_frequency[i] = precomputed_bg[token] elif token in ("@@UNKNOWN@@", "@@PADDING@@", '@@START@@', '@@END@@') or token not in precomputed_bg: log_term_frequency[i] = 1e-12 # bp() assert log_term_frequency.sum().allclose(torch.ones(1)) log_term_frequency = torch.log(log_term_frequency) # return torch.zeros(vocab.get_vocab_size(vocab_namespace)) return log_term_frequency
def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, encoder: Seq2SeqEncoder, classifier_feedforward: FeedForward, elmo: Elmo = None, use_input_elmo: bool = False): super().__init__(vocab) self.elmo = elmo self.use_elmo = use_input_elmo self.text_field_embedder = text_field_embedder self.num_classes = self.vocab.get_vocab_size("labels") self.encoder = encoder self.classifier_feed_forward = classifier_feedforward self.label_accuracy = CategoricalAccuracy() self.label_f1_metrics = {} # create F1 Measures for each class for i in range(self.num_classes): self.label_f1_metrics[vocab.get_token_from_index(index=i, namespace="labels")] = \ F1Measure(positive_label=i) self.loss = torch.nn.CrossEntropyLoss() self.attention = Attention(encoder.get_output_dim())
def embed(self, vocab: Vocabulary, tokens: torch.Tensor) -> torch.Tensor: """ Idea: reconstruct string tokens from token ids -> feed to spacy -> return tensors :param vocab: :param tokens: :return: """ with SwitchDefaultTensor(): with torch.autograd.no_grad(): embedded_sentences = [] tokens_cpu = tokens.cpu() batch_size, seq_len = tokens.shape sents = [] for sentence in tokens_cpu: str_tokens: List[str] = [ vocab.get_token_from_index(int(token)) for token in sentence if token != 0 ] #skip padding sents.append(str_tokens) doc = make_doc(self.nlp.vocab, sents) self.nlp.pipeline[1][1](doc) #word pieces self.nlp.pipeline[2][1](doc) #run transformer on wordpieces #Now iterate over sentences in correct order and cut out the correct tensor + pad it for sent, str_tokens in zip(doc.sents, sents): #add padding back in embedded = from_dlpack(sent.tensor.toDlpack() ) # shape (str_tokens, output dim) if seq_len - len(str_tokens) > 0: padded = torch.zeros(seq_len - len(str_tokens), self.get_output_dim()) embedded = torch.cat([embedded, padded], dim=0) embedded_sentences.append(embedded) return torch.stack(embedded_sentences, dim=0)
def __init__(self, vocab: Vocabulary, input_embedder: TextFieldEmbedder, encoder: Encoder = None, dropout: float = None, initializer: InitializerApplicator = InitializerApplicator() ) -> None: """ Parameters ---------- vocab: `Vocabulary` vocab to use input_embedder: `TextFieldEmbedder` generic embedder of tokens encoder: `Encoder`, optional (default = None) Seq2Vec or Seq2Seq Encoder wrapper. If no encoder is provided, assume that the input is a bag of word counts, for linear classification. dropout: `float`, optional (default = None) if set, will apply dropout to output of encoder. initializer: `InitializerApplicator` generic initializer """ super().__init__(vocab) self._input_embedder = input_embedder if dropout: self._dropout = torch.nn.Dropout(dropout) else: self._dropout = None self._encoder = encoder self._num_labels = vocab.get_vocab_size(namespace="labels") if self._encoder: self._clf_input_dim = self._encoder.get_output_dim() else: self._clf_input_dim = self._input_embedder.get_output_dim() self._classification_layer = torch.nn.Linear(self._clf_input_dim, self._num_labels) self.attn = torch.nn.Parameter(torch.randn(5, self._num_labels)) self._accuracy = CategoricalAccuracy() self.label_f1_metrics = {} self.label_order = [] for i in range(self._num_labels): self.label_f1_metrics[vocab.get_token_from_index(index=i, namespace="labels")] = F1Measure(positive_label=i) self.label_order.append(vocab.get_token_from_index(index=i, namespace="labels")) self._loss = torch.nn.CrossEntropyLoss() initializer(self)
def _get_vocab_index_mapping(self, archived_vocab: Vocabulary) -> List[Tuple[int, int]]: vocab_index_mapping: List[Tuple[int, int]] = [] for index in range(self.vocab.get_vocab_size(namespace='tokens')): token = self.vocab.get_token_from_index(index=index, namespace='tokens') archived_token_index = archived_vocab.get_token_index(token, namespace='tokens') # Checking if we got the UNK token index, because we don't want all new token # representations initialized to UNK token's representation. We do that by checking if # the two tokens are the same. They will not be if the token at the archived index is # UNK. if archived_vocab.get_token_from_index(archived_token_index, namespace="tokens") == token: vocab_index_mapping.append((index, archived_token_index)) return vocab_index_mapping
def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, user_utterance_encoder: Seq2VecEncoder, prev_user_utterance_encoder: Seq2VecEncoder, prev_sys_utterance_encoder: Seq2VecEncoder, classifier_feedforward: FeedForward, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None) -> None: super(IntentClassifier, self).__init__(vocab, regularizer) self.text_field_embedder = text_field_embedder self.num_classes = self.vocab.get_vocab_size("labels") self.user_utterance_encoder = user_utterance_encoder self.prev_user_utterance_encoder = prev_user_utterance_encoder self.prev_sys_utterance_encoder = prev_sys_utterance_encoder self.classifier_feedforward = classifier_feedforward if text_field_embedder.get_output_dim( ) != user_utterance_encoder.get_input_dim(): raise ConfigurationError( "The output dimension of the text_field_embedder must match the " "input dimension of the user_utterance_encoder. Found {} and {}, " "respectively.".format(text_field_embedder.get_output_dim(), user_utterance_encoder.get_input_dim())) if text_field_embedder.get_output_dim( ) != prev_user_utterance_encoder.get_input_dim(): raise ConfigurationError( "The output dimension of the text_field_embedder must match the " "input dimension of the prev_user_utterance_encoder. Found {} and {}, " "respectively.".format( text_field_embedder.get_output_dim(), prev_user_utterance_encoder.get_input_dim())) if text_field_embedder.get_output_dim( ) != prev_sys_utterance_encoder.get_input_dim(): raise ConfigurationError( "The output dimension of the text_field_embedder must match the " "input dimension of the prev_sys_utterance_encoder. Found {} and {}, " "respectively.".format( text_field_embedder.get_output_dim(), prev_sys_utterance_encoder.get_input_dim())) self.label_accuracy = CategoricalAccuracy() self.label_f1_metrics = {} for i in range(self.num_classes): self.label_f1_metrics[vocab.get_token_from_index( index=i, namespace="labels")] = F1Measure(positive_label=i) self.loss = torch.nn.CrossEntropyLoss() initializer(self)
def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, encoder: Seq2SeqEncoder, label_encoder: Seq2VecEncoder, calculate_span_f1: bool = None, tag_encoding: Optional[str] = None, tag_namespace: str = "tags", verbose_metrics: bool = False, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None) -> None: super(ParamTaggerPipeline, self).__init__(vocab, regularizer) self.label_encoder = label_encoder self.tag_namespace = tag_namespace self.text_field_embedder = text_field_embedder self.num_classes = self.vocab.get_vocab_size(tag_namespace) self.encoder = encoder self._verbose_metrics = verbose_metrics self.tag_projection_layer = TimeDistributed( Linear(self.encoder.get_output_dim(), self.num_classes)) check_dimensions_match(text_field_embedder.get_output_dim(), encoder.get_input_dim(), "text field embedding dim", "encoder input dim") # We keep calculate_span_f1 as a constructor argument for API consistency with # the CrfTagger, even it is redundant in this class # (tag_encoding serves the same purpose). if calculate_span_f1 and not tag_encoding: raise ConfigurationError("calculate_span_f1 is True, but " "no tag_encoding was specified.") self.accuracy = CategoricalAccuracy() if calculate_span_f1 or tag_encoding: self._f1_metric = SpanBasedF1Measure(vocab, tag_namespace=tag_namespace, tag_encoding=tag_encoding) else: self._f1_metric = None self.f1 = SpanBasedF1Measure(vocab, tag_namespace=tag_namespace) self.tag_f1_metrics = {} for i in range(self.num_classes): self.tag_f1_metrics[vocab.get_token_from_index( index=i, namespace=tag_namespace)] = F1Measure(positive_label=i) initializer(self)
def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, text_encoder: Seq2SeqEncoder, classifier_feedforward: FeedForward, verbose_metrics: False, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None, loss: Optional[dict] = None, ) -> None: super(TextClassifier, self).__init__(vocab, regularizer) self.text_field_embedder = text_field_embedder self.num_classes = self.vocab.get_vocab_size("labels") self.text_encoder = text_encoder self.classifier_feedforward = classifier_feedforward self.prediction_layer = torch.nn.Linear(self.classifier_feedforward.get_output_dim(), self.num_classes) self.pool = lambda text, mask: util.get_final_encoder_states(text, mask, bidirectional=True) self.label_accuracy = CategoricalAccuracy() self.label_f1_metrics = {} for i in range(self.num_classes): self.label_f1_metrics[vocab.get_token_from_index(index=i, namespace="labels")] = F1Measure(positive_label=i) self.verbose_metrics = verbose_metrics if loss is None: self.loss = torch.nn.CrossEntropyLoss() else: alpha = loss.get('alpha') gamma = loss.get('gamma') weight = loss.get('weight') if alpha is not None: alpha = float(alpha) if gamma is not None: gamma = float(gamma) if weight is not None: weight = torch.tensor([1.0, float(weight)]) if loss.get('type') == 'CrossEntropyLoss': self.loss = torch.nn.CrossEntropyLoss(weight=weight) elif loss.get('type') == 'BinaryFocalLoss': self.loss = BinaryFocalLoss(alpha=alpha, gamma=gamma) elif loss.get('type') == 'FocalLoss': self.loss = FocalLoss(alpha=alpha, gamma=gamma) elif loss.get('type') == 'MultiLabelMarginLoss': self.loss = torch.nn.MultiLabelMarginLoss() elif loss.get('type') == 'MultiLabelSoftMarginLoss': self.loss = torch.nn.MultiLabelSoftMarginLoss(weight) else: raise ValueError(f'Unexpected loss "{loss}"') initializer(self)
def label_for_index(vocab: Vocabulary, idx: int) -> str: """Gets label string for a label `int` id Parameters ---------- vocab: `allennlp.data.Vocabulary` idx: `int the token index Returns ------- label: `str` The string for a label id """ return vocab.get_token_from_index(idx, namespace=LABELS_NAMESPACE)
def __init__( self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, seq2vec_encoder: Seq2VecEncoder, feedforward_layer: FeedForward, seq2seq_encoder: Seq2SeqEncoder = None, dropout: float = None, num_labels: int = None, label_namespace: str = "labels", initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None, ) -> None: super().__init__(vocab, regularizer) self._text_field_embedder = text_field_embedder if seq2seq_encoder: self._seq2seq_encoder = seq2seq_encoder else: self._seq2seq_encoder = None self._seq2vec_encoder = seq2vec_encoder self._classifier_input_dim = self._seq2vec_encoder.get_output_dim() if dropout: self._dropout = torch.nn.Dropout(dropout) else: self._dropout = None self._label_namespace = label_namespace if num_labels: self._num_labels = num_labels else: self._num_labels = vocab.get_vocab_size( namespace=self._label_namespace) self._feedforward_layer = feedforward_layer self._classification_layer = torch.nn.Linear( self._classifier_input_dim, self._num_labels) self._accuracy = CategoricalAccuracy() self._label_f1_metrics: Dict[str, F1Measure] = {} for i in range(self._num_labels): self._label_f1_metrics[vocab.get_token_from_index( index=i, namespace="labels")] = F1Measure(positive_label=i) self._loss = torch.nn.CrossEntropyLoss() initializer(self)
def __init__( self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, verbose_metrics: False, dropout: float = 0.2, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None, loss: Optional[dict] = None, ) -> None: super(TextClassifier, self).__init__(vocab, regularizer) self.text_field_embedder = text_field_embedder self.dropout = torch.nn.Dropout(dropout) self.num_classes = self.vocab.get_vocab_size("labels") self.classifier_feedforward = torch.nn.Linear( self.text_field_embedder.get_output_dim(), self.num_classes) self.label_accuracy = CategoricalAccuracy() self.label_f1_metrics = {} self.verbose_metrics = verbose_metrics for i in range(self.num_classes): self.label_f1_metrics[vocab.get_token_from_index( index=i, namespace="labels")] = F1Measure(positive_label=i) if loss is None or loss.get('type') == 'CrossEntropyLoss': self.loss = torch.nn.CrossEntropyLoss() elif loss.get('type') == 'BinaryFocalLoss': self.loss = BinaryFocalLoss(alpha=loss.get('alpha'), gamma=loss.get('gamma')) elif loss.get('type') == 'FocalLoss': self.loss = FocalLoss(alpha=loss.get('alpha'), gamma=loss.get('gamma')) elif loss.get('type') == 'MultiLabelMarginLoss': self.loss = torch.nn.MultiLabelMarginLoss() elif loss.get('type') == 'MultiLabelSoftMarginLoss': self.loss = torch.nn.MultiLabelSoftMarginLoss( weight=torch.tensor(loss.get('weight')) if 'weight' in loss else None) else: raise ValueError(f'Unexpected loss "{loss}"') initializer(self)
def prediction_to_rows(*, fold: str, guesser_name: str, vocab: Vocabulary, question: Instance, prediction) -> List[Dict[str, Any]]: top_scores = prediction["top_k_scores"] top_indices = prediction["top_k_indices"] meta = question["metadata"] rows = [] for score, guess_idx in zip(top_scores, top_indices): guess = vocab.get_token_from_index(guess_idx, namespace="page_labels") rows.append({ "qanta_id": meta["qanta_id"], "proto_id": meta["proto_id"], "char_index": meta["char_idx"], "guess": guess, "score": score, "fold": fold, "guesser": guesser_name, }) return rows
def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, node_embedder: TokenEmbedder, null_text_embedder: TokenEmbedder, verbose_metrics: False, classifier_feedforward: FeedForward, use_node_vector: bool = True, use_text: bool = True, dropout: float = 0.2, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None) -> None: super(AclEdgeClassifier, self).__init__(vocab, regularizer) self.node_embedder = node_embedder self.text_field_embedder = text_field_embedder # Instead of setting this, omit embedding path in config # to get randomly initialized embeddings. #self.use_node_vector = use_node_vector self.use_text = use_text self.null_text_embedder = null_text_embedder self.dropout = torch.nn.Dropout(dropout) self.num_classes = self.vocab.get_vocab_size("labels") self.sep_index = self.vocab.get_token_index("[SEP]") self.classifier_feedforward = classifier_feedforward self.label_accuracy = CategoricalAccuracy() self.label_f1_metrics = {} self.verbose_metrics = verbose_metrics for i in range(self.num_classes): self.label_f1_metrics[vocab.get_token_from_index( index=i, namespace="labels")] = F1Measure(positive_label=i) self.confusion_matrix = ConfusionMatrix(self.num_classes) self.loss = torch.nn.CrossEntropyLoss() initializer(self)
def convert_indices_to_string(hyps: List[List[Hypothesis]], metadata: List[Dict[str, Any]], vocab: Vocabulary, end_token: str = "[SEP]", return_all: bool = False, index_name: str = "tokens"): """Convert the token ids in hyps to result rewrite string.""" vocab_size = vocab.get_vocab_size(namespace=index_name) rewrite_tokens = [] rewrite_strings = [] origin_rewrite_strings = [] origin_query_strings = [] other_rewrite_strings = [] # for each instance for hyp, mdata in zip(hyps, metadata): oovs = mdata['oovs'] if 'rewrite' in mdata: origin_query_words = mdata['query_words'] origin_rw_words = mdata['rewrite'] other_rw_string = [] for i, h in enumerate(hyp): word_ids = h.tokens words = [] for wid in word_ids: try: w = vocab.get_token_from_index(wid, namespace=index_name) except Exception: assert oovs is not None, "Error: No oov words in the dialogue!" dialogue_oov_idx = wid - vocab_size try: w = oovs[dialogue_oov_idx] except Exception: raise ValueError( "Error: model produce word ID %i corresponds to dialogue OOV %i " "but this example only has %i OOV words." % (wid, dialogue_oov_idx, len(oovs))) words.append(w) if i == 0: if 'rewrite' in mdata: origin_query_strings.append(origin_query_words) origin_rewrite_strings.append(origin_rw_words) # find the end position try: stop_idx = words.index(end_token) words = words[:stop_idx] except ValueError: pass rewrite_tokens.append(words) rewrite_strings.append("".join(words)) if not return_all: break else: other_rw_string.append("".join(words)) other_rewrite_strings.append(other_rw_string) # return result string, rewrite_token, gold_target and origin_query output_dict = {} output_dict['rewrite_string'] = rewrite_strings output_dict['rewrite_token'] = rewrite_tokens output_dict['gold_target'] = origin_rewrite_strings output_dict['origin_query'] = origin_query_strings # if return_all return other rewrite results (not only the best) if return_all: output_dict['other_rewrites'] = other_rewrite_strings return output_dict
tokens = [Token(t) for t in s.split(" ")] indexed = idx.tokens_to_indices(tokens, vocab) print([vocab.get_token_from_index(i) for i in indexed['token_ids']]) return Instance({"tokens": TextField(tokens, {"tokens": idx})}) instances = [prepare_instance("ϩⲙⲡⲣⲁⲛ ⲙⲡⲛⲟⲩⲧⲉ ⲛϣⲟⲣⲡ ⲁⲛⲟⲕ"), prepare_instance("ϩⲙⲡⲣⲁⲛ ⲙⲡⲛⲟⲩⲧⲉ ⲛϣⲟⲣⲡ ⲁⲛⲟⲕ")] for i in instances: i["tokens"].index(vocab) tensors = [i.as_tensor_dict() for i in instances] collator = DataCollatorForWholeWordMask(tokenizer=tokenizer) ids = torch.cat([tensors[0]['tokens']['tokens']['token_ids'].unsqueeze(0), tensors[1]['tokens']['tokens']['token_ids'].unsqueeze(0)], dim=0) ids.shape wwm = collator._whole_word_mask([[vocab.get_token_from_index(i.item()) for i in wp_ids] for wp_ids in ids]) wwms = [] for i in range(ids.shape[0]): tokens = [vocab.get_token_from_index(i.item()) for i in ids[i]] wwm = torch.tensor(collator._whole_word_mask(tokens)).unsqueeze(0) wwms.append(wwm) wwms = torch.cat(wwms, dim=0) wwm = torch.tensor(wwm).unsqueeze(0) wwm masked_ids, labels = collator.mask_tokens(ids, wwm) masked_ids labels print([vocab.get_token_from_index(i.item()) for i in out[0][0]])
def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, user_utterance_encoder: Seq2VecEncoder, prev_user_utterance_encoder: Seq2VecEncoder, prev_sys_utterance_encoder: Seq2VecEncoder, classifier_feedforward: FeedForward, encoder: Seq2SeqEncoder, calculate_span_f1: bool = None, tag_encoding: Optional[str] = None, tag_namespace: str = "tags", verbose_metrics: bool = False, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None) -> None: super(IntentParamClassifier, self).__init__(vocab, regularizer) # Intent task self.text_field_embedder = text_field_embedder self.label_num_classes = self.vocab.get_vocab_size("labels") self.user_utterance_encoder = user_utterance_encoder self.prev_user_utterance_encoder = prev_user_utterance_encoder self.prev_sys_utterance_encoder = prev_sys_utterance_encoder self.classifier_feedforward = classifier_feedforward if text_field_embedder.get_output_dim() != user_utterance_encoder.get_input_dim(): raise ConfigurationError("The output dimension of the text_field_embedder must match the " "input dimension of the user_utterance_encoder. Found {} and {}, " "respectively.".format(text_field_embedder.get_output_dim(), user_utterance_encoder.get_input_dim())) if text_field_embedder.get_output_dim() != prev_user_utterance_encoder.get_input_dim(): raise ConfigurationError("The output dimension of the text_field_embedder must match the " "input dimension of the prev_user_utterance_encoder. Found {} and {}, " "respectively.".format(text_field_embedder.get_output_dim(), prev_user_utterance_encoder.get_input_dim())) if text_field_embedder.get_output_dim() != prev_sys_utterance_encoder.get_input_dim(): raise ConfigurationError("The output dimension of the text_field_embedder must match the " "input dimension of the prev_sys_utterance_encoder. Found {} and {}, " "respectively.".format(text_field_embedder.get_output_dim(), prev_sys_utterance_encoder.get_input_dim())) self.label_accuracy = CategoricalAccuracy() self.label_f1_metrics = {} for i in range(self.label_num_classes): self.label_f1_metrics[vocab.get_token_from_index(index=i, namespace="labels")] = F1Measure(positive_label=i) self.loss = torch.nn.CrossEntropyLoss() # Param task self.tag_namespace = tag_namespace self.tag_num_classes = self.vocab.get_vocab_size(tag_namespace) self.encoder = encoder self._verbose_metrics = verbose_metrics self.tag_projection_layer = TimeDistributed(Linear(self.encoder.get_output_dim(), self.tag_num_classes)) check_dimensions_match(text_field_embedder.get_output_dim(), encoder.get_input_dim(), "text field embedding dim", "encoder input dim") # We keep calculate_span_f1 as a constructor argument for API consistency with # the CrfTagger, even it is redundant in this class # (tag_encoding serves the same purpose). if calculate_span_f1 and not tag_encoding: raise ConfigurationError("calculate_span_f1 is True, but " "no tag_encoding was specified.") self.tag_accuracy = CategoricalAccuracy() if calculate_span_f1 or tag_encoding: self._f1_metric = SpanBasedF1Measure(vocab, tag_namespace=tag_namespace, tag_encoding=tag_encoding) else: self._f1_metric = None self.f1 = SpanBasedF1Measure(vocab, tag_namespace=tag_namespace) self.tag_f1_metrics = {} for k in range(self.tag_num_classes): self.tag_f1_metrics[vocab.get_token_from_index(index=k, namespace=tag_namespace)] = F1Measure( positive_label=k) initializer(self)
def construct_trees(vocab: Vocabulary, namespace: str, predictions: torch.FloatTensor, all_spans: torch.LongTensor, num_spans: torch.LongTensor, sentences: List[List[str]], pos_tags: List[List[str]] = None) -> List[Tree]: """ Construct ``nltk.Tree``'s for each batch element by greedily nesting spans. The trees use exclusive end indices, which contrasts with how spans are represented in the rest of the model. Parameters ---------- predictions : ``torch.FloatTensor``, required. A tensor of shape ``(batch_size, num_spans, span_label_vocab_size)`` representing a distribution over the label classes per span. all_spans : ``torch.LongTensor``, required. A tensor of shape (batch_size, num_spans, 2), representing the span indices we scored. num_spans : ``torch.LongTensor``, required. A tensor of shape (batch_size), representing the lengths of non-padded spans in ``enumerated_spans``. sentences : ``List[List[str]]``, required. A list of tokens in the sentence for each element in the batch. pos_tags : ``List[List[str]]``, optional (default = None). A list of POS tags for each word in the sentence for each element in the batch. Returns ------- A ``List[Tree]`` containing the decoded trees for each element in the batch. """ # Switch to using exclusive end spans. exclusive_end_spans = all_spans.clone() exclusive_end_spans[:, :, -1] += 1 no_label_id = vocab.get_token_index(BratDoc.NEG_SPAN_LABEL, namespace) trees: List[Tree] = [] for batch_index, (scored_spans, spans, sentence) in enumerate(zip(predictions, exclusive_end_spans, sentences)): selected_spans = [] for prediction, span in zip(scored_spans[:num_spans[batch_index]], spans[:num_spans[batch_index]]): start, end = span no_label_prob = prediction[no_label_id] label_prob, label_index = torch.max(prediction, -1) # Does the span have a label != NO-LABEL or is it the root node? # If so, include it in the spans that we consider. if int(label_index) != no_label_id or (start == 0 and end == len(sentence)): # TODO(Mark): Remove this once pylint sorts out named tuples. # https://github.com/PyCQA/pylint/issues/1418 selected_spans.append(SpanInformation(start=int(start), # pylint: disable=no-value-for-parameter end=int(end), label_prob=float(label_prob), no_label_prob=float(no_label_prob), label_index=int(label_index))) # The spans we've selected might overlap, which causes problems when we try # to construct the tree as they won't nest properly. consistent_spans = SpanConstituencyParser.resolve_overlap_conflicts_greedily(selected_spans) spans_to_labels = {(span.start, span.end): vocab.get_token_from_index(span.label_index, namespace) for span in consistent_spans} sentence_pos = pos_tags[batch_index] if pos_tags is not None else None trees.append(SpanConstituencyParser.construct_tree_from_spans(spans_to_labels, sentence, sentence_pos)) return trees
def __init__( self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, citation_text_encoder: Seq2SeqEncoder, classifier_feedforward: FeedForward, classifier_feedforward_2: FeedForward, classifier_feedforward_3: FeedForward, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None, report_auxiliary_metrics: bool = False, predict_mode: bool = False, ) -> None: """ Additional Args: lexicon_embedder_params: parameters for the lexicon attention model use_sparse_lexicon_features: whether to use sparse (onehot) lexicon features multilabel: whether the classification is multi-label data_format: s2 or jurgens report_auxiliary_metrics: report metrics for aux tasks predict_mode: predict unlabeled examples """ super(ScaffoldBilstmAttentionClassifier, self).__init__(vocab, regularizer) self.text_field_embedder = text_field_embedder self.num_classes = self.vocab.get_vocab_size("labels") self.num_classes_sections = self.vocab.get_vocab_size("section_labels") self.num_classes_cite_worthiness = self.vocab.get_vocab_size( "cite_worthiness_labels") self.citation_text_encoder = citation_text_encoder self.classifier_feedforward = classifier_feedforward self.classifier_feedforward_2 = classifier_feedforward_2 self.classifier_feedforward_3 = classifier_feedforward_3 self.label_accuracy = CategoricalAccuracy() self.label_f1_metrics = {} self.label_f1_metrics_sections = {} self.label_f1_metrics_cite_worthiness = {} # for i in range(self.num_classes): # self.label_f1_metrics[vocab.get_token_from_index(index=i, namespace="labels")] =\ # F1Measure(positive_label=i) for i in range(self.num_classes): self.label_f1_metrics[vocab.get_token_from_index(index=i, namespace="labels")] =\ F1Measure(positive_label=i) for i in range(self.num_classes_sections): self.label_f1_metrics_sections[vocab.get_token_from_index(index=i, namespace="section_labels")] =\ F1Measure(positive_label=i) for i in range(self.num_classes_cite_worthiness): self.label_f1_metrics_cite_worthiness[vocab.get_token_from_index(index=i, namespace="cite_worthiness_labels")] =\ F1Measure(positive_label=i) self.loss = torch.nn.CrossEntropyLoss() self.attention_seq2seq = Attention( citation_text_encoder.get_output_dim()) self.report_auxiliary_metrics = report_auxiliary_metrics self.predict_mode = predict_mode initializer(self)
def _read_pretrained_embedding_file( embeddings_filename: str, embedding_dim: int, vocab: Vocabulary, namespace: str = "tokens") -> torch.FloatTensor: """ Reads a pre-trained embedding file and generates an Embedding layer that has weights initialized to the pre-trained embeddings. The Embedding layer can either be trainable or not. We use the ``Vocabulary`` to map from the word strings in the embeddings file to the indices that we need, and to know which words from the embeddings file we can safely ignore. Parameters ---------- embeddings_filename : str, required. The path to a file containing pretrined embeddings. The embeddings file is assumed to be gzipped and space delimited, e.g. [word] [dim 1] [dim 2] ... vocab : Vocabulary, required. A Vocabulary object. namespace : str, (optional, default=tokens) The namespace of the vocabulary to find pretrained embeddings for. trainable : bool, (optional, default=True) Whether or not the embedding parameters should be optimized. Returns ------- A weight matrix with embeddings initialized from the read file. The matrix has shape ``(vocab.get_vocab_size(namespace), embedding_dim)``, where the indices of words appearing in the pretrained embedding file are initialized to the pretrained embedding value. """ words_to_keep = set( vocab.get_index_to_token_vocabulary(namespace).values()) vocab_size = vocab.get_vocab_size(namespace) embeddings = {} # First we read the embeddings from the file, only keeping vectors for the words we need. logger.info("Reading embeddings from file") with gzip.open(cached_path(embeddings_filename), 'rb') as embeddings_file: expected_length = embedding_dim for line in embeddings_file: fields = line.decode('utf-8').strip().split(' ') if len(fields) - 1 != embedding_dim and len( fields) - 1 != expected_length: # Sometimes there are funny unicode parsing problems that lead to different # fields lengths (e.g., a word with a unicode space character that splits # into more than one column). We skip those lines. Note that if you have # some kind of long header, this could result in all of your lines getting # skipped. It's hard to check for that here; you just have to look in the # embedding_misses_file and at the model summary to make sure things look # like they are supposed to. logger.warning( "Found line with wrong number of dimensions " "(expected %d or %d, was %d): %s", embedding_dim, expected_length, len(fields) - 1, ' '.join(fields[:10]) + '[...]') try: n1 = float( fields[1]) # test that the second field is a number assert len( fields ) - 1 > embedding_dim # test that we could take a subset of the line # if these tests pass, print a warning but use the vector and allow # future vectors with the same length. # NOTE TK TODO REMOVE: in future replace this by allowing user to specify # both the 'actual' and 'desired' input embedding dimension. logger.warning( "Will change expected_length to %s and allow this and " "similar vectors", len(fields) - 1) expected_length = len(fields) - 1 except: logger.warning("Skipping...") continue word = fields[0] if word in words_to_keep: vector = numpy.asarray(fields[1:embedding_dim + 1], dtype='float32') embeddings[word] = vector if not embeddings: raise ConfigurationError( "No embeddings of correct dimension found; you probably " "misspecified your embedding_dim parameter, or didn't " "pre-populate your Vocabulary") all_embeddings = numpy.asarray(list(embeddings.values())) embeddings_mean = float(numpy.mean(all_embeddings)) embeddings_std = float(numpy.std(all_embeddings)) # Now we initialize the weight matrix for an embedding layer, starting with random vectors, # then filling in the word vectors we just read. logger.info("Initializing pre-trained embedding layer") embedding_matrix = torch.FloatTensor(vocab_size, embedding_dim).normal_( embeddings_mean, embeddings_std) for i in range(0, vocab_size): word = vocab.get_token_from_index(i, namespace) # If we don't have a pre-trained vector for this word, we'll just leave this row alone, # so the word has a random initialization. if word in embeddings: embedding_matrix[i] = torch.FloatTensor(embeddings[word]) else: logger.debug( "Word %s was not found in the embedding file. Initialising randomly.", word) # The weight matrix is initialized, so we construct and return the actual Embedding. return embedding_matrix
def get_pretrained_embedding_layer(embeddings_filename: str, vocab: Vocabulary, namespace: str = "tokens", trainable: bool = True): """ Reads a pre-trained embedding file and generates an Embedding layer that has weights initialized to the pre-trained embeddings. The Embedding layer can either be trainable or not. We use the ``Vocabulary`` to map from the word strings in the embeddings file to the indices that we need, and to know which words from the embeddings file we can safely ignore. Parameters ---------- embeddings_filename : str, required. The path to a file containing pretrined embeddings. The embeddings file is assumed to be gzipped and space delimited, e.g. [word] [dim 1] [dim 2] ... vocab : Vocabulary, required. A Vocabulary object. namespace : str, (optional, default=tokens) The namespace of the vocabulary to find pretrained embeddings for. trainable : bool, (optional, default=True) Whether or not the embedding parameters should be optimized. Returns ------- An Embedding Module initialised with a weight matrix of shape (vocab.get_vocab_size(namespace), pretrained_embedding_dim), where the indices of words appearing in the pretrained embedding file are initialized to the pretrained embedding value. """ words_to_keep = set(vocab.get_index_to_token_vocabulary(namespace).values()) vocab_size = vocab.get_vocab_size(namespace) embeddings = {} embedding_dim = None # First we read the embeddings from the file, only keeping vectors for the words we need. logger.info("Reading embeddings from file") with gzip.open(embeddings_filename, 'rb') as embeddings_file: for line in embeddings_file: fields = line.decode('utf-8').strip().split(' ') if embedding_dim is None: embedding_dim = len(fields) - 1 assert embedding_dim > 1, "Found embedding size of 1; do you have a header?" else: if len(fields) - 1 != embedding_dim: # Sometimes there are funny unicode parsing problems that lead to different # fields lengths (e.g., a word with a unicode space character that splits # into more than one column). We skip those lines. Note that if you have # some kind of long header, this could result in all of your lines getting # skipped. It's hard to check for that here; you just have to look in the # embedding_misses_file and at the model summary to make sure things look # like they are supposed to. continue word = fields[0] if word in words_to_keep: vector = numpy.asarray(fields[1:], dtype='float32') embeddings[word] = vector # Now we initialize the weight matrix for an embedding layer, starting with random vectors, # then filling in the word vectors we just read. logger.info("Initializing pre-trained embedding layer") embedding_matrix = torch.FloatTensor(vocab_size, embedding_dim).normal_(0, 1) for i in range(0, vocab_size): word = vocab.get_token_from_index(i, namespace) # If we don't have a pre-trained vector for this word, we'll just leave this row alone, # so the word has a random initialization. if word in embeddings: embedding_matrix[i] = torch.FloatTensor(embeddings[word]) else: logger.debug("Word %s was not found in the embedding file. Initialising randomly.", word) # The weight matrix is initialized, so we construct and return the actual Embedding. return Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim, padding_index=0, weight=embedding_matrix, trainable=trainable)
def _read_pretrained_word2vec_format_embedding_file(embeddings_filename: str, # pylint: disable=invalid-name embedding_dim: int, vocab: Vocabulary, namespace: str = "tokens") -> torch.FloatTensor: """ Read from a gzipped-word2vec format file. The embeddings file is assumed to be gzipped and space delimited, e.g. [word] [dim 1] [dim 2] ... The remainder of the docstring is identical to ``_read_pretrained_embedding_file``. """ words_to_keep = set(vocab.get_index_to_token_vocabulary(namespace).values()) vocab_size = vocab.get_vocab_size(namespace) embeddings = {} # First we read the embeddings from the file, only keeping vectors for the words we need. logger.info("Reading embeddings from file") with gzip.open(cached_path(embeddings_filename), 'rb') as embeddings_file: for line in embeddings_file: fields = line.decode('utf-8').strip().split(' ') if len(fields) - 1 != embedding_dim: # Sometimes there are funny unicode parsing problems that lead to different # fields lengths (e.g., a word with a unicode space character that splits # into more than one column). We skip those lines. Note that if you have # some kind of long header, this could result in all of your lines getting # skipped. It's hard to check for that here; you just have to look in the # embedding_misses_file and at the model summary to make sure things look # like they are supposed to. logger.warning("Found line with wrong number of dimensions (expected %d, was %d): %s", embedding_dim, len(fields) - 1, line) continue word = fields[0] if word in words_to_keep: vector = numpy.asarray(fields[1:], dtype='float32') embeddings[word] = vector if not embeddings: raise ConfigurationError("No embeddings of correct dimension found; you probably " "misspecified your embedding_dim parameter, or didn't " "pre-populate your Vocabulary") all_embeddings = numpy.asarray(list(embeddings.values())) embeddings_mean = float(numpy.mean(all_embeddings)) embeddings_std = float(numpy.std(all_embeddings)) # Now we initialize the weight matrix for an embedding layer, starting with random vectors, # then filling in the word vectors we just read. logger.info("Initializing pre-trained embedding layer") embedding_matrix = torch.FloatTensor(vocab_size, embedding_dim).normal_(embeddings_mean, embeddings_std) for i in range(0, vocab_size): word = vocab.get_token_from_index(i, namespace) # If we don't have a pre-trained vector for this word, we'll just leave this row alone, # so the word has a random initialization. if word in embeddings: embedding_matrix[i] = torch.FloatTensor(embeddings[word]) else: logger.debug("Word %s was not found in the embedding file. Initialising randomly.", word) # The weight matrix is initialized, so we construct and return the actual Embedding. return embedding_matrix
def _read_pretrained_word2vec_format_embedding_file( embeddings_filename: str, # pylint: disable=invalid-name embedding_dim: int, vocab: Vocabulary, namespace: str = "tokens") -> torch.FloatTensor: """ Read from a gzipped-word2vec format file. The embeddings file is assumed to be gzipped and space delimited, e.g. [word] [dim 1] [dim 2] ... The remainder of the docstring is identical to ``_read_pretrained_embedding_file``. """ words_to_keep = set( vocab.get_index_to_token_vocabulary(namespace).values()) vocab_size = vocab.get_vocab_size(namespace) embeddings = {} # First we read the embeddings from the file, only keeping vectors for the words we need. logger.info("Reading embeddings from file") with gzip.open(cached_path(embeddings_filename), 'rb') as embeddings_file: for line in embeddings_file: fields = line.decode('utf-8').rstrip().split(' ') if len(fields) - 1 != embedding_dim: # Sometimes there are funny unicode parsing problems that lead to different # fields lengths (e.g., a word with a unicode space character that splits # into more than one column). We skip those lines. Note that if you have # some kind of long header, this could result in all of your lines getting # skipped. It's hard to check for that here; you just have to look in the # embedding_misses_file and at the model summary to make sure things look # like they are supposed to. logger.warning( "Found line with wrong number of dimensions (expected %d, was %d): %s", embedding_dim, len(fields) - 1, line) continue word = fields[0] if word in words_to_keep: vector = numpy.asarray(fields[1:], dtype='float32') embeddings[word] = vector if not embeddings: raise ConfigurationError( "No embeddings of correct dimension found; you probably " "misspecified your embedding_dim parameter, or didn't " "pre-populate your Vocabulary") all_embeddings = numpy.asarray(list(embeddings.values())) embeddings_mean = float(numpy.mean(all_embeddings)) embeddings_std = float(numpy.std(all_embeddings)) # Now we initialize the weight matrix for an embedding layer, starting with random vectors, # then filling in the word vectors we just read. logger.info("Initializing pre-trained embedding layer") embedding_matrix = torch.FloatTensor(vocab_size, embedding_dim).normal_( embeddings_mean, embeddings_std) for i in range(0, vocab_size): word = vocab.get_token_from_index(i, namespace) # If we don't have a pre-trained vector for this word, we'll just leave this row alone, # so the word has a random initialization. if word in embeddings: embedding_matrix[i] = torch.FloatTensor(embeddings[word]) else: logger.debug( "Word %s was not found in the embedding file. Initialising randomly.", word) # The weight matrix is initialized, so we construct and return the actual Embedding. return embedding_matrix
def _read_pretrained_embedding_file( embeddings_filename: str, embedding_dim: int, vocab: Vocabulary, namespace: str = "tokens") -> torch.FloatTensor: """ Reads a pre-trained embedding file and generates an Embedding layer that has weights initialized to the pre-trained embeddings. The Embedding layer can either be trainable or not. We use the ``Vocabulary`` to map from the word strings in the embeddings file to the indices that we need, and to know which words from the embeddings file we can safely ignore. Parameters ---------- embeddings_filename : str, required. The path to a file containing pretrined embeddings. The embeddings file is assumed to be gzipped and space delimited, e.g. [word] [dim 1] [dim 2] ... vocab : Vocabulary, required. A Vocabulary object. namespace : str, (optional, default=tokens) The namespace of the vocabulary to find pretrained embeddings for. trainable : bool, (optional, default=True) Whether or not the embedding parameters should be optimized. Returns ------- A weight matrix with embeddings initialized from the read file. The matrix has shape ``(vocab.get_vocab_size(namespace), embedding_dim)``, where the indices of words appearing in the pretrained embedding file are initialized to the pretrained embedding value. """ words_to_keep = set( vocab.get_index_to_token_vocabulary(namespace).values()) vocab_size = vocab.get_vocab_size(namespace) embeddings = {} words_found = set() # First we read the embeddings from the file, only keeping vectors for the words we need. logger.info("Reading embeddings from file; {}".format(len(words_to_keep))) with gzip.open(embeddings_filename, 'rb') as embeddings_file: for line in embeddings_file: fields = line.decode('utf-8').strip().split(' ') if len(fields) - 1 != embedding_dim: # Sometimes there are funny unicode parsing problems that lead to different # fields lengths (e.g., a word with a unicode space character that splits # into more than one column). We skip those lines. Note that if you have # some kind of long header, this could result in all of your lines getting # skipped. It's hard to check for that here; you just have to look in the # embedding_misses_file and at the model summary to make sure things look # like they are supposed to. logger.warning( "Found line with wrong number of dimensions (expected %d, was %d): %s", embedding_dim, len(fields) - 1, line) continue word = fields[0] if word in words_to_keep: words_found.add(word) vector = numpy.asarray(fields[1:], dtype='float32') embeddings[word] = vector notfound = words_to_keep.difference(words_found) logger.info("Emb load count: {}; Emb not found count: {}".format( len(words_found), len(notfound))) #""" with open("/home/kz918/bpe/eval/bidaf/not_found.txt", 'w', encoding='utf-8') as f: for word in notfound: f.write(word) f.write('\n') #""" #assert len(notfound) < 10 if not embeddings: raise ConfigurationError( "No embeddings of correct dimension found; you probably " "misspecified your embedding_dim parameter, or didn't " "pre-populate your Vocabulary") all_embeddings = numpy.asarray(list(embeddings.values())) embeddings_mean = float(numpy.mean(all_embeddings)) embeddings_std = float(numpy.std(all_embeddings)) # Now we initialize the weight matrix for an embedding layer, starting with random vectors, # then filling in the word vectors we just read. logger.info("Initializing pre-trained embedding layer") embedding_matrix = torch.FloatTensor(vocab_size, embedding_dim).normal_( embeddings_mean, embeddings_std) for i in range(0, vocab_size): word = vocab.get_token_from_index(i, namespace) # If we don't have a pre-trained vector for this word, we'll just leave this row alone, # so the word has a random initialization. if word in embeddings: embedding_matrix[i] = torch.FloatTensor(embeddings[word]) else: logger.debug( "Word %s was not found in the embedding file. Initialising randomly.", word) # The weight matrix is initialized, so we construct and return the actual Embedding. return embedding_matrix