def _create_weight_matrix(multilang_embeddings, embedding_dim: int, vocab: Vocabulary, namespace: str = "tokens") -> torch.FloatTensor: vocab_size = vocab.get_vocab_size(namespace) embeddings = multilang_embeddings all_embeddings = numpy.asarray(list(embeddings.values())) embeddings_mean = float(numpy.mean(all_embeddings)) embeddings_std = float(numpy.std(all_embeddings)) # Now we initialize the weight matrix for an embedding layer, starting with random vectors, # then filling in the word vectors we just read. logger.info("Initializing pre-trained embedding layer") embedding_matrix = torch.FloatTensor(vocab_size, embedding_dim).normal_(embeddings_mean, embeddings_std) num_tokens_found = 0 index_to_token = vocab.get_index_to_token_vocabulary(namespace) for i in range(vocab_size): token = index_to_token[i] # If we don't have a pre-trained vector for this word, we'll just leave this row alone, # so the word has a random initialization. if token in embeddings: embedding_matrix[i] = torch.FloatTensor(embeddings[token]) num_tokens_found += 1 else: logger.debug("Token %s was not found in the embedding file. Initialising randomly.", token) logger.info("Pretrained embeddings were found for %d out of %d tokens", num_tokens_found, vocab_size) return embedding_matrix
def __init__(self, embeddings_result_file, vocab: Vocabulary, config: Params): """ Gets sentence embedding b averaging w2v word reprsentations and image embedding from pretrained convnet, combines them by a dot-product, then applies logistic regresssion """ super().__init__() self.emb_size = config.pop("emb_size") self.vocab_size = vocab.get_vocab_size("tokens") self.hidden_size = config.pop("hidden_size") self.image_emb_size = config.pop("image_emb_size") self.n_classes = config.pop("n_classes") with open(embeddings_result_file, "rb") as f: saved_embs = SavedEmbeddings(pickle.load(f)) self.embs = Embedding(self.vocab_size, embedding_dim=self.emb_size, padding_idx=0) emb_weights = numpy.zeros((self.vocab_size, self.emb_size), dtype=numpy.float32) saved_embs.return_zero_for_oov = False for idx, word in tqdm( vocab.get_index_to_token_vocabulary("tokens").items()): if idx != 0: emb_weights[idx] = saved_embs.get(word) self.embs.weight.data = torch.tensor(emb_weights) self.question_to_hidden = Linear(self.emb_size, self.hidden_size) self.image_to_hidden = Linear(self.image_emb_size, self.hidden_size) self.hidden_to_hidden = Linear(self.hidden_size, self.hidden_size) self.scores_layer = Linear(self.hidden_size, self.n_classes) self.lrelu = LeakyReLU() self.dropout = Dropout(p=config.pop("dropout_rate"))
def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, attend_feedforward: FeedForward, similarity_function: SimilarityFunction, compare_feedforward: FeedForward, aggregate_feedforward: FeedForward, premise_encoder: Optional[Seq2SeqEncoder] = None, hypothesis_encoder: Optional[Seq2SeqEncoder] = None, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None) -> None: super(BiaowenMingxiClassifier, self).__init__(vocab, regularizer) self._text_field_embedder = text_field_embedder self._attend_feedforward = TimeDistributed(attend_feedforward) self._matrix_attention = LegacyMatrixAttention(similarity_function) self._compare_feedforward = TimeDistributed(compare_feedforward) self._aggregate_feedforward = aggregate_feedforward self._premise_encoder = premise_encoder self._hypothesis_encoder = hypothesis_encoder or premise_encoder self._num_labels = vocab.get_vocab_size(namespace="labels") print(vocab.get_index_to_token_vocabulary(namespace="labels")) check_dimensions_match(text_field_embedder.get_output_dim(), attend_feedforward.get_input_dim(), "text field embedding dim", "attend feedforward input dim") check_dimensions_match(aggregate_feedforward.get_output_dim(), self._num_labels, "final output dimension", "number of labels") self._accuracy = CategoricalAccuracy() self._loss = torch.nn.CrossEntropyLoss() initializer(self)
def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, hidden_size: int = 768, size_embedding: int = 25, dropout: float = 0.1, rel_filter_threshold: float = 0.5, max_pairs: int = 1000) -> None: super(Spart, self).__init__(vocab) self._text_field_embedder = text_field_embedder self._rel_filter_threshold = rel_filter_threshold self._relation_types = vocab.get_vocab_size("rel_labels") self._entity_types = vocab.get_vocab_size("ner_labels") self._cls_token = 2 # TODO nur für https://huggingface.co/german-nlp-group/electra-base-german-uncased/blob/main/vocab.txt self._max_pairs = max_pairs self._bert = self._text_field_embedder.token_embedder_tokens._modules[ "_matched_embedder"].transformer_model self.rel_classifier = nn.Linear(hidden_size * 3 + size_embedding * 2, self._relation_types) self.entity_classifier = nn.Linear(hidden_size * 2 + size_embedding, self._entity_types) self.size_embeddings = nn.Embedding(100, size_embedding) self.dropout = nn.Dropout(dropout) self._rel_loss = nn.BCEWithLogitsLoss(reduction='none') self._ents_loss = nn.CrossEntropyLoss( reduction='none') # TODO BCEWithLogitsLoss ner_labels = list(vocab.get_index_to_token_vocabulary("ner_labels")) ner_labels.remove(0) rel_labels = list(vocab.get_index_to_token_vocabulary("rel_labels")) rel_labels.remove(0) self._f1_relation = FBetaMultiLabelMeasure( average="micro", threshold=self._rel_filter_threshold) self._f1_entities = FBetaMeasure(average="micro", labels=ner_labels)
def get_index_to_labels_dictionary(vocab: Vocabulary) -> Dict[int, str]: """Gets a dictionary for turning label `int` ids into label strings Parameters ---------- vocab: `allennlp.data.Vocabulary` Returns ------- labels: `Dict[int, str]` A dictionary to get fetch label strings from ids """ return vocab.get_index_to_token_vocabulary(LABELS_NAMESPACE)
def get_slices_if_not_provided(vocab: allen_data.Vocabulary): if hasattr(vocab, "slices"): return vocab.slices if "feats_labels" in vocab.get_namespaces(): idx2token = vocab.get_index_to_token_vocabulary("feats_labels") for _, v in dict(idx2token).items(): if v not in ["_", "__PAD__"]: empty_value = v.split("=")[0] + "=None" vocab.add_token_to_namespace(empty_value, "feats_labels") slices = {} for idx, name in vocab.get_index_to_token_vocabulary( "feats_labels").items(): # There are 2 types features: with (Case=Acc) or without assigment (None). # Here we group their indices by name (before assigment sign). name = name.split("=")[0] if name in slices: slices[name].append(idx) else: slices[name] = [idx] vocab.slices = slices return vocab.slices
def _read_embeddings_from_jsonl(embeddings_filename: str, embedding_dim: int, vocab: Vocabulary, namespace: str = 'tokens') -> torch.FloatTensor: tokens_to_keep = set(vocab.get_index_to_token_vocabulary(namespace).values()) vocab_size = vocab.get_vocab_size(namespace) embeddings = {} with jsonlines.open(embeddings_filename) as reader: for instance in reader: token = instance['paper_id'] graph_vector = numpy.asarray(instance['graph_vector']) vector = numpy.asarray(instance['graph_vector'], dtype='float32') if len(vector) != embedding_dim: logger.warning("Found instance with wrong number of dimensions (expected: %d; actual %d): %s", embedding_dim, len(vector), token) else: embeddings[token] = vector all_embeddings = numpy.asarray(list(embeddings.values())) embeddings_mean = float(numpy.mean(all_embeddings)) embeddings_std = float(numpy.std(all_embeddings)) logger.info("Initializing pre-trained embedding layer") embedding_matrix = torch.FloatTensor(vocab_size, embedding_dim).normal_(embeddings_mean, embeddings_std) num_tokens_found = 0 for i in range(vocab_size): token = vocab.get_index_to_token_vocabulary(namespace)[i] if token in embeddings: embedding_matrix[i] = torch.FloatTensor(embeddings[token]) num_tokens_found += 1 else: logger.debug("Token %s was not found in the embedding file. Initializing randomly.", token) logger.info("pretrained embeddings were found for %d out of %d tokens", num_tokens_found, vocab_size) return embedding_matrix
def _read_embeddings_from_bin_file( file_uri: str, embedding_dim: int, vocab: Vocabulary, namespace: str = "tokens") -> torch.FloatTensor: """ Reads from a bin formatted file using gensim """ vocab_size = vocab.get_vocab_size(namespace) import gensim.models model = gensim.models.KeyedVectors.load_word2vec_format( file_uri, binary=True, unicode_errors="ignore") words = sorted([w for w in model.vocab], key=lambda w: model.vocab[w].index) vecs = [model[w] for w in words] all_embeddings = numpy.asarray(vecs) embeddings_mean = float(numpy.mean(all_embeddings)) embeddings_std = float(numpy.std(all_embeddings)) vocab_size = vocab.get_vocab_size(namespace="tokens") logger.info("Initializing pre-trained embedding layer") embedding_matrix = torch.FloatTensor(vocab_size, embedding_dim).normal_( embeddings_mean, embeddings_std) num_tokens_found = 0 index_to_token = vocab.get_index_to_token_vocabulary(namespace) for i in range(vocab_size): token = index_to_token[i] # If we don't have a pre-trained vector for this word, we'll just leave this row alone, # so the word has a random initialization. if token in words: embedding_matrix[i] = torch.FloatTensor(model[token]) num_tokens_found += 1 else: logger.debug( "Token %s was not found in the embedding file. Initialising randomly.", token) logger.info("Pretrained embeddings were found for %d out of %d tokens", num_tokens_found, vocab_size) return embedding_matrix
def _read_embeddings_from_text_file(file_uri: str, embedding_dim: int, vocab: Vocabulary, namespace: str = "tokens") -> torch.FloatTensor: """ Read pre-trained word vectors from an eventually compressed text file, possibly contained inside an archive with multiple files. The text file is assumed to be utf-8 encoded with space-separated fields: [word] [dim 1] [dim 2] ... Lines that contain more numerical tokens than ``embedding_dim`` raise a warning and are skipped. The remainder of the docstring is identical to ``_read_pretrained_embeddings_file``. """ tokens_to_keep = set(vocab.get_index_to_token_vocabulary(namespace).values()) embeddings = {} # First we read the embeddings from the file, only keeping vectors for the words we need. logger.info("Reading pretrained embeddings from file") with EmbeddingsTextFile(file_uri) as embeddings_file: for line in Tqdm.tqdm(embeddings_file): token = line.split(' ', 1)[0] if token in tokens_to_keep: fields = line.rstrip().split(' ') if len(fields) - 1 != embedding_dim: # Sometimes there are funny unicode parsing problems that lead to different # fields lengths (e.g., a word with a unicode space character that splits # into more than one column). We skip those lines. Note that if you have # some kind of long header, this could result in all of your lines getting # skipped. It's hard to check for that here; you just have to look in the # embedding_misses_file and at the model summary to make sure things look # like they are supposed to. logger.warning("Found line with wrong number of dimensions (expected: %d; actual: %d): %s", embedding_dim, len(fields) - 1, line) continue vector = numpy.asarray(fields[1:], dtype='float32') embeddings[token] = vector if not embeddings: raise ConfigurationError("No embeddings of correct dimension found; you probably " "misspecified your embedding_dim parameter, or didn't " "pre-populate your Vocabulary") return embeddings
def __init__(self, vocab: Vocabulary, simple_classifier: SimpleClassifier, alpha: float = 0.05, target: str = 'label', freeze_topic: bool = False): super().__init__(vocab) self.simple_classifier = simple_classifier num_topics = vocab.get_vocab_size("topic_labels") self.topic_classifier = torch.nn.Linear( self.simple_classifier.encoder.get_output_dim(), num_topics) self.alpha = alpha self.index_to_label = vocab.get_index_to_token_vocabulary('labels') self.label_to_dif_index = simple_classifier.vocab.get_token_to_index_vocabulary( 'labels') self.target = target self.freeze_topic = freeze_topic self.topic_accuracy = CategoricalAccuracy()
def set_labels(vocab: Vocabulary, new_labels: List[str]): """Resets the labels in the vocabulary with a given labels string list Parameters ---------- vocab: `allennlp.data.Vocabulary` new_labels: `List[str]` The label strings to add to the vocabulary """ for namespace_vocab in [ vocab.get_token_to_index_vocabulary(LABELS_NAMESPACE), vocab.get_index_to_token_vocabulary(LABELS_NAMESPACE), ]: tokens = list(namespace_vocab.keys()) for token in tokens: del namespace_vocab[token] extend_labels(vocab, new_labels)
def from_params(cls, vocab: Vocabulary, params: Params) -> 'ConstrainedConditionalModule': hard_constraints = params.pop("hard_constraints", []) soft_constraints = params.pop("soft_constraints", {}) label_namespace = params.pop("label_namespace", "labels") sentence_penalty_map_dict = params.pop("sentence_penalty_map", None) constrain_crf_decoding = params.pop("constrain_crf_decoding", False) label_encoding = params.pop("label_encoding", None) sentence_penalty_map = None if sentence_penalty_map_dict: assert len(sentence_penalty_map_dict) == 1, "multiple sentence constraints not supported" tag, penalty = list(sentence_penalty_map_dict.items())[0] tag_index = vocab.get_token_index(tag, label_namespace) sentence_penalty_map = (tag_index, penalty) hard_constraints_to_indices: Dict[str, List[int]] = {} for tag in hard_constraints: hard_constraints_to_indices[tag] = [] for label, index in vocab.get_token_to_index_vocabulary(label_namespace).items(): if re.match(rf"^.*-{tag}", label): hard_constraints_to_indices[tag].append(index) soft_constraints = soft_constraints or {} soft_constraints_to_indices: Dict[str, Tuple[List[int], float]] = {} for tag, penalty in soft_constraints.items(): indices = [] for label, index in vocab.get_token_to_index_vocabulary(label_namespace).items(): if re.match(rf"^.*-{tag}", label): indices.append(index) soft_constraints_to_indices[tag] = (indices, penalty) num_tags = vocab.get_vocab_size(label_namespace) if constrain_crf_decoding: if not label_encoding: raise ConfigurationError("constrain_crf_decoding is True, but " "no label_encoding was specified.") labels = vocab.get_index_to_token_vocabulary(label_namespace) constraints = allowed_transitions(label_encoding, labels) else: constraints = None params.assert_empty(cls.__name__) return ConstrainedConditionalModule(num_tags, constraints, hard_constraints_to_indices, soft_constraints_to_indices, sentence_penalty_map)
def __init__(self, vocab: Vocabulary, verbose_metrics: False, embedding_dim: int = 128, dropout: float = 0.2, neg_samples: int = 10, cuda_device: int = 7, pretrained_file: str = None, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None) -> None: super(NegativeSamplingModel, self).__init__(vocab, regularizer) self.embedder = Embedding( num_embeddings=vocab.get_vocab_size('source_token'), embedding_dim=embedding_dim, pretrained_file=pretrained_file) self.neg_samples = neg_samples self.cuda_device = cuda_device self.dropout = torch.nn.Dropout(dropout) self.verbose_metrics = verbose_metrics # Compute negative sampling probabilities # Based on https://github.com/mhagiwara/realworldnlp token_probs = {} token_counts = vocab._retained_counter['source_token'] total_counts = float(sum(token_counts.values())) total_probs = 0. for token, counts in token_counts.items(): adjusted_freq = math.pow(counts / total_counts, 0.75) token_probs[token] = adjusted_freq total_probs += adjusted_freq self.neg_sample_probs = np.ndarray( (vocab.get_vocab_size('source_token'), )) for idx, token in vocab.get_index_to_token_vocabulary( 'source_token').items(): self.neg_sample_probs[idx] = token_probs.get(token, 0) / total_probs initializer(self)
def __init__( self, vocab: Vocabulary, transformer_model: str = "roberta-large", num_labels: Optional[int] = None, label_namespace: str = "labels", override_weights_file: Optional[str] = None, **kwargs, ) -> None: super().__init__(vocab, **kwargs) transformer_kwargs = { "model_name": transformer_model, "weights_path": override_weights_file, } self.embeddings = TransformerEmbeddings.from_pretrained_module( **transformer_kwargs) self.transformer_stack = TransformerStack.from_pretrained_module( **transformer_kwargs) self.pooler = TransformerPooler.from_pretrained_module( **transformer_kwargs) self.pooler_dropout = Dropout(p=0.1) self.label_tokens = vocab.get_index_to_token_vocabulary( label_namespace) if num_labels is None: num_labels = len(self.label_tokens) self.linear_layer = torch.nn.Linear(self.pooler.get_output_dim(), num_labels) self.linear_layer.weight.data.normal_(mean=0.0, std=0.02) self.linear_layer.bias.data.zero_() from allennlp.training.metrics import CategoricalAccuracy, FBetaMeasure self.loss = torch.nn.CrossEntropyLoss() self.acc = CategoricalAccuracy() self.f1 = FBetaMeasure()
def tensorize(self, vocab: Vocabulary): """ Creates a list of tensors from the alias lookup. After dataset creation, we'll mainly want to work with alias lists as lists of padded tensors and their associated masks. This needs to be done **after** the vocabulary has been created. Accordingly, in our current approach, this method must be called in the forward pass of the model (since the operation is rather expensive we'll make sure that it doesn't anything after the first time it is called). """ # This operation is expensive, only do it once. if self.is_tensorized: return logger.debug('Tensorizing AliasDatabase') entity_idx_to_token = vocab.get_index_to_token_vocabulary( 'raw_entity_ids') for i in range(len(entity_idx_to_token)): # pylint: disable=C0200 entity = entity_idx_to_token[i] try: tokenized_aliases = self._token_lookup[entity] except KeyError: # If we encounter non-entity tokens (e.g. padding and null) then just add # a blank placeholder - these should not be encountered during training. self._global_id_lookup.append(None) self._local_id_lookup.append(None) continue # Construct tensor of alias token indices from the global vocabulary. num_aliases = len(tokenized_aliases) max_alias_length = max( len(tokenized_alias) for tokenized_alias in tokenized_aliases) global_id_tensor = torch.zeros(num_aliases, max_alias_length, dtype=torch.int64, requires_grad=False) for j, tokenized_alias in enumerate(tokenized_aliases): for k, token in enumerate(tokenized_alias): # WARNING: Extremely janky cast to string global_id_tensor[j, k] = vocab.get_token_index( str(token), 'tokens') self._global_id_lookup.append(global_id_tensor) # Convert array of local alias token indices into a tensor local_id_tensor = torch.tensor(self._id_array_lookup[entity], requires_grad=False) # pylint: disable=not-callable self._local_id_lookup.append(local_id_tensor) # Build the tensorized token -> potential entities lookup. # NOTE: Initial approach will be to store just the necessary info to build one-hot vectors # on the fly since storing them will probably be way too expensive. token_idx_to_token = vocab.get_index_to_token_vocabulary('tokens') for i in range(len(token_idx_to_token)): token = token_idx_to_token[i] try: potential_entities = self._token_to_entity_lookup[token] except KeyError: self._token_id_to_entity_id_lookup.append(None) else: potential_entity_ids = torch.tensor([ vocab.get_token_index(str(x), 'entity_ids') for x in potential_entities ], dtype=torch.int64, requires_grad=False) self._token_id_to_entity_id_lookup.append(potential_entity_ids) self._num_entities = vocab.get_vocab_size( 'entity_ids') # Needed to get one-hot vector length self.is_tensorized = True logger.debug('Done tensorizing AliasDatabase')
def _read_pretrained_embedding_file( embeddings_filename: str, embedding_dim: int, vocab: Vocabulary, namespace: str = "tokens") -> torch.FloatTensor: """ Reads a pre-trained embedding file and generates an Embedding layer that has weights initialized to the pre-trained embeddings. The Embedding layer can either be trainable or not. We use the ``Vocabulary`` to map from the word strings in the embeddings file to the indices that we need, and to know which words from the embeddings file we can safely ignore. Parameters ---------- embeddings_filename : str, required. The path to a file containing pretrined embeddings. The embeddings file is assumed to be gzipped and space delimited, e.g. [word] [dim 1] [dim 2] ... vocab : Vocabulary, required. A Vocabulary object. namespace : str, (optional, default=tokens) The namespace of the vocabulary to find pretrained embeddings for. trainable : bool, (optional, default=True) Whether or not the embedding parameters should be optimized. Returns ------- A weight matrix with embeddings initialized from the read file. The matrix has shape ``(vocab.get_vocab_size(namespace), embedding_dim)``, where the indices of words appearing in the pretrained embedding file are initialized to the pretrained embedding value. """ words_to_keep = set( vocab.get_index_to_token_vocabulary(namespace).values()) vocab_size = vocab.get_vocab_size(namespace) embeddings = {} # First we read the embeddings from the file, only keeping vectors for the words we need. logger.info("Reading embeddings from file") with gzip.open(cached_path(embeddings_filename), 'rb') as embeddings_file: expected_length = embedding_dim for line in embeddings_file: fields = line.decode('utf-8').strip().split(' ') if len(fields) - 1 != embedding_dim and len( fields) - 1 != expected_length: # Sometimes there are funny unicode parsing problems that lead to different # fields lengths (e.g., a word with a unicode space character that splits # into more than one column). We skip those lines. Note that if you have # some kind of long header, this could result in all of your lines getting # skipped. It's hard to check for that here; you just have to look in the # embedding_misses_file and at the model summary to make sure things look # like they are supposed to. logger.warning( "Found line with wrong number of dimensions " "(expected %d or %d, was %d): %s", embedding_dim, expected_length, len(fields) - 1, ' '.join(fields[:10]) + '[...]') try: n1 = float( fields[1]) # test that the second field is a number assert len( fields ) - 1 > embedding_dim # test that we could take a subset of the line # if these tests pass, print a warning but use the vector and allow # future vectors with the same length. # NOTE TK TODO REMOVE: in future replace this by allowing user to specify # both the 'actual' and 'desired' input embedding dimension. logger.warning( "Will change expected_length to %s and allow this and " "similar vectors", len(fields) - 1) expected_length = len(fields) - 1 except: logger.warning("Skipping...") continue word = fields[0] if word in words_to_keep: vector = numpy.asarray(fields[1:embedding_dim + 1], dtype='float32') embeddings[word] = vector if not embeddings: raise ConfigurationError( "No embeddings of correct dimension found; you probably " "misspecified your embedding_dim parameter, or didn't " "pre-populate your Vocabulary") all_embeddings = numpy.asarray(list(embeddings.values())) embeddings_mean = float(numpy.mean(all_embeddings)) embeddings_std = float(numpy.std(all_embeddings)) # Now we initialize the weight matrix for an embedding layer, starting with random vectors, # then filling in the word vectors we just read. logger.info("Initializing pre-trained embedding layer") embedding_matrix = torch.FloatTensor(vocab_size, embedding_dim).normal_( embeddings_mean, embeddings_std) for i in range(0, vocab_size): word = vocab.get_token_from_index(i, namespace) # If we don't have a pre-trained vector for this word, we'll just leave this row alone, # so the word has a random initialization. if word in embeddings: embedding_matrix[i] = torch.FloatTensor(embeddings[word]) else: logger.debug( "Word %s was not found in the embedding file. Initialising randomly.", word) # The weight matrix is initialized, so we construct and return the actual Embedding. return embedding_matrix
def __init__( self, vocabulary: Vocabulary, image_feature_size: Tuple[int, int, int] = (1024, 14, 14), module_channels: int = 128, class_projection_channels: int = 1024, classifier_linear_size: int = 1024, ): super().__init__() self.vocabulary = vocabulary # Short-hand notations for convenience. __channels, __height, __width = image_feature_size # Exclude "@@UNKNOWN@@" answer token, our network will never generate this output through # regular forward pass. We set answer output as "@@UNKNOWN@@" when sampled programs are # invalid. __num_answers will be 28 for all practical purposes. __num_answers = len( vocabulary.get_index_to_token_vocabulary(namespace="answers")) - 1 # The stem takes features from ResNet (or another feature extractor) and projects down to # a lower-dimensional space for sending through the Neural Module Network. self.stem = nn.Sequential( nn.Conv2d(image_feature_size[0], module_channels, kernel_size=3, padding=1), nn.ReLU(), nn.Conv2d(module_channels, module_channels, kernel_size=3, padding=1), nn.ReLU(), ) # The classifier takes output of the last module (which will be a Query or Equal module) # and produces a distribution over answers. self.classifier = nn.Sequential( nn.Conv2d(module_channels, class_projection_channels, kernel_size=1), nn.ReLU(), nn.MaxPool2d(kernel_size=2, stride=2), Flatten(), nn.Linear(class_projection_channels * __height * __width // 4, classifier_linear_size), nn.ReLU(), nn.Linear(classifier_linear_size, __num_answers), # note no softmax here ) # Instantiate a module for each program token in our vocabulary. self._function_modules: Dict[str, Type[nn.Module]] = {} for program_token in vocabulary.get_token_to_index_vocabulary( "programs"): # We don"t need modules for the placeholders. if program_token in [ "@@PADDING@@", "@@UNKNOWN@@", "@start@", "@end@", "unique" ]: continue # Figure out which module we want we use. if program_token == "scene": # "scene" is just a flag that indicates the start of a new line of reasoning # we set `module` to `None` because we still need the flag "scene" in forward() module = None elif program_token == "intersect": module = AndModule() elif program_token == "union": module = OrModule() elif "equal" in program_token or program_token in { "less_than", "greater_than" }: module = ComparisonModule(module_channels) elif "query" in program_token or program_token in { "exist", "count" }: module = QueryModule(module_channels) elif "relate" in program_token: module = RelateModule(module_channels) elif "same" in program_token: module = SameModule(module_channels) else: module = AttentionModule(module_channels) # Add the module to our dictionary and register its parameters so it can learn self._function_modules[program_token] = module # type: ignore self.add_module(program_token, module) # Cross Entropy Loss for answer classification. self._loss = nn.CrossEntropyLoss(reduction="none") # Record accuracy while training and validation. self._answer_accuracy = BooleanAccuracy() # Record average number of invalid programs per batch. self._average_invalid_programs = Average()
def _read_pretrained_word2vec_format_embedding_file(embeddings_filename: str, # pylint: disable=invalid-name embedding_dim: int, vocab: Vocabulary, namespace: str = "tokens") -> torch.FloatTensor: """ Read from a gzipped-word2vec format file. The embeddings file is assumed to be gzipped and space delimited, e.g. [word] [dim 1] [dim 2] ... The remainder of the docstring is identical to ``_read_pretrained_embedding_file``. """ words_to_keep = set(vocab.get_index_to_token_vocabulary(namespace).values()) vocab_size = vocab.get_vocab_size(namespace) embeddings = {} # First we read the embeddings from the file, only keeping vectors for the words we need. logger.info("Reading embeddings from file") with gzip.open(cached_path(embeddings_filename), 'rb') as embeddings_file: for line in embeddings_file: fields = line.decode('utf-8').strip().split(' ') if len(fields) - 1 != embedding_dim: # Sometimes there are funny unicode parsing problems that lead to different # fields lengths (e.g., a word with a unicode space character that splits # into more than one column). We skip those lines. Note that if you have # some kind of long header, this could result in all of your lines getting # skipped. It's hard to check for that here; you just have to look in the # embedding_misses_file and at the model summary to make sure things look # like they are supposed to. logger.warning("Found line with wrong number of dimensions (expected %d, was %d): %s", embedding_dim, len(fields) - 1, line) continue word = fields[0] if word in words_to_keep: vector = numpy.asarray(fields[1:], dtype='float32') embeddings[word] = vector if not embeddings: raise ConfigurationError("No embeddings of correct dimension found; you probably " "misspecified your embedding_dim parameter, or didn't " "pre-populate your Vocabulary") all_embeddings = numpy.asarray(list(embeddings.values())) embeddings_mean = float(numpy.mean(all_embeddings)) embeddings_std = float(numpy.std(all_embeddings)) # Now we initialize the weight matrix for an embedding layer, starting with random vectors, # then filling in the word vectors we just read. logger.info("Initializing pre-trained embedding layer") embedding_matrix = torch.FloatTensor(vocab_size, embedding_dim).normal_(embeddings_mean, embeddings_std) for i in range(0, vocab_size): word = vocab.get_token_from_index(i, namespace) # If we don't have a pre-trained vector for this word, we'll just leave this row alone, # so the word has a random initialization. if word in embeddings: embedding_matrix[i] = torch.FloatTensor(embeddings[word]) else: logger.debug("Word %s was not found in the embedding file. Initialising randomly.", word) # The weight matrix is initialized, so we construct and return the actual Embedding. return embedding_matrix
def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, encoder_0: Seq2SeqEncoder, encoder_1: Seq2SeqEncoder, encoder_2: Seq2SeqEncoder, tag_representation_dim: int, arc_representation_dim: int, tag_feedforward: FeedForward = None, arc_feedforward: FeedForward = None, pos_tag_embedding: Embedding = None, use_mst_decoding_for_validation: bool = True, use_layer_normalization: bool = True, dropout: float = 0.0, input_dropout: float = 0.0, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None) -> None: super(BiaffineDependencyParser, self).__init__(vocab, regularizer) a = vocab.get_index_to_token_vocabulary(namespace='tokens') # glyph_config['idx2word'] = {k: v for k, v in a.items()} # self.glyph = GlyphEmbedding(glyph_config) self.text_field_embedder = text_field_embedder self.encoder_0 = encoder_0 self.encoder_1 = encoder_1 self.encoder_2 = encoder_2 encoder_dim = self.encoder_2.get_output_dim() self.head_arc_feedforward = arc_feedforward or \ FeedForward(encoder_dim, 1, arc_representation_dim, Activation.by_name("elu")()) self.child_arc_feedforward = copy.deepcopy(self.head_arc_feedforward) self.arc_attention = BilinearMatrixAttention(arc_representation_dim, arc_representation_dim, use_input_biases=True) num_labels = self.vocab.get_vocab_size("head_tags") self.head_tag_feedforward = tag_feedforward or \ FeedForward(encoder_dim, 1, tag_representation_dim, Activation.by_name("elu")()) self.child_tag_feedforward = copy.deepcopy(self.head_tag_feedforward) self.tag_bilinear = torch.nn.modules.Bilinear(tag_representation_dim, tag_representation_dim, num_labels) self._pos_tag_embedding = pos_tag_embedding or None self._dropout = InputVariationalDropout(dropout) # self._dropout = Dropout(dropout) self._input_dropout = Dropout(input_dropout) self._head_sentinel = torch.nn.Parameter( torch.randn([1, 1, self.encoder_2.get_output_dim()])) self.use_layer_normalization = use_layer_normalization if use_layer_normalization: self.norm_input = torch.nn.LayerNorm( self.encoder_0.get_input_dim()) self.norm_hidden = torch.nn.LayerNorm( self.encoder_0.get_output_dim()) representation_dim = text_field_embedder.get_output_dim() if pos_tag_embedding is not None: representation_dim += pos_tag_embedding.get_output_dim() # check_dimensions_match(representation_dim, encoder.get_input_dim(), # "text field embedding dim", "encoder input dim") check_dimensions_match(tag_representation_dim, self.head_tag_feedforward.get_output_dim(), "tag representation dim", "tag feedforward output dim") check_dimensions_match(arc_representation_dim, self.head_arc_feedforward.get_output_dim(), "arc representation dim", "arc feedforward output dim") self.use_mst_decoding_for_validation = use_mst_decoding_for_validation tags = self.vocab.get_token_to_index_vocabulary("pos") punctuation_tag_indices = { tag: index for tag, index in tags.items() if tag in POS_TO_IGNORE } self._pos_to_ignore = set(punctuation_tag_indices.values()) logger.info( f"Found POS tags corresponding to the following punctuation : {punctuation_tag_indices}. " "Ignoring words with these POS tags for evaluation.") self._attachment_scores = AttachmentScores() initializer(self)
def _read_pretrained_embedding_file( embeddings_filename: str, embedding_dim: int, vocab: Vocabulary, namespace: str = "tokens") -> torch.FloatTensor: """ Reads a pre-trained embedding file and generates an Embedding layer that has weights initialized to the pre-trained embeddings. The Embedding layer can either be trainable or not. We use the ``Vocabulary`` to map from the word strings in the embeddings file to the indices that we need, and to know which words from the embeddings file we can safely ignore. Parameters ---------- embeddings_filename : str, required. The path to a file containing pretrined embeddings. The embeddings file is assumed to be gzipped and space delimited, e.g. [word] [dim 1] [dim 2] ... vocab : Vocabulary, required. A Vocabulary object. namespace : str, (optional, default=tokens) The namespace of the vocabulary to find pretrained embeddings for. trainable : bool, (optional, default=True) Whether or not the embedding parameters should be optimized. Returns ------- A weight matrix with embeddings initialized from the read file. The matrix has shape ``(vocab.get_vocab_size(namespace), embedding_dim)``, where the indices of words appearing in the pretrained embedding file are initialized to the pretrained embedding value. """ words_to_keep = set( vocab.get_index_to_token_vocabulary(namespace).values()) vocab_size = vocab.get_vocab_size(namespace) embeddings = {} words_found = set() # First we read the embeddings from the file, only keeping vectors for the words we need. logger.info("Reading embeddings from file; {}".format(len(words_to_keep))) with gzip.open(embeddings_filename, 'rb') as embeddings_file: for line in embeddings_file: fields = line.decode('utf-8').strip().split(' ') if len(fields) - 1 != embedding_dim: # Sometimes there are funny unicode parsing problems that lead to different # fields lengths (e.g., a word with a unicode space character that splits # into more than one column). We skip those lines. Note that if you have # some kind of long header, this could result in all of your lines getting # skipped. It's hard to check for that here; you just have to look in the # embedding_misses_file and at the model summary to make sure things look # like they are supposed to. logger.warning( "Found line with wrong number of dimensions (expected %d, was %d): %s", embedding_dim, len(fields) - 1, line) continue word = fields[0] if word in words_to_keep: words_found.add(word) vector = numpy.asarray(fields[1:], dtype='float32') embeddings[word] = vector notfound = words_to_keep.difference(words_found) logger.info("Emb load count: {}; Emb not found count: {}".format( len(words_found), len(notfound))) #""" with open("/home/kz918/bpe/eval/bidaf/not_found.txt", 'w', encoding='utf-8') as f: for word in notfound: f.write(word) f.write('\n') #""" #assert len(notfound) < 10 if not embeddings: raise ConfigurationError( "No embeddings of correct dimension found; you probably " "misspecified your embedding_dim parameter, or didn't " "pre-populate your Vocabulary") all_embeddings = numpy.asarray(list(embeddings.values())) embeddings_mean = float(numpy.mean(all_embeddings)) embeddings_std = float(numpy.std(all_embeddings)) # Now we initialize the weight matrix for an embedding layer, starting with random vectors, # then filling in the word vectors we just read. logger.info("Initializing pre-trained embedding layer") embedding_matrix = torch.FloatTensor(vocab_size, embedding_dim).normal_( embeddings_mean, embeddings_std) for i in range(0, vocab_size): word = vocab.get_token_from_index(i, namespace) # If we don't have a pre-trained vector for this word, we'll just leave this row alone, # so the word has a random initialization. if word in embeddings: embedding_matrix[i] = torch.FloatTensor(embeddings[word]) else: logger.debug( "Word %s was not found in the embedding file. Initialising randomly.", word) # The weight matrix is initialized, so we construct and return the actual Embedding. return embedding_matrix
def _read_embeddings_from_text_file( file_uri: str, embedding_dim: int, vocab: Vocabulary, namespace: str = "tokens") -> torch.FloatTensor: """ Read pre-trained word vectors from an eventually compressed text file, possibly contained inside an archive with multiple files. The text file is assumed to be utf-8 encoded with space-separated fields: [word] [dim 1] [dim 2] ... Lines that contain more numerical tokens than ``embedding_dim`` raise a warning and are skipped. The remainder of the docstring is identical to ``_read_pretrained_embeddings_file``. """ tokens_to_keep = set( vocab.get_index_to_token_vocabulary(namespace).values()) vocab_size = vocab.get_vocab_size(namespace) embeddings = {} # First we read the embeddings from the file, only keeping vectors for the words we need. logger.info("Reading pretrained embeddings from file") with EmbeddingsTextFile(file_uri) as embeddings_file: for line in Tqdm.tqdm(embeddings_file): token = line.split(' ', 1)[0] if token in tokens_to_keep: fields = line.rstrip().split(' ') if len(fields) - 1 != embedding_dim: # Sometimes there are funny unicode parsing problems that lead to different # fields lengths (e.g., a word with a unicode space character that splits # into more than one column). We skip those lines. Note that if you have # some kind of long header, this could result in all of your lines getting # skipped. It's hard to check for that here; you just have to look in the # embedding_misses_file and at the model summary to make sure things look # like they are supposed to. logger.warning( "Found line with wrong number of dimensions (expected: %d; actual: %d): %s", embedding_dim, len(fields) - 1, line) continue vector = numpy.asarray(fields[1:], dtype='float32') embeddings[token] = vector if not embeddings: raise ConfigurationError( "No embeddings of correct dimension found; you probably " "misspecified your embedding_dim parameter, or didn't " "pre-populate your Vocabulary") all_embeddings = numpy.asarray(list(embeddings.values())) embeddings_mean = float(numpy.mean(all_embeddings)) embeddings_std = float(numpy.std(all_embeddings)) # Now we initialize the weight matrix for an embedding layer, starting with random vectors, # then filling in the word vectors we just read. logger.info("Initializing pre-trained embedding layer") embedding_matrix = torch.FloatTensor(vocab_size, embedding_dim).normal_( embeddings_mean, embeddings_std) num_tokens_found = 0 index_to_token = vocab.get_index_to_token_vocabulary(namespace) for i in range(vocab_size): token = index_to_token[i] # If we don't have a pre-trained vector for this word, we'll just leave this row alone, # so the word has a random initialization. if token in embeddings: embedding_matrix[i] = torch.FloatTensor(embeddings[token]) num_tokens_found += 1 else: logger.debug( "Token %s was not found in the embedding file. Initialising randomly.", token) logger.info("Pretrained embeddings were found for %d out of %d tokens", num_tokens_found, vocab_size) return embedding_matrix
def __init__(self, vocab: Vocabulary, embedder: TextFieldEmbedder, encoder: Seq2VecEncoder, classifier: str = "linear", alpha: float = 1.0, learn_alpha: bool = False, l2_to_sim: str = "negative", squared_l2: bool = False, truncate: bool = False, embeds_per_label: int = 1, label_namespace: str = "labels", ): super().__init__(vocab) # str arg validation valid_classifiers = ["linear", "l2", "cos"] assert classifier in valid_classifiers, f"classifier must be in {valid_classifiers}" self.classifier = classifier valid_l2_to_sim = ["negative", "inverse"] assert l2_to_sim in valid_l2_to_sim, f"l2_to_sim must be in {valid_l2_to_sim}" self.l2_to_sim = l2_to_sim # encoder and embedder layers self.embedder = embedder self.encoder = encoder self.labels = vocab.get_index_to_token_vocabulary(namespace=label_namespace) self.num_labels = len(self.labels) self.embeds_per_label = embeds_per_label self.classifier_out = self.num_labels * embeds_per_label self.embed_dim = encoder.get_output_dim() # similarity/distance layer if classifier == "linear": self.classifier = nn.Linear(self.embed_dim, self.classifier_out) elif classifier == "l2": self.classifier = L2Linear(self.embed_dim, self.classifier_out, square=squared_l2) elif classifier == "cos": self.classifier = CosLinear(self.embed_dim, self.classifier_out) else: raise ValueError(f"Invalid classifier value: {classifier}") # truncate logits self.truncate = truncate if truncate: if classifier == "linear": self.threshold = nn.Parameter(torch.Tensor([0.1])) elif classifier == "cos": self.threshold = nn.Parameter(torch.Tensor([0.1])) elif classifier == "l2": if l2_to_sim == "negative": self.threshold = nn.Parameter(torch.Tensor([float(self.embed_dim)])) elif l2_to_sim == "inverse": self.threshold = nn.Parameter(torch.Tensor([-1.0])) else: raise ValueError(f"Invalid l2_to_sim value: {l2_to_sim}") else: raise ValueError(f"Invalid classifier value: {classifier}") # str arg validation valid_classifiers = ["linear", "l2", "cos"] assert classifier in valid_classifiers, f"classifier must be in {valid_classifiers}" self.classifier = classifier valid_l2_to_sim = ["negative", "inverse"] assert l2_to_sim in valid_l2_to_sim, f"l2_to_sim must be in {valid_l2_to_sim}" self.l2_to_sim = l2_to_sim # scale logits by alpha self.alpha = nn.Parameter(torch.Tensor([alpha])) if not learn_alpha: self.alpha.requires_grad = False # metrics self.accuracy = CategoricalAccuracy() self.prf_metrics = {l: F1Measure(i) for i, l in self.labels.items()} self.avg_alpha = Average() if self.truncate: self.trunc_avg_num = Average() self.trunc_avg_untrunc_num = Average() self.trunc_avg_threshold = Average() self.trunc_avg_sim = Average()
def get_pretrained_embedding_layer(embeddings_filename: str, vocab: Vocabulary, namespace: str = "tokens", trainable: bool = True): """ Reads a pre-trained embedding file and generates an Embedding layer that has weights initialized to the pre-trained embeddings. The Embedding layer can either be trainable or not. We use the ``Vocabulary`` to map from the word strings in the embeddings file to the indices that we need, and to know which words from the embeddings file we can safely ignore. Parameters ---------- embeddings_filename : str, required. The path to a file containing pretrined embeddings. The embeddings file is assumed to be gzipped and space delimited, e.g. [word] [dim 1] [dim 2] ... vocab : Vocabulary, required. A Vocabulary object. namespace : str, (optional, default=tokens) The namespace of the vocabulary to find pretrained embeddings for. trainable : bool, (optional, default=True) Whether or not the embedding parameters should be optimized. Returns ------- An Embedding Module initialised with a weight matrix of shape (vocab.get_vocab_size(namespace), pretrained_embedding_dim), where the indices of words appearing in the pretrained embedding file are initialized to the pretrained embedding value. """ words_to_keep = set(vocab.get_index_to_token_vocabulary(namespace).values()) vocab_size = vocab.get_vocab_size(namespace) embeddings = {} embedding_dim = None # First we read the embeddings from the file, only keeping vectors for the words we need. logger.info("Reading embeddings from file") with gzip.open(embeddings_filename, 'rb') as embeddings_file: for line in embeddings_file: fields = line.decode('utf-8').strip().split(' ') if embedding_dim is None: embedding_dim = len(fields) - 1 assert embedding_dim > 1, "Found embedding size of 1; do you have a header?" else: if len(fields) - 1 != embedding_dim: # Sometimes there are funny unicode parsing problems that lead to different # fields lengths (e.g., a word with a unicode space character that splits # into more than one column). We skip those lines. Note that if you have # some kind of long header, this could result in all of your lines getting # skipped. It's hard to check for that here; you just have to look in the # embedding_misses_file and at the model summary to make sure things look # like they are supposed to. continue word = fields[0] if word in words_to_keep: vector = numpy.asarray(fields[1:], dtype='float32') embeddings[word] = vector # Now we initialize the weight matrix for an embedding layer, starting with random vectors, # then filling in the word vectors we just read. logger.info("Initializing pre-trained embedding layer") embedding_matrix = torch.FloatTensor(vocab_size, embedding_dim).normal_(0, 1) for i in range(0, vocab_size): word = vocab.get_token_from_index(i, namespace) # If we don't have a pre-trained vector for this word, we'll just leave this row alone, # so the word has a random initialization. if word in embeddings: embedding_matrix[i] = torch.FloatTensor(embeddings[word]) else: logger.debug("Word %s was not found in the embedding file. Initialising randomly.", word) # The weight matrix is initialized, so we construct and return the actual Embedding. return Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim, padding_index=0, weight=embedding_matrix, trainable=trainable)
def _read_embeddings_from_text_file(file_uri: str, embedding_dim: int, vocab: Vocabulary, namespace: str = "tokens") -> torch.FloatTensor: """ Read pre-trained word vectors from an eventually compressed text file, possibly contained inside an archive with multiple files. The text file is assumed to be utf-8 encoded with space-separated fields: [word] [dim 1] [dim 2] ... Lines that contain more numerical tokens than ``embedding_dim`` raise a warning and are skipped. The remainder of the docstring is identical to ``_read_pretrained_embeddings_file``. """ tokens_to_keep = set(vocab.get_index_to_token_vocabulary(namespace).values()) vocab_size = vocab.get_vocab_size(namespace) embeddings = {} # First we read the embeddings from the file, only keeping vectors for the words we need. logger.info("Reading pretrained embeddings from file") with EmbeddingsTextFile(file_uri) as embeddings_file: for line in Tqdm.tqdm(embeddings_file): token = line.split(' ', 1)[0] if token in tokens_to_keep: fields = line.rstrip().split(' ') if len(fields) - 1 != embedding_dim: # Sometimes there are funny unicode parsing problems that lead to different # fields lengths (e.g., a word with a unicode space character that splits # into more than one column). We skip those lines. Note that if you have # some kind of long header, this could result in all of your lines getting # skipped. It's hard to check for that here; you just have to look in the # embedding_misses_file and at the model summary to make sure things look # like they are supposed to. logger.warning("Found line with wrong number of dimensions (expected: %d; actual: %d): %s", embedding_dim, len(fields) - 1, line) continue vector = numpy.asarray(fields[1:], dtype='float32') embeddings[token] = vector if not embeddings: raise ConfigurationError("No embeddings of correct dimension found; you probably " "misspecified your embedding_dim parameter, or didn't " "pre-populate your Vocabulary") all_embeddings = numpy.asarray(list(embeddings.values())) embeddings_mean = float(numpy.mean(all_embeddings)) embeddings_std = float(numpy.std(all_embeddings)) # Now we initialize the weight matrix for an embedding layer, starting with random vectors, # then filling in the word vectors we just read. logger.info("Initializing pre-trained embedding layer") embedding_matrix = torch.FloatTensor(vocab_size, embedding_dim).normal_(embeddings_mean, embeddings_std) num_tokens_found = 0 index_to_token = vocab.get_index_to_token_vocabulary(namespace) for i in range(vocab_size): token = index_to_token[i] # If we don't have a pre-trained vector for this word, we'll just leave this row alone, # so the word has a random initialization. if token in embeddings: embedding_matrix[i] = torch.FloatTensor(embeddings[token]) num_tokens_found += 1 else: logger.debug("Token %s was not found in the embedding file. Initialising randomly.", token) logger.info("Pretrained embeddings were found for %d out of %d tokens", num_tokens_found, vocab_size) return embedding_matrix
def __init__( self, vocab: Vocabulary, embedder: str, encoder: Seq2VecEncoder, classifier: str = "linear", alpha: float = 1.0, learn_alpha: bool = False, l2_to_sim: str = "negative", squared_l2: bool = False, truncate: bool = False, embeds_per_label: int = 1, label_namespace: str = "labels", attention_layer: str = "first", finetune_bert: bool = True, trunc_ratio: float = 0.1, ): super().__init__(vocab) # str arg validation assert embedder in EMBEDDERS.keys( ), f"embedder must be in {list(EMBEDDERS.keys())}" valid_classifiers = ["linear", "l2", "cos"] assert classifier in valid_classifiers, f"classifier must be in {valid_classifiers}" self.classifier_type = classifier valid_l2_to_sim = ["negative", "inverse"] assert l2_to_sim in valid_l2_to_sim, f"l2_to_sim must be in {valid_l2_to_sim}" self.l2_to_sim = l2_to_sim self.finetune_bert = finetune_bert self.squared_l2 = squared_l2 self.trunc_ratio = trunc_ratio # encoder and embedder layers self.embedder = EMBEDDERS[embedder].from_pretrained( embedder, output_attentions=True) self.encoder = encoder self.labels = vocab.get_index_to_token_vocabulary( namespace=label_namespace) self.num_labels = len(self.labels) self.embeds_per_label = embeds_per_label self.classifier_out = self.num_labels * embeds_per_label self.embed_dim = encoder.get_output_dim() self.attention_layer = attention_layer # similarity/distance layer if self.classifier_type == "linear": self.classifier = nn.Linear(self.embed_dim, self.classifier_out) elif self.classifier_type == "l2": self.classifier = L2Linear(self.embed_dim, self.classifier_out, square=squared_l2) elif self.classifier_type == "cos": self.classifier = CosLinear(self.embed_dim, self.classifier_out) else: raise ValueError(f"Invalid classifier value: {classifier}") # truncate logits self.truncate = truncate if truncate: # compute threshold values from a dummy learnable embedding # for stable and sufficient gradient updates self.trunc_embed = nn.Parameter(torch.ones(self.embed_dim) * 0.5) # scale logits by alpha self.alpha = nn.Parameter(torch.Tensor([alpha])) if not learn_alpha: self.alpha.requires_grad = False # metrics self.accuracy = CategoricalAccuracy() self.prf_metrics = {l: F1Measure(i) for i, l in self.labels.items()} self.avg_alpha = Average() if self.truncate: self.trunc_avg_total_num = Average() self.trunc_avg_trunc_num = Average() self.trunc_avg_untrunc_num = Average() self.trunc_avg_threshold = Average() self.trunc_avg_sim = Average() self.trunc_pre_avg_sim = Average() self.trunc_avg_sim_std = Average() self.trunc_pre_avg_sim_std = Average() self.trunc_pre_avg_sim_std = Average()
def _read_pretrained_word2vec_format_embedding_file( embeddings_filename: str, # pylint: disable=invalid-name embedding_dim: int, vocab: Vocabulary, namespace: str = "tokens") -> torch.FloatTensor: """ Read from a gzipped-word2vec format file. The embeddings file is assumed to be gzipped and space delimited, e.g. [word] [dim 1] [dim 2] ... The remainder of the docstring is identical to ``_read_pretrained_embedding_file``. """ words_to_keep = set( vocab.get_index_to_token_vocabulary(namespace).values()) vocab_size = vocab.get_vocab_size(namespace) embeddings = {} # First we read the embeddings from the file, only keeping vectors for the words we need. logger.info("Reading embeddings from file") with gzip.open(cached_path(embeddings_filename), 'rb') as embeddings_file: for line in embeddings_file: fields = line.decode('utf-8').rstrip().split(' ') if len(fields) - 1 != embedding_dim: # Sometimes there are funny unicode parsing problems that lead to different # fields lengths (e.g., a word with a unicode space character that splits # into more than one column). We skip those lines. Note that if you have # some kind of long header, this could result in all of your lines getting # skipped. It's hard to check for that here; you just have to look in the # embedding_misses_file and at the model summary to make sure things look # like they are supposed to. logger.warning( "Found line with wrong number of dimensions (expected %d, was %d): %s", embedding_dim, len(fields) - 1, line) continue word = fields[0] if word in words_to_keep: vector = numpy.asarray(fields[1:], dtype='float32') embeddings[word] = vector if not embeddings: raise ConfigurationError( "No embeddings of correct dimension found; you probably " "misspecified your embedding_dim parameter, or didn't " "pre-populate your Vocabulary") all_embeddings = numpy.asarray(list(embeddings.values())) embeddings_mean = float(numpy.mean(all_embeddings)) embeddings_std = float(numpy.std(all_embeddings)) # Now we initialize the weight matrix for an embedding layer, starting with random vectors, # then filling in the word vectors we just read. logger.info("Initializing pre-trained embedding layer") embedding_matrix = torch.FloatTensor(vocab_size, embedding_dim).normal_( embeddings_mean, embeddings_std) for i in range(0, vocab_size): word = vocab.get_token_from_index(i, namespace) # If we don't have a pre-trained vector for this word, we'll just leave this row alone, # so the word has a random initialization. if word in embeddings: embedding_matrix[i] = torch.FloatTensor(embeddings[word]) else: logger.debug( "Word %s was not found in the embedding file. Initialising randomly.", word) # The weight matrix is initialized, so we construct and return the actual Embedding. return embedding_matrix
def get_glove_embedder(num_embeddings: int, embedding_dim: int, vocab: Vocabulary, namespace: str = "tokens") -> Embedding: tokens_to_keep = set(vocab.get_index_to_token_vocabulary(namespace).values()) vocab_size = vocab.get_vocab_size(namespace) embeddings = {} # First we read the embeddings from the file, only keeping vectors for the words we need. logger.info("Reading pre-trained embeddings from file") with open("../embeddings/glove/glove840B300d.txt",encoding="utf8") as embeddings_file: for line in Tqdm.tqdm(embeddings_file): token = line.split(' ', 1)[0] if token in tokens_to_keep: fields = line.rstrip().split(' ') if len(fields) - 1 != embedding_dim: # Sometimes there are funny unicode parsing problems that lead to different # fields lengths (e.g., a word with a unicode space character that splits # into more than one column). We skip those lines. Note that if you have # some kind of long header, this could result in all of your lines getting # skipped. It's hard to check for that here; you just have to look in the # embedding_misses_file and at the model summary to make sure things look # like they are supposed to. logger.warning("Found line with wrong number of dimensions (expected: %d; actual: %d): %s", embedding_dim, len(fields) - 1, line) continue vector = np.asarray(fields[1:], dtype='float32') embeddings[token] = vector if not embeddings: raise ConfigurationError("No embeddings of correct dimension found; you probably " "misspecified your embedding_dim parameter, or didn't " "pre-populate your Vocabulary") all_embeddings = np.asarray(list(embeddings.values())) embeddings_mean = float(np.mean(all_embeddings)) embeddings_std = float(np.std(all_embeddings)) print("Embedding mean:" + str(embeddings_mean)) print("Embedding std:" + str(embeddings_std)) # Now we initialize the weight matrix for an embedding layer, starting with random vectors, # then filling in the word vectors we just read. logger.info("Initializing pre-trained embedding layer") embedding_matrix = torch.FloatTensor(vocab_size, embedding_dim).normal_(embeddings_mean,embeddings_std) num_tokens_found = 0 index_to_token = vocab.get_index_to_token_vocabulary(namespace) for i in range(vocab_size): token = index_to_token[i] # If we don't have a pre-trained vector for this word, we'll just leave this row alone, # so the word has a random initialization. if token in embeddings: embedding_matrix[i] = torch.FloatTensor(embeddings[token]) num_tokens_found += 1 else: logger.debug("Token %s was not found in the embedding file. Initialising randomly.", token) logger.info("Pretrained embeddings were found for %d out of %d tokens", num_tokens_found, vocab_size) # initialize glove embedding on precalculated weight glove_embedder = Embedding(num_embeddings, embedding_dim, weight = embedding_matrix, padding_index=0) return glove_embedder