def _create_weight_matrix(multilang_embeddings,
                            embedding_dim: int,
                            vocab: Vocabulary,
                            namespace: str = "tokens") -> torch.FloatTensor:

    vocab_size = vocab.get_vocab_size(namespace)
    embeddings = multilang_embeddings

    all_embeddings = numpy.asarray(list(embeddings.values()))
    embeddings_mean = float(numpy.mean(all_embeddings))
    embeddings_std = float(numpy.std(all_embeddings))
    # Now we initialize the weight matrix for an embedding layer, starting with random vectors,
    # then filling in the word vectors we just read.
    logger.info("Initializing pre-trained embedding layer")
    embedding_matrix = torch.FloatTensor(vocab_size, embedding_dim).normal_(embeddings_mean,
                                                                            embeddings_std)
    num_tokens_found = 0
    index_to_token = vocab.get_index_to_token_vocabulary(namespace)
    for i in range(vocab_size):
        token = index_to_token[i]

        # If we don't have a pre-trained vector for this word, we'll just leave this row alone,
        # so the word has a random initialization.
        if token in embeddings:
            embedding_matrix[i] = torch.FloatTensor(embeddings[token])
            num_tokens_found += 1
        else:
            logger.debug("Token %s was not found in the embedding file. Initialising randomly.", token)

    logger.info("Pretrained embeddings were found for %d out of %d tokens",
                num_tokens_found, vocab_size)

    return embedding_matrix
示例#2
0
文件: model.py 项目: festeh/VisualQA
    def __init__(self, embeddings_result_file, vocab: Vocabulary,
                 config: Params):
        """
        Gets sentence embedding b averaging w2v word reprsentations and image embedding from pretrained
        convnet, combines them by a dot-product, then applies logistic regresssion
        """
        super().__init__()
        self.emb_size = config.pop("emb_size")
        self.vocab_size = vocab.get_vocab_size("tokens")
        self.hidden_size = config.pop("hidden_size")
        self.image_emb_size = config.pop("image_emb_size")
        self.n_classes = config.pop("n_classes")

        with open(embeddings_result_file, "rb") as f:
            saved_embs = SavedEmbeddings(pickle.load(f))

        self.embs = Embedding(self.vocab_size,
                              embedding_dim=self.emb_size,
                              padding_idx=0)
        emb_weights = numpy.zeros((self.vocab_size, self.emb_size),
                                  dtype=numpy.float32)
        saved_embs.return_zero_for_oov = False
        for idx, word in tqdm(
                vocab.get_index_to_token_vocabulary("tokens").items()):
            if idx != 0:
                emb_weights[idx] = saved_embs.get(word)
        self.embs.weight.data = torch.tensor(emb_weights)
        self.question_to_hidden = Linear(self.emb_size, self.hidden_size)
        self.image_to_hidden = Linear(self.image_emb_size, self.hidden_size)

        self.hidden_to_hidden = Linear(self.hidden_size, self.hidden_size)

        self.scores_layer = Linear(self.hidden_size, self.n_classes)
        self.lrelu = LeakyReLU()
        self.dropout = Dropout(p=config.pop("dropout_rate"))
示例#3
0
    def __init__(self, vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 attend_feedforward: FeedForward,
                 similarity_function: SimilarityFunction,
                 compare_feedforward: FeedForward,
                 aggregate_feedforward: FeedForward,
                 premise_encoder: Optional[Seq2SeqEncoder] = None,
                 hypothesis_encoder: Optional[Seq2SeqEncoder] = None,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super(BiaowenMingxiClassifier, self).__init__(vocab, regularizer)

        self._text_field_embedder = text_field_embedder
        self._attend_feedforward = TimeDistributed(attend_feedforward)
        self._matrix_attention = LegacyMatrixAttention(similarity_function)
        self._compare_feedforward = TimeDistributed(compare_feedforward)
        self._aggregate_feedforward = aggregate_feedforward
        self._premise_encoder = premise_encoder
        self._hypothesis_encoder = hypothesis_encoder or premise_encoder

        self._num_labels = vocab.get_vocab_size(namespace="labels")
        print(vocab.get_index_to_token_vocabulary(namespace="labels"))
        check_dimensions_match(text_field_embedder.get_output_dim(), attend_feedforward.get_input_dim(),
                               "text field embedding dim", "attend feedforward input dim")
        check_dimensions_match(aggregate_feedforward.get_output_dim(), self._num_labels,
                               "final output dimension", "number of labels")

        self._accuracy = CategoricalAccuracy()
        self._loss = torch.nn.CrossEntropyLoss()

        initializer(self)
示例#4
0
文件: spart.py 项目: MSLars/mare
    def __init__(self,
                 vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 hidden_size: int = 768,
                 size_embedding: int = 25,
                 dropout: float = 0.1,
                 rel_filter_threshold: float = 0.5,
                 max_pairs: int = 1000) -> None:
        super(Spart, self).__init__(vocab)

        self._text_field_embedder = text_field_embedder

        self._rel_filter_threshold = rel_filter_threshold
        self._relation_types = vocab.get_vocab_size("rel_labels")
        self._entity_types = vocab.get_vocab_size("ner_labels")
        self._cls_token = 2  # TODO nur für https://huggingface.co/german-nlp-group/electra-base-german-uncased/blob/main/vocab.txt
        self._max_pairs = max_pairs
        self._bert = self._text_field_embedder.token_embedder_tokens._modules[
            "_matched_embedder"].transformer_model

        self.rel_classifier = nn.Linear(hidden_size * 3 + size_embedding * 2,
                                        self._relation_types)
        self.entity_classifier = nn.Linear(hidden_size * 2 + size_embedding,
                                           self._entity_types)
        self.size_embeddings = nn.Embedding(100, size_embedding)
        self.dropout = nn.Dropout(dropout)

        self._rel_loss = nn.BCEWithLogitsLoss(reduction='none')
        self._ents_loss = nn.CrossEntropyLoss(
            reduction='none')  # TODO BCEWithLogitsLoss

        ner_labels = list(vocab.get_index_to_token_vocabulary("ner_labels"))
        ner_labels.remove(0)

        rel_labels = list(vocab.get_index_to_token_vocabulary("rel_labels"))
        rel_labels.remove(0)
        self._f1_relation = FBetaMultiLabelMeasure(
            average="micro", threshold=self._rel_filter_threshold)
        self._f1_entities = FBetaMeasure(average="micro", labels=ner_labels)
示例#5
0
def get_index_to_labels_dictionary(vocab: Vocabulary) -> Dict[int, str]:
    """Gets a dictionary for turning label `int` ids into label strings

    Parameters
    ----------
    vocab: `allennlp.data.Vocabulary`

    Returns
    -------
    labels: `Dict[int, str]`
        A dictionary to get fetch label strings from ids
    """
    return vocab.get_index_to_token_vocabulary(LABELS_NAMESPACE)
示例#6
0
def get_slices_if_not_provided(vocab: allen_data.Vocabulary):
    if hasattr(vocab, "slices"):
        return vocab.slices

    if "feats_labels" in vocab.get_namespaces():
        idx2token = vocab.get_index_to_token_vocabulary("feats_labels")
        for _, v in dict(idx2token).items():
            if v not in ["_", "__PAD__"]:
                empty_value = v.split("=")[0] + "=None"
                vocab.add_token_to_namespace(empty_value, "feats_labels")

        slices = {}
        for idx, name in vocab.get_index_to_token_vocabulary(
                "feats_labels").items():
            # There are 2 types features: with (Case=Acc) or without assigment (None).
            # Here we group their indices by name (before assigment sign).
            name = name.split("=")[0]
            if name in slices:
                slices[name].append(idx)
            else:
                slices[name] = [idx]
        vocab.slices = slices
        return vocab.slices
示例#7
0
def _read_embeddings_from_jsonl(embeddings_filename: str,
                                embedding_dim: int,
                                vocab: Vocabulary,
                                namespace: str = 'tokens') -> torch.FloatTensor: 

    tokens_to_keep = set(vocab.get_index_to_token_vocabulary(namespace).values())
    vocab_size = vocab.get_vocab_size(namespace)

    embeddings = {}

    with jsonlines.open(embeddings_filename) as reader:
        for instance in reader:
            token = instance['paper_id']
            graph_vector = numpy.asarray(instance['graph_vector'])
            vector = numpy.asarray(instance['graph_vector'], dtype='float32')
            if len(vector) != embedding_dim:
                logger.warning("Found instance with wrong number of dimensions (expected: %d; actual %d): %s", embedding_dim, len(vector), token)
            else:
                embeddings[token] = vector
    all_embeddings = numpy.asarray(list(embeddings.values()))
    embeddings_mean = float(numpy.mean(all_embeddings))
    embeddings_std = float(numpy.std(all_embeddings))
    logger.info("Initializing pre-trained embedding layer")
    embedding_matrix = torch.FloatTensor(vocab_size, embedding_dim).normal_(embeddings_mean, embeddings_std)
    num_tokens_found = 0
    for i in range(vocab_size):
        token = vocab.get_index_to_token_vocabulary(namespace)[i]
        if token in embeddings:
            embedding_matrix[i] = torch.FloatTensor(embeddings[token])
            num_tokens_found += 1
        else:
            logger.debug("Token %s was not found in the embedding file. Initializing randomly.", token)

    logger.info("pretrained embeddings were found for %d out of %d tokens", num_tokens_found, vocab_size)

    return embedding_matrix
def _read_embeddings_from_bin_file(
        file_uri: str,
        embedding_dim: int,
        vocab: Vocabulary,
        namespace: str = "tokens") -> torch.FloatTensor:
    """
    Reads from a bin formatted file using gensim    
    """
    vocab_size = vocab.get_vocab_size(namespace)

    import gensim.models

    model = gensim.models.KeyedVectors.load_word2vec_format(
        file_uri, binary=True, unicode_errors="ignore")

    words = sorted([w for w in model.vocab],
                   key=lambda w: model.vocab[w].index)
    vecs = [model[w] for w in words]

    all_embeddings = numpy.asarray(vecs)
    embeddings_mean = float(numpy.mean(all_embeddings))
    embeddings_std = float(numpy.std(all_embeddings))
    vocab_size = vocab.get_vocab_size(namespace="tokens")
    logger.info("Initializing pre-trained embedding layer")
    embedding_matrix = torch.FloatTensor(vocab_size, embedding_dim).normal_(
        embeddings_mean, embeddings_std)

    num_tokens_found = 0
    index_to_token = vocab.get_index_to_token_vocabulary(namespace)
    for i in range(vocab_size):
        token = index_to_token[i]

        # If we don't have a pre-trained vector for this word, we'll just leave this row alone,
        # so the word has a random initialization.
        if token in words:
            embedding_matrix[i] = torch.FloatTensor(model[token])
            num_tokens_found += 1
        else:
            logger.debug(
                "Token %s was not found in the embedding file. Initialising randomly.",
                token)

    logger.info("Pretrained embeddings were found for %d out of %d tokens",
                num_tokens_found, vocab_size)

    return embedding_matrix
def _read_embeddings_from_text_file(file_uri: str,
                                    embedding_dim: int,
                                    vocab: Vocabulary,
                                    namespace: str = "tokens") -> torch.FloatTensor:
    """
    Read pre-trained word vectors from an eventually compressed text file, possibly contained
    inside an archive with multiple files. The text file is assumed to be utf-8 encoded with
    space-separated fields: [word] [dim 1] [dim 2] ...

    Lines that contain more numerical tokens than ``embedding_dim`` raise a warning and are skipped.

    The remainder of the docstring is identical to ``_read_pretrained_embeddings_file``.
    """
    tokens_to_keep = set(vocab.get_index_to_token_vocabulary(namespace).values())
 
    embeddings = {}

    # First we read the embeddings from the file, only keeping vectors for the words we need.
    logger.info("Reading pretrained embeddings from file")

    with EmbeddingsTextFile(file_uri) as embeddings_file:
        for line in Tqdm.tqdm(embeddings_file):
            token = line.split(' ', 1)[0]
            if token in tokens_to_keep:
                fields = line.rstrip().split(' ')
                if len(fields) - 1 != embedding_dim:
                    # Sometimes there are funny unicode parsing problems that lead to different
                    # fields lengths (e.g., a word with a unicode space character that splits
                    # into more than one column).  We skip those lines.  Note that if you have
                    # some kind of long header, this could result in all of your lines getting
                    # skipped.  It's hard to check for that here; you just have to look in the
                    # embedding_misses_file and at the model summary to make sure things look
                    # like they are supposed to.
                    logger.warning("Found line with wrong number of dimensions (expected: %d; actual: %d): %s",
                                   embedding_dim, len(fields) - 1, line)
                    continue

                vector = numpy.asarray(fields[1:], dtype='float32')
                embeddings[token] = vector

    if not embeddings:
        raise ConfigurationError("No embeddings of correct dimension found; you probably "
                                 "misspecified your embedding_dim parameter, or didn't "
                                 "pre-populate your Vocabulary")

    return embeddings
示例#10
0
 def __init__(self,
              vocab: Vocabulary,
              simple_classifier: SimpleClassifier,
              alpha: float = 0.05,
              target: str = 'label',
              freeze_topic: bool = False):
     super().__init__(vocab)
     self.simple_classifier = simple_classifier
     num_topics = vocab.get_vocab_size("topic_labels")
     self.topic_classifier = torch.nn.Linear(
         self.simple_classifier.encoder.get_output_dim(), num_topics)
     self.alpha = alpha
     self.index_to_label = vocab.get_index_to_token_vocabulary('labels')
     self.label_to_dif_index = simple_classifier.vocab.get_token_to_index_vocabulary(
         'labels')
     self.target = target
     self.freeze_topic = freeze_topic
     self.topic_accuracy = CategoricalAccuracy()
示例#11
0
def set_labels(vocab: Vocabulary, new_labels: List[str]):
    """Resets the labels in the vocabulary with a given labels string list

    Parameters
    ----------
    vocab: `allennlp.data.Vocabulary`
    new_labels: `List[str]`
        The label strings to add to the vocabulary
    """
    for namespace_vocab in [
            vocab.get_token_to_index_vocabulary(LABELS_NAMESPACE),
            vocab.get_index_to_token_vocabulary(LABELS_NAMESPACE),
    ]:
        tokens = list(namespace_vocab.keys())
        for token in tokens:
            del namespace_vocab[token]

    extend_labels(vocab, new_labels)
    def from_params(cls, vocab: Vocabulary, params: Params) -> 'ConstrainedConditionalModule':
        hard_constraints = params.pop("hard_constraints", [])
        soft_constraints = params.pop("soft_constraints", {})
        label_namespace = params.pop("label_namespace", "labels")
        sentence_penalty_map_dict = params.pop("sentence_penalty_map", None)
        constrain_crf_decoding = params.pop("constrain_crf_decoding", False)
        label_encoding = params.pop("label_encoding", None)

        sentence_penalty_map = None
        if sentence_penalty_map_dict:
            assert len(sentence_penalty_map_dict) == 1, "multiple sentence constraints not supported"
            tag, penalty = list(sentence_penalty_map_dict.items())[0]
            tag_index = vocab.get_token_index(tag, label_namespace)
            sentence_penalty_map = (tag_index, penalty)

        hard_constraints_to_indices: Dict[str, List[int]] = {}
        for tag in hard_constraints:
            hard_constraints_to_indices[tag] = []
            for label, index in vocab.get_token_to_index_vocabulary(label_namespace).items():
                if re.match(rf"^.*-{tag}", label):
                    hard_constraints_to_indices[tag].append(index)
        soft_constraints = soft_constraints or {}
        soft_constraints_to_indices: Dict[str, Tuple[List[int], float]] = {}
        for tag, penalty in soft_constraints.items():
            indices = []
            for label, index in vocab.get_token_to_index_vocabulary(label_namespace).items():
                if re.match(rf"^.*-{tag}", label):
                    indices.append(index)
            soft_constraints_to_indices[tag] = (indices, penalty)
        num_tags = vocab.get_vocab_size(label_namespace)
        if constrain_crf_decoding:
            if not label_encoding:
                raise ConfigurationError("constrain_crf_decoding is True, but "
                                         "no label_encoding was specified.")
            labels = vocab.get_index_to_token_vocabulary(label_namespace)
            constraints = allowed_transitions(label_encoding, labels)
        else:
            constraints = None
        params.assert_empty(cls.__name__)
        return ConstrainedConditionalModule(num_tags, constraints,
                                            hard_constraints_to_indices,
                                            soft_constraints_to_indices,
                                            sentence_penalty_map)
示例#13
0
    def __init__(self,
                 vocab: Vocabulary,
                 verbose_metrics: False,
                 embedding_dim: int = 128,
                 dropout: float = 0.2,
                 neg_samples: int = 10,
                 cuda_device: int = 7,
                 pretrained_file: str = None,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super(NegativeSamplingModel, self).__init__(vocab, regularizer)

        self.embedder = Embedding(
            num_embeddings=vocab.get_vocab_size('source_token'),
            embedding_dim=embedding_dim,
            pretrained_file=pretrained_file)
        self.neg_samples = neg_samples
        self.cuda_device = cuda_device
        self.dropout = torch.nn.Dropout(dropout)
        self.verbose_metrics = verbose_metrics

        # Compute negative sampling probabilities
        # Based on https://github.com/mhagiwara/realworldnlp
        token_probs = {}
        token_counts = vocab._retained_counter['source_token']
        total_counts = float(sum(token_counts.values()))
        total_probs = 0.
        for token, counts in token_counts.items():
            adjusted_freq = math.pow(counts / total_counts, 0.75)
            token_probs[token] = adjusted_freq
            total_probs += adjusted_freq

        self.neg_sample_probs = np.ndarray(
            (vocab.get_vocab_size('source_token'), ))
        for idx, token in vocab.get_index_to_token_vocabulary(
                'source_token').items():
            self.neg_sample_probs[idx] = token_probs.get(token,
                                                         0) / total_probs

        initializer(self)
    def __init__(
        self,
        vocab: Vocabulary,
        transformer_model: str = "roberta-large",
        num_labels: Optional[int] = None,
        label_namespace: str = "labels",
        override_weights_file: Optional[str] = None,
        **kwargs,
    ) -> None:
        super().__init__(vocab, **kwargs)
        transformer_kwargs = {
            "model_name": transformer_model,
            "weights_path": override_weights_file,
        }
        self.embeddings = TransformerEmbeddings.from_pretrained_module(
            **transformer_kwargs)
        self.transformer_stack = TransformerStack.from_pretrained_module(
            **transformer_kwargs)
        self.pooler = TransformerPooler.from_pretrained_module(
            **transformer_kwargs)
        self.pooler_dropout = Dropout(p=0.1)

        self.label_tokens = vocab.get_index_to_token_vocabulary(
            label_namespace)
        if num_labels is None:
            num_labels = len(self.label_tokens)
        self.linear_layer = torch.nn.Linear(self.pooler.get_output_dim(),
                                            num_labels)
        self.linear_layer.weight.data.normal_(mean=0.0, std=0.02)
        self.linear_layer.bias.data.zero_()

        from allennlp.training.metrics import CategoricalAccuracy, FBetaMeasure

        self.loss = torch.nn.CrossEntropyLoss()
        self.acc = CategoricalAccuracy()
        self.f1 = FBetaMeasure()
示例#15
0
    def tensorize(self, vocab: Vocabulary):
        """
        Creates a list of tensors from the alias lookup.

        After dataset creation, we'll mainly want to work with alias lists as lists of padded
        tensors and their associated masks. This needs to be done **after** the vocabulary has
        been created. Accordingly, in our current approach, this method must be called in the
        forward pass of the model (since the operation is rather expensive we'll make sure that
        it doesn't anything after the first time it is called).
        """
        # This operation is expensive, only do it once.
        if self.is_tensorized:
            return

        logger.debug('Tensorizing AliasDatabase')

        entity_idx_to_token = vocab.get_index_to_token_vocabulary(
            'raw_entity_ids')
        for i in range(len(entity_idx_to_token)):  # pylint: disable=C0200
            entity = entity_idx_to_token[i]
            try:
                tokenized_aliases = self._token_lookup[entity]
            except KeyError:
                # If we encounter non-entity tokens (e.g. padding and null) then just add
                # a blank placeholder - these should not be encountered during training.
                self._global_id_lookup.append(None)
                self._local_id_lookup.append(None)
                continue

            # Construct tensor of alias token indices from the global vocabulary.
            num_aliases = len(tokenized_aliases)
            max_alias_length = max(
                len(tokenized_alias) for tokenized_alias in tokenized_aliases)
            global_id_tensor = torch.zeros(num_aliases,
                                           max_alias_length,
                                           dtype=torch.int64,
                                           requires_grad=False)
            for j, tokenized_alias in enumerate(tokenized_aliases):
                for k, token in enumerate(tokenized_alias):
                    # WARNING: Extremely janky cast to string
                    global_id_tensor[j, k] = vocab.get_token_index(
                        str(token), 'tokens')
            self._global_id_lookup.append(global_id_tensor)

            # Convert array of local alias token indices into a tensor
            local_id_tensor = torch.tensor(self._id_array_lookup[entity],
                                           requires_grad=False)  # pylint: disable=not-callable
            self._local_id_lookup.append(local_id_tensor)

        # Build the tensorized token -> potential entities lookup.
        # NOTE: Initial approach will be to store just the necessary info to build one-hot vectors
        # on the fly since storing them will probably be way too expensive.
        token_idx_to_token = vocab.get_index_to_token_vocabulary('tokens')
        for i in range(len(token_idx_to_token)):
            token = token_idx_to_token[i]
            try:
                potential_entities = self._token_to_entity_lookup[token]
            except KeyError:
                self._token_id_to_entity_id_lookup.append(None)
            else:
                potential_entity_ids = torch.tensor([
                    vocab.get_token_index(str(x), 'entity_ids')
                    for x in potential_entities
                ],
                                                    dtype=torch.int64,
                                                    requires_grad=False)
                self._token_id_to_entity_id_lookup.append(potential_entity_ids)
        self._num_entities = vocab.get_vocab_size(
            'entity_ids')  # Needed to get one-hot vector length

        self.is_tensorized = True

        logger.debug('Done tensorizing AliasDatabase')
示例#16
0
def _read_pretrained_embedding_file(
        embeddings_filename: str,
        embedding_dim: int,
        vocab: Vocabulary,
        namespace: str = "tokens") -> torch.FloatTensor:
    """
    Reads a pre-trained embedding file and generates an Embedding layer that has weights
    initialized to the pre-trained embeddings.  The Embedding layer can either be trainable or
    not.

    We use the ``Vocabulary`` to map from the word strings in the embeddings file to the indices
    that we need, and to know which words from the embeddings file we can safely ignore.

    Parameters
    ----------
    embeddings_filename : str, required.
        The path to a file containing pretrined embeddings. The embeddings
        file is assumed to be gzipped and space delimited, e.g. [word] [dim 1] [dim 2] ...
    vocab : Vocabulary, required.
        A Vocabulary object.
    namespace : str, (optional, default=tokens)
        The namespace of the vocabulary to find pretrained embeddings for.
    trainable : bool, (optional, default=True)
        Whether or not the embedding parameters should be optimized.

    Returns
    -------
    A weight matrix with embeddings initialized from the read file.  The matrix has shape
    ``(vocab.get_vocab_size(namespace), embedding_dim)``, where the indices of words appearing in
    the pretrained embedding file are initialized to the pretrained embedding value.
    """
    words_to_keep = set(
        vocab.get_index_to_token_vocabulary(namespace).values())
    vocab_size = vocab.get_vocab_size(namespace)
    embeddings = {}

    # First we read the embeddings from the file, only keeping vectors for the words we need.
    logger.info("Reading embeddings from file")
    with gzip.open(cached_path(embeddings_filename), 'rb') as embeddings_file:
        expected_length = embedding_dim
        for line in embeddings_file:
            fields = line.decode('utf-8').strip().split(' ')
            if len(fields) - 1 != embedding_dim and len(
                    fields) - 1 != expected_length:
                # Sometimes there are funny unicode parsing problems that lead to different
                # fields lengths (e.g., a word with a unicode space character that splits
                # into more than one column).  We skip those lines.  Note that if you have
                # some kind of long header, this could result in all of your lines getting
                # skipped.  It's hard to check for that here; you just have to look in the
                # embedding_misses_file and at the model summary to make sure things look
                # like they are supposed to.
                logger.warning(
                    "Found line with wrong number of dimensions "
                    "(expected %d or %d, was %d): %s", embedding_dim,
                    expected_length,
                    len(fields) - 1, ' '.join(fields[:10]) + '[...]')
                try:
                    n1 = float(
                        fields[1])  # test that the second field is a number
                    assert len(
                        fields
                    ) - 1 > embedding_dim  # test that we could take a subset of the line
                    # if these tests pass, print a warning but use the vector and allow
                    # future vectors with the same length.
                    # NOTE TK TODO REMOVE: in future replace this by allowing user to specify
                    # both the 'actual' and 'desired' input embedding dimension.
                    logger.warning(
                        "Will change expected_length to %s and allow this and "
                        "similar vectors",
                        len(fields) - 1)
                    expected_length = len(fields) - 1
                except:
                    logger.warning("Skipping...")
                    continue
            word = fields[0]
            if word in words_to_keep:
                vector = numpy.asarray(fields[1:embedding_dim + 1],
                                       dtype='float32')
                embeddings[word] = vector

    if not embeddings:
        raise ConfigurationError(
            "No embeddings of correct dimension found; you probably "
            "misspecified your embedding_dim parameter, or didn't "
            "pre-populate your Vocabulary")

    all_embeddings = numpy.asarray(list(embeddings.values()))
    embeddings_mean = float(numpy.mean(all_embeddings))
    embeddings_std = float(numpy.std(all_embeddings))
    # Now we initialize the weight matrix for an embedding layer, starting with random vectors,
    # then filling in the word vectors we just read.
    logger.info("Initializing pre-trained embedding layer")
    embedding_matrix = torch.FloatTensor(vocab_size, embedding_dim).normal_(
        embeddings_mean, embeddings_std)

    for i in range(0, vocab_size):
        word = vocab.get_token_from_index(i, namespace)

        # If we don't have a pre-trained vector for this word, we'll just leave this row alone,
        # so the word has a random initialization.
        if word in embeddings:
            embedding_matrix[i] = torch.FloatTensor(embeddings[word])
        else:
            logger.debug(
                "Word %s was not found in the embedding file. Initialising randomly.",
                word)

    # The weight matrix is initialized, so we construct and return the actual Embedding.
    return embedding_matrix
示例#17
0
    def __init__(
        self,
        vocabulary: Vocabulary,
        image_feature_size: Tuple[int, int, int] = (1024, 14, 14),
        module_channels: int = 128,
        class_projection_channels: int = 1024,
        classifier_linear_size: int = 1024,
    ):
        super().__init__()
        self.vocabulary = vocabulary

        # Short-hand notations for convenience.
        __channels, __height, __width = image_feature_size

        # Exclude "@@UNKNOWN@@" answer token, our network will never generate this output through
        # regular forward pass. We set answer output as "@@UNKNOWN@@" when sampled programs are
        # invalid. __num_answers will be 28 for all practical purposes.
        __num_answers = len(
            vocabulary.get_index_to_token_vocabulary(namespace="answers")) - 1

        # The stem takes features from ResNet (or another feature extractor) and projects down to
        # a lower-dimensional space for sending through the Neural Module Network.
        self.stem = nn.Sequential(
            nn.Conv2d(image_feature_size[0],
                      module_channels,
                      kernel_size=3,
                      padding=1),
            nn.ReLU(),
            nn.Conv2d(module_channels,
                      module_channels,
                      kernel_size=3,
                      padding=1),
            nn.ReLU(),
        )
        # The classifier takes output of the last module (which will be a Query or Equal module)
        # and produces a distribution over answers.
        self.classifier = nn.Sequential(
            nn.Conv2d(module_channels,
                      class_projection_channels,
                      kernel_size=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            Flatten(),
            nn.Linear(class_projection_channels * __height * __width // 4,
                      classifier_linear_size),
            nn.ReLU(),
            nn.Linear(classifier_linear_size,
                      __num_answers),  # note no softmax here
        )

        # Instantiate a module for each program token in our vocabulary.
        self._function_modules: Dict[str, Type[nn.Module]] = {}
        for program_token in vocabulary.get_token_to_index_vocabulary(
                "programs"):

            # We don"t need modules for the placeholders.
            if program_token in [
                    "@@PADDING@@", "@@UNKNOWN@@", "@start@", "@end@", "unique"
            ]:
                continue

            # Figure out which module we want we use.
            if program_token == "scene":
                # "scene" is just a flag that indicates the start of a new line of reasoning
                # we set `module` to `None` because we still need the flag "scene" in forward()
                module = None
            elif program_token == "intersect":
                module = AndModule()
            elif program_token == "union":
                module = OrModule()
            elif "equal" in program_token or program_token in {
                    "less_than", "greater_than"
            }:
                module = ComparisonModule(module_channels)
            elif "query" in program_token or program_token in {
                    "exist", "count"
            }:
                module = QueryModule(module_channels)
            elif "relate" in program_token:
                module = RelateModule(module_channels)
            elif "same" in program_token:
                module = SameModule(module_channels)
            else:
                module = AttentionModule(module_channels)

            # Add the module to our dictionary and register its parameters so it can learn
            self._function_modules[program_token] = module  # type: ignore
            self.add_module(program_token, module)

        # Cross Entropy Loss for answer classification.
        self._loss = nn.CrossEntropyLoss(reduction="none")

        # Record accuracy while training and validation.
        self._answer_accuracy = BooleanAccuracy()

        # Record average number of invalid programs per batch.
        self._average_invalid_programs = Average()
示例#18
0
def _read_pretrained_word2vec_format_embedding_file(embeddings_filename: str, # pylint: disable=invalid-name
                                                    embedding_dim: int,
                                                    vocab: Vocabulary,
                                                    namespace: str = "tokens") -> torch.FloatTensor:
    """
    Read from a gzipped-word2vec format file.  The embeddings file is assumed to be gzipped and
    space delimited, e.g. [word] [dim 1] [dim 2] ...

    The remainder of the docstring is identical to ``_read_pretrained_embedding_file``.
    """
    words_to_keep = set(vocab.get_index_to_token_vocabulary(namespace).values())
    vocab_size = vocab.get_vocab_size(namespace)
    embeddings = {}

    # First we read the embeddings from the file, only keeping vectors for the words we need.
    logger.info("Reading embeddings from file")
    with gzip.open(cached_path(embeddings_filename), 'rb') as embeddings_file:
        for line in embeddings_file:
            fields = line.decode('utf-8').strip().split(' ')
            if len(fields) - 1 != embedding_dim:
                # Sometimes there are funny unicode parsing problems that lead to different
                # fields lengths (e.g., a word with a unicode space character that splits
                # into more than one column).  We skip those lines.  Note that if you have
                # some kind of long header, this could result in all of your lines getting
                # skipped.  It's hard to check for that here; you just have to look in the
                # embedding_misses_file and at the model summary to make sure things look
                # like they are supposed to.
                logger.warning("Found line with wrong number of dimensions (expected %d, was %d): %s",
                               embedding_dim, len(fields) - 1, line)
                continue
            word = fields[0]
            if word in words_to_keep:
                vector = numpy.asarray(fields[1:], dtype='float32')
                embeddings[word] = vector

    if not embeddings:
        raise ConfigurationError("No embeddings of correct dimension found; you probably "
                                 "misspecified your embedding_dim parameter, or didn't "
                                 "pre-populate your Vocabulary")

    all_embeddings = numpy.asarray(list(embeddings.values()))
    embeddings_mean = float(numpy.mean(all_embeddings))
    embeddings_std = float(numpy.std(all_embeddings))
    # Now we initialize the weight matrix for an embedding layer, starting with random vectors,
    # then filling in the word vectors we just read.
    logger.info("Initializing pre-trained embedding layer")
    embedding_matrix = torch.FloatTensor(vocab_size, embedding_dim).normal_(embeddings_mean,
                                                                            embeddings_std)

    for i in range(0, vocab_size):
        word = vocab.get_token_from_index(i, namespace)

        # If we don't have a pre-trained vector for this word, we'll just leave this row alone,
        # so the word has a random initialization.
        if word in embeddings:
            embedding_matrix[i] = torch.FloatTensor(embeddings[word])
        else:
            logger.debug("Word %s was not found in the embedding file. Initialising randomly.", word)

    # The weight matrix is initialized, so we construct and return the actual Embedding.
    return embedding_matrix
示例#19
0
    def __init__(self,
                 vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 encoder_0: Seq2SeqEncoder,
                 encoder_1: Seq2SeqEncoder,
                 encoder_2: Seq2SeqEncoder,
                 tag_representation_dim: int,
                 arc_representation_dim: int,
                 tag_feedforward: FeedForward = None,
                 arc_feedforward: FeedForward = None,
                 pos_tag_embedding: Embedding = None,
                 use_mst_decoding_for_validation: bool = True,
                 use_layer_normalization: bool = True,
                 dropout: float = 0.0,
                 input_dropout: float = 0.0,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super(BiaffineDependencyParser, self).__init__(vocab, regularizer)

        a = vocab.get_index_to_token_vocabulary(namespace='tokens')
        # glyph_config['idx2word'] = {k: v for k, v in a.items()}

        # self.glyph = GlyphEmbedding(glyph_config)

        self.text_field_embedder = text_field_embedder

        self.encoder_0 = encoder_0
        self.encoder_1 = encoder_1
        self.encoder_2 = encoder_2

        encoder_dim = self.encoder_2.get_output_dim()

        self.head_arc_feedforward = arc_feedforward or \
                                        FeedForward(encoder_dim, 1,
                                                    arc_representation_dim,
                                                    Activation.by_name("elu")())
        self.child_arc_feedforward = copy.deepcopy(self.head_arc_feedforward)

        self.arc_attention = BilinearMatrixAttention(arc_representation_dim,
                                                     arc_representation_dim,
                                                     use_input_biases=True)

        num_labels = self.vocab.get_vocab_size("head_tags")

        self.head_tag_feedforward = tag_feedforward or \
                                        FeedForward(encoder_dim, 1,
                                                    tag_representation_dim,
                                                    Activation.by_name("elu")())
        self.child_tag_feedforward = copy.deepcopy(self.head_tag_feedforward)

        self.tag_bilinear = torch.nn.modules.Bilinear(tag_representation_dim,
                                                      tag_representation_dim,
                                                      num_labels)

        self._pos_tag_embedding = pos_tag_embedding or None
        self._dropout = InputVariationalDropout(dropout)
        # self._dropout = Dropout(dropout)
        self._input_dropout = Dropout(input_dropout)
        self._head_sentinel = torch.nn.Parameter(
            torch.randn([1, 1, self.encoder_2.get_output_dim()]))

        self.use_layer_normalization = use_layer_normalization

        if use_layer_normalization:
            self.norm_input = torch.nn.LayerNorm(
                self.encoder_0.get_input_dim())
            self.norm_hidden = torch.nn.LayerNorm(
                self.encoder_0.get_output_dim())

        representation_dim = text_field_embedder.get_output_dim()
        if pos_tag_embedding is not None:
            representation_dim += pos_tag_embedding.get_output_dim()

        # check_dimensions_match(representation_dim, encoder.get_input_dim(),
        #                        "text field embedding dim", "encoder input dim")

        check_dimensions_match(tag_representation_dim,
                               self.head_tag_feedforward.get_output_dim(),
                               "tag representation dim",
                               "tag feedforward output dim")
        check_dimensions_match(arc_representation_dim,
                               self.head_arc_feedforward.get_output_dim(),
                               "arc representation dim",
                               "arc feedforward output dim")

        self.use_mst_decoding_for_validation = use_mst_decoding_for_validation

        tags = self.vocab.get_token_to_index_vocabulary("pos")
        punctuation_tag_indices = {
            tag: index
            for tag, index in tags.items() if tag in POS_TO_IGNORE
        }
        self._pos_to_ignore = set(punctuation_tag_indices.values())
        logger.info(
            f"Found POS tags corresponding to the following punctuation : {punctuation_tag_indices}. "
            "Ignoring words with these POS tags for evaluation.")

        self._attachment_scores = AttachmentScores()
        initializer(self)
示例#20
0
def _read_pretrained_embedding_file(
        embeddings_filename: str,
        embedding_dim: int,
        vocab: Vocabulary,
        namespace: str = "tokens") -> torch.FloatTensor:
    """
    Reads a pre-trained embedding file and generates an Embedding layer that has weights
    initialized to the pre-trained embeddings.  The Embedding layer can either be trainable or
    not.

    We use the ``Vocabulary`` to map from the word strings in the embeddings file to the indices
    that we need, and to know which words from the embeddings file we can safely ignore.

    Parameters
    ----------
    embeddings_filename : str, required.
        The path to a file containing pretrined embeddings. The embeddings
        file is assumed to be gzipped and space delimited, e.g. [word] [dim 1] [dim 2] ...
    vocab : Vocabulary, required.
        A Vocabulary object.
    namespace : str, (optional, default=tokens)
        The namespace of the vocabulary to find pretrained embeddings for.
    trainable : bool, (optional, default=True)
        Whether or not the embedding parameters should be optimized.

    Returns
    -------
    A weight matrix with embeddings initialized from the read file.  The matrix has shape
    ``(vocab.get_vocab_size(namespace), embedding_dim)``, where the indices of words appearing in
    the pretrained embedding file are initialized to the pretrained embedding value.
    """
    words_to_keep = set(
        vocab.get_index_to_token_vocabulary(namespace).values())
    vocab_size = vocab.get_vocab_size(namespace)
    embeddings = {}

    words_found = set()
    # First we read the embeddings from the file, only keeping vectors for the words we need.
    logger.info("Reading embeddings from file; {}".format(len(words_to_keep)))
    with gzip.open(embeddings_filename, 'rb') as embeddings_file:
        for line in embeddings_file:
            fields = line.decode('utf-8').strip().split(' ')
            if len(fields) - 1 != embedding_dim:
                # Sometimes there are funny unicode parsing problems that lead to different
                # fields lengths (e.g., a word with a unicode space character that splits
                # into more than one column).  We skip those lines.  Note that if you have
                # some kind of long header, this could result in all of your lines getting
                # skipped.  It's hard to check for that here; you just have to look in the
                # embedding_misses_file and at the model summary to make sure things look
                # like they are supposed to.
                logger.warning(
                    "Found line with wrong number of dimensions (expected %d, was %d): %s",
                    embedding_dim,
                    len(fields) - 1, line)
                continue
            word = fields[0]
            if word in words_to_keep:
                words_found.add(word)
                vector = numpy.asarray(fields[1:], dtype='float32')
                embeddings[word] = vector
    notfound = words_to_keep.difference(words_found)
    logger.info("Emb load count: {}; Emb not found count: {}".format(
        len(words_found), len(notfound)))
    #"""
    with open("/home/kz918/bpe/eval/bidaf/not_found.txt",
              'w',
              encoding='utf-8') as f:
        for word in notfound:
            f.write(word)
            f.write('\n')
    #"""
    #assert len(notfound) < 10
    if not embeddings:
        raise ConfigurationError(
            "No embeddings of correct dimension found; you probably "
            "misspecified your embedding_dim parameter, or didn't "
            "pre-populate your Vocabulary")

    all_embeddings = numpy.asarray(list(embeddings.values()))
    embeddings_mean = float(numpy.mean(all_embeddings))
    embeddings_std = float(numpy.std(all_embeddings))
    # Now we initialize the weight matrix for an embedding layer, starting with random vectors,
    # then filling in the word vectors we just read.
    logger.info("Initializing pre-trained embedding layer")
    embedding_matrix = torch.FloatTensor(vocab_size, embedding_dim).normal_(
        embeddings_mean, embeddings_std)

    for i in range(0, vocab_size):
        word = vocab.get_token_from_index(i, namespace)

        # If we don't have a pre-trained vector for this word, we'll just leave this row alone,
        # so the word has a random initialization.
        if word in embeddings:
            embedding_matrix[i] = torch.FloatTensor(embeddings[word])
        else:
            logger.debug(
                "Word %s was not found in the embedding file. Initialising randomly.",
                word)

    # The weight matrix is initialized, so we construct and return the actual Embedding.
    return embedding_matrix
示例#21
0
def _read_embeddings_from_text_file(
        file_uri: str,
        embedding_dim: int,
        vocab: Vocabulary,
        namespace: str = "tokens") -> torch.FloatTensor:
    """
    Read pre-trained word vectors from an eventually compressed text file, possibly contained
    inside an archive with multiple files. The text file is assumed to be utf-8 encoded with
    space-separated fields: [word] [dim 1] [dim 2] ...

    Lines that contain more numerical tokens than ``embedding_dim`` raise a warning and are skipped.

    The remainder of the docstring is identical to ``_read_pretrained_embeddings_file``.
    """
    tokens_to_keep = set(
        vocab.get_index_to_token_vocabulary(namespace).values())
    vocab_size = vocab.get_vocab_size(namespace)
    embeddings = {}

    # First we read the embeddings from the file, only keeping vectors for the words we need.
    logger.info("Reading pretrained embeddings from file")

    with EmbeddingsTextFile(file_uri) as embeddings_file:
        for line in Tqdm.tqdm(embeddings_file):
            token = line.split(' ', 1)[0]
            if token in tokens_to_keep:
                fields = line.rstrip().split(' ')
                if len(fields) - 1 != embedding_dim:
                    # Sometimes there are funny unicode parsing problems that lead to different
                    # fields lengths (e.g., a word with a unicode space character that splits
                    # into more than one column).  We skip those lines.  Note that if you have
                    # some kind of long header, this could result in all of your lines getting
                    # skipped.  It's hard to check for that here; you just have to look in the
                    # embedding_misses_file and at the model summary to make sure things look
                    # like they are supposed to.
                    logger.warning(
                        "Found line with wrong number of dimensions (expected: %d; actual: %d): %s",
                        embedding_dim,
                        len(fields) - 1, line)
                    continue

                vector = numpy.asarray(fields[1:], dtype='float32')
                embeddings[token] = vector

    if not embeddings:
        raise ConfigurationError(
            "No embeddings of correct dimension found; you probably "
            "misspecified your embedding_dim parameter, or didn't "
            "pre-populate your Vocabulary")

    all_embeddings = numpy.asarray(list(embeddings.values()))
    embeddings_mean = float(numpy.mean(all_embeddings))
    embeddings_std = float(numpy.std(all_embeddings))
    # Now we initialize the weight matrix for an embedding layer, starting with random vectors,
    # then filling in the word vectors we just read.
    logger.info("Initializing pre-trained embedding layer")
    embedding_matrix = torch.FloatTensor(vocab_size, embedding_dim).normal_(
        embeddings_mean, embeddings_std)
    num_tokens_found = 0
    index_to_token = vocab.get_index_to_token_vocabulary(namespace)
    for i in range(vocab_size):
        token = index_to_token[i]

        # If we don't have a pre-trained vector for this word, we'll just leave this row alone,
        # so the word has a random initialization.
        if token in embeddings:
            embedding_matrix[i] = torch.FloatTensor(embeddings[token])
            num_tokens_found += 1
        else:
            logger.debug(
                "Token %s was not found in the embedding file. Initialising randomly.",
                token)

    logger.info("Pretrained embeddings were found for %d out of %d tokens",
                num_tokens_found, vocab_size)

    return embedding_matrix
    def __init__(self,
                 vocab: Vocabulary,
                 embedder: TextFieldEmbedder,
                 encoder: Seq2VecEncoder,
                 classifier: str = "linear",
                 alpha: float = 1.0,
                 learn_alpha: bool = False,
                 l2_to_sim: str = "negative",
                 squared_l2: bool = False,
                 truncate: bool = False,
                 embeds_per_label: int = 1,
                 label_namespace: str = "labels",
                 ):
        super().__init__(vocab)

        # str arg validation
        valid_classifiers = ["linear", "l2", "cos"]
        assert classifier in valid_classifiers, f"classifier must be in {valid_classifiers}"
        self.classifier = classifier
        valid_l2_to_sim = ["negative", "inverse"]
        assert l2_to_sim in valid_l2_to_sim, f"l2_to_sim must be in {valid_l2_to_sim}"
        self.l2_to_sim = l2_to_sim

        # encoder and embedder layers
        self.embedder = embedder
        self.encoder = encoder
        self.labels = vocab.get_index_to_token_vocabulary(namespace=label_namespace)
        self.num_labels = len(self.labels)
        self.embeds_per_label = embeds_per_label
        self.classifier_out = self.num_labels * embeds_per_label
        self.embed_dim = encoder.get_output_dim()

        # similarity/distance layer
        if classifier == "linear":
            self.classifier = nn.Linear(self.embed_dim, self.classifier_out)
        elif classifier == "l2":
            self.classifier = L2Linear(self.embed_dim, self.classifier_out, square=squared_l2)
        elif classifier == "cos":
            self.classifier = CosLinear(self.embed_dim, self.classifier_out)
        else:
            raise ValueError(f"Invalid classifier value: {classifier}")

        # truncate logits
        self.truncate = truncate
        if truncate:
            if classifier == "linear":
                self.threshold = nn.Parameter(torch.Tensor([0.1]))
            elif classifier == "cos":
                self.threshold = nn.Parameter(torch.Tensor([0.1]))
            elif classifier == "l2":
                if l2_to_sim == "negative":
                    self.threshold = nn.Parameter(torch.Tensor([float(self.embed_dim)]))
                elif l2_to_sim == "inverse":
                    self.threshold = nn.Parameter(torch.Tensor([-1.0]))
                else:
                    raise ValueError(f"Invalid l2_to_sim value: {l2_to_sim}")
            else:
                raise ValueError(f"Invalid classifier value: {classifier}")
                # str arg validation
                valid_classifiers = ["linear", "l2", "cos"]
                assert classifier in valid_classifiers, f"classifier must be in {valid_classifiers}"
                self.classifier = classifier
                valid_l2_to_sim = ["negative", "inverse"]
                assert l2_to_sim in valid_l2_to_sim, f"l2_to_sim must be in {valid_l2_to_sim}"
                self.l2_to_sim = l2_to_sim

        # scale logits by alpha
        self.alpha = nn.Parameter(torch.Tensor([alpha]))
        if not learn_alpha:
            self.alpha.requires_grad = False

        # metrics
        self.accuracy = CategoricalAccuracy()
        self.prf_metrics = {l: F1Measure(i) for i, l in self.labels.items()}
        self.avg_alpha = Average()
        if self.truncate:
            self.trunc_avg_num = Average()
            self.trunc_avg_untrunc_num = Average()
            self.trunc_avg_threshold = Average()
            self.trunc_avg_sim = Average()
示例#23
0
def get_pretrained_embedding_layer(embeddings_filename: str,
                                   vocab: Vocabulary,
                                   namespace: str = "tokens",
                                   trainable: bool = True):
    """
    Reads a pre-trained embedding file and generates an Embedding layer that has weights
    initialized to the pre-trained embeddings.  The Embedding layer can either be trainable or
    not.

    We use the ``Vocabulary`` to map from the word strings in the embeddings file to the indices
    that we need, and to know which words from the embeddings file we can safely ignore.

    Parameters
    ----------

    embeddings_filename : str, required.
        The path to a file containing pretrined embeddings. The embeddings
        file is assumed to be gzipped and space delimited, e.g. [word] [dim 1] [dim 2] ...
    vocab : Vocabulary, required.
        A Vocabulary object.
    namespace : str, (optional, default=tokens)
        The namespace of the vocabulary to find pretrained embeddings for.
    trainable : bool, (optional, default=True)
        Whether or not the embedding parameters should be optimized.

    Returns
    -------

    An Embedding Module initialised with a weight matrix of shape
    (vocab.get_vocab_size(namespace), pretrained_embedding_dim),
    where the indices of words appearing in the pretrained embedding file
    are initialized to the pretrained embedding value.

    """
    words_to_keep = set(vocab.get_index_to_token_vocabulary(namespace).values())
    vocab_size = vocab.get_vocab_size(namespace)
    embeddings = {}
    embedding_dim = None

    # First we read the embeddings from the file, only keeping vectors for the words we need.
    logger.info("Reading embeddings from file")
    with gzip.open(embeddings_filename, 'rb') as embeddings_file:
        for line in embeddings_file:
            fields = line.decode('utf-8').strip().split(' ')
            if embedding_dim is None:
                embedding_dim = len(fields) - 1
                assert embedding_dim > 1, "Found embedding size of 1; do you have a header?"
            else:
                if len(fields) - 1 != embedding_dim:
                    # Sometimes there are funny unicode parsing problems that lead to different
                    # fields lengths (e.g., a word with a unicode space character that splits
                    # into more than one column).  We skip those lines.  Note that if you have
                    # some kind of long header, this could result in all of your lines getting
                    # skipped.  It's hard to check for that here; you just have to look in the
                    # embedding_misses_file and at the model summary to make sure things look
                    # like they are supposed to.
                    continue
            word = fields[0]
            if word in words_to_keep:
                vector = numpy.asarray(fields[1:], dtype='float32')
                embeddings[word] = vector

    # Now we initialize the weight matrix for an embedding layer, starting with random vectors,
    # then filling in the word vectors we just read.
    logger.info("Initializing pre-trained embedding layer")
    embedding_matrix = torch.FloatTensor(vocab_size, embedding_dim).normal_(0, 1)

    for i in range(0, vocab_size):
        word = vocab.get_token_from_index(i, namespace)

        # If we don't have a pre-trained vector for this word, we'll just leave this row alone,
        # so the word has a random initialization.
        if word in embeddings:
            embedding_matrix[i] = torch.FloatTensor(embeddings[word])
        else:
            logger.debug("Word %s was not found in the embedding file. Initialising randomly.", word)

    # The weight matrix is initialized, so we construct and return the actual Embedding.
    return Embedding(num_embeddings=vocab_size,
                     embedding_dim=embedding_dim,
                     padding_index=0,
                     weight=embedding_matrix,
                     trainable=trainable)
示例#24
0
def _read_embeddings_from_text_file(file_uri: str,
                                    embedding_dim: int,
                                    vocab: Vocabulary,
                                    namespace: str = "tokens") -> torch.FloatTensor:
    """
    Read pre-trained word vectors from an eventually compressed text file, possibly contained
    inside an archive with multiple files. The text file is assumed to be utf-8 encoded with
    space-separated fields: [word] [dim 1] [dim 2] ...

    Lines that contain more numerical tokens than ``embedding_dim`` raise a warning and are skipped.

    The remainder of the docstring is identical to ``_read_pretrained_embeddings_file``.
    """
    tokens_to_keep = set(vocab.get_index_to_token_vocabulary(namespace).values())
    vocab_size = vocab.get_vocab_size(namespace)
    embeddings = {}

    # First we read the embeddings from the file, only keeping vectors for the words we need.
    logger.info("Reading pretrained embeddings from file")

    with EmbeddingsTextFile(file_uri) as embeddings_file:
        for line in Tqdm.tqdm(embeddings_file):
            token = line.split(' ', 1)[0]
            if token in tokens_to_keep:
                fields = line.rstrip().split(' ')
                if len(fields) - 1 != embedding_dim:
                    # Sometimes there are funny unicode parsing problems that lead to different
                    # fields lengths (e.g., a word with a unicode space character that splits
                    # into more than one column).  We skip those lines.  Note that if you have
                    # some kind of long header, this could result in all of your lines getting
                    # skipped.  It's hard to check for that here; you just have to look in the
                    # embedding_misses_file and at the model summary to make sure things look
                    # like they are supposed to.
                    logger.warning("Found line with wrong number of dimensions (expected: %d; actual: %d): %s",
                                   embedding_dim, len(fields) - 1, line)
                    continue

                vector = numpy.asarray(fields[1:], dtype='float32')
                embeddings[token] = vector

    if not embeddings:
        raise ConfigurationError("No embeddings of correct dimension found; you probably "
                                 "misspecified your embedding_dim parameter, or didn't "
                                 "pre-populate your Vocabulary")

    all_embeddings = numpy.asarray(list(embeddings.values()))
    embeddings_mean = float(numpy.mean(all_embeddings))
    embeddings_std = float(numpy.std(all_embeddings))
    # Now we initialize the weight matrix for an embedding layer, starting with random vectors,
    # then filling in the word vectors we just read.
    logger.info("Initializing pre-trained embedding layer")
    embedding_matrix = torch.FloatTensor(vocab_size, embedding_dim).normal_(embeddings_mean,
                                                                            embeddings_std)
    num_tokens_found = 0
    index_to_token = vocab.get_index_to_token_vocabulary(namespace)
    for i in range(vocab_size):
        token = index_to_token[i]

        # If we don't have a pre-trained vector for this word, we'll just leave this row alone,
        # so the word has a random initialization.
        if token in embeddings:
            embedding_matrix[i] = torch.FloatTensor(embeddings[token])
            num_tokens_found += 1
        else:
            logger.debug("Token %s was not found in the embedding file. Initialising randomly.", token)

    logger.info("Pretrained embeddings were found for %d out of %d tokens",
                num_tokens_found, vocab_size)

    return embedding_matrix
    def __init__(
        self,
        vocab: Vocabulary,
        embedder: str,
        encoder: Seq2VecEncoder,
        classifier: str = "linear",
        alpha: float = 1.0,
        learn_alpha: bool = False,
        l2_to_sim: str = "negative",
        squared_l2: bool = False,
        truncate: bool = False,
        embeds_per_label: int = 1,
        label_namespace: str = "labels",
        attention_layer: str = "first",
        finetune_bert: bool = True,
        trunc_ratio: float = 0.1,
    ):
        super().__init__(vocab)

        # str arg validation
        assert embedder in EMBEDDERS.keys(
        ), f"embedder must be in {list(EMBEDDERS.keys())}"
        valid_classifiers = ["linear", "l2", "cos"]
        assert classifier in valid_classifiers, f"classifier must be in {valid_classifiers}"
        self.classifier_type = classifier
        valid_l2_to_sim = ["negative", "inverse"]
        assert l2_to_sim in valid_l2_to_sim, f"l2_to_sim must be in {valid_l2_to_sim}"
        self.l2_to_sim = l2_to_sim
        self.finetune_bert = finetune_bert
        self.squared_l2 = squared_l2
        self.trunc_ratio = trunc_ratio

        # encoder and embedder layers
        self.embedder = EMBEDDERS[embedder].from_pretrained(
            embedder, output_attentions=True)
        self.encoder = encoder
        self.labels = vocab.get_index_to_token_vocabulary(
            namespace=label_namespace)
        self.num_labels = len(self.labels)
        self.embeds_per_label = embeds_per_label
        self.classifier_out = self.num_labels * embeds_per_label
        self.embed_dim = encoder.get_output_dim()
        self.attention_layer = attention_layer

        # similarity/distance layer
        if self.classifier_type == "linear":
            self.classifier = nn.Linear(self.embed_dim, self.classifier_out)
        elif self.classifier_type == "l2":
            self.classifier = L2Linear(self.embed_dim,
                                       self.classifier_out,
                                       square=squared_l2)
        elif self.classifier_type == "cos":
            self.classifier = CosLinear(self.embed_dim, self.classifier_out)
        else:
            raise ValueError(f"Invalid classifier value: {classifier}")

        # truncate logits
        self.truncate = truncate
        if truncate:
            # compute threshold values from a dummy learnable embedding
            # for stable and sufficient gradient updates
            self.trunc_embed = nn.Parameter(torch.ones(self.embed_dim) * 0.5)

        # scale logits by alpha
        self.alpha = nn.Parameter(torch.Tensor([alpha]))
        if not learn_alpha:
            self.alpha.requires_grad = False

        # metrics
        self.accuracy = CategoricalAccuracy()
        self.prf_metrics = {l: F1Measure(i) for i, l in self.labels.items()}
        self.avg_alpha = Average()
        if self.truncate:
            self.trunc_avg_total_num = Average()
            self.trunc_avg_trunc_num = Average()
            self.trunc_avg_untrunc_num = Average()
            self.trunc_avg_threshold = Average()
            self.trunc_avg_sim = Average()
            self.trunc_pre_avg_sim = Average()
            self.trunc_avg_sim_std = Average()
            self.trunc_pre_avg_sim_std = Average()
            self.trunc_pre_avg_sim_std = Average()
示例#26
0
def _read_pretrained_word2vec_format_embedding_file(
        embeddings_filename: str,  # pylint: disable=invalid-name
        embedding_dim: int,
        vocab: Vocabulary,
        namespace: str = "tokens") -> torch.FloatTensor:
    """
    Read from a gzipped-word2vec format file.  The embeddings file is assumed to be gzipped and
    space delimited, e.g. [word] [dim 1] [dim 2] ...

    The remainder of the docstring is identical to ``_read_pretrained_embedding_file``.
    """
    words_to_keep = set(
        vocab.get_index_to_token_vocabulary(namespace).values())
    vocab_size = vocab.get_vocab_size(namespace)
    embeddings = {}

    # First we read the embeddings from the file, only keeping vectors for the words we need.
    logger.info("Reading embeddings from file")
    with gzip.open(cached_path(embeddings_filename), 'rb') as embeddings_file:
        for line in embeddings_file:
            fields = line.decode('utf-8').rstrip().split(' ')
            if len(fields) - 1 != embedding_dim:
                # Sometimes there are funny unicode parsing problems that lead to different
                # fields lengths (e.g., a word with a unicode space character that splits
                # into more than one column).  We skip those lines.  Note that if you have
                # some kind of long header, this could result in all of your lines getting
                # skipped.  It's hard to check for that here; you just have to look in the
                # embedding_misses_file and at the model summary to make sure things look
                # like they are supposed to.
                logger.warning(
                    "Found line with wrong number of dimensions (expected %d, was %d): %s",
                    embedding_dim,
                    len(fields) - 1, line)
                continue
            word = fields[0]
            if word in words_to_keep:
                vector = numpy.asarray(fields[1:], dtype='float32')
                embeddings[word] = vector

    if not embeddings:
        raise ConfigurationError(
            "No embeddings of correct dimension found; you probably "
            "misspecified your embedding_dim parameter, or didn't "
            "pre-populate your Vocabulary")

    all_embeddings = numpy.asarray(list(embeddings.values()))
    embeddings_mean = float(numpy.mean(all_embeddings))
    embeddings_std = float(numpy.std(all_embeddings))
    # Now we initialize the weight matrix for an embedding layer, starting with random vectors,
    # then filling in the word vectors we just read.
    logger.info("Initializing pre-trained embedding layer")
    embedding_matrix = torch.FloatTensor(vocab_size, embedding_dim).normal_(
        embeddings_mean, embeddings_std)

    for i in range(0, vocab_size):
        word = vocab.get_token_from_index(i, namespace)

        # If we don't have a pre-trained vector for this word, we'll just leave this row alone,
        # so the word has a random initialization.
        if word in embeddings:
            embedding_matrix[i] = torch.FloatTensor(embeddings[word])
        else:
            logger.debug(
                "Word %s was not found in the embedding file. Initialising randomly.",
                word)

    # The weight matrix is initialized, so we construct and return the actual Embedding.
    return embedding_matrix
def get_glove_embedder(num_embeddings: int, embedding_dim: int, vocab: Vocabulary, namespace: str = "tokens") -> Embedding:

    tokens_to_keep = set(vocab.get_index_to_token_vocabulary(namespace).values())
    vocab_size = vocab.get_vocab_size(namespace)
    embeddings = {}

    # First we read the embeddings from the file, only keeping vectors for the words we need.
    logger.info("Reading pre-trained embeddings from file")
	
    with open("../embeddings/glove/glove840B300d.txt",encoding="utf8") as embeddings_file:
        for line in Tqdm.tqdm(embeddings_file):
            token = line.split(' ', 1)[0]
            if token in tokens_to_keep:
                fields = line.rstrip().split(' ')
                if len(fields) - 1 != embedding_dim:
                    # Sometimes there are funny unicode parsing problems that lead to different
					# fields lengths (e.g., a word with a unicode space character that splits
                    # into more than one column).  We skip those lines.  Note that if you have
                    # some kind of long header, this could result in all of your lines getting
                    # skipped.  It's hard to check for that here; you just have to look in the
                    # embedding_misses_file and at the model summary to make sure things look
                    # like they are supposed to.
                    logger.warning("Found line with wrong number of dimensions (expected: %d; actual: %d): %s",
					               embedding_dim, len(fields) - 1, line)
                    continue
                
                vector = np.asarray(fields[1:], dtype='float32')
                embeddings[token] = vector
    
    if not embeddings:
        raise ConfigurationError("No embeddings of correct dimension found; you probably "
                                "misspecified your embedding_dim parameter, or didn't "
                                "pre-populate your Vocabulary")

    all_embeddings = np.asarray(list(embeddings.values()))
    embeddings_mean = float(np.mean(all_embeddings))
    embeddings_std = float(np.std(all_embeddings))
	 
    print("Embedding mean:" + str(embeddings_mean))
    print("Embedding std:" + str(embeddings_std))

    # Now we initialize the weight matrix for an embedding layer, starting with random vectors,
    # then filling in the word vectors we just read.
    logger.info("Initializing pre-trained embedding layer")
    embedding_matrix = torch.FloatTensor(vocab_size, embedding_dim).normal_(embeddings_mean,embeddings_std)

    num_tokens_found = 0
    index_to_token = vocab.get_index_to_token_vocabulary(namespace)

    for i in range(vocab_size):
        token = index_to_token[i]
        
        # If we don't have a pre-trained vector for this word, we'll just leave this row alone,
        # so the word has a random initialization.
        if token in embeddings:
            embedding_matrix[i] = torch.FloatTensor(embeddings[token])
            num_tokens_found += 1
        else:
            logger.debug("Token %s was not found in the embedding file. Initialising randomly.", token)
    
    logger.info("Pretrained embeddings were found for %d out of %d tokens",
                num_tokens_found, vocab_size)
			
    # initialize glove embedding on precalculated weight
    glove_embedder = Embedding(num_embeddings, embedding_dim, weight = embedding_matrix, padding_index=0)
	
    return glove_embedder