Exemplo n.º 1
0
def embedding_matrix(
    vocab,
    embedding_size,
    representation='dense',
    embeddings_trainable=True,
    pretrained_embeddings=None,
    force_embedding_size=False,
    embedding_initializer=None,
):
    vocab_size = len(vocab)
    if representation == 'dense':
        if pretrained_embeddings is not None and pretrained_embeddings is not False:
            embeddings_matrix = load_pretrained_embeddings(
                pretrained_embeddings, vocab)
            if embeddings_matrix.shape[-1] != embedding_size:
                raise ValueError(
                    'The size of the pretrained embeddings is {}, '
                    'but the specified embedding_size is {}. '
                    'Please change the embedding_size accordingly.'.format(
                        embeddings_matrix.shape[-1], embedding_size))
            embedding_initializer_obj = tf.constant(embeddings_matrix,
                                                    dtype=tf.float32)

        else:
            if vocab_size < embedding_size and not force_embedding_size:
                logger.info(
                    '  embedding_size ({}) is greater than vocab_size ({}). '
                    'Setting embedding size to be equal to vocab_size.'.format(
                        embedding_size, vocab_size))
                embedding_size = vocab_size

            if embedding_initializer is not None:
                embedding_initializer_obj_ref = get_initializer(
                    embedding_initializer)
            else:
                embedding_initializer_obj_ref = get_initializer({
                    TYPE: 'uniform',
                    'minval': -1.0,
                    'maxval': 1.0
                })
            embedding_initializer_obj = embedding_initializer_obj_ref(
                [vocab_size, embedding_size])

        embeddings = tf.Variable(embedding_initializer_obj,
                                 trainable=embeddings_trainable,
                                 name='embeddings')

    elif representation == 'sparse':
        embedding_size = vocab_size
        embeddings = tf.Variable(
            get_initializer('identity')([vocab_size, embedding_size]),
            trainable=False,
            name='embeddings')

    else:
        raise Exception('Embedding representation {} not supported.'.format(
            representation))

    return embeddings, embedding_size
Exemplo n.º 2
0
def embedding_matrix(
    vocab: List[str],
    embedding_size: int,
    representation: str = "dense",
    embeddings_trainable: bool = True,
    pretrained_embeddings: Optional[str] = None,
    force_embedding_size: bool = False,
    embedding_initializer: Optional[Union[str, Dict]] = None,
) -> Tuple[nn.Module, int]:
    """Returns initialized torch.nn.Embedding module and embedding size."""

    vocab_size = len(vocab)
    if representation == "dense":
        if pretrained_embeddings:
            embeddings_matrix = load_pretrained_embeddings(pretrained_embeddings, vocab)
            if embeddings_matrix.shape[-1] != embedding_size:
                if not force_embedding_size:
                    embedding_size = embeddings_matrix.shape[-1]
                    logger.info(f"Setting embedding size to be equal to {embeddings_matrix.shape[-1]}.")
                else:
                    raise ValueError(
                        f"The size of the pretrained embeddings is "
                        f"{embeddings_matrix.shape[-1]}, but the specified "
                        f"embedding_size is {embedding_size}. Please change "
                        f"the embedding_size accordingly."
                    )
            embedding_initializer_obj = torch.tensor(embeddings_matrix, dtype=torch.float32)

        else:
            if vocab_size < embedding_size and not force_embedding_size:
                logger.info(
                    f"  embedding_size ({embedding_size}) is greater than "
                    f"vocab_size ({vocab_size}). Setting embedding size to be "
                    f"equal to vocab_size."
                )
                embedding_size = vocab_size

            if embedding_initializer is not None:
                embedding_initializer_obj_ref = get_initializer(embedding_initializer)
            else:
                embedding_initializer_obj_ref = get_initializer({TYPE: "uniform", "a": -1.0, "b": 1.0})
            embedding_initializer_obj = embedding_initializer_obj_ref([vocab_size, embedding_size])

        embeddings = embedding_initializer_obj

    elif representation == "sparse":
        embedding_size = vocab_size
        embeddings = get_initializer("identity")([vocab_size, embedding_size])
        embeddings.requires_grad = False
    else:
        raise Exception(f"Embedding representation {representation} not supported.")

    embeddings = nn.Embedding.from_pretrained(embeddings, freeze=not embeddings_trainable)
    return embeddings, embedding_size