Exemplo n.º 1
0
def test_document_bidirectional_lstm_embeddings():
    (sentence, glove, charlm) = init_document_embeddings()
    embeddings = DocumentRNNEmbeddings(
        [glove, charlm], hidden_size=128, bidirectional=True)
    embeddings.embed(sentence)
    assert (len(sentence.get_embedding()) == 512)
    assert (len(sentence.get_embedding()) == embeddings.embedding_length)
    sentence.clear_embeddings()
    assert (len(sentence.get_embedding()) == 0)
Exemplo n.º 2
0
def test_keep_batch_order():
    (sentence, glove, charlm) = init_document_embeddings()
    embeddings = DocumentRNNEmbeddings([glove])
    sentences_1 = [Sentence('First sentence'),
                   Sentence('This is second sentence')]
    sentences_2 = [Sentence('This is second sentence'),
                   Sentence('First sentence')]
    embeddings.embed(sentences_1)
    embeddings.embed(sentences_2)
    assert (sentences_1[0].to_original_text() == 'First sentence')
    assert (sentences_1[1].to_original_text() == 'This is second sentence')
    assert (torch.norm(
        (sentences_1[0].embedding - sentences_2[1].embedding)) == 0.0)
    assert (torch.norm(
        (sentences_1[0].embedding - sentences_2[1].embedding)) == 0.0)
Exemplo n.º 3
0
def test_keep_batch_order():

    embeddings = DocumentRNNEmbeddings([glove])
    sentences_1 = [Sentence("First sentence"), Sentence("This is second sentence")]
    sentences_2 = [Sentence("This is second sentence"), Sentence("First sentence")]

    embeddings.embed(sentences_1)
    embeddings.embed(sentences_2)

    assert sentences_1[0].to_original_text() == "First sentence"
    assert sentences_1[1].to_original_text() == "This is second sentence"

    assert torch.norm(sentences_1[0].embedding - sentences_2[1].embedding) == 0.0
    assert torch.norm(sentences_1[0].embedding - sentences_2[1].embedding) == 0.0
    del embeddings
Exemplo n.º 4
0
def test_fine_tunable_flair_embedding():
    language_model_forward = LanguageModel(Dictionary.load(
        'chars'), is_forward_lm=True, hidden_size=32, nlayers=1)
    embeddings = DocumentRNNEmbeddings([FlairEmbeddings(
        language_model_forward, fine_tune=True)], hidden_size=128, bidirectional=False)
    sentence = Sentence('I love Berlin.')
    embeddings.embed(sentence)
    assert (len(sentence.get_embedding()) == 128)
    assert (len(sentence.get_embedding()) == embeddings.embedding_length)
    sentence.clear_embeddings()
    assert (len(sentence.get_embedding()) == 0)
    embeddings = DocumentLMEmbeddings(
        [FlairEmbeddings(language_model_forward, fine_tune=True)])
    sentence = Sentence('I love Berlin.')
    embeddings.embed(sentence)
    assert (len(sentence.get_embedding()) == 32)
    assert (len(sentence.get_embedding()) == embeddings.embedding_length)
    sentence.clear_embeddings()
    assert (len(sentence.get_embedding()) == 0)
Exemplo n.º 5
0
import numpy as np
from pandas import read_csv
import pickle

from flair.embeddings import WordEmbeddings, DocumentRNNEmbeddings, Sentence

data = read_csv('data/abcnews-date-text.csv', error_bad_lines=False)
documents = data[['headline_text']].values.reshape(-1).tolist()
# documents = list(pickle.load(open( "./corpus/df_proyectosFECYT.pkl", "rb" ) )['LEMAS_UC3M'])

glove_embedding = WordEmbeddings('glove')
document_embeddings = DocumentRNNEmbeddings([glove_embedding], hidden_size=512)
embeddings = []

count = 0

try:
    for document in documents:
        count += 1
        sentence = Sentence(document)

        document_embeddings.embed(sentence)

        embeddings.append(sentence.get_embedding().tolist())

        if (count % 1000 == 0): print(count)

finally:  # In case an error occurs before finish, we store previous results
    embedings_array = np.array(embeddings)
    np.save("embeds_abcnews_512_2.npy", embedings_array)
Exemplo n.º 6
0
class EasyDocumentEmbeddings:
    """ Document Embeddings generated by pool and rnn methods applied to the word embeddings of text

    Usage:

    ```python
    >>> embeddings = adaptnlp.EasyDocumentEmbeddings("bert-base-cased", "xlnet-base-cased", methods["rnn"])
    ```

    **Parameters:**

    * ***embeddings** - Non-keyword variable number of strings referring to model names or paths
    * **methods** - A list of strings to specify which document embeddings to use i.e. ["rnn", "pool"] (avoids unncessary loading of models if only using one)
    * **configs** - A dictionary of configurations for flair's rnn and pool document embeddings
    ```python
    >>>example_configs = {"pool_configs": {"fine_tune_mode": "linear", "pooling": "mean", },
    ...                   "rnn_configs": {"hidden_size": 512,
    ...                                   "rnn_layers": 1,
    ...                                   "reproject_words": True,
    ...                                   "reproject_words_dimension": 256,
    ...                                   "bidirectional": False,
    ...                                   "dropout": 0.5,
    ...                                   "word_dropout": 0.0,
    ...                                   "locked_dropout": 0.0,
    ...                                   "rnn_type": "GRU",
    ...                                   "fine_tune": True, },
    ...                  }
    ```
    """

    __allowed_methods = ["rnn", "pool"]
    __allowed_configs = ("pool_configs", "rnn_configs")

    def __init__(
        self,
        *embeddings: str,
        methods: List[str] = ["rnn", "pool"],
        configs: Dict = {
            "pool_configs": {
                "fine_tune_mode": "linear",
                "pooling": "mean"
            },
            "rnn_configs": {
                "hidden_size": 512,
                "rnn_layers": 1,
                "reproject_words": True,
                "reproject_words_dimension": 256,
                "bidirectional": False,
                "dropout": 0.5,
                "word_dropout": 0.0,
                "locked_dropout": 0.0,
                "rnn_type": "GRU",
                "fine_tune": True,
            },
        },
    ):
        print("May need a couple moments to instantiate...")
        self.embedding_stack = []

        # Check methods
        for m in methods:
            assert m in self.__class__.__allowed_methods

        # Set configs for pooling and rnn parameters
        for k, v in configs.items():
            assert k in self.__class__.__allowed_configs
            setattr(self, k, v)

        # Load correct Embeddings module
        for model_name_or_path in embeddings:
            if "bert" in model_name_or_path and "roberta" not in model_name_or_path:
                self.embedding_stack.append(BertEmbeddings(model_name_or_path))
            elif "roberta" in model_name_or_path:
                self.embedding_stack.append(
                    RoBERTaEmbeddings(model_name_or_path))
            elif "gpt2" in model_name_or_path:
                self.embedding_stack.append(
                    OpenAIGPT2Embeddings(model_name_or_path))
            elif "xlnet" in model_name_or_path:
                self.embedding_stack.append(
                    XLNetEmbeddings(model_name_or_path))
            elif "xlm" in model_name_or_path:
                self.embedding_stack.append(XLMEmbeddings(model_name_or_path))
            elif ("flair" in model_name_or_path
                  or model_name_or_path in FLAIR_PRETRAINED_MODEL_NAMES):
                self.embedding_stack.append(
                    FlairEmbeddings(model_name_or_path))
            else:
                try:
                    self.embedding_stack.append(
                        WordEmbeddings(model_name_or_path))
                except ValueError:
                    raise ValueError(
                        f"Embeddings not found for the model key: {model_name_or_path}, check documentation or custom model path to verify specified model"
                    )

        assert len(self.embedding_stack) != 0
        if "pool" in methods:
            self.pool_embeddings = DocumentPoolEmbeddings(
                self.embedding_stack, **self.pool_configs)
            print("Pooled embedding loaded")
        if "rnn" in methods:
            self.rnn_embeddings = DocumentRNNEmbeddings(
                self.embedding_stack, **self.rnn_configs)
            print("RNN embeddings loaded")

    def embed_pool(
        self,
        text: Union[List[Sentence], Sentence, List[str], str],
    ) -> List[Sentence]:
        """ Stacked embeddings


        * **text** - Text input, it can be a string or any of Flair's `Sentence` input formats
        **return** - A list of Flair's `Sentence`s
        """
        if isinstance(text, str):
            sentences = [Sentence(text)]
        elif isinstance(text, list) and all(isinstance(t, str) for t in text):
            sentences = [Sentence(t) for t in text]
        elif isinstance(text, Sentence):
            sentences = [text]
        else:
            sentences = text
        self.pool_embeddings.embed(sentences)
        return sentences

    def embed_rnn(
        self,
        text: Union[List[Sentence], Sentence, List[str], str],
    ) -> List[Sentence]:
        """ Stacked embeddings

        * **text** - Text input, it can be a string or any of Flair's `Sentence` input formats
        **return** - A list of Flair's `Sentence`s
        """
        if isinstance(text, str):
            sentences = [Sentence(text)]
        elif isinstance(text, list) and all(isinstance(t, str) for t in text):
            sentences = [Sentence(t) for t in text]
        elif isinstance(text, Sentence):
            sentences = [text]
        else:
            sentences = text
        self.rnn_embeddings.embed(sentences)
        return sentences
Exemplo n.º 7
0
class EasyDocumentEmbeddings:
    """Document Embeddings generated by pool and rnn methods applied to the word embeddings of text

    Usage:

    ```python
    >>> embeddings = adaptnlp.EasyDocumentEmbeddings("bert-base-cased", "xlnet-base-cased", methods["rnn"])
    ```

    **Parameters:**

    * `*embeddings` - Non-keyword variable number of strings referring to model names or paths
    * `methods` - A list of strings to specify which document embeddings to use i.e. ["rnn", "pool"] (avoids unncessary loading of models if only using one)
    * `configs` - A dictionary of configurations for flair's rnn and pool document embeddings
    ```python
    >>> example_configs = {"pool_configs": {"fine_tune_mode": "linear", "pooling": "mean", },
    ...                   "rnn_configs": {"hidden_size": 512,
    ...                                   "rnn_layers": 1,
    ...                                   "reproject_words": True,
    ...                                   "reproject_words_dimension": 256,
    ...                                   "bidirectional": False,
    ...                                   "dropout": 0.5,
    ...                                   "word_dropout": 0.0,
    ...                                   "locked_dropout": 0.0,
    ...                                   "rnn_type": "GRU",
    ...                                   "fine_tune": True, },
    ...                  }
    ```
    """

    __allowed_methods = ["rnn", "pool"]
    __allowed_configs = ("pool_configs", "rnn_configs")

    def __init__(
        self,
        *embeddings: str,
        methods: List[str] = ["rnn", "pool"],
        configs: Dict = {
            "pool_configs": {
                "fine_tune_mode": "linear",
                "pooling": "mean"
            },
            "rnn_configs": {
                "hidden_size": 512,
                "rnn_layers": 1,
                "reproject_words": True,
                "reproject_words_dimension": 256,
                "bidirectional": False,
                "dropout": 0.5,
                "word_dropout": 0.0,
                "locked_dropout": 0.0,
                "rnn_type": "GRU",
                "fine_tune": True,
            },
        },
    ):
        print("May need a couple moments to instantiate...")
        self.embedding_stack = []

        # Check methods
        for m in methods:
            assert m in self.__class__.__allowed_methods

        # Set configs for pooling and rnn parameters
        for k, v in configs.items():
            assert k in self.__class__.__allowed_configs
            setattr(self, k, v)

        # Load correct Embeddings module
        for model_name_or_path in embeddings:
            self.embedding_stack.append(
                _get_embedding_model(model_name_or_path))

        assert len(self.embedding_stack) != 0
        if "pool" in methods:
            self.pool_embeddings = DocumentPoolEmbeddings(
                self.embedding_stack, **self.pool_configs)
            print("Pooled embedding loaded")
        if "rnn" in methods:
            self.rnn_embeddings = DocumentRNNEmbeddings(
                self.embedding_stack, **self.rnn_configs)
            print("RNN embeddings loaded")

    def embed_pool(
        self,
        text: Union[List[Sentence], Sentence, List[str], str],
    ) -> List[Sentence]:
        """Generate stacked embeddings with `DocumentPoolEmbeddings`

        **Parameters**:
        * `text` - Text input, it can be a string or any of Flair's `Sentence` input formats

        **Return**:
        * A list of Flair's `Sentence`s
        """
        sentences = _make_sentences(text, as_list=True)
        self.pool_embeddings.embed(sentences)
        return sentences

    def embed_rnn(
        self,
        text: Union[List[Sentence], Sentence, List[str], str],
    ) -> List[Sentence]:
        """Generate stacked embeddings with `DocumentRNNEmbeddings`

        **Parameters**:
        * `text` - Text input, it can be a string or any of Flair's `Sentence` input formats

        **Return**:
        * A list of Flair's `Sentence`s
        """
        sentences = _make_sentences(text, as_list=True)
        self.rnn_embeddings.embed(sentences)
        return sentences