def test_read_from_file(self, lazy):
        reader = LanguageModelingReader(tokens_per_instance=3, lazy=lazy)

        instances = ensure_list(
            reader.read('tests/fixtures/data/language_modeling.txt'))
        # The last potential instance is left out, which is ok, because we don't have an end token
        # in here, anyway.
        assert len(instances) == 5

        assert [t.text for t in instances[0].fields["input_tokens"].tokens
                ] == ["This", "is", "a"]
        assert [t.text for t in instances[0].fields["output_tokens"].tokens
                ] == ["is", "a", "sentence"]

        assert [t.text for t in instances[1].fields["input_tokens"].tokens
                ] == ["sentence", "for", "language"]
        assert [t.text for t in instances[1].fields["output_tokens"].tokens
                ] == ["for", "language", "modelling"]

        assert [t.text for t in instances[2].fields["input_tokens"].tokens
                ] == ["modelling", ".", "Here"]
        assert [t.text for t in instances[2].fields["output_tokens"].tokens
                ] == [".", "Here", "'s"]

        assert [t.text for t in instances[3].fields["input_tokens"].tokens
                ] == ["'s", "another", "one"]
        assert [t.text for t in instances[3].fields["output_tokens"].tokens
                ] == ["another", "one", "for"]

        assert [t.text for t in instances[4].fields["input_tokens"].tokens
                ] == ["for", "extra", "language"]
        assert [t.text for t in instances[4].fields["output_tokens"].tokens
                ] == ["extra", "language", "modelling"]
    def test_read_from_file(self, lazy):
        reader = LanguageModelingReader(tokens_per_instance=3, lazy=lazy)

        instances = ensure_list(
            reader.read(AllenNlpTestCase.FIXTURES_ROOT / u'data' /
                        u'language_modeling.txt'))
        # The last potential instance is left out, which is ok, because we don't have an end token
        # in here, anyway.
        assert len(instances) == 5

        assert [t.text for t in instances[0].fields[u"input_tokens"].tokens
                ] == [u"This", u"is", u"a"]
        assert [t.text for t in instances[0].fields[u"output_tokens"].tokens
                ] == [u"is", u"a", u"sentence"]

        assert [t.text for t in instances[1].fields[u"input_tokens"].tokens
                ] == [u"sentence", u"for", u"language"]
        assert [t.text for t in instances[1].fields[u"output_tokens"].tokens
                ] == [u"for", u"language", u"modelling"]

        assert [t.text for t in instances[2].fields[u"input_tokens"].tokens
                ] == [u"modelling", u".", u"Here"]
        assert [t.text for t in instances[2].fields[u"output_tokens"].tokens
                ] == [u".", u"Here", u"'s"]

        assert [t.text for t in instances[3].fields[u"input_tokens"].tokens
                ] == [u"'s", u"another", u"one"]
        assert [t.text for t in instances[3].fields[u"output_tokens"].tokens
                ] == [u"another", u"one", u"for"]

        assert [t.text for t in instances[4].fields[u"input_tokens"].tokens
                ] == [u"for", u"extra", u"language"]
        assert [t.text for t in instances[4].fields[u"output_tokens"].tokens
                ] == [u"extra", u"language", u"modelling"]
    def test_read_from_file(self):
        reader = LanguageModelingReader(self.TRAIN_FILE,
                                        tokens_per_instance=4)

        dataset = reader.read()
        instances = dataset.instances
        assert instances[0].fields()["input_tokens"].tokens() == ["<S>", "this", "is", "a", "sentence"]
        assert instances[1].fields()["input_tokens"].tokens() == ["<S>", "for", "language", "modelling", "."]
        assert instances[2].fields()["input_tokens"].tokens() == ["<S>", "here", "'s", "another", "one"]
示例#4
0
    def test_read_from_file(self):
        reader = LanguageModelingReader(tokens_per_instance=4)

        dataset = reader.read('tests/fixtures/data/language_modeling.txt')
        instances = dataset.instances
        assert instances[0].fields()["input_tokens"].tokens() == [
            "<S>", "This", "is", "a", "sentence"
        ]
        assert instances[1].fields()["input_tokens"].tokens() == [
            "<S>", "for", "language", "modelling", "."
        ]
        assert instances[2].fields()["input_tokens"].tokens() == [
            "<S>", "Here", "'s", "another", "one"
        ]
示例#5
0
    def test_read_from_file(self, lazy):
        # The LanguageModelingReader is deprecated, but we don't want this test to fail
        # before we remove it.
        warnings.simplefilter(action="ignore", category=DeprecationWarning)
        reader = LanguageModelingReader(tokens_per_instance=3, lazy=lazy)

        instances = ensure_list(
            reader.read(AllenNlpTestCase.FIXTURES_ROOT / "data" /
                        "language_modeling.txt"))
        # The last potential instance is left out, which is ok, because we don't have an end token
        # in here, anyway.
        assert len(instances) == 5

        assert [t.text for t in instances[0].fields["input_tokens"].tokens
                ] == ["This", "is", "a"]
        assert [t.text
                for t in instances[0].fields["output_tokens"].tokens] == [
                    "is",
                    "a",
                    "sentence",
                ]

        assert [t.text
                for t in instances[1].fields["input_tokens"].tokens] == [
                    "sentence",
                    "for",
                    "language",
                ]
        assert [t.text
                for t in instances[1].fields["output_tokens"].tokens] == [
                    "for",
                    "language",
                    "modelling",
                ]

        assert [t.text
                for t in instances[2].fields["input_tokens"].tokens] == [
                    "modelling",
                    ".",
                    "Here",
                ]
        assert [t.text for t in instances[2].fields["output_tokens"].tokens
                ] == [".", "Here", "'s"]

        assert [t.text
                for t in instances[3].fields["input_tokens"].tokens] == [
                    "'s",
                    "another",
                    "one",
                ]
        assert [t.text
                for t in instances[3].fields["output_tokens"].tokens] == [
                    "another",
                    "one",
                    "for",
                ]

        assert [t.text
                for t in instances[4].fields["input_tokens"].tokens] == [
                    "for",
                    "extra",
                    "language",
                ]
        assert [t.text
                for t in instances[4].fields["output_tokens"].tokens] == [
                    "extra",
                    "language",
                    "modelling",
                ]
from allennlp.modules.token_embedders import Embedding
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
from allennlp.models import Model
from model.custom_lstm import OptimizedLSTM
from allennlp.nn.util import get_text_field_mask
from model.custom_lstm import NaiveLSTM, OptimizedLSTM

DATA_ROOT = Path("../data/brown")

N_EPOCHS = 1

char_tokenizer = CharacterTokenizer(lowercase_characters=True)

reader = LanguageModelingReader(
    tokens_per_instance=500,
    tokenizer=char_tokenizer,
    token_indexers={"tokens": SingleIdTokenIndexer()},
)

train_ds = reader.read(DATA_ROOT / "brown.txt")
train_ds, val_ds = train_test_split(train_ds, random_state=0, test_size=0.1)

vocab = Vocabulary.from_instances(train_ds)

iterator = BasicIterator(batch_size=32)
iterator.index_with(vocab)


def train(model: nn.Module, epochs: int = 10):
    trainer = Trainer(
        model=model.cuda() if torch.cuda.is_available() else model,