def test_read_from_file(self, lazy): reader = LanguageModelingReader(tokens_per_instance=3, lazy=lazy) instances = ensure_list( reader.read('tests/fixtures/data/language_modeling.txt')) # The last potential instance is left out, which is ok, because we don't have an end token # in here, anyway. assert len(instances) == 5 assert [t.text for t in instances[0].fields["input_tokens"].tokens ] == ["This", "is", "a"] assert [t.text for t in instances[0].fields["output_tokens"].tokens ] == ["is", "a", "sentence"] assert [t.text for t in instances[1].fields["input_tokens"].tokens ] == ["sentence", "for", "language"] assert [t.text for t in instances[1].fields["output_tokens"].tokens ] == ["for", "language", "modelling"] assert [t.text for t in instances[2].fields["input_tokens"].tokens ] == ["modelling", ".", "Here"] assert [t.text for t in instances[2].fields["output_tokens"].tokens ] == [".", "Here", "'s"] assert [t.text for t in instances[3].fields["input_tokens"].tokens ] == ["'s", "another", "one"] assert [t.text for t in instances[3].fields["output_tokens"].tokens ] == ["another", "one", "for"] assert [t.text for t in instances[4].fields["input_tokens"].tokens ] == ["for", "extra", "language"] assert [t.text for t in instances[4].fields["output_tokens"].tokens ] == ["extra", "language", "modelling"]
def test_read_from_file(self, lazy): reader = LanguageModelingReader(tokens_per_instance=3, lazy=lazy) instances = ensure_list( reader.read(AllenNlpTestCase.FIXTURES_ROOT / u'data' / u'language_modeling.txt')) # The last potential instance is left out, which is ok, because we don't have an end token # in here, anyway. assert len(instances) == 5 assert [t.text for t in instances[0].fields[u"input_tokens"].tokens ] == [u"This", u"is", u"a"] assert [t.text for t in instances[0].fields[u"output_tokens"].tokens ] == [u"is", u"a", u"sentence"] assert [t.text for t in instances[1].fields[u"input_tokens"].tokens ] == [u"sentence", u"for", u"language"] assert [t.text for t in instances[1].fields[u"output_tokens"].tokens ] == [u"for", u"language", u"modelling"] assert [t.text for t in instances[2].fields[u"input_tokens"].tokens ] == [u"modelling", u".", u"Here"] assert [t.text for t in instances[2].fields[u"output_tokens"].tokens ] == [u".", u"Here", u"'s"] assert [t.text for t in instances[3].fields[u"input_tokens"].tokens ] == [u"'s", u"another", u"one"] assert [t.text for t in instances[3].fields[u"output_tokens"].tokens ] == [u"another", u"one", u"for"] assert [t.text for t in instances[4].fields[u"input_tokens"].tokens ] == [u"for", u"extra", u"language"] assert [t.text for t in instances[4].fields[u"output_tokens"].tokens ] == [u"extra", u"language", u"modelling"]
def test_read_from_file(self): reader = LanguageModelingReader(self.TRAIN_FILE, tokens_per_instance=4) dataset = reader.read() instances = dataset.instances assert instances[0].fields()["input_tokens"].tokens() == ["<S>", "this", "is", "a", "sentence"] assert instances[1].fields()["input_tokens"].tokens() == ["<S>", "for", "language", "modelling", "."] assert instances[2].fields()["input_tokens"].tokens() == ["<S>", "here", "'s", "another", "one"]
def test_read_from_file(self): reader = LanguageModelingReader(tokens_per_instance=4) dataset = reader.read('tests/fixtures/data/language_modeling.txt') instances = dataset.instances assert instances[0].fields()["input_tokens"].tokens() == [ "<S>", "This", "is", "a", "sentence" ] assert instances[1].fields()["input_tokens"].tokens() == [ "<S>", "for", "language", "modelling", "." ] assert instances[2].fields()["input_tokens"].tokens() == [ "<S>", "Here", "'s", "another", "one" ]
def test_read_from_file(self, lazy): # The LanguageModelingReader is deprecated, but we don't want this test to fail # before we remove it. warnings.simplefilter(action="ignore", category=DeprecationWarning) reader = LanguageModelingReader(tokens_per_instance=3, lazy=lazy) instances = ensure_list( reader.read(AllenNlpTestCase.FIXTURES_ROOT / "data" / "language_modeling.txt")) # The last potential instance is left out, which is ok, because we don't have an end token # in here, anyway. assert len(instances) == 5 assert [t.text for t in instances[0].fields["input_tokens"].tokens ] == ["This", "is", "a"] assert [t.text for t in instances[0].fields["output_tokens"].tokens] == [ "is", "a", "sentence", ] assert [t.text for t in instances[1].fields["input_tokens"].tokens] == [ "sentence", "for", "language", ] assert [t.text for t in instances[1].fields["output_tokens"].tokens] == [ "for", "language", "modelling", ] assert [t.text for t in instances[2].fields["input_tokens"].tokens] == [ "modelling", ".", "Here", ] assert [t.text for t in instances[2].fields["output_tokens"].tokens ] == [".", "Here", "'s"] assert [t.text for t in instances[3].fields["input_tokens"].tokens] == [ "'s", "another", "one", ] assert [t.text for t in instances[3].fields["output_tokens"].tokens] == [ "another", "one", "for", ] assert [t.text for t in instances[4].fields["input_tokens"].tokens] == [ "for", "extra", "language", ] assert [t.text for t in instances[4].fields["output_tokens"].tokens] == [ "extra", "language", "modelling", ]
from allennlp.modules.token_embedders import Embedding from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder from allennlp.models import Model from model.custom_lstm import OptimizedLSTM from allennlp.nn.util import get_text_field_mask from model.custom_lstm import NaiveLSTM, OptimizedLSTM DATA_ROOT = Path("../data/brown") N_EPOCHS = 1 char_tokenizer = CharacterTokenizer(lowercase_characters=True) reader = LanguageModelingReader( tokens_per_instance=500, tokenizer=char_tokenizer, token_indexers={"tokens": SingleIdTokenIndexer()}, ) train_ds = reader.read(DATA_ROOT / "brown.txt") train_ds, val_ds = train_test_split(train_ds, random_state=0, test_size=0.1) vocab = Vocabulary.from_instances(train_ds) iterator = BasicIterator(batch_size=32) iterator.index_with(vocab) def train(model: nn.Module, epochs: int = 10): trainer = Trainer( model=model.cuda() if torch.cuda.is_available() else model,