def train(): ds = Dataset.load(path) X_train, _, y_train, _ = ds.train_val_split() print(ds.tokenizer.decode_texts(X_train[:10])) print(y_train[:10]) # RNN models can use `max_tokens=None` to indicate variable length words per mini-batch. factory = TokenModelFactory( 2, ds.tokenizer.token_index, max_tokens=max_len, embedding_type='glove.6B.300d') # 2, ds.tokenizer.token_index, max_tokens=max_len, embedding_type='fasttext.simple') word_encoder_model = YoonKimCNN() # word_encoder_model = AlexCNN(dropout_rate=[0, 0]) # word_encoder_model = AttentionRNN() # word_encoder_model = StackedRNN() word_encoder_model = BasicRNN() model = factory.build_model( token_encoder_model=word_encoder_model, trainable_embeddings=False) model.compile(optimizer='sgd', loss='categorical_crossentropy', metrics=['accuracy']) model.summary() model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.1)
def test_train(): X, y, _, _ = imdb(10) # use the special tokenizer used for constructing the embeddings tokenizer = SpacyTokenizer() # preprocess data (once) experiment.setup_data(X, y, tokenizer, 'data.bin', max_len=100) # load data ds = Dataset.load('data.bin') # construct base factory = TokenModelFactory(ds.num_classes, ds.tokenizer.token_index, max_tokens=100, embedding_type='glove.6B.50d', embedding_dims=50) # choose a model word_encoder_model = YoonKimCNN() # build a model model = factory.build_model(token_encoder_model=word_encoder_model, trainable_embeddings=False) # use experiment.train as wrapper for Keras.fit() experiment.train(x=ds.X, y=ds.y, validation_split=0.1, model=model, word_encoder_model=word_encoder_model, epochs=1, batch_size=32)
def train(word_encoder_model, lr, batch_size, results_base_dir): ds = Dataset.load(proc_path) factory = TokenModelFactory(ds.num_classes, ds.tokenizer.token_index, max_tokens=max_len, embedding_type="fasttext.wiki.de", embedding_dims=300) model = factory.build_model(token_encoder_model=word_encoder_model, trainable_embeddings=False) experiment.train(x=ds.X, y=ds.y, validation_split=0.1, model=model, word_encoder_model=word_encoder_model, epochs=5)
def test_token_preprocessing(tmpdir): tokenizer = SpacyTokenizer() X = ['hello', 'world', 'welcome', 'earth'] y = [0, 1, 0, 1] tokenizer.build_vocab(X) assert(len(tokenizer.token_index) - len(tokenizer.special_token) == 4) X_enc = tokenizer.encode_texts(X) X_fin = tokenizer.pad_sequences(X_enc, fixed_token_seq_length=50) ds = Dataset(X_fin, y, tokenizer=tokenizer) path = str(tmpdir.mkdir("data").join("test")) ds.save(path) ds_new = Dataset.load(path) # only first word assert(all([a == b for a, b in zip(ds_new.X[0], X_fin[0])]))
def test_train_multi_label(): X, y = ['what is up', 'yes yes', 'no no no'], [["foo", "bar"], ["foo"], ["bar", "haha"]] # use the special tokenizer used for constructing the embeddings tokenizer = SimpleTokenizer() # preprocess data (once) experiment.setup_data(X, y, tokenizer, 'data.bin', max_len=100) # load data ds = Dataset.load('data.bin') # construct base factory = TokenModelFactory(ds.num_classes, ds.tokenizer.token_index, max_tokens=100, embedding_type='glove.6B.50d', embedding_dims=50) # choose a model word_encoder_model = YoonKimCNN() # build a model model = factory.build_model(token_encoder_model=word_encoder_model, trainable_embeddings=False, output_activation="sigmoid") # use experiment.train as wrapper for Keras.fit() experiment.train(x=ds.X, y=ds.y, validation_split=0.1, model=model, word_encoder_model=word_encoder_model, epochs=1, batch_size=32)