示例#1
0
def train():
    ds = Dataset.load(path)
    X_train, _, y_train, _ = ds.train_val_split()

    print(ds.tokenizer.decode_texts(X_train[:10]))

    print(y_train[:10])

    # RNN models can use `max_tokens=None` to indicate variable length words per mini-batch.
    factory = TokenModelFactory(
        2, ds.tokenizer.token_index, max_tokens=max_len, embedding_type='glove.6B.300d')
    # 2, ds.tokenizer.token_index, max_tokens=max_len, embedding_type='fasttext.simple')

    word_encoder_model = YoonKimCNN()
    # word_encoder_model = AlexCNN(dropout_rate=[0, 0])
    # word_encoder_model = AttentionRNN()
    # word_encoder_model = StackedRNN()
    word_encoder_model = BasicRNN()
    model = factory.build_model(
        token_encoder_model=word_encoder_model, trainable_embeddings=False)

    model.compile(optimizer='sgd',
                  loss='categorical_crossentropy', metrics=['accuracy'])
    model.summary()

    model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.1)
示例#2
0
def test_train():
    X, y, _, _ = imdb(10)

    # use the special tokenizer used for constructing the embeddings
    tokenizer = SpacyTokenizer()

    # preprocess data (once)
    experiment.setup_data(X, y, tokenizer, 'data.bin', max_len=100)

    # load data
    ds = Dataset.load('data.bin')

    # construct base
    factory = TokenModelFactory(ds.num_classes,
                                ds.tokenizer.token_index,
                                max_tokens=100,
                                embedding_type='glove.6B.50d',
                                embedding_dims=50)

    # choose a model
    word_encoder_model = YoonKimCNN()

    # build a model
    model = factory.build_model(token_encoder_model=word_encoder_model,
                                trainable_embeddings=False)

    # use experiment.train as wrapper for Keras.fit()
    experiment.train(x=ds.X,
                     y=ds.y,
                     validation_split=0.1,
                     model=model,
                     word_encoder_model=word_encoder_model,
                     epochs=1,
                     batch_size=32)
示例#3
0
文件: ml.py 项目: jfilter/fds-util
def train(word_encoder_model, lr, batch_size, results_base_dir):
    ds = Dataset.load(proc_path)

    factory = TokenModelFactory(ds.num_classes,
                                ds.tokenizer.token_index,
                                max_tokens=max_len,
                                embedding_type="fasttext.wiki.de",
                                embedding_dims=300)

    model = factory.build_model(token_encoder_model=word_encoder_model,
                                trainable_embeddings=False)

    experiment.train(x=ds.X,
                     y=ds.y,
                     validation_split=0.1,
                     model=model,
                     word_encoder_model=word_encoder_model,
                     epochs=5)
示例#4
0
def test_token_preprocessing(tmpdir):
    tokenizer = SpacyTokenizer()

    X = ['hello', 'world', 'welcome', 'earth']
    y = [0, 1, 0, 1]

    tokenizer.build_vocab(X)

    assert(len(tokenizer.token_index) - len(tokenizer.special_token) == 4)

    X_enc = tokenizer.encode_texts(X)
    X_fin = tokenizer.pad_sequences(X_enc, fixed_token_seq_length=50)

    ds = Dataset(X_fin, y, tokenizer=tokenizer)

    path = str(tmpdir.mkdir("data").join("test"))

    ds.save(path)

    ds_new = Dataset.load(path)

    # only first word
    assert(all([a == b for a, b in zip(ds_new.X[0], X_fin[0])]))
示例#5
0
def test_train_multi_label():
    X, y = ['what is up', 'yes yes', 'no no  no'], [["foo", "bar"], ["foo"],
                                                    ["bar", "haha"]]

    # use the special tokenizer used for constructing the embeddings
    tokenizer = SimpleTokenizer()

    # preprocess data (once)
    experiment.setup_data(X, y, tokenizer, 'data.bin', max_len=100)

    # load data
    ds = Dataset.load('data.bin')

    # construct base
    factory = TokenModelFactory(ds.num_classes,
                                ds.tokenizer.token_index,
                                max_tokens=100,
                                embedding_type='glove.6B.50d',
                                embedding_dims=50)

    # choose a model
    word_encoder_model = YoonKimCNN()

    # build a model
    model = factory.build_model(token_encoder_model=word_encoder_model,
                                trainable_embeddings=False,
                                output_activation="sigmoid")

    # use experiment.train as wrapper for Keras.fit()
    experiment.train(x=ds.X,
                     y=ds.y,
                     validation_split=0.1,
                     model=model,
                     word_encoder_model=word_encoder_model,
                     epochs=1,
                     batch_size=32)