示例#1
0
def synthetic_data():
    print("synthetic_data task")
    # Train the simple copy task.
    V = 11
    criterion = model_help.LabelSmoothing(size=V, padding_idx=0, smoothing=0.1)
    criterion.cuda()
    model = model_help.make_model(V, V, N=2)
    model.cuda()
    model_opt = model_help.NoamOpt(
        model.src_embed[0].d_model, 1, 400,
        torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98),
                         eps=1e-9))
    for epoch in range(10):
        model.train()
        run_epoch(
            data_help.data_gen(V, 30, 20), model,
            model_help.SimpleLossCompute(model.generator, criterion,
                                         args.device, model_opt))
        model.eval()
        eval_loss = run_epoch(
            data_help.data_gen(V, 30, 5), model,
            model_help.SimpleLossCompute(model.generator, criterion,
                                         args.device, None))
        print("eval loss: %f" % eval_loss.numpy())

    model.eval()
    src = Variable(torch.LongTensor([[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]]))
    src_mask = Variable(torch.ones(1, 1, 10))
    print(
        greedy_decode(model,
                      src.to(args.device),
                      src_mask.to(args.device),
                      max_len=10,
                      start_symbol=1))
示例#2
0
def main():
    os.makedirs('checkpoint', exist_ok=True)

    V = 11
    num_epochs = 10
    batch_size = 30
    train_batches = 20
    test_batches = 5

    criterion = LabelSmoothing(size=V, padding_idx=0, smoothing=0.0)
    model = make_model(V, V, N=2).to(device)
    model_opt = NoamOpt(
        model.src_embed[0].d_model, 1, 400,
        torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98),
                         eps=1e-9))

    for epoch in range(num_epochs):
        # training
        model.train()
        train_loss = run_epoch(
            data_gen(V, batch_size=batch_size, nbatches=train_batches), model,
            SimpleLossCompute(model.generator, criterion, model_opt))
        experiment.log_metric('train_loss', train_loss, step=epoch)

        # validation
        model.eval()
        valid_loss = run_epoch(
            data_gen(V, batch_size=30, nbatches=test_batches), model,
            SimpleLossCompute(model.generator, criterion, None))
        experiment.log_metric('valid_loss', valid_loss, step=epoch)
        print('valid_loss:', valid_loss)

    torch.save(model.state_dict(), 'checkpoint/model.pt')
示例#3
0
def gridsearch(params):
    
    



    max_sequence_length = reader.max_sentence_length
    random_init = True
    if not(params.wordvec_initialization == 'random'):
        random_init = False

    train_test_val= reader.create_batch(embedding_params = embedding_params,batch_size = -1)

    training_data = train_test_val['train']
    test_data = train_test_val['test']
    validation_data = train_test_val['dev']


    # for x, y in batch_gen(training_data, max_sequence_length):
    #     model.train_on_batch(x,y)

    train_x, train_y = data_gen(training_data, max_sequence_length)
    test_x, test_y = data_gen(test_data, max_sequence_length)
    val_x, val_y = data_gen(validation_data, max_sequence_length)


    train_y = to_categorical(train_y)
    test_y = to_categorical(test_y)
    val_y = to_categorical(val_y)
    

    

    dropout_rate = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]  
    dropout_rate = [ 0.5, 0.9]  
    param_grid = dict(dropout_rate=dropout_rate)
    
#    ,validation_data= (test_x, test_y)
    model = KerasClassifier(build_fn=createModel, nb_epoch= 1, batch_size= params.batch_size, verbose=1)
    grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=2)    # n_jobs=-1
    grid_result = grid.fit(train_x, train_y)  
    # summarize results  
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))  
    for params, mean_score, scores in grid_result.grid_scores_:  
        print("%f (%f) with: %r" % (scores.mean(), scores.std(), params))
    
    
    
    
    experiment_results_path = 'eval/experiment_result.xlsx'
    xls_file = pd.ExcelFile(experiment_results_path)

    df1 = xls_file.parse('Sheet1')
    l = {'complex_mixture':0,'complex_superposition':1,'real':2}
    df1.ix[l[params.network_type],params.dataset_name] = max(grid_result.best_score_)
    df1.to_excel(experiment_results_path)
def main():
    path_to_vec = '../glove/glove.6B.100d.txt'
    dir_name = '../'
    reader = SSTDataReader(dir_name, nclasses=2)
    embedding_params = reader.get_word_embedding(path_to_vec,
                                                 orthonormalized=False)
    lookup_table = get_lookup_table(embedding_params)
    max_sequence_length = 60

    sequence_input = Input(shape=(max_sequence_length, ), dtype='int32')
    phase_embedding = phase_embedding_layer(max_sequence_length,
                                            lookup_table.shape[0])

    amplitude_embedding = amplitude_embedding_layer(np.transpose(lookup_table),
                                                    max_sequence_length)

    # [embed_seq_real, embed_seq_imag] = ComplexMultiply()([phase_embedding, amplitude_embedding])
    output = phase_embedding(sequence_input)
    model = Model(sequence_input, output)
    model.compile(loss='binary_crossentropy',
                  optimizer='rmsprop',
                  metrics=['accuracy'])
    model.summary()

    train_test_val = reader.create_batch(embedding_params=embedding_params,
                                         batch_size=-1)

    training_data = train_test_val['train']
    test_data = train_test_val['test']
    validation_data = train_test_val['dev']

    # for x, y in batch_gen(training_data, max_sequence_length):
    #     model.train_on_batch(x,y)

    train_x, train_y = data_gen(training_data, max_sequence_length)
    test_x, test_y = data_gen(test_data, max_sequence_length)
    val_x, val_y = data_gen(validation_data, max_sequence_length)
    # sequence_input = Input(shape=(max_sequence_length,), dtype='int32')
    # path_to_vec = '../glove/glove.6B.100d.txt'
    # embedded_sequences = amplitude_embedding_layer(path_to_vec, 10)

    # output = embedded_sequences(sequence_input)
    # model = Model(sequence_input, output)
    # model.compile(loss='categorical_crossentropy',
    #           optimizer='rmsprop',
    #           metrics=['acc'])

    # model.summary()

    x = train_x

    y = model.predict(x)
    print(y)
    print(y.shape)
def init(attr=2,
         train_size=0.7,
         test_size=0.1,
         batch_size=25,
         trainable_embed=False,
         filename=None):
    #data generator
    #load first mini-batch
    """
    train_set_x (45, 312, 153, 300)
    """

    revs, W, W2, word_idx_map, vocab, mairesse, charged_words = load_data(attr)
    datasets = w2idx(revs,
                     word_idx_map,
                     mairesse,
                     charged_words,
                     attr,
                     max_l=149,
                     max_s=312,
                     filter_h=3)
    _D = len(datasets[0])
    _S = len(datasets[0][0])
    _W = len(datasets[0][0][0])
    _E = W.shape[1]
    dataset_idx = data_idx(len(datasets[0]), batch_size)
    #print(len(datasets[0]))
    # 2467
    #exit()

    #split train val
    n_train_items = int(np.round(train_size * _D))
    n_test_items = int(test_size * _D)
    test_idx = dataset_idx[n_train_items:n_train_items + n_test_items]
    val_idx = dataset_idx[n_train_items + n_test_items:]
    test_generator = data_gen(attr,
                              test_idx,
                              datasets,
                              W,
                              batch_size=25,
                              test=True)

    if filename == None:
        exit()
    else:
        model = load_model(filename, custom_objects={'nll1': nll1})

    return model, test_generator
示例#6
0
def init(attr=2,
         train_size=0.7,
         test_size=0.1,
         batch_size=25,
         trainable_embed=False,
         filename=None):
    #data generator
    #load first mini-batch
    """
    train_set_x (45, 312, 153, 300)
    """

    revs, W, W2, word_idx_map, vocab, mairesse, charged_words = load_data(
        attr, data_aug=True)
    datasets = w2idx(revs,
                     word_idx_map,
                     mairesse,
                     charged_words,
                     attr,
                     max_l=149,
                     max_s=312,
                     filter_h=3)
    _D = len(datasets[0])
    _S = len(datasets[0][0])
    _W = len(datasets[0][0][0])
    _E = W.shape[1]
    dataset_idx = data_idx(len(datasets[0]), batch_size)
    #print(len(datasets[0]))
    # 2467
    #exit()

    #split train val
    n_train_items = int(np.round(train_size * _D))
    train_idx = dataset_idx[:n_train_items]
    n_test_items = int(test_size * _D)
    test_idx = dataset_idx[n_train_items:n_train_items + n_test_items]
    val_idx = dataset_idx[n_train_items + n_test_items:]
    train_generator = data_gen(attr, train_idx, datasets, W, batch_size=25)
    val_generator = data_gen(attr, val_idx, datasets, W, batch_size=25)
    test_generator = data_gen(attr, test_idx, datasets, W, batch_size=25)

    input_shape = (_S * _W, _E, 1)
    docs_size = _S
    hidden_units = [200, 200, 2]
    filter_hs = [1, 2, 3]
    filter_shapes = []
    pool_sizes = []
    reshape = (_S, _W)
    for filter_h in filter_hs:
        filter_shapes.append((filter_h, _E))
        pool_sizes.append((_S * (_W - filter_h + 1), 1))
    if filename == None:
        model = BigFiveCnnModel(W,
                                filter_shapes,
                                pool_sizes,
                                reshape,
                                filter_hs=filter_hs,
                                hidden_units=hidden_units,
                                docs_size=docs_size,
                                trainable_embed=trainable_embed)
        #model.summary()
        opt = Adadelta(lr=1.0, rho=0.95, epsilon=None, decay=0.0)
        model.compile(loss=nll1, optimizer=opt, metrics=['accuracy'])
    else:
        model = load_model(filename, custom_objects={'nll1': nll1})
    steps = int(train_idx.shape[0] // batch_size)
    v_steps = int(val_idx.shape[0] // batch_size)
    return model, train_generator, val_generator, test_generator, steps, v_steps
示例#7
0
# print(embedding_params['word2id'])
lookup_table = get_lookup_table(embedding_params)

max_sequence_length = reader.max_sentence_length
random_init = True
if not (params.wordvec_initialization == 'random'):
    random_init = False

train_test_val = reader.create_batch(embedding_params=embedding_params,
                                     batch_size=-1)

training_data = train_test_val['train']
test_data = train_test_val['test']
validation_data = train_test_val['dev']

train_x, train_y = data_gen(training_data, max_sequence_length)
test_x, test_y = data_gen(test_data, max_sequence_length)
val_x, val_y = data_gen(validation_data, max_sequence_length)

train_y = to_categorical(train_y)
test_y = to_categorical(test_y)
val_y = to_categorical(val_y)


def run_task(zipped_args):
    i, (dropout_rate, optimizer, learning_rate, init_mode, projection,
        batch_size, activation) = zipped_args

    arg_str = (" ".join([
        str(ii) for ii in (dropout_rate, optimizer, learning_rate, init_mode,
                           projection, batch_size, activation)
示例#8
0
def complex_embedding(params):
    # datasets_dir, dataset_name, wordvec_initialization ='random', wordvec_path = None, loss = 'binary_crossentropy', optimizer = 'rmsprop', batch_size = 16, epochs= 4

    reader = data_reader_initialize(params.dataset_name, params.datasets_dir)

    if (params.wordvec_initialization == 'orthogonalize'):
        embedding_params = reader.get_word_embedding(params.wordvec_path,
                                                     orthonormalized=True)

    elif ((params.wordvec_initialization == 'random') |
          (params.wordvec_initialization == 'word2vec')):
        embedding_params = reader.get_word_embedding(params.wordvec_path,
                                                     orthonormalized=False)
    else:
        raise ValueError('The input word initialization approach is invalid!')

    # print(embedding_params['word2id'])
    lookup_table = get_lookup_table(embedding_params)

    max_sequence_length = reader.max_sentence_length
    random_init = True
    if not (params.wordvec_initialization == 'random'):
        random_init = False

    if params.network_type == 'complex_superposition':
        model = run_complex_embedding_network_superposition(
            lookup_table,
            max_sequence_length,
            reader.nb_classes,
            random_init=random_init)
    elif params.network_type == 'complex_mixture':
        model = run_complex_embedding_network_mixture(lookup_table,
                                                      max_sequence_length,
                                                      reader.nb_classes,
                                                      random_init=random_init)
    else:
        model = run_real_embedding_network(lookup_table,
                                           max_sequence_length,
                                           reader.nb_classes,
                                           random_init=random_init)

    model.compile(loss=params.loss,
                  optimizer=params.optimizer,
                  metrics=['accuracy'])

    model.summary()
    weights = model.get_weights()

    train_test_val = reader.create_batch(embedding_params=embedding_params,
                                         batch_size=-1)

    training_data = train_test_val['train']
    test_data = train_test_val['test']
    validation_data = train_test_val['dev']

    # for x, y in batch_gen(training_data, max_sequence_length):
    #     model.train_on_batch(x,y)

    train_x, train_y = data_gen(training_data, max_sequence_length)
    test_x, test_y = data_gen(test_data, max_sequence_length)
    val_x, val_y = data_gen(validation_data, max_sequence_length)
    print(len(train_x))
    print(len(test_x))
    print(len(val_x))
    # assert len(train_x) == 67349
    # assert len(test_x) == 1821
    # assert len(val_x) == 872

    train_y = to_categorical(train_y)
    test_y = to_categorical(test_y)
    val_y = to_categorical(val_y)

    history = model.fit(x=train_x,
                        y=train_y,
                        batch_size=params.batch_size,
                        epochs=params.epochs,
                        validation_data=(test_x, test_y))

    val_acc = history.history['val_acc']
    train_acc = history.history['acc']

    if not (os.path.exists(params.eval_dir)):
        os.mkdir(params.eval_dir)

    learning_curve_path = os.path.join(params.eval_dir, 'learning_curve')
    epoch_indexes = [x + 1 for x in range(len(val_acc))]
    line_1, = plt.plot(epoch_indexes, val_acc)
    line_2, = plt.plot(epoch_indexes, train_acc)
    # plt.axis([0, 6, 0, 20])

    plt.legend([line_1, line_2], ['test_acc', 'train_acc'])
    fig = plt.gcf()
    fig.savefig(learning_curve_path, dpi=fig.dpi)

    evaluation = model.evaluate(x=test_x, y=test_y)

    eval_file_path = os.path.join(params.eval_dir, 'eval.txt')

    with open(eval_file_path, 'w') as eval_file:
        eval_file.write('acc: {}, loss: {}'.format(evaluation[1],
                                                   evaluation[0]))

    embedding_dir = os.path.join(params.eval_dir, 'embedding')
    if not (os.path.exists(embedding_dir)):
        os.mkdir(embedding_dir)
    np.save(os.path.join(embedding_dir, 'phase_embedding'),
            model.get_weights()[0])
    np.save(os.path.join(embedding_dir, 'amplitude_embedding'),
            model.get_weights()[1])
    np.save(os.path.join(embedding_dir, 'word2id'),
            embedding_params['word2id'])
    save_model(model, os.path.join(params.eval_dir, 'model'))

    experiment_results_path = 'eval/experiment_result.xlsx'
    xls_file = pd.ExcelFile(experiment_results_path)

    df1 = xls_file.parse('Sheet1')
    l = {'complex_mixture': 0, 'complex_superposition': 1, 'real': 2}
    df1.ix[l[params.network_type], params.dataset_name] = max(val_acc)
    df1.to_excel(experiment_results_path)
示例#9
0
train_size = 0.9
attr = 2
batch_size = 25
revs, W, W2, word_idx_map, vocab, mairesse, charged_words = load_data(attr)
datasets = w2idx(revs,
                 word_idx_map,
                 mairesse,
                 charged_words,
                 attr,
                 max_l=149,
                 max_s=312,
                 filter_h=3)
_D = len(datasets[0])
_S = len(datasets[0][0])
_W = len(datasets[0][0][0])
_E = W.shape[1]
print(_D)
print(_S)
print(_W)
print(_E)

dataset_idx = data_idx(attr, len(datasets[0]), batch_size)

#split train val
n_train_items = int(np.round(train_size * _D))
train_idx = dataset_idx[:n_train_items]
train_generator = data_gen(attr, train_idx, datasets, W, batch_size=25)
[train_set_x, train_set_m], train_set_y = next(train_generator)

print(train_set_x.shape)