예제 #1
0
# read dataset
if os.path.exists('dataset.pickle'):
    with open('dataset.pickle', 'rb') as f:
        train_iter, dev_iter, test_iter, vocab = pickle.load(f)
else:
    root_dir = opt.data
    segments = ['train', 'dev', 'test']
    token_files = [
        os.path.join(root_dir, seg, '%s.toks' % tok) for tok in ['a', 'b']
        for seg in segments
    ]

    vocab = Vocab(filepaths=token_files, embedpath=opt.word_embed)

    train_iter, dev_iter, test_iter = [
        SICKDataIter(os.path.join(root_dir, segment), vocab, num_classes)
        for segment in segments
    ]
    with open('dataset.pickle', 'wb') as f:
        pickle.dump([train_iter, dev_iter, test_iter, vocab], f)

logging.info('==> SICK vocabulary size : %d ' % vocab.size)
logging.info('==> Size of train data   : %d ' % len(train_iter))
logging.info('==> Size of dev data     : %d ' % len(dev_iter))
logging.info('==> Size of test data    : %d ' % len(test_iter))

# get network
net = SimilarityTreeLSTM(sim_hidden_size, rnn_hidden_size, vocab.size,
                         vocab.embed.shape[1], num_classes)

# use pearson correlation and mean-square error for evaluation
예제 #2
0
batch_size = opt.batch_size

# read dataset
if os.path.exists('dataset.pickle'):
    with open('dataset.pickle', 'rb') as f:
        train_iter, dev_iter, test_iter, vocab = pickle.load(f)
else:
    root_dir = opt.data
    segments = ['train', 'dev', 'test']
    token_files = [os.path.join(root_dir, seg, '%s.toks'%tok)
                   for tok in ['a', 'b']
                   for seg in segments]

    vocab = Vocab(filepaths=token_files, embedpath=opt.word_embed)

    train_iter, dev_iter, test_iter = [SICKDataIter(os.path.join(root_dir, segment), vocab, num_classes)
                                       for segment in segments]
    with open('dataset.pickle', 'wb') as f:
        pickle.dump([train_iter, dev_iter, test_iter, vocab], f)

logging.info('==> SICK vocabulary size : %d ' % vocab.size)
logging.info('==> Size of train data   : %d ' % len(train_iter))
logging.info('==> Size of dev data     : %d ' % len(dev_iter))
logging.info('==> Size of test data    : %d ' % len(test_iter))

# get network
net = SimilarityTreeLSTM(sim_hidden_size, rnn_hidden_size, vocab.size, vocab.embed.shape[1], num_classes)

# use pearson correlation and mean-square error for evaluation
metric = mx.metric.create(['pearsonr', 'mse'])