Exemplo n.º 1
0
def main():

    with open('./dataset/train.pickle', 'rb') as f:
        x = pickle.load(f)

    vocab = make_vocab(x)
    tmp_vocab = {}
    for c, i in vocab.items():
        tmp_vocab[i] = c

    with open("./rnnlm_50.model", mode='rb') as f:
        model = pickle.load(f)

    word = 'EOS'
    in_x = Variable(np.array([vocab.get(word, vocab['UNK'])], dtype='int32'))

    for index in model.predict(in_x, max_length=1000):
        if index == vocab['EOS']:
            print()
        else:
            print(tmp_vocab[index], end='')
    print()
Exemplo n.º 2
0
# y = "オッケー蓮子!!"
# input_sentence = ["メリー", "!", "ボブスレー", "しよ", "う", "!", "!"] + ["<eos>"]
# output_sentence = ["オッケー", "蓮子", "!", "!"] + ["<eos>"]
# x = list(x)
# y = list(y)
tmp_vocab = {}
train_x = []
train_y = []

with open('./json/speaker.pickle', 'rb') as f:
    x = pickle.load(f)
with open('./json/response.pickle', 'rb') as f:
    y = pickle.load(f)

# train_set, vocab = make_vocab(x+y)
vocab = make_vocab(x+y)

for speaker, utterance in zip(x, y):
    train_x.append(np.array([vocab[word] for word in reversed(mecab_wakati(speaker))], dtype='int32'))
    train_y.append(np.array([vocab[word] for word in mecab_wakati(utterance)], dtype='int32'))

train_x = np.array(train_x)
train_y = np.array(train_y)

print("train_x: {}, train_y: {}, vocab: {}".format(len(train_x), len(train_y), len(vocab)))

loss = 0
average_loss = []
accuracy_list = []
epochs = 50
batch_size = 128
Exemplo n.º 3
0
from nlp.rnnlm.rnnlm import RNNLM
from nlp.utils import make_vocab, mecab_wakati, plot_loss

# x = "メリー!ボブスレーしよう!!"
# y = "オッケー蓮子!!"
# input_sentence = ["メリー", "!", "ボブスレー", "しよ", "う", "!", "!"] + ["<eos>"]
# output_sentence = ["オッケー", "蓮子", "!", "!"] + ["<eos>"]
# x = list(x)
# y = list(y)
tmp_vocab = {}
train_x = []

with open('./dataset/train.pickle', 'rb') as f:
    x = pickle.load(f)

vocab = make_vocab(x)

for row in x:
    train_x.append(
        np.array([vocab[word] for word in mecab_wakati(row).split()],
                 dtype='int32'))

train_x = np.array(train_x)

print("train_x: {}, vocab: {}".format(len(train_x), len(vocab)))

loss = 0
average_loss = []
epochs = 20
batch_size = 50
num_data = len(train_x)
Exemplo n.º 4
0
from nlp.utils import make_vocab, mecab_wakati, plot_loss


# x = "メリー!ボブスレーしよう!!"
# y = "オッケー蓮子!!"
# input_sentence = ["メリー", "!", "ボブスレー", "しよ", "う", "!", "!"] + ["<eos>"]
# output_sentence = ["オッケー", "蓮子", "!", "!"] + ["<eos>"]
# x = list(x)
# y = list(y)
tmp_vocab = {}
train_x = []

with open('./dataset/train.pickle', 'rb') as f:
    x = pickle.load(f)

vocab = make_vocab(x)

for row in x:
    train_x.append(np.array([vocab[word] for word in mecab_wakati(row).split()], dtype='int32'))

train_x = np.array(train_x)

print("train_x: {}, vocab: {}".format(len(train_x), len(vocab)))

loss = 0
average_loss = []
epochs = 20
batch_size = 50
num_data = len(train_x)
start_at = time.time()
cur_at = start_at
Exemplo n.º 5
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--gpu', '-g', default=-1, type=int,
                        help='GPU ID (negative value indicates CPU)')
    parser.add_argument('--unit', '-u', default=100, type=int,
                        help='number of units')
    parser.add_argument('--window', '-w', default=5, type=int,
                        help='window size')
    parser.add_argument('--batchsize', '-b', type=int, default=1000,
                        help='learning minibatch size')
    parser.add_argument('--epoch', '-e', default=20, type=int,
                        help='number of epochs to learn')
    parser.add_argument('--model', '-m', choices=['skipgram', 'cbow'],
                        default='skipgram',
                        help='model type ("skipgram", "cbow")')
    parser.add_argument('--negative-size', default=5, type=int,
                        help='number of negative samples')
    parser.add_argument('--out-type', '-o', choices=['hsm', 'ns', 'original'],
                        default='ns',
                        help='output model type ("hsm": hierarchical softmax, '
                             '"ns": negative sampling, "original": '
                             'no approximation)')
    parser.add_argument('--out', default='result',
                        help='Directory to output the result')
    parser.add_argument('--test', dest='test', action='store_true')
    parser.set_defaults(test=False)
    args = parser.parse_args()

    if args.gpu >= 0:
        chainer.backends.cuda.get_device_from_id(args.gpu).use()
        cuda.check_cuda_available()

    print('GPU: {}'.format(args.gpu))
    print('# unit: {}'.format(args.unit))
    print('Window: {}'.format(args.window))
    print('Minibatch-size: {}'.format(args.batchsize))
    print('# epoch: {}'.format(args.epoch))
    print('Training model: {}'.format(args.model))
    print('Output type: {}'.format(args.out_type))
    print('')

    # Load the dataset
    # 学習データの読み込み
    with open('./corpus/train.pickle', 'rb') as f:
        x = pickle.load(f)
    vocab = make_vocab(x)
    train_x = []
    for row in x:
        train_x += [vocab[word] for word in mecab_wakati(row)]
    train = np.array(train_x)
    counts = collections.Counter(train)
    n_vocab = max(train) + 1

    index2word = {wid: word for word, wid in six.iteritems(vocab)}

    print('n_vocab: %d' % n_vocab)
    print('data length: %d' % len(train))

    if args.out_type == 'hsm':
        HSM = L.BinaryHierarchicalSoftmax
        tree = HSM.create_huffman_tree(counts)
        loss_func = HSM(args.unit, tree)
        loss_func.W.data[...] = 0
    elif args.out_type == 'ns':
        cs = [counts[w] for w in range(len(counts))]
        loss_func = L.NegativeSampling(args.unit, cs, args.negative_size)
        loss_func.W.data[...] = 0
    elif args.out_type == 'original':
        loss_func = SoftmaxCrossEntropyLoss(args.unit, n_vocab)
    else:
        raise Exception('Unknown output type: {}'.format(args.out_type))

    # Choose the model
    if args.model == 'skipgram':
        model = SkipGram(n_vocab, args.unit, loss_func)
    elif args.model == 'cbow':
        model = ContinuousBoW(n_vocab, args.unit, loss_func)
    else:
        raise Exception('Unknown model type: {}'.format(args.model))

    if args.gpu >= 0:
        model.to_gpu()

    # Set up an optimizer
    optimizer = O.Adam()
    optimizer.setup(model)

    # Set up an iterator
    train_iter = WindowIterator(train, args.window, args.batchsize)
    # val_iter = WindowIterator(val, args.window, args.batchsize, repeat=False)

    # Set up an updater
    updater = training.updaters.StandardUpdater(
        train_iter, optimizer, converter=convert, device=args.gpu)

    # Set up a trainer
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)

    # trainer.extend(extensions.Evaluator(
    #     val_iter, model, converter=convert, device=args.gpu))
    trainer.extend(extensions.LogReport())
    trainer.extend(extensions.PrintReport(
        ['epoch', 'main/loss', 'validation/main/loss']))
    trainer.extend(extensions.ProgressBar())
    trainer.run()

    # Save the word2vec model
    with open('word2vec.model', 'w') as f:
        f.write('%d %d\n' % (len(index2word), args.unit))
        w = cuda.to_cpu(model.embed.W.data)
        for i, wi in enumerate(w):
            v = ' '.join(map(str, wi))
            f.write('%s %s\n' % (index2word[i], v))