예제 #1
0
def train(labels, features, w):
	n_class = len(set(features))
	print(f"data : {len(features)}")
	print(f"class: {n_class}")

	pairs = [
		(vec, np.array([cls], np.int32))
		for vec, cls in zip(features, labels)
	]

	train_iter = chainer.iterators.SerialIterator(pairs, batch_size=16)

	model = nets.TextClassifier(Encoder(w), n_class)

	optimizer = chainer.optimizers.Adam()
	optimizer.setup(model)
	optimizer.add_hook(chainer.optimizer.WeightDecay(1e-4))

	updater = training.updaters.StandardUpdater(
		train_iter,
		optimizer,
		converter=convert_seq
	)

	trainer = training.Trainer(updater, (8, "epoch"), out="./result/dl/")

	trainer.extend(extensions.LogReport())
	trainer.extend(
		extensions.PrintReport(["epoch", "main\loss", "main/accuracy", "elapsed_time"])
	)

	trainer.run()

	return model
def setup_model(device, model_setup):
    sys.stderr.write(json.dumps(args.__dict__, indent=2) + '\n')
    setup = json.load(open(model_setup))
    sys.stderr.write(json.dumps(setup, indent=2) + '\n')

    vocab = json.load(open(setup['vocab_path']))
    n_class = setup['n_class']

    # Setup a model
    if setup['model'] == 'rnn':
        Encoder = nets.RNNEncoder
    elif setup['model'] == 'cnn':
        Encoder = nets.CNNEncoder
    elif setup['model'] == 'bow':
        Encoder = nets.BOWMLPEncoder
    elif setup['model'] == 'gru':
        Encoder = nets.GRUEncoder
    encoder = Encoder(n_layers=setup['layer'],
                      n_vocab=len(vocab),
                      n_units=setup['unit'],
                      dropout=setup['dropout'])
    model = nets.TextClassifier(encoder, n_class)
    chainer.serializers.load_npz(setup['model_path'], model)
    model.to_device(device)  # Copy the model to the device

    return model, vocab, setup
예제 #3
0
def setup_model(args):
    sys.stderr.write(json.dumps(args.__dict__, indent=2) + '\n')
    setup = json.load(open(args.model_setup))
    sys.stderr.write(json.dumps(setup, indent=2) + '\n')

    vocab = json.load(open(setup['vocab_path']))
    n_class = setup['n_class']

    # Setup a model
    if setup['model'] == 'rnn':
        Encoder = nets.RNNEncoder
    elif setup['model'] == 'cnn':
        Encoder = nets.CNNEncoder
    elif setup['model'] == 'bow':
        Encoder = nets.BOWMLPEncoder
    encoder = Encoder(n_layers=setup['layer'],
                      n_vocab=len(vocab),
                      n_units=setup['unit'],
                      dropout=setup['dropout'])
    model = nets.TextClassifier(encoder, n_class)
    chainer.serializers.load_npz(setup['model_path'], model)
    if args.gpu >= 0:
        # Make a specified GPU current
        chainer.backends.cuda.get_device_from_id(args.gpu).use()
        model.to_gpu()  # Copy the model to the GPU

    return model, vocab, setup
예제 #4
0
def train(labels, features, w):
    n_class = len(set(labels))
    print('# data: {0}'.format(len(features)))
    print('# class: {0}'.format(n_class))

    pairs = [(vec, numpy.array([cls], numpy.int32))
             for vec, cls in zip(features, labels)]
    train_iter = chainer.iterators.SerialIterator(pairs, batch_size=16)

    model = nets.TextClassifier(Encoder(w), n_class)

    # Setup an optimizer
    optimizer = chainer.optimizers.Adam()
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer.WeightDecay(1e-4))

    # Set up a trainer
    updater = training.updaters.StandardUpdater(train_iter,
                                                optimizer,
                                                converter=convert_seq)
    trainer = training.Trainer(updater, (8, 'epoch'), out='./result/dl')

    # Write a log of evaluation statistics for each epoch
    trainer.extend(extensions.LogReport())
    trainer.extend(
        extensions.PrintReport(
            ['epoch', 'main/loss', 'main/accuracy', 'elapsed_time']))

    trainer.run()
    return model
예제 #5
0
def setup_model(args):
    sys.stderr.write(json.dumps(args.__dict__, indent=2) + '\n')
    setup = json.load(open(args.model_setup))
    sys.stderr.write(json.dumps(setup, indent=2) + '\n')

    # Load a dataset
    dataset = setup['dataset']
    if dataset == 'dbpedia':
        train, test, vocab = text_datasets.get_dbpedia(
            char_based=setup['char_based'])
    elif dataset == 'snli':
        train, test, vocab = text_datasets.get_snli(
            char_based=setup['char_based'])
    elif dataset.startswith('imdb.'):
        train, test, vocab = text_datasets.get_imdb(
            fine_grained=dataset.endswith('.fine'),
            char_based=setup['char_based'])
    elif dataset in [
            'TREC', 'stsa.binary', 'stsa.fine', 'custrev', 'mpqa',
            'rt-polarity', 'subj'
    ]:
        train, test, vocab = text_datasets.get_other_text_dataset(
            dataset, char_based=setup['char_based'])

    vocab = json.load(open(setup['vocab_path']))
    n_class = setup['n_class']
    print('# train data: {}'.format(len(train)))
    print('# test  data: {}'.format(len(test)))
    print('# vocab: {}'.format(len(vocab)))
    print('# class: {}'.format(n_class))

    # Setup a model
    if setup['model'] == 'rnn':
        Encoder = nets.RNNEncoder
    elif setup['model'] == 'bilstm':
        Encoder = nets.BiLSTMEncoder
    elif setup['model'] == 'cnn':
        Encoder = nets.CNNEncoder
    elif setup['model'] == 'bow':
        Encoder = nets.BOWMLPEncoder
    encoder = Encoder(n_layers=setup['layer'],
                      n_vocab=len(vocab),
                      n_units=setup['unit'],
                      dropout=setup['dropout'])
    if dataset == 'snli':
        model = nets.SNLIClassifier(encoder)
    else:
        model = nets.TextClassifier(encoder, n_class)
    chainer.serializers.load_npz(setup['model_path'], model)
    if args.gpu >= 0:
        # Make a specified GPU current
        chainer.backends.cuda.get_device_from_id(args.gpu).use()
        model.to_gpu()  # Copy the model to the GPU

    return model, train, test, vocab, setup
예제 #6
0
def model_fn(model_dir):
    """
    This function is called by the Chainer container during hosting when running on SageMaker with
    values populated by the hosting environment.
    
    This function loads models written during training into `model_dir`.

    Args:
        model_dir (str): path to the directory containing the saved model artifacts

    Returns:
        a loaded Chainer model

    For more on `model_fn`, please visit the sagemaker-python-sdk repository:
    https://github.com/aws/sagemaker-python-sdk

    For more on the Chainer container, please visit the sagemaker-chainer-containers repository:
    https://github.com/aws/sagemaker-chainer-containers
    """
    model_path = os.path.join(model_dir, 'my_model.npz')

    vocab_path = os.path.join(model_dir, 'vocab.json')
    model_setup_path = os.path.join(model_dir, 'args.json')
    with open(vocab_path, 'r') as f:
        vocab = json.load(f)
    with open(model_setup_path, 'r') as f:
        model_setup = json.load(f)

    model_type = model_setup['model_type']
    if model_type == 'rnn':
        Encoder = nets.RNNEncoder
    elif model_type == 'cnn':
        Encoder = nets.CNNEncoder
    elif model_type == 'bow':
        Encoder = nets.BOWMLPEncoder
    num_layers = model_setup['num_layers']
    num_units = model_setup['num_units']
    dropout = model_setup['dropout']
    num_classes = model_setup['num_classes']
    encoder = Encoder(n_layers=num_layers,
                      n_vocab=len(vocab),
                      n_units=num_units,
                      dropout=dropout)
    model = nets.TextClassifier(encoder, num_classes)

    serializers.load_npz(model_path, model)

    return model, vocab
예제 #7
0
def train(labels, features, w):
    # set型、集合型に変換する
    n_class = len(set(labels))
    print(f'# data: {len(features)}')
    print(f'# class: {n_class}')

    # 学習用データをchainerのiteratorの形にしておく
    pairs = [(vec, numpy.array([cls], numpy.int32))
             for vec, cls in zip(features, labels)]
    train_iter = chainer.iterators.SerialIterator(pairs, batch_size=16)

    # 学習するモデルをちゃいねrのサンプルプログラムのTextClassifierクラスを用いて設定する
    # 二値分類であるためカテゴリ数には2を指定、モデルはEncoderクラスのLSTMを指定する
    model = nets.TextClassifier(Encoder(w), n_class)

    # 最適化にはAdamを選択
    # ニューラルネットの学習方法を指定します。SGDは最も単純なものです。
    optimizer = chainer.optimizers.Adam()
    # 学習させたいパラメータを持ったChainをオプティマイザーにセットします。
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer.WeightDecay(1e-4))

    # optimizerを使用してupdatersでパラメータを更新する
    updater = training.updaters.StandardUpdater(train_iter,
                                                optimizer,
                                                converter=convert_seq)
    # Trainerを用意する。updaterを渡すことで使える。epochを指定する。outはデータを保存する場所。
    trainer = training.Trainer(updater, (8, 'epoch'), out='./result/dl')

    # 下記で学習経過を確認する
    trainer.extend(extensions.LogReport())
    trainer.extend(
        extensions.PrintReport(
            ['epoch', 'main/loss', 'main/accuracy', 'elapsed_time']))

    # 学習スタート
    trainer.run()
    return model
예제 #8
0
def main():
    parser = create_parser()
    args = parser.parse_args()
    current_datetime = '{}'.format(datetime.datetime.today())

    # Load a dataset
    if args.dataset == 'dbpedia':
        train, test, vocab = text_datasets.get_dbpedia(
            char_based=args.char_based)
    elif args.dataset == 'snli':
        train, test, vocab = text_datasets.get_snli(char_based=args.char_based)
    elif args.dataset.startswith('imdb.'):
        train, test, vocab = text_datasets.get_imdb(
            fine_grained=args.dataset.endswith('.fine'),
            char_based=args.char_based)
    elif args.dataset in [
            'TREC', 'stsa.binary', 'stsa.fine', 'custrev', 'mpqa',
            'rt-polarity', 'subj'
    ]:
        train, test, vocab = text_datasets.get_other_text_dataset(
            args.dataset, char_based=args.char_based)

    train_idx = list(range(len(train)))

    # calibration data is taken out of training for calibrated dknn / temperature scaling
    calibration_idx = sorted(random.sample(train_idx, 1000))
    calibration = [train[i] for i in calibration_idx]
    train = [x for i, x in enumerate(train) if i not in calibration_idx]

    print('# train data: {}'.format(len(train)))
    print('# test  data: {}'.format(len(test)))
    print('# vocab: {}'.format(len(vocab)))
    if args.dataset == 'snli':
        n_class = 3
    else:
        n_class = len(set([int(d[1]) for d in train]))
    print('# class: {}'.format(n_class))

    train_iter = chainer.iterators.SerialIterator(train, args.batchsize)
    test_iter = chainer.iterators.SerialIterator(test,
                                                 args.batchsize,
                                                 repeat=False,
                                                 shuffle=False)

    # Save vocabulary and model's setting
    current = os.path.dirname(os.path.abspath(__file__))
    save_path = os.path.join(current, args.out,
                             '{}_{}'.format(args.dataset, args.model))
    if not os.path.isdir(args.out):
        os.mkdir(args.out)
    if not os.path.isdir(save_path):
        os.mkdir(save_path)
    args.save_path = save_path

    vocab_path = os.path.join(save_path, 'vocab.json')
    model_path = os.path.join(save_path, 'best_model.npz')
    setup_path = os.path.join(save_path, 'args.json')
    calib_path = os.path.join(save_path, 'calib.json')

    with open(calib_path, 'w') as f:
        json.dump(calibration_idx, f)

    with open(vocab_path, 'w') as f:
        json.dump(vocab, f)

    model_setup = args.__dict__
    model_setup['vocab_path'] = vocab_path
    model_setup['model_path'] = model_path
    model_setup['n_class'] = n_class
    model_setup['datetime'] = current_datetime
    with open(setup_path, 'w') as f:
        json.dump(model_setup, f)
    print(json.dumps(model_setup, indent=2))

    # Setup a model
    if args.model == 'rnn':
        Encoder = nets.RNNEncoder
    if args.model == 'bilstm':
        Encoder = nets.BiLSTMEncoder
    elif args.model == 'cnn':
        Encoder = nets.CNNEncoder
    elif args.model == 'bow':
        Encoder = nets.BOWMLPEncoder
    encoder = Encoder(n_layers=args.layer,
                      n_vocab=len(vocab),
                      n_units=args.unit,
                      dropout=args.dropout)
    if args.dataset == 'snli':
        model = nets.SNLIClassifier(encoder)
    else:
        model = nets.TextClassifier(encoder, n_class)

    # load word vectors
    if args.word_vectors:
        print("loading word vectors")
        with open(args.word_vectors, "r") as fi:
            for line in fi:
                line_list = line.strip().split(" ")
                word = line_list[0]
                if word in vocab:
                    vec = model.xp.array(line_list[1::], dtype=np.float32)
                    model.encoder.embed.W.data[vocab[word]] = vec
    else:
        print("WARNING: NO Word Vectors")

    if args.gpu >= 0:
        # Make a specified GPU current
        chainer.backends.cuda.get_device_from_id(args.gpu).use()
        model.to_gpu()  # Copy the model to the GPU

    # Setup an optimizer
    optimizer = chainer.optimizers.Adam()
    optimizer.setup(model)
    # optimizer.add_hook(chainer.optimizer.WeightDecay(1e-4))

    # Set up a trainer
    if args.dataset == 'snli':
        converter = convert_snli_seq
    else:
        converter = convert_seq

    updater = training.updaters.StandardUpdater(train_iter,
                                                optimizer,
                                                converter=converter,
                                                device=args.gpu)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'),
                               out=os.path.join(
                                   args.out,
                                   '{}_{}'.format(args.dataset, args.model)))

    # Evaluate the model with the test dataset for each epoch
    trainer.extend(
        extensions.Evaluator(test_iter,
                             model,
                             converter=converter,
                             device=args.gpu))

    # Take a best snapshot
    record_trigger = training.triggers.MaxValueTrigger(
        'validation/main/accuracy', (1, 'epoch'))
    trainer.extend(extensions.snapshot_object(model, 'best_model.npz'),
                   trigger=record_trigger)

    # Write a log of evaluation statistics for each epoch
    trainer.extend(extensions.LogReport())
    trainer.extend(
        extensions.PrintReport([
            'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy',
            'validation/main/accuracy', 'elapsed_time'
        ]))

    # Print a progress bar to stdout
    trainer.extend(extensions.ProgressBar())

    idx2word = {}  # build reverse dict
    for word, idx in vocab.items():
        idx2word[idx] = word

    # Run the training
    trainer.run()
예제 #9
0
def main():
    current_datetime = '{}'.format(datetime.datetime.today())
    parser = argparse.ArgumentParser(
        description='Chainer example: Text Classification')
    parser.add_argument('--batchsize',
                        '-b',
                        type=int,
                        default=64,
                        help='Number of images in each mini-batch')
    parser.add_argument('--epoch',
                        '-e',
                        type=int,
                        default=30,
                        help='Number of sweeps over the dataset to train')
    parser.add_argument('--gpu',
                        '-g',
                        type=int,
                        default=-1,
                        help='GPU ID (negative value indicates CPU)')
    parser.add_argument('--out',
                        '-o',
                        default='result',
                        help='Directory to output the result')
    parser.add_argument('--unit',
                        '-u',
                        type=int,
                        default=300,
                        help='Number of units')
    parser.add_argument('--layer',
                        '-l',
                        type=int,
                        default=1,
                        help='Number of layers of RNN or MLP following CNN')
    parser.add_argument('--dropout',
                        '-d',
                        type=float,
                        default=0.4,
                        help='Dropout rate')
    parser.add_argument('--dataset',
                        '-data',
                        default='imdb.binary',
                        choices=[
                            'dbpedia', 'imdb.binary', 'imdb.fine', 'TREC',
                            'stsa.binary', 'stsa.fine', 'custrev', 'mpqa',
                            'rt-polarity', 'subj'
                        ],
                        help='Name of dataset.')
    parser.add_argument('--model',
                        '-model',
                        default='cnn',
                        choices=['cnn', 'rnn', 'bow'],
                        help='Name of encoder model type.')
    parser.add_argument('--char-based', action='store_true')

    args = parser.parse_args()
    print(json.dumps(args.__dict__, indent=2))

    # Load a dataset
    if args.dataset == 'dbpedia':
        train, test, vocab = text_datasets.get_dbpedia(
            char_based=args.char_based)
    elif args.dataset.startswith('imdb.'):
        train, test, vocab = text_datasets.get_imdb(
            fine_grained=args.dataset.endswith('.fine'),
            char_based=args.char_based)
    elif args.dataset in [
            'TREC', 'stsa.binary', 'stsa.fine', 'custrev', 'mpqa',
            'rt-polarity', 'subj'
    ]:
        train, test, vocab = text_datasets.get_other_text_dataset(
            args.dataset, char_based=args.char_based)

    print('# train data: {}'.format(len(train)))
    print('# test  data: {}'.format(len(test)))
    print('# vocab: {}'.format(len(vocab)))
    n_class = len(set([int(d[1]) for d in train]))
    print('# class: {}'.format(n_class))

    train_iter = chainer.iterators.SerialIterator(train, args.batchsize)
    test_iter = chainer.iterators.SerialIterator(test,
                                                 args.batchsize,
                                                 repeat=False,
                                                 shuffle=False)

    # Setup a model
    if args.model == 'rnn':
        Encoder = nets.RNNEncoder
    elif args.model == 'cnn':
        Encoder = nets.CNNEncoder
    elif args.model == 'bow':
        Encoder = nets.BOWMLPEncoder
    encoder = Encoder(n_layers=args.layer,
                      n_vocab=len(vocab),
                      n_units=args.unit,
                      dropout=args.dropout)
    model = nets.TextClassifier(encoder, n_class)
    if args.gpu >= 0:
        # Make a specified GPU current
        chainer.cuda.get_device_from_id(args.gpu).use()
        model.to_gpu()  # Copy the model to the GPU

    # Setup an optimizer
    optimizer = chainer.optimizers.Adam()
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer.WeightDecay(1e-4))

    # Set up a trainer
    updater = training.StandardUpdater(train_iter,
                                       optimizer,
                                       converter=convert_seq,
                                       device=args.gpu)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)

    # Evaluate the model with the test dataset for each epoch
    trainer.extend(
        extensions.Evaluator(test_iter,
                             model,
                             converter=convert_seq,
                             device=args.gpu))

    # Take a best snapshot
    record_trigger = training.triggers.MaxValueTrigger(
        'validation/main/accuracy', (1, 'epoch'))
    trainer.extend(extensions.snapshot_object(model, 'best_model.npz'),
                   trigger=record_trigger)

    # Write a log of evaluation statistics for each epoch
    trainer.extend(extensions.LogReport())
    trainer.extend(
        extensions.PrintReport([
            'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy',
            'validation/main/accuracy', 'elapsed_time'
        ]))

    # Print a progress bar to stdout
    trainer.extend(extensions.ProgressBar())

    # Save vocabulary and model's setting
    if not os.path.isdir(args.out):
        os.mkdir(args.out)
    current = os.path.dirname(os.path.abspath(__file__))
    vocab_path = os.path.join(current, args.out, 'vocab.json')
    with open(vocab_path, 'w') as f:
        json.dump(vocab, f)
    model_path = os.path.join(current, args.out, 'best_model.npz')
    model_setup = args.__dict__
    model_setup['vocab_path'] = vocab_path
    model_setup['model_path'] = model_path
    model_setup['n_class'] = n_class
    model_setup['datetime'] = current_datetime
    with open(os.path.join(args.out, 'args.json'), 'w') as f:
        json.dump(args.__dict__, f)

    # Run the training
    trainer.run()
예제 #10
0
    # Setup a model
    if args.model_type == 'rnn':
        Encoder = nets.RNNEncoder
    elif args.model_type == 'cnn':
        Encoder = nets.CNNEncoder
    elif args.model_type == 'bow':
        Encoder = nets.BOWMLPEncoder
    else:
        raise ValueError('model_type must be "rnn", "cnn", or "bow"')

    encoder = Encoder(n_layers=args.num_layers,
                      n_vocab=len(vocab),
                      n_units=args.num_units,
                      dropout=args.dropout)
    model = nets.TextClassifier(encoder, num_classes)

    # Setup an optimizer
    optimizer = chainer.optimizers.Adam()
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer.WeightDecay(1e-4))

    # Set up a trainer
    device = 0 if num_gpus > 0 else -1  # -1 indicates CPU, 0 indicates first GPU device.
    if num_gpus > 1:
        devices = range(num_gpus)
        train_iters = [chainer.iterators.SerialIterator(i, args.batch_size) \
                    for i in chainer.datasets.split_dataset_n_random(train, len(devices))]
        test_iter = chainer.iterators.SerialIterator(test,
                                                     args.batch_size,
                                                     repeat=False,
예제 #11
0
def main():

    args = {
        'gpu': -1,
        'dataset': 'imdb.binary',
        'model': 'rnn',
        'batchsize': 64,
        'epoch': 3,
        'out': 'result',
        'unit': 100,
        'layer': 1,
        'dropout': 0.4,
        'char_based': False
    }

    # Load a dataset
    if args['dataset'] == 'dbpedia':
        train, test, vocab = text_datasets.get_dbpedia(
            char_based=args['char_based'])
    elif args['dataset'].startswith('imdb.'):
        print("IMDB datasets")
        train, test, vocab = text_datasets.get_imdb(
            fine_grained=args['dataset'].endswith('.fine'),
            char_based=args['char_based'])
    elif args['dataset'] in [
            'TREC', 'stsa.binary', 'stsa.fine', 'custrev', 'mpqa',
            'rt-polarity', 'subj'
    ]:
        train, test, vocab = text_datasets.get_other_text_dataset(
            args['dataset'], char_based=args['char_based'])

    print('# train data: {}'.format(len(train)))
    print('# test  data: {}'.format(len(test)))
    print('# vocab: {}'.format(len(vocab)))
    n_class = len(set([int(d[1]) for d in train]))
    print('# class: {}'.format(n_class))

    train_iter = chainer.iterators.SerialIterator(train[:1000],
                                                  args['batchsize'])
    test_iter = chainer.iterators.SerialIterator(test[:1000],
                                                 args['batchsize'],
                                                 repeat=False,
                                                 shuffle=False)

    # return train_iter, test_iter
    # Setup a model
    if args['model'] == 'rnn':
        Encoder = nets.RNNEncoder
        print(type(Encoder))
    elif args['model'] == 'cnn':
        Encoder = nets.CNNEncoder
    elif args['model'] == 'bow':
        Encoder = nets.BOWMLPEncoder

    encoder = Encoder(n_layers=args['layer'],
                      n_vocab=len(vocab),
                      n_units=args['unit'],
                      dropout=args['dropout'])
    model = nets.TextClassifier(encoder, n_class)
    if args['gpu'] >= 0:
        # Make a specified GPU current
        chainer.backends.cuda.get_device_from_id(args['gpu']).use()
        model.to_gpu()  # Copy the model to the GPU

    # Setup an optimizer
    optimizer = chainer.optimizers.Adam()
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer.WeightDecay(1e-4))

    # Set up a trainer
    updater = training.updaters.StandardUpdater(train_iter,
                                                optimizer,
                                                converter=convert_seq,
                                                device=args['gpu'])
    trainer = training.Trainer(updater, (args['epoch'], 'epoch'),
                               out=args['out'])

    # Evaluate the model with the test dataset for each epoch
    trainer.extend(
        extensions.Evaluator(test_iter,
                             model,
                             converter=convert_seq,
                             device=args['gpu']))

    # Take a best snapshot
    record_trigger = training.triggers.MaxValueTrigger(
        'validation/main/accuracy', (1, 'epoch'))
    trainer.extend(extensions.snapshot_object(model, 'best_model.npz'),
                   trigger=record_trigger)

    # Write a log of evaluation statistics for each epoch
    trainer.extend(extensions.LogReport())
    trainer.extend(
        extensions.PrintReport([
            'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy',
            'validation/main/accuracy', 'elapsed_time'
        ]))

    # Print a progress bar to stdout
    trainer.extend(extensions.ProgressBar())

    print("STRAT Training!")
    # Run the training
    trainer.run()
    print("Finished!")
예제 #12
0
import ruleclassifier
import solrindexer as indexer
import sqlitedatastore as datastore
from annoutil import find_xs_in_y

# ルールによるテキスト分類の設定
rule = ruleclassifier.get_rule()

# 教師あり学習によるテキスト分類の設定
model_ml = joblib.load('result/model.pkl')
vocab_ml = joblib.load('result/vocab.pkl')

# ディープラーニングによるテキスト分類の設定
w = numpy.load('result/w_dl.npy')
encoder = dlclassifier.Encoder(w)
model_dl = nets.TextClassifier(encoder, n_class=2)
chainer.serializers.load_npz('result/model_dl.npz', model_dl)
with open('result/vocab_dl.json') as f:
    vocab_dl = json.load(f)


@bottle.route('/')
def index_html():
    return bottle.static_file('sample_10_10.html', root='./src/static')


@bottle.route('/file/<filename:path>')
def static(filename):
    return bottle.static_file(filename, root='./src/static')

def main():
    start = time.time()
    current_datetime = '{}'.format(datetime.datetime.today())
    parser = argparse.ArgumentParser(description='Chainer Text Classification')
    parser.add_argument('--batchsize',
                        '-b',
                        type=int,
                        default=64,
                        help='Number of images in each mini-batch')
    parser.add_argument('--epoch',
                        '-e',
                        type=int,
                        default=30,
                        help='Number of sweeps over the dataset to train')
    parser.add_argument('--gpu',
                        '-g',
                        type=int,
                        default=-1,
                        help='GPU ID (negative value indicates CPU)')
    parser.add_argument('--out',
                        '-o',
                        default='result',
                        help='Directory to output the result')
    parser.add_argument('--unit',
                        '-u',
                        type=int,
                        default=200,
                        help='Number of units')
    parser.add_argument('--vocab',
                        '-v',
                        type=int,
                        default=100000,
                        help='Number of max vocabulary')
    parser.add_argument('--layer',
                        '-l',
                        type=int,
                        default=1,
                        help='Number of layers of RNN or MLP following CNN')
    parser.add_argument('--dropout',
                        '-d',
                        type=float,
                        default=0.4,
                        help='Dropout rate')
    parser.add_argument('--dataset',
                        '-dataset',
                        required=True,
                        help='train dataset')
    parser.add_argument('--size',
                        '-size',
                        type=int,
                        default=-1,
                        help='train dataset size -> def train:3/4, test:1/4')
    parser.add_argument('--model',
                        '-model',
                        default='cnn',
                        choices=['cnn', 'lstm', 'bow', 'gru'],
                        help='Name of encoder model type.')
    parser.add_argument('--early-stop',
                        action='store_true',
                        help='use early stopping method')
    parser.add_argument('--same-network',
                        action='store_true',
                        help='use same network between i1 and i2')
    parser.add_argument('--save-init',
                        action='store_true',
                        help='save init model')
    parser.add_argument('--char-based', action='store_true')

    args = parser.parse_args()
    print(json.dumps(args.__dict__, indent=2))

    train, test, vocab = get_input_dataset(args.dataset,
                                           vocab=None,
                                           max_vocab_size=args.vocab)

    print('# train data: {}'.format(len(train)))
    print('# dev  data: {}'.format(len(test)))
    print('# vocab: {}'.format(len(vocab)))
    n_class = len(set([int(d[-1]) for d in train]))
    print('# class: {}'.format(n_class))

    train_iter = chainer.iterators.SerialIterator(train, args.batchsize)
    test_iter = chainer.iterators.SerialIterator(test,
                                                 args.batchsize,
                                                 repeat=False,
                                                 shuffle=False)

    # Setup a model
    if args.model == 'lstm':
        Encoder = nets.LSTMEncoder
    elif args.model == 'cnn':
        Encoder = nets.CNNEncoder
    elif args.model == 'bow':
        Encoder = nets.BOWMLPEncoder
    elif args.model == 'gru':
        Encoder = nets.GRUEncoder

    encoder = Encoder(n_layers=args.layer,
                      n_vocab=len(vocab),
                      n_units=args.unit,
                      dropout=args.dropout,
                      same_network=args.same_network)
    model = nets.TextClassifier(encoder, n_class)

    if args.gpu >= 0:
        # Make a specified GPU current
        chainer.backends.cuda.get_device_from_id(args.gpu).use()
        model.to_gpu()  # Copy the model to the GPU

    # Setup an optimizer
    optimizer = chainer.optimizers.Adam()
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer.WeightDecay(1e-4))

    # Set up a trainer
    updater = training.updaters.StandardUpdater(train_iter,
                                                optimizer,
                                                converter=convert_seq2,
                                                device=args.gpu)

    # early Stopping
    if args.early_stop:
        stop_trigger = triggers.EarlyStoppingTrigger(
            monitor='validation/main/loss', max_trigger=(args.epoch, 'epoch'))
    else:
        stop_trigger = (args.epoch, 'epoch')

    trainer = training.Trainer(updater, stop_trigger, out=args.out)

    # Evaluate the model with the test dataset for each epoch
    trainer.extend(
        extensions.Evaluator(test_iter,
                             model,
                             converter=convert_seq2,
                             device=args.gpu))

    # Take a best snapshot
    record_trigger = training.triggers.MaxValueTrigger(
        'validation/main/accuracy', (1, 'epoch'))
    trainer.extend(extensions.snapshot_object(model, 'best_model.npz'),
                   trigger=record_trigger)

    # Write a log of evaluation statistics for each epoch
    trainer.extend(extensions.LogReport())
    trainer.extend(
        extensions.PrintReport([
            'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy',
            'validation/main/accuracy', 'elapsed_time'
        ]))

    # Print a progress bar to stdout
    # trainer.extend(extensions.ProgressBar())

    # Save vocabulary and model's setting
    if not os.path.isdir(args.out):
        os.mkdir(args.out)
    vocab_path = os.path.join(args.out, 'vocab.json')
    with open(vocab_path, 'w') as f:
        json.dump(vocab, f)
    model_path = os.path.join(args.out, 'best_model.npz')
    model_setup = args.__dict__
    model_setup['vocab_path'] = vocab_path
    model_setup['model_path'] = model_path
    model_setup['n_class'] = n_class
    model_setup['datetime'] = current_datetime
    with open(os.path.join(args.out, 'args.json'), 'w') as f:
        json.dump(args.__dict__, f)

    if args.save_init:
        chainer.serializers.save_npz(os.path.join(args.out, 'init_model.npz'),
                                     model)
        exit()

    # Run the training
    print('Start trainer.run: {}'.format(current_datetime))
    trainer.run()
    print('Elapsed_time: {}'.format(
        datetime.timedelta(seconds=time.time() - start)))