示例#1
0
 def write_grids(self, grid, l0, s1, l2, final):
     batch_size, num_sample_sets, context_len, num_alt_utts = l0.shape
     stride = [
         len(grid) / batch_size,
         len(grid) / (batch_size * num_sample_sets)
     ]
     with gzip.open(config.get_file_path('grids.0.jsons.gz'),
                    'a') as outfile:
         for i in range(batch_size):
             final_dist = final[i, :].tolist()
             sample_sets = []
             for ss in range(num_sample_sets):
                 loc = i * stride[0] + ss * stride[1]
                 utts = [
                     inst.input for inst in grid[loc:loc + num_alt_utts]
                 ]
                 l0_grid = l0[i, ss, :, :].tolist()
                 s1_grid = s1[i, ss, :, :].tolist()
                 l2_grid = l2[i, ss, :, :].tolist()
                 sample_sets.append({
                     'utts': utts,
                     'L0': l0_grid,
                     'S1': s1_grid,
                     'L2': l2_grid
                 })
             json.dump({'final': final_dist, 'sets': sample_sets}, outfile)
             outfile.write('\n')
示例#2
0
    def predict_and_score(self, eval_instances, random=False, verbosity=0):
        predictions = []
        scores = []

        batches = iterators.gen_batches(eval_instances,
                                        batch_size=self.options.eval_batch_size)

        with gzip.open(config.get_file_path('dists.b64.gz'), 'w'):
            pass

        if self.options.verbosity + verbosity >= 1:
            progress.start_task('Eval batch', len(batches))

        for i, batch in enumerate(batches):
            if self.options.verbosity + verbosity >= 1:
                progress.progress(i)

            batch = list(batch)

            feed_dict = self.vectorize_inputs(batch)
            feed_dict.update(self.vectorize_labels(batch))
            output = self.run_predict(feed_dict)
            predictions_batch = self.output_to_preds(output, batch, sample=random)
            predictions.extend(predictions_batch)
            labels = self.vectorize_labels(batch)
            scores_batch = self.output_to_scores(output, labels)
            scores.extend(scores_batch)

        if self.options.verbosity + verbosity >= 1:
            progress.end_task()

        return predictions, scores
示例#3
0
    def output_to_preds(self, output, batch, sample='ignored'):
        card_names = [rank + suit for rank in cards_env.RANKS for suit in cards_env.SUITS]
        num_cards = len(card_names)
        card_loc_rows, p2_loc = output

        assert card_loc_rows.shape[1:] == (num_cards, NUM_LOCS + 2), card_loc_rows.shape
        assert p2_loc.shape[1:] == (NUM_LOCS,), p2_loc.shape

        with gzip.open(config.get_file_path('dists.b64.gz'), 'a') as outfile:
            for row in summarize_output(card_loc_rows, p2_loc):
                outfile.write(row)
                outfile.write('\n')

        card_loc_indices = card_loc_rows.argmax(axis=2)
        p2_loc_indices = p2_loc.argmax(axis=1)

        preds = []
        for i, inst in enumerate(batch):
            cards_to_loc_pred = {
                name: loc_index_to_coord(idx, card=True)
                for name, idx in zip(card_names, card_loc_indices[i])
            }
            p2_loc_pred = loc_index_to_coord(p2_loc_indices[i])
            state = world.build_world(inst.input['walls'], cards_to_loc_pred, p2_loc=p2_loc_pred)
            preds.append(state.__dict__)
        return preds
示例#4
0
    def train(self, training_instances, validation_instances=None, metrics=None):
        id_tag = (self.id + ': ') if self.id else ''
        if self.options.verbosity >= 2:
            print(id_tag + 'Training priors')
        self.train_priors(training_instances, listener_data=self.options.listener)

        self.dataset = training_instances
        xs, ys = self._data_to_arrays(training_instances, init_vectorizer=True)
        self._build_model()

        if self.options.verbosity >= 2:
            print(id_tag + 'Training conditional model')
        summary_path = config.get_file_path('losses.tfevents')
        if summary_path:
            writer = summary.SummaryWriter(summary_path)
        else:
            writer = None
        progress.start_task('Iteration', self.options.train_iters)
        for iteration in range(self.options.train_iters):
            progress.progress(iteration)
            self.model.fit(xs, ys, batch_size=self.options.batch_size,
                           num_epochs=self.options.train_epochs,
                           summary_writer=writer, step=iteration * self.options.train_epochs)
            validation_results = self.validate(validation_instances, metrics, iteration=iteration)
            if writer is not None:
                step = (iteration + 1) * self.options.train_epochs
                self.on_iter_end(step, writer)
                for key, value in validation_results.iteritems():
                    tag = 'val/' + key.split('.', 1)[1].replace('.', '/')
                    writer.log_scalar(step, tag, value)
        writer.flush()
        progress.end_task()
示例#5
0
def main():
    options = config.options()

    progress.set_resolution(datetime.timedelta(seconds=options.progress_tick))

    train_data = color_instances.SOURCES[options.data_source].train_data(
        listener=options.listener)[:options.train_size]
    if options.validation_size:
        assert options.validation_size < len(train_data), \
            ('No training data after validation split! (%d <= %d)' %
             (len(train_data), options.validation_size))
        validation_data = train_data[-options.validation_size:]
        train_data = train_data[:-options.validation_size]
    else:
        validation_data = None
    test_data = color_instances.SOURCES[options.data_source].test_data(
        options.listener)[:options.test_size]

    learner = learners.new(options.learner)

    m = [
        metrics.log_likelihood, metrics.log_likelihood_bits,
        metrics.perplexity, metrics.aic
    ]
    if options.listener and not isinstance(test_data[0].output,
                                           numbers.Integral):
        m.append(metrics.squared_error)
    elif isinstance(test_data[0].output, (tuple, list)):
        m.append(metrics.prec1)
        if test_data[0].output and isinstance(test_data[0].output, basestring):
            m.append(metrics.bleu)
    else:
        m.append(metrics.accuracy)
        if test_data[0].output and isinstance(test_data[0].output, basestring):
            m.append(metrics.bleu)

    if options.load:
        with open(options.load, 'rb') as infile:
            learner.load(infile)
    else:
        learner.train(train_data, validation_data, metrics=m)
        with open(config.get_file_path('model.p'), 'wb') as outfile:
            learner.dump(outfile)

        train_results = evaluate.evaluate(learner,
                                          train_data,
                                          metrics=m,
                                          split_id='train',
                                          write_data=options.output_train_data)
        output.output_results(train_results, 'train')

    test_results = evaluate.evaluate(learner,
                                     test_data,
                                     metrics=m,
                                     split_id='dev',
                                     write_data=options.output_test_data)
    output.output_results(test_results, 'dev')
示例#6
0
    def train(self, training_instances, validation_instances=None, metrics=None,
              keep_params=False):
        id_tag = (self.id + ': ') if self.id else ''
        if self.options.verbosity >= 2:
            print(id_tag + 'Training priors')
        self.train_priors(training_instances, listener_data=self.options.listener)

        self.dataset = training_instances
        xs, ys = self._data_to_arrays(training_instances,
                                      init_vectorizer=not hasattr(self, 'model'))
        if not hasattr(self, 'model') or not keep_params:
            if self.options.verbosity >= 2:
                print(id_tag + 'Building model')
            if keep_params:
                warnings.warn("keep_params was passed, but the model hasn't been built; "
                              "initializing all parameters.")
            self._build_model()
        else:
            if not hasattr(self.options, 'reset_optimizer_vars') or \
                    self.options.reset_optimizer_vars:
                if self.options.verbosity >= 2:
                    print(id_tag + 'Resetting optimizer')
                self.model.reset_optimizer()

        if self.options.verbosity >= 2:
            print(id_tag + 'Training conditional model')
        if hasattr(self, 'writer'):
            writer = self.writer
        else:
            summary_path = config.get_file_path('losses.tfevents')
            if summary_path:
                writer = summary.SummaryWriter(summary_path)
            else:
                writer = None
            self.writer = writer

        if not hasattr(self, 'step_base'):
            self.step_base = 0

        progress.start_task('Iteration', self.options.train_iters)
        for iteration in range(self.options.train_iters):
            progress.progress(iteration)
            self.model.fit(xs, ys, batch_size=self.options.batch_size,
                           num_epochs=self.options.train_epochs,
                           summary_writer=writer,
                           step=self.step_base + iteration * self.options.train_epochs)
            validation_results = self.validate(validation_instances, metrics, iteration=iteration)
            if writer is not None:
                step = self.step_base + (iteration + 1) * self.options.train_epochs
                self.on_iter_end(step, writer)
                for key, value in validation_results.iteritems():
                    tag = 'val/' + key.split('.', 1)[1].replace('.', '/')
                    writer.log_scalar(step, tag, value)

        self.step_base += self.options.train_iters * self.options.train_epochs
        writer.flush()
        progress.end_task()
示例#7
0
    def output_to_preds(self, output, batch, sample='ignored'):
        assert output.shape[1:] == (NUM_LOCS,), output.shape

        with gzip.open(config.get_file_path('dists.b64.gz'), 'a') as outfile:
            for row in summarize_output(np.zeros((output.shape[0],
                                                  52, NUM_LOCS + 2)),
                                        output):
                outfile.write(row)
                outfile.write('\n')

        p2_loc_indices = output.argmax(axis=1)
        return [loc_index_to_coord(p2_loc_indices[i]) for i, inst in enumerate(batch)]
示例#8
0
def output_html_dists():
    options = config.options(read=True)
    with gzip.open(config.get_file_path('dists.b64.gz'), 'r') as infile:
        rows = list(infile)
    with config.open('dists.js', 'w') as outfile:
        write_json_dists(rows, outfile)
        write_json_ents(rows, outfile)

    with config.open('data.eval.jsons', 'r') as infile:
        insts = list(infile)
    with config.open('predictions.eval.jsons', 'r') as infile:
        preds = list(infile)
    try:
        with config.open('samples.eval.jsons', 'r') as infile:
            samples = list(infile)
    except IOError:
        samples = None
    with config.open('insts.js', 'w') as outfile:
        write_json_insts(insts, preds, samples, outfile, listener=options.listener)

    shutil.copy('dists.html', config.get_file_path('dists.html'))
def main():
    options = config.options()

    progress.set_resolution(datetime.timedelta(seconds=options.progress_tick))

    train_data = color_instances.SOURCES[options.data_source].train_data(
        listener=options.listener
    )[:options.train_size]
    if options.validation_size:
        assert options.validation_size < len(train_data), \
            ('No training data after validation split! (%d <= %d)' %
             (len(train_data), options.validation_size))
        validation_data = train_data[-options.validation_size:]
        train_data = train_data[:-options.validation_size]
    else:
        validation_data = None
    test_data = color_instances.SOURCES[options.data_source].test_data(
        options.listener
    )[:options.test_size]

    learner = learners.new(options.learner)

    m = [metrics.log_likelihood,
         metrics.log_likelihood_bits,
         metrics.perplexity,
         metrics.aic]
    if options.listener and not isinstance(test_data[0].output, numbers.Integral):
        m.append(metrics.squared_error)
    elif isinstance(test_data[0].output, (tuple, list)):
        m.append(metrics.prec1)
        if test_data[0].output and isinstance(test_data[0].output, basestring):
            m.append(metrics.bleu)
    else:
        m.append(metrics.accuracy)
        if test_data[0].output and isinstance(test_data[0].output, basestring):
            m.append(metrics.bleu)

    if options.load:
        with open(options.load, 'rb') as infile:
            learner.load(infile)
    else:
        learner.train(train_data, validation_data, metrics=m)
        with open(config.get_file_path('model.p'), 'wb') as outfile:
            learner.dump(outfile)

        train_results = evaluate.evaluate(learner, train_data, metrics=m, split_id='train',
                                          write_data=options.output_train_data)
        output.output_results(train_results, 'train')

    test_results = evaluate.evaluate(learner, test_data, metrics=m, split_id='dev',
                                     write_data=options.output_test_data)
    output.output_results(test_results, 'dev')
示例#10
0
 def __init__(self, module, loss, optimizer, optimizer_params, vectorizer):
     self.get_options()
     self.module = cu(module)
     self.loss = cu(loss)
     self.optimizer_class = optimizer
     self.optimizer_params = optimizer_params
     self.build_optimizer()
     self.vectorizer = vectorizer
     summary_path = config.get_file_path('monitoring.tfevents')
     if summary_path:
         self.summary_writer = summary.SummaryWriter(summary_path)
     else:
         self.summary_writer = None
     self.step = 0
     self.last_timestamp = datetime.datetime.now()
示例#11
0
    def output_to_preds(self, output, batch, sample=False):
        _, predictions, samples = output
        indices = samples if sample else predictions

        p2_loc_arrays = -11.0 * (np.array([inst.input['walls'] for inst in batch]) + 2.0)
        for i, inst in enumerate(batch):
            p2_loc_arrays[i][inst.input['loc']] = 0.0
        p2_loc_linear = p2_loc_arrays.reshape([p2_loc_arrays.shape[0], NUM_LOCS])
        with gzip.open(config.get_file_path('dists.b64.gz'), 'a') as outfile:
            for row in summarize_output(np.zeros((p2_loc_linear.shape[0],
                                                  52, NUM_LOCS + 2)),
                                        p2_loc_linear):
                outfile.write(row)
                outfile.write('\n')

        return sanitize_preds(self.seq_vec.unvectorize_all(indices))
示例#12
0
 def write_grids(self, grid, l0, s1, l2, final):
     batch_size, num_sample_sets, context_len, num_alt_utts = l0.shape
     stride = [len(grid) / batch_size, len(grid) / (batch_size * num_sample_sets)]
     with gzip.open(config.get_file_path('grids.0.jsons.gz'), 'a') as outfile:
         for i in range(batch_size):
             final_dist = final[i, :].tolist()
             sample_sets = []
             for ss in range(num_sample_sets):
                 loc = i * stride[0] + ss * stride[1]
                 utts = [inst.input for inst in grid[loc:loc + num_alt_utts]]
                 l0_grid = l0[i, ss, :, :].tolist()
                 s1_grid = s1[i, ss, :, :].tolist()
                 l2_grid = l2[i, ss, :, :].tolist()
                 sample_sets.append({'utts': utts, 'L0': l0_grid, 'S1': s1_grid, 'L2': l2_grid})
             json.dump({'final': final_dist, 'sets': sample_sets}, outfile)
             outfile.write('\n')
示例#13
0
def main():
    options = config.options()

    progress.set_resolution(datetime.timedelta(seconds=options.progress_tick))

    train_size = options.train_size if options.train_size >= 0 else None
    test_size = options.test_size if options.test_size >= 0 else None

    train_data = datasets.SOURCES[options.data_source].train_data()[:train_size]
    if options.validation_size:
        assert options.validation_size < len(train_data), \
            ('No training data after validation split! (%d <= %d)' %
             (len(train_data), options.validation_size))
        validation_data = train_data[-options.validation_size:]
        train_data = train_data[:-options.validation_size]
    else:
        validation_data = None
    test_data = datasets.SOURCES[options.data_source].test_data()[:test_size]

    learner = learners.new(options.learner)

    m = [metrics.METRICS[m] for m in options.metrics]

    if options.load:
        learner.load(options.load)
    else:
        learner.train(train_data, validation_data, metrics=m)
        model_path = config.get_file_path('model')
        if model_path:
            learner.dump(model_path)

        train_results = evaluate.evaluate(learner, train_data, metrics=m, split_id='train',
                                          write_data=options.output_train_data)
        output.output_results(train_results, 'train')

        if options.output_train_samples:
            samples = learner.predict(train_data, random=True)
            config.dump(samples, 'samples.train.jsons', lines=True)

    test_results = evaluate.evaluate(learner, test_data, metrics=m, split_id='eval',
                                     write_data=options.output_test_data)
    output.output_results(test_results, 'eval')

    if options.output_test_samples:
        samples = learner.predict(test_data, random=True)
        config.dump(samples, 'samples.eval.jsons', lines=True)
示例#14
0
def main():
    options = config.options()

    progress.set_resolution(datetime.timedelta(seconds=options.progress_tick))

    train_data = datasets.SOURCES[
        options.data_source].train_data()[:options.train_size]
    if options.validation_size:
        assert options.validation_size < len(train_data), \
            ('No training data after validation split! (%d <= %d)' %
             (len(train_data), options.validation_size))
        validation_data = train_data[-options.validation_size:]
        train_data = train_data[:-options.validation_size]
    else:
        validation_data = None
    test_data = datasets.SOURCES[
        options.data_source].test_data()[:options.test_size]

    learner = learners.new(options.learner)

    m = [metrics.METRICS[m] for m in options.metrics]

    if options.load:
        with open(options.load, 'rb') as infile:
            learner.load(infile)
    else:
        learner.train(train_data, validation_data, metrics=m)
        model_path = config.get_file_path('model.pkl')
        if model_path:
            with open(model_path, 'wb') as outfile:
                learner.dump(outfile)

        train_results = evaluate.evaluate(learner,
                                          train_data,
                                          metrics=m,
                                          split_id='train',
                                          write_data=options.output_train_data)
        output.output_results(train_results, 'train')

    test_results = evaluate.evaluate(learner,
                                     test_data,
                                     metrics=m,
                                     split_id='eval',
                                     write_data=options.output_test_data)
    output.output_results(test_results, 'eval')
示例#15
0
    def train(self, training_instances, validation_instances='ignored', metrics='ignored'):
        self.build_graph()
        env = gym.make(cards_env.register())

        self.init_params()

        if self.options.verbosity >= 1:
            progress.start_task('Epoch', self.options.pg_train_epochs)

        for epoch in range(self.options.pg_train_epochs):
            if self.options.verbosity >= 1:
                progress.progress(epoch)

            batches = iterators.iter_batches(training_instances,
                                             self.options.pg_batch_size)
            num_batches = (len(training_instances) - 1) // self.options.pg_batch_size + 1

            if self.options.verbosity >= 1:
                progress.start_task('Batch', num_batches)

            try:
                for batch_num, batch in enumerate(batches):
                    if self.options.verbosity >= 1:
                        progress.progress(batch_num)
                    step = epoch * num_batches + batch_num
                    self.train_one_batch(list(batch), env, t=step)
                    if step % 10 == 0:
                        check_prefix = config.get_file_path('checkpoint')
                        self.saver.save(self.session, check_prefix, global_step=step)
            except KeyboardInterrupt:
                self.summary_writer.flush()
                raise

            if self.options.verbosity >= 1:
                progress.end_task()

        if self.options.verbosity >= 1:
            progress.end_task()
def main():
    options = config.options()

    progress.set_resolution(datetime.timedelta(seconds=options.progress_tick))

    train_data = datasets.SOURCES[options.data_source].train_data()[:options.train_size]
    if options.validation_size:
        assert options.validation_size < len(train_data), \
            ('No training data after validation split! (%d <= %d)' %
             (len(train_data), options.validation_size))
        validation_data = train_data[-options.validation_size:]
        train_data = train_data[:-options.validation_size]
    else:
        validation_data = None
    test_data = datasets.SOURCES[options.data_source].test_data()[:options.test_size]

    learner = learners.new(options.learner)

    m = [metrics.METRICS[m] for m in options.metrics]

    if options.load:
        with open(options.load, 'rb') as infile:
            learner.load(infile)
    else:
        learner.train(train_data, validation_data, metrics=m)
        model_path = config.get_file_path('model.pkl')
        if model_path:
            with open(model_path, 'wb') as outfile:
                learner.dump(outfile)

        train_results = evaluate.evaluate(learner, train_data, metrics=m, split_id='train',
                                          write_data=options.output_train_data)
        output.output_results(train_results, 'train')

    test_results = evaluate.evaluate(learner, test_data, metrics=m, split_id='eval',
                                     write_data=options.output_test_data)
    output.output_results(test_results, 'eval')
示例#17
0
def patch(model):
    def __quickpickle_setstate__(self, state):
        self.__dict__ = state

    def __quickpickle_getstate__(self):
        state = dict(self.__dict__)
        del state['__getstate__']
        del state['__setstate__']
        state['quickpickle'] = True
        return state

    def __quickpickle_numparams__(self):
        return self.quickpickle_numparams

    model.__getstate__ = types.MethodType(__quickpickle_getstate__, model)
    model.__setstate__ = types.MethodType(__quickpickle_setstate__, model)
    model.quickpickle_numparams = model.num_params


if __name__ == '__main__':
    sys.setrecursionlimit(50000)
    options = config.options(read=True)
    if options.load:
        modelfile = options.load
    else:
        modelfile = config.get_file_path('model.p')
    with open(modelfile, 'rb') as infile, config.open('quickpickle.p', 'wb') as outfile:
        model = pickle.load(infile)
        patch(model)
        pickle.dump(model, outfile)
示例#18
0
def main():
    options = config.options()

    progress.set_resolution(datetime.timedelta(seconds=options.progress_tick))

    train_datasets = []
    validation_datasets = []
    test_datasets = []

    if len(options.train_size) == 1:
        options.train_size = options.train_size * len(options.data_source)
    else:
        assert len(options.train_size) == len(options.data_source)
    if len(options.validation_size) == 1:
        options.validation_size = options.validation_size * len(
            options.data_source)
    else:
        assert len(options.validation_size) == len(options.data_source)
    if len(options.test_size) == 1:
        options.test_size = options.test_size * len(options.data_source)
    else:
        assert len(options.test_size) == len(options.data_source)

    for source, train_size, validation_size, test_size in zip(
            options.data_source, options.train_size, options.validation_size,
            options.test_size):
        train_insts = color_instances.SOURCES[source].train_data(
            listener=options.listener)[:train_size]
        if validation_size:
            assert validation_size < len(train_insts), \
                ('No training data after validation split! (%d <= %d)' %
                 (len(train_insts), validation_size))
            validation_insts = train_insts[-validation_size:]
            validation_datasets.append(validation_insts)
            train_insts = train_insts[:-validation_size]
        else:
            validation_datasets.append(None)
        train_datasets.append(train_insts)
        test_insts = color_instances.SOURCES[source].test_data(
            options.listener)[:test_size]
        test_datasets.append(test_insts)

    learner = learners.new(options.learner)

    m = [
        metrics.log_likelihood, metrics.log_likelihood_bits,
        metrics.perplexity, metrics.aic
    ]
    example_inst = get_example_inst(test_datasets, train_datasets)
    if options.listener and not isinstance(example_inst.output,
                                           numbers.Integral):
        m.append(metrics.squared_error)
    elif isinstance(example_inst.output, (tuple, list)):
        m.append(metrics.prec1)
        if example_inst.output and isinstance(example_inst.output, basestring):
            m.extend([
                metrics.bleu, metrics.wer, metrics.token_perplexity_macro,
                metrics.token_perplexity_micro
            ])
    else:
        m.append(metrics.accuracy)
        if example_inst.output and isinstance(example_inst.output, basestring):
            m.extend([
                metrics.bleu, metrics.wer, metrics.token_perplexity_macro,
                metrics.token_perplexity_micro
            ])

    multi_train = (len(options.data_source) > 1)
    if options.load:
        with open(options.load, 'rb') as infile:
            learner.load(infile)

        train_results = None
    else:
        if hasattr(learner, '_data_to_arrays'):
            # XXX: is there a better way to ensure that the vocabulary is defined
            # before training starts?
            for train_insts in train_datasets[1:]:
                learner._data_to_arrays(train_insts, init_vectorizer=True)

        for i, (source, train_insts, validation_insts) in enumerate(
                zip(options.data_source, train_datasets, validation_datasets)):
            if not train_insts:
                continue

            if i > 0:
                learner.train(train_insts,
                              validation_insts,
                              metrics=m,
                              keep_params=True)
            else:
                learner.train(train_insts, validation_insts, metrics=m)
            with open(config.get_file_path('model.p'), 'wb') as outfile:
                learner.dump(outfile)

            if multi_train:
                split_id = 'train_' + source
            else:
                split_id = 'train'
            train_results = evaluate.evaluate(
                learner,
                train_insts,
                metrics=m,
                split_id=split_id,
                write_data=options.output_train_data)
            if options.verbosity != 0:
                output.output_results(train_results, split_id)

    for i, (source,
            test_insts) in enumerate(zip(options.data_source, test_datasets)):
        if not test_insts:
            continue
        if multi_train:
            split_id = 'eval_' + source
        else:
            split_id = 'eval'
        test_results = evaluate.evaluate(learner,
                                         test_insts,
                                         metrics=m,
                                         split_id=split_id,
                                         write_data=options.output_test_data)
        if options.verbosity != 0:
            output.output_results(test_results, split_id)

    return train_results, test_results
def main():
    options = config.options()

    progress.set_resolution(datetime.timedelta(seconds=options.progress_tick))

    train_datasets = []
    validation_datasets = []
    test_datasets = []

    if len(options.train_size) == 1:
        options.train_size = options.train_size * len(options.data_source)
    else:
        assert len(options.train_size) == len(options.data_source)
    if len(options.validation_size) == 1:
        options.validation_size = options.validation_size * len(options.data_source)
    else:
        assert len(options.validation_size) == len(options.data_source)
    if len(options.test_size) == 1:
        options.test_size = options.test_size * len(options.data_source)
    else:
        assert len(options.test_size) == len(options.data_source)

    for source, train_size, validation_size, test_size in zip(options.data_source,
                                                              options.train_size,
                                                              options.validation_size,
                                                              options.test_size):
        train_insts = color_instances.SOURCES[source].train_data(
            listener=options.listener
        )[:train_size]
        if validation_size:
            assert validation_size < len(train_insts), \
                ('No training data after validation split! (%d <= %d)' %
                 (len(train_insts), validation_size))
            validation_insts = train_insts[-validation_size:]
            validation_datasets.append(validation_insts)
            train_insts = train_insts[:-validation_size]
        else:
            validation_datasets.append(None)
        train_datasets.append(train_insts)
        test_insts = color_instances.SOURCES[source].test_data(
            options.listener
        )[:test_size]
        test_datasets.append(test_insts)

    learner = learners.new(options.learner)

    m = [metrics.log_likelihood,
         metrics.log_likelihood_bits,
         metrics.perplexity,
         metrics.aic]
    example_inst = get_example_inst(test_datasets, train_datasets)
    if options.listener and not isinstance(example_inst.output, numbers.Integral):
        m.append(metrics.squared_error)
    elif isinstance(example_inst.output, (tuple, list)):
        m.append(metrics.prec1)
        if example_inst.output and isinstance(example_inst.output, basestring):
            m.extend([metrics.bleu, metrics.wer,
                      metrics.token_perplexity_macro, metrics.token_perplexity_micro])
    else:
        m.append(metrics.accuracy)
        if example_inst.output and isinstance(example_inst.output, basestring):
            m.extend([metrics.bleu, metrics.wer,
                      metrics.token_perplexity_macro, metrics.token_perplexity_micro])

    multi_train = (len(options.data_source) > 1)
    if options.load:
        with open(options.load, 'rb') as infile:
            learner.load(infile)

        train_results = None
    else:
        if hasattr(learner, '_data_to_arrays'):
            # XXX: is there a better way to ensure that the vocabulary is defined
            # before training starts?
            for train_insts in train_datasets[1:]:
                learner._data_to_arrays(train_insts, init_vectorizer=True)

        for i, (source, train_insts, validation_insts) in enumerate(zip(options.data_source,
                                                                        train_datasets,
                                                                        validation_datasets)):
            if not train_insts:
                continue

            if i > 0:
                learner.train(train_insts, validation_insts, metrics=m, keep_params=True)
            else:
                learner.train(train_insts, validation_insts, metrics=m)
            with open(config.get_file_path('model.p'), 'wb') as outfile:
                learner.dump(outfile)

            if multi_train:
                split_id = 'train_' + source
            else:
                split_id = 'train'
            train_results = evaluate.evaluate(learner, train_insts, metrics=m, split_id=split_id,
                                              write_data=options.output_train_data)
            if options.verbosity != 0:
                output.output_results(train_results, split_id)

    for i, (source, test_insts) in enumerate(zip(options.data_source,
                                                 test_datasets)):
        if not test_insts:
            continue
        if multi_train:
            split_id = 'eval_' + source
        else:
            split_id = 'eval'
        test_results = evaluate.evaluate(learner, test_insts, metrics=m, split_id=split_id,
                                         write_data=options.output_test_data)
        if options.verbosity != 0:
            output.output_results(test_results, split_id)

    return train_results, test_results
示例#20
0
def main():
    options = config.options()

    with thutils.device_context(options.device):
        progress.set_resolution(
            datetime.timedelta(seconds=options.progress_tick))

        SG = iterators.SizedGenerator

        if not hasattr(options, 'verbosity') or options.verbosity >= 2:
            print('Pre-calculating dataset sizes')
        train_data = SG(lambda: islice(dataset(options.train_file), 0,
                                       nin(options.train_size)),
                        length=None)
        if not hasattr(options, 'verbosity') or options.verbosity >= 4:
            print('Training set size: {}'.format(len(train_data)))

        validation_data = None
        if options.validation_file:
            validation_data = SG(
                lambda: islice(dataset(options.validation_file), 0,
                               nin(options.validation_size)),
                length=None)
            if not hasattr(options, 'verbosity') or options.verbosity >= 4:
                print('Validation set size: {}'.format(len(validation_data)))

        eval_data = SG(lambda: islice(dataset(options.eval_file), 0,
                                      nin(options.eval_size)),
                       length=None)
        if not hasattr(options, 'verbosity') or options.verbosity >= 4:
            print('Eval set size: {}'.format(len(eval_data)))

        learner = learners.new(options.learner)

        m = [metrics.METRICS[m] for m in options.metrics]

        if options.load:
            with open(options.load, 'rb') as infile:
                learner.load(infile)
        else:
            learner.train(train_data, validation_data, metrics=m)
            model_path = config.get_file_path('model.pkl')
            if model_path:
                with open(model_path, 'wb') as outfile:
                    learner.dump(outfile)

            train_results = evaluate.evaluate(
                learner,
                train_data,
                metrics=m,
                split_id='train',
                write_data=options.output_train_data,
                pass_split=True)
            output.output_results(train_results, 'train')

        eval_results = evaluate.evaluate(learner,
                                         eval_data,
                                         metrics=m,
                                         split_id='eval',
                                         write_data=options.output_eval_data,
                                         pass_split=True)
        output.output_results(eval_results, 'eval')
示例#21
0
    def __quickpickle_setstate__(self, state):
        self.__dict__ = state

    def __quickpickle_getstate__(self):
        state = dict(self.__dict__)
        del state['__getstate__']
        del state['__setstate__']
        state['quickpickle'] = True
        return state

    def __quickpickle_numparams__(self):
        return self.quickpickle_numparams

    model.__getstate__ = types.MethodType(__quickpickle_getstate__, model)
    model.__setstate__ = types.MethodType(__quickpickle_setstate__, model)
    model.quickpickle_numparams = model.num_params


if __name__ == '__main__':
    sys.setrecursionlimit(50000)
    options = config.options(read=True)
    if options.load:
        modelfile = options.load
    else:
        modelfile = config.get_file_path('model.p')
    with open(modelfile, 'rb') as infile, config.open('quickpickle.p',
                                                      'wb') as outfile:
        model = pickle.load(infile)
        patch(model)
        pickle.dump(model, outfile)