Пример #1
0
def load_embeddings(data_name: str, embedding_name: str, seed: int = None):
    """ Load a indexed word embedding object.
    
    This method will first check if the given setup have previously been
    serialized, restore the object from file. Otherwise, construct a new object.
    """
    # The naming convention of the serialization path.
    if seed:
        pkl_path = os.path.join(
            build.BUILD_DIR, 'data',
            '{}.{}.seed{}.pkl'.format(data_name, embedding_name, seed))
    else:
        pkl_path = os.path.join(build.BUILD_DIR, 'data',
                                '{}.{}.pkl'.format(data_name, embedding_name))
    if os.path.exists(pkl_path):
        log.info(
            'Restore corpus-specific indexed word embedding from file: %s' %
            pkl_path)
        with open(pkl_path, 'rb') as pkl_file:
            embeds = pickle.load(pkl_file)
    else:
        if seed:
            log.debug('Set numpy random seed to %d' % seed)
            np.random.seed(seed)
        log.info('Build corpus-specific indexed word embedding.')
        embeds = globals()[data_name].init_indexed_word_embedding(
            embed.init(embedding_name))
        # Serialize for reusing
        log.info('Save corpus-specific indexed word embedding to file: %s.' %
                 pkl_path)
        os.makedirs(os.path.normpath(os.path.join(pkl_path, os.pardir)),
                    exist_ok=True)
        with open(pkl_path, 'wb') as pkl_file:
            pickle.dump(embeds, pkl_file, 4)
    return embeds
Пример #2
0
def load_dataset(data_name: str,
                 data_mode: str,
                 embedding_name: str,
                 seed: int = None) -> Dataset:
    if seed:
        pkl_path = os.path.join(
            build.BUILD_DIR, 'data',
            '{}-{}.{}.seed{}.pkl'.format(data_name, data_mode, embedding_name,
                                         seed))
    else:
        pkl_path = os.path.join(
            build.BUILD_DIR, 'data',
            '{}-{}.{}.pkl'.format(data_name, data_mode, embedding_name))
    # Load preprocessed data object from pkl if applicable.
    if os.path.exists(pkl_path):
        log.info('Restore %s %s dataset from file: %s' %
                 (data_name, data_mode, pkl_path))
        with open(pkl_path, 'rb') as pkl_file:
            dataset = pickle.load(pkl_file)
    else:
        log.info('Build %s %s dataset' % (data_name, data_mode))
        embedding = load_embeddings(data_name, embedding_name, seed)
        if seed:
            log.debug('Set numpy random seed to %d' % seed)
            np.random.seed(seed)
        dataset = globals()[data_name](data_mode,
                                       indexed_word_embedding=embedding)
        os.makedirs(os.path.normpath(os.path.join(pkl_path, os.pardir)),
                    exist_ok=True)
        log.info('Serialize %s %s dataset to file %s.' %
                 (data_mode, data_name, pkl_path))
        with open(pkl_path, 'wb') as pkl_file:
            pickle.dump(dataset, pkl_file, 4)
    return dataset
Пример #3
0
def _init_optimizer(optim_type: str,
                    learning_rate: float = None,
                    global_step: int = None,
                    decay_steps: int = None,
                    decay_rate: float = None,
                    **kwargs):
    if decay_steps and decay_rate:
        learning_rate = tf.train.exponential_decay(learning_rate, global_step,
                                                   decay_steps, decay_rate)
    init_kwargs = {'name': 'optimizer', 'learning_rate': learning_rate}
    log.debug('Build %s%s' % (optim_type, '' if isinstance(
        learning_rate, float) else ' with exponential decay'))
    return getattr(tf.train, optim_type)(learning_rate=learning_rate, **kwargs)
Пример #4
0
 def update(self, step_acc, global_step) -> None:
     if step_acc < self._max_acc + self.min_delta:
         self._no_update_cnt += 1
         log.debug('No update for %d consequtive times.' %
                   self._no_update_cnt)
         if self._no_update_cnt >= self.patience:
             self._lr_val[self._idx] /= 2
             log.info('Half the current learning rate to : %f' %
                      self._lr_val[self._idx])
             self._no_update_cnt = 0
     else:
         self._no_update_cnt = 0
     self._max_acc = max(self._max_acc, step_acc)
Пример #5
0
def test(
    name: str,
    model_type: str,
    step: int = None,
    mode: str = 'test',
    data_seed: int = None,
    data_name: str = 'SNLI',
    data_embedding: str = 'GloVe',
    data_pad: bool = True,
    batch_size: int = 10,
    print_errors: bool = False,
    print_errors_limit: int = 10,
    **kwargs,
) -> None:
    model_path = build.get_model_path(name)

    model = getattr(nn, model_type)(embeddings=data.load_embeddings(
        data_name, data_embedding, data_seed),
                                    **kwargs)
    log.info(str(model))
    log.debug('Model parameters:\n\n\t' +
              '\n\t'.join(graph.print_trainable_variables().split('\n')))

    with tf.Session(config=_make_config()) as sess:
        dataset = data.load_dataset(data_name, mode, data_embedding, data_seed)

        data_iter, data_hd = _make_dataset_iterator(
            type_name='initializable_iterator',
            handle_name='data_handle',
            dataset=dataset,
            batch_size=batch_size,
            shuffle=False,
            pad=data_pad,
            session=sess)

        _restore_model(sess, model_path, step)

        y_preds, y_trues = [], []  # type: ignore
        sess.run(data_iter.initializer)
        while True:
            try:
                true, pred = sess.run(
                    [model.y, model.prediction],
                    feed_dict={
                        model.handle: data_hd,
                        model.keep_prob: 1.0,
                        model.is_training: False
                    })
                y_preds.extend(np.squeeze(pred).tolist())
                y_trues.extend(np.squeeze(true).tolist())
            except tf.errors.OutOfRangeError:
                break

    # print accuracy
    print('Acc: %.4f' % sklearn.metrics.accuracy_score(y_trues, y_preds))

    # Print confusion matrix
    labels = list(
        sorted(data.SNLI.LABELS.keys(), key=lambda x: data.SNLI.LABELS[x]))
    cm = sklearn.metrics.confusion_matrix(y_trues,
                                          y_preds,
                                          labels=range(len(labels)))
    tmpl = '%15s ' * (len(labels) + 2)
    print(tmpl % tuple([''] + labels + ['']))
    corr = 0
    for i in range(len(labels)):
        stats = cm[i]
        prob = stats[i] / sum(stats)
        corr += stats[i]
        print(tmpl %
              tuple([labels[i]] + list(map(str, cm[i])) + ['%.4f' % prob]))
    print(tmpl % tuple(['%d / %d' % (corr, len(y_trues))] +
                       [''] * len(labels) + ['%.4f' % (corr / len(y_trues))]))

    # Print errors
    if print_errors:
        tmpl = '\n%4d. Pred: %-20s  True: %s\n      %s\n      %s'
        for i, (y_pred, y_true) in enumerate(zip(y_preds, y_trues)):
            if y_pred != y_true and print_errors_limit != 0:
                s1 = ' '.join(dataset.x1_words[i])
                s2 = ' '.join(dataset.x2_words[i])
                l_pred = labels[y_pred]
                l_true = labels[y_true]
                print(tmpl % (i, l_pred, l_true, s1, s2))
                print_errors_limit -= 1
Пример #6
0
def _make_dataset(dataset: data.Dataset,
                  batch_size: int,
                  argument: bool = False,
                  bucket_boundaries: t.List[int] = [],
                  cache: bool = False,
                  shuffle: bool = True,
                  pad: bool = True,
                  shuffle_buffer_size: int = 40960,
                  prefetch_buffer_size: int = -1,
                  repeat_num: int = 1,
                  seed: int = None) -> t.Tuple[tf.data.Iterator, tf.Tensor]:
    """ Prepare a `tf.data.Dataset` for evaluation.

    Args:
        dataset: A dataset.
        batch_size: The batch size.
        shuffle_buffer_size: The buffer size for random shuffling. Disable
            random shuffling when `shuffle_buffer_size` is smaller than or equal
            to 1.
        prefetch_buffer_size: The buffer size for prefetching. This parameter is
            used, when shuffling is permitted. When given a non-positive value,
            `prefetch_buffer_size` will adapt to `batch_size`.
        repeat_time: The number of times the records in the dataset are
            repeated.
    """
    output_shapes = (
        [None],
        [None],
        [],  # x1, x2, y
        [],
        [],  # len1, len2 
        [None, nn.WORD_SEQ_LEN],  # char1
        [None, nn.WORD_SEQ_LEN],  # char2
        [None, 4],
        [None, 4],  # temp1, temp2
        [None],
        [None])  # type: tuple  # tag1, tag2
    if argument:
        log.debug('Apply data argumentation')

    def gen():
        def to_ord_list(word):
            """ Convert the first 16 characters of the given word to their
            ordinal value. If the given word contains less than 16 words, pad
            the list to length 16 with 0. """
            out = list(map(ord, list(word)))
            while len(out) < nn.WORD_SEQ_LEN:
                out += 0,
            return out[:nn.WORD_SEQ_LEN]

        for x1, x2, y, w1, w2, (temp1, tag1), (temp2, tag2) in zip(
                dataset.x1_ids, dataset.x2_ids, dataset.labels,
                dataset.x1_words, dataset.x2_words, dataset.x1_feats,
                dataset.x2_feats):
            yield (x1, x2, y, len(x1), len(x2), list(map(to_ord_list, w1)),
                   list(map(to_ord_list, w2)), temp1, temp2, tag1, tag2)
            if argument:
                yield (x2, x1, y, len(x2), len(x1), list(map(to_ord_list, w2)),
                       list(map(to_ord_list, w1)), temp2, temp1, tag2, tag1)

    dset = tf.data.Dataset.from_generator(gen,
                                          output_types=(tf.int32, ) * 11,
                                          output_shapes=output_shapes)
    if cache:
        log.debug('Cache dataset during computation')
        dset = dset.cache()
    else:
        log.debug('Do not cache dataset during computation')

    if shuffle and shuffle_buffer_size > 1:
        if tf.__version__ >= '1.6':
            dset = dset.apply(
                tf.contrib.data.shuffle_and_repeat(
                    buffer_size=shuffle_buffer_size,
                    count=repeat_num,
                    seed=seed))
        else:
            dset = (dset.shuffle(shuffle_buffer_size,
                                 seed=seed).repeat(repeat_num))
    else:
        dset = dset.repeat(repeat_num)

    # Pack records with similar lengthes as batch.
    if pad:
        if bucket_boundaries:
            log.debug('Generate batches using '
                      'tf.contrib.data.bucket_by_sequence_length')
            dset = dset.apply(
                tf.contrib.data.bucket_by_sequence_length(
                    (lambda x1, x2, y, l1, l2, c1, c2, tmp1, tmp2, tag1, tag2:
                     tf.maximum(l1, l2)), bucket_boundaries,
                    [batch_size] * (len(bucket_boundaries) + 1)))
        else:
            log.debug('Generate padded batches without bucketing')
            dset = dset.padded_batch(batch_size, padded_shapes=output_shapes)
    else:
        log.debug('Generate batches without padding input sequences')
        dset = dset.batch(batch_size)

    if prefetch_buffer_size <= 0:
        prefetch_buffer_size = 64 * batch_size
    return dset.prefetch(buffer_size=prefetch_buffer_size)
Пример #7
0
def train(name: str,
          model_type: str,
          batch_size: int = 256,
          epoch_num: int = 200,
          keep_prob: float = 0.8,
          train_regex_list: t.Union[t.List[str], str] = None,
          optim_manager_type: str = 'NotChange',
          data_name: str = 'SNLI',
          data_embedding: str = 'GloVe',
          data_argument: bool = False,
          data_pad: bool = True,
          data_cache: bool = False,
          data_seed: int = None,
          record_every: int = 64000,
          validate_every: int = 640000,
          save_every: int = 6400000,
          restore_from: str = None,
          restore_step: int = None,
          profiling: bool = False,
          clip_norm: int = None,
          seed: int = None,
          debug: bool = False,
          **kwargs) -> None:

    # Data preparation
    model_path = build.get_model_path(name)
    shutil.rmtree(model_path, ignore_errors=True)  # remove previous trained

    # Network setup
    model = getattr(nn, model_type)(embeddings=data.load_embeddings(
        data_name, data_embedding, data_seed),
                                    **_select_kwargs_regex(kwargs,
                                                           r'^optim[0-9]*_',
                                                           invert=True))
    log.info(str(model))
    log.debug('Model parameters:\n\n\t' +
              '\n\t'.join(graph.print_trainable_variables().split('\n')))

    # Control randomization
    if seed:
        log.info(
            'Set random seed for data shuffling and graph computation: %d' %
            seed)
        tf.set_random_seed(seed)

    train_summary = _make_model_summary(model)

    with tf.Session(config=_make_config()) as sess:
        if debug:
            from tensorflow.python import debug as tf_debug
            sess = tf_debug.LocalCLIDebugWrapperSession(sess)

        dataset_opts = {
            'pad': data_pad,
            'batch_size': batch_size,
            'session': sess,
        }
        train_iter, train_hd = _make_dataset_iterator(
            type_name='one_shot_iterator',
            handle_name='train_handle',
            dataset=data.load_dataset(data_name, 'train', data_embedding,
                                      data_seed),
            argument=data_argument,
            bucket_boundaries=[20, 50],
            repeat_num=epoch_num,
            cache=data_cache,
            seed=seed,
            **dataset_opts)
        valid_iter, valid_hd = _make_dataset_iterator(
            type_name='initializable_iterator',
            handle_name='valid_handle',
            dataset=data.load_dataset(data_name, 'validation', data_embedding,
                                      data_seed),
            shuffle=False,
            cache=True,
            **dataset_opts)
        test_iter, test_hd = _make_dataset_iterator(
            type_name='initializable_iterator',
            handle_name='test_handle',
            dataset=data.load_dataset(data_name, 'test', data_embedding,
                                      data_seed),
            shuffle=False,
            cache=True,
            **dataset_opts)

        om = _make_optim_manager(optim_manager_type, model.loss, clip_norm,
                                 train_regex_list, kwargs)

        test_wtr = tf.summary.FileWriter(os.path.join(model_path, 'test'))
        train_wtr = tf.summary.FileWriter(os.path.join(model_path, 'train'),
                                          sess.graph)
        # Build a validation summary writer for each optimizer
        valid_wtr = {}
        for optim in om.optims:
            valid_wtr[optim.get_name()] = tf.summary.FileWriter(
                os.path.join(model_path, 'valid-%s' % optim.get_name()))

        if restore_from:
            _copy_checkpoint(restore_from, model_path, restore_step)
            _restore_model(sess, model_path, restore_step)
            # Evaluate the pretrained model
            step = restore_step
            _iterate_dataset(sess, model, valid_iter, valid_hd,
                             valid_wtr[om.optim.get_name()], step)
            _iterate_dataset(sess, model, test_iter, test_hd, test_wtr, step)
        else:
            sess.run(tf.global_variables_initializer())
            step = 0

        if profiling:
            _profile_and_exit(sess, model, om.optim_op, train_hd)

        pbar = tqdm.tqdm(total=save_every, desc='Train', unit=' inst')
        try:
            while True:
                feed_dict = {
                    model.handle: train_hd,
                    model.keep_prob: keep_prob,
                    model.is_training: True
                }
                if om.feed_lr:
                    feed_dict[om.lr_op] = om.lr_val
                if step % record_every == 0:
                    summary, _, loss = sess.run(
                        [train_summary, om.optim_op, model.loss],
                        feed_dict=feed_dict)
                    pbar.set_postfix(loss='{:.3f}'.format(loss))
                    train_wtr.add_summary(summary, step)
                else:
                    sess.run([om.optim_op], feed_dict=feed_dict)

                if step and step % validate_every == 0:
                    pbar.set_description('Valid')
                    valid_acc = _iterate_dataset(
                        sess, model, valid_iter, valid_hd,
                        valid_wtr[om.optim.get_name()], step)
                    # Update upon the validation perfomance
                    om.update(valid_acc, step)
                    pbar.set_description('Test')
                    _iterate_dataset(sess, model, test_iter, test_hd, test_wtr,
                                     step)
                    pbar.set_description('Train')

                if step and step % save_every == 0:
                    save_path = _save_model(sess, model_path, step)
                    pbar.set_description(save_path)
                    pbar.update(batch_size)
                    pbar.close()
                    pbar = tqdm.tqdm(total=save_every,
                                     desc='Train',
                                     unit=' inst')
                else:
                    pbar.update(batch_size)

                step += batch_size

        except tf.errors.OutOfRangeError:
            save_path = _save_model(sess, model_path, step)
            pbar.set_description(save_path)
            log.info('Training finished!')
Пример #8
0
                s1 = ' '.join(dataset.x1_words[i])
                s2 = ' '.join(dataset.x2_words[i])
                l_pred = labels[y_pred]
                l_true = labels[y_true]
                print(tmpl % (i, l_pred, l_true, s1, s2))
                print_errors_limit -= 1


if __name__ == "__main__":
    # Disable the debugging INFO and WARNING information
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
    #os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

    # Testing reads arguments from both file and commend line. Commend line
    # arguments can override ones parsed from file.
    mode = sys.argv[1]
    kwargs = parse.parse_args(sys.argv[1:])
    if 'file' in kwargs:
        # Use the file name as the default model name.
        fname = os.path.basename(kwargs['file'])  # type: ignore
        if 'name' not in kwargs:
            kwargs['name'] = fname[:fname.rfind('.')]  # type: ignore
        kwargs = {**parse.parse_yaml(kwargs['file'], mode=mode), **kwargs}
        del kwargs['file']

    log.debug('Input arguments:\n\n\t%s\n' %
              '\n\t'.join('%-25s %s' % (k + ':', v)
                          for k, v in kwargs.items()))

    locals()[mode](**kwargs)  # type: ignore