def load_embeddings(data_name: str, embedding_name: str, seed: int = None): """ Load a indexed word embedding object. This method will first check if the given setup have previously been serialized, restore the object from file. Otherwise, construct a new object. """ # The naming convention of the serialization path. if seed: pkl_path = os.path.join( build.BUILD_DIR, 'data', '{}.{}.seed{}.pkl'.format(data_name, embedding_name, seed)) else: pkl_path = os.path.join(build.BUILD_DIR, 'data', '{}.{}.pkl'.format(data_name, embedding_name)) if os.path.exists(pkl_path): log.info( 'Restore corpus-specific indexed word embedding from file: %s' % pkl_path) with open(pkl_path, 'rb') as pkl_file: embeds = pickle.load(pkl_file) else: if seed: log.debug('Set numpy random seed to %d' % seed) np.random.seed(seed) log.info('Build corpus-specific indexed word embedding.') embeds = globals()[data_name].init_indexed_word_embedding( embed.init(embedding_name)) # Serialize for reusing log.info('Save corpus-specific indexed word embedding to file: %s.' % pkl_path) os.makedirs(os.path.normpath(os.path.join(pkl_path, os.pardir)), exist_ok=True) with open(pkl_path, 'wb') as pkl_file: pickle.dump(embeds, pkl_file, 4) return embeds
def load_dataset(data_name: str, data_mode: str, embedding_name: str, seed: int = None) -> Dataset: if seed: pkl_path = os.path.join( build.BUILD_DIR, 'data', '{}-{}.{}.seed{}.pkl'.format(data_name, data_mode, embedding_name, seed)) else: pkl_path = os.path.join( build.BUILD_DIR, 'data', '{}-{}.{}.pkl'.format(data_name, data_mode, embedding_name)) # Load preprocessed data object from pkl if applicable. if os.path.exists(pkl_path): log.info('Restore %s %s dataset from file: %s' % (data_name, data_mode, pkl_path)) with open(pkl_path, 'rb') as pkl_file: dataset = pickle.load(pkl_file) else: log.info('Build %s %s dataset' % (data_name, data_mode)) embedding = load_embeddings(data_name, embedding_name, seed) if seed: log.debug('Set numpy random seed to %d' % seed) np.random.seed(seed) dataset = globals()[data_name](data_mode, indexed_word_embedding=embedding) os.makedirs(os.path.normpath(os.path.join(pkl_path, os.pardir)), exist_ok=True) log.info('Serialize %s %s dataset to file %s.' % (data_mode, data_name, pkl_path)) with open(pkl_path, 'wb') as pkl_file: pickle.dump(dataset, pkl_file, 4) return dataset
def _init_optimizer(optim_type: str, learning_rate: float = None, global_step: int = None, decay_steps: int = None, decay_rate: float = None, **kwargs): if decay_steps and decay_rate: learning_rate = tf.train.exponential_decay(learning_rate, global_step, decay_steps, decay_rate) init_kwargs = {'name': 'optimizer', 'learning_rate': learning_rate} log.debug('Build %s%s' % (optim_type, '' if isinstance( learning_rate, float) else ' with exponential decay')) return getattr(tf.train, optim_type)(learning_rate=learning_rate, **kwargs)
def update(self, step_acc, global_step) -> None: if step_acc < self._max_acc + self.min_delta: self._no_update_cnt += 1 log.debug('No update for %d consequtive times.' % self._no_update_cnt) if self._no_update_cnt >= self.patience: self._lr_val[self._idx] /= 2 log.info('Half the current learning rate to : %f' % self._lr_val[self._idx]) self._no_update_cnt = 0 else: self._no_update_cnt = 0 self._max_acc = max(self._max_acc, step_acc)
def test( name: str, model_type: str, step: int = None, mode: str = 'test', data_seed: int = None, data_name: str = 'SNLI', data_embedding: str = 'GloVe', data_pad: bool = True, batch_size: int = 10, print_errors: bool = False, print_errors_limit: int = 10, **kwargs, ) -> None: model_path = build.get_model_path(name) model = getattr(nn, model_type)(embeddings=data.load_embeddings( data_name, data_embedding, data_seed), **kwargs) log.info(str(model)) log.debug('Model parameters:\n\n\t' + '\n\t'.join(graph.print_trainable_variables().split('\n'))) with tf.Session(config=_make_config()) as sess: dataset = data.load_dataset(data_name, mode, data_embedding, data_seed) data_iter, data_hd = _make_dataset_iterator( type_name='initializable_iterator', handle_name='data_handle', dataset=dataset, batch_size=batch_size, shuffle=False, pad=data_pad, session=sess) _restore_model(sess, model_path, step) y_preds, y_trues = [], [] # type: ignore sess.run(data_iter.initializer) while True: try: true, pred = sess.run( [model.y, model.prediction], feed_dict={ model.handle: data_hd, model.keep_prob: 1.0, model.is_training: False }) y_preds.extend(np.squeeze(pred).tolist()) y_trues.extend(np.squeeze(true).tolist()) except tf.errors.OutOfRangeError: break # print accuracy print('Acc: %.4f' % sklearn.metrics.accuracy_score(y_trues, y_preds)) # Print confusion matrix labels = list( sorted(data.SNLI.LABELS.keys(), key=lambda x: data.SNLI.LABELS[x])) cm = sklearn.metrics.confusion_matrix(y_trues, y_preds, labels=range(len(labels))) tmpl = '%15s ' * (len(labels) + 2) print(tmpl % tuple([''] + labels + [''])) corr = 0 for i in range(len(labels)): stats = cm[i] prob = stats[i] / sum(stats) corr += stats[i] print(tmpl % tuple([labels[i]] + list(map(str, cm[i])) + ['%.4f' % prob])) print(tmpl % tuple(['%d / %d' % (corr, len(y_trues))] + [''] * len(labels) + ['%.4f' % (corr / len(y_trues))])) # Print errors if print_errors: tmpl = '\n%4d. Pred: %-20s True: %s\n %s\n %s' for i, (y_pred, y_true) in enumerate(zip(y_preds, y_trues)): if y_pred != y_true and print_errors_limit != 0: s1 = ' '.join(dataset.x1_words[i]) s2 = ' '.join(dataset.x2_words[i]) l_pred = labels[y_pred] l_true = labels[y_true] print(tmpl % (i, l_pred, l_true, s1, s2)) print_errors_limit -= 1
def _make_dataset(dataset: data.Dataset, batch_size: int, argument: bool = False, bucket_boundaries: t.List[int] = [], cache: bool = False, shuffle: bool = True, pad: bool = True, shuffle_buffer_size: int = 40960, prefetch_buffer_size: int = -1, repeat_num: int = 1, seed: int = None) -> t.Tuple[tf.data.Iterator, tf.Tensor]: """ Prepare a `tf.data.Dataset` for evaluation. Args: dataset: A dataset. batch_size: The batch size. shuffle_buffer_size: The buffer size for random shuffling. Disable random shuffling when `shuffle_buffer_size` is smaller than or equal to 1. prefetch_buffer_size: The buffer size for prefetching. This parameter is used, when shuffling is permitted. When given a non-positive value, `prefetch_buffer_size` will adapt to `batch_size`. repeat_time: The number of times the records in the dataset are repeated. """ output_shapes = ( [None], [None], [], # x1, x2, y [], [], # len1, len2 [None, nn.WORD_SEQ_LEN], # char1 [None, nn.WORD_SEQ_LEN], # char2 [None, 4], [None, 4], # temp1, temp2 [None], [None]) # type: tuple # tag1, tag2 if argument: log.debug('Apply data argumentation') def gen(): def to_ord_list(word): """ Convert the first 16 characters of the given word to their ordinal value. If the given word contains less than 16 words, pad the list to length 16 with 0. """ out = list(map(ord, list(word))) while len(out) < nn.WORD_SEQ_LEN: out += 0, return out[:nn.WORD_SEQ_LEN] for x1, x2, y, w1, w2, (temp1, tag1), (temp2, tag2) in zip( dataset.x1_ids, dataset.x2_ids, dataset.labels, dataset.x1_words, dataset.x2_words, dataset.x1_feats, dataset.x2_feats): yield (x1, x2, y, len(x1), len(x2), list(map(to_ord_list, w1)), list(map(to_ord_list, w2)), temp1, temp2, tag1, tag2) if argument: yield (x2, x1, y, len(x2), len(x1), list(map(to_ord_list, w2)), list(map(to_ord_list, w1)), temp2, temp1, tag2, tag1) dset = tf.data.Dataset.from_generator(gen, output_types=(tf.int32, ) * 11, output_shapes=output_shapes) if cache: log.debug('Cache dataset during computation') dset = dset.cache() else: log.debug('Do not cache dataset during computation') if shuffle and shuffle_buffer_size > 1: if tf.__version__ >= '1.6': dset = dset.apply( tf.contrib.data.shuffle_and_repeat( buffer_size=shuffle_buffer_size, count=repeat_num, seed=seed)) else: dset = (dset.shuffle(shuffle_buffer_size, seed=seed).repeat(repeat_num)) else: dset = dset.repeat(repeat_num) # Pack records with similar lengthes as batch. if pad: if bucket_boundaries: log.debug('Generate batches using ' 'tf.contrib.data.bucket_by_sequence_length') dset = dset.apply( tf.contrib.data.bucket_by_sequence_length( (lambda x1, x2, y, l1, l2, c1, c2, tmp1, tmp2, tag1, tag2: tf.maximum(l1, l2)), bucket_boundaries, [batch_size] * (len(bucket_boundaries) + 1))) else: log.debug('Generate padded batches without bucketing') dset = dset.padded_batch(batch_size, padded_shapes=output_shapes) else: log.debug('Generate batches without padding input sequences') dset = dset.batch(batch_size) if prefetch_buffer_size <= 0: prefetch_buffer_size = 64 * batch_size return dset.prefetch(buffer_size=prefetch_buffer_size)
def train(name: str, model_type: str, batch_size: int = 256, epoch_num: int = 200, keep_prob: float = 0.8, train_regex_list: t.Union[t.List[str], str] = None, optim_manager_type: str = 'NotChange', data_name: str = 'SNLI', data_embedding: str = 'GloVe', data_argument: bool = False, data_pad: bool = True, data_cache: bool = False, data_seed: int = None, record_every: int = 64000, validate_every: int = 640000, save_every: int = 6400000, restore_from: str = None, restore_step: int = None, profiling: bool = False, clip_norm: int = None, seed: int = None, debug: bool = False, **kwargs) -> None: # Data preparation model_path = build.get_model_path(name) shutil.rmtree(model_path, ignore_errors=True) # remove previous trained # Network setup model = getattr(nn, model_type)(embeddings=data.load_embeddings( data_name, data_embedding, data_seed), **_select_kwargs_regex(kwargs, r'^optim[0-9]*_', invert=True)) log.info(str(model)) log.debug('Model parameters:\n\n\t' + '\n\t'.join(graph.print_trainable_variables().split('\n'))) # Control randomization if seed: log.info( 'Set random seed for data shuffling and graph computation: %d' % seed) tf.set_random_seed(seed) train_summary = _make_model_summary(model) with tf.Session(config=_make_config()) as sess: if debug: from tensorflow.python import debug as tf_debug sess = tf_debug.LocalCLIDebugWrapperSession(sess) dataset_opts = { 'pad': data_pad, 'batch_size': batch_size, 'session': sess, } train_iter, train_hd = _make_dataset_iterator( type_name='one_shot_iterator', handle_name='train_handle', dataset=data.load_dataset(data_name, 'train', data_embedding, data_seed), argument=data_argument, bucket_boundaries=[20, 50], repeat_num=epoch_num, cache=data_cache, seed=seed, **dataset_opts) valid_iter, valid_hd = _make_dataset_iterator( type_name='initializable_iterator', handle_name='valid_handle', dataset=data.load_dataset(data_name, 'validation', data_embedding, data_seed), shuffle=False, cache=True, **dataset_opts) test_iter, test_hd = _make_dataset_iterator( type_name='initializable_iterator', handle_name='test_handle', dataset=data.load_dataset(data_name, 'test', data_embedding, data_seed), shuffle=False, cache=True, **dataset_opts) om = _make_optim_manager(optim_manager_type, model.loss, clip_norm, train_regex_list, kwargs) test_wtr = tf.summary.FileWriter(os.path.join(model_path, 'test')) train_wtr = tf.summary.FileWriter(os.path.join(model_path, 'train'), sess.graph) # Build a validation summary writer for each optimizer valid_wtr = {} for optim in om.optims: valid_wtr[optim.get_name()] = tf.summary.FileWriter( os.path.join(model_path, 'valid-%s' % optim.get_name())) if restore_from: _copy_checkpoint(restore_from, model_path, restore_step) _restore_model(sess, model_path, restore_step) # Evaluate the pretrained model step = restore_step _iterate_dataset(sess, model, valid_iter, valid_hd, valid_wtr[om.optim.get_name()], step) _iterate_dataset(sess, model, test_iter, test_hd, test_wtr, step) else: sess.run(tf.global_variables_initializer()) step = 0 if profiling: _profile_and_exit(sess, model, om.optim_op, train_hd) pbar = tqdm.tqdm(total=save_every, desc='Train', unit=' inst') try: while True: feed_dict = { model.handle: train_hd, model.keep_prob: keep_prob, model.is_training: True } if om.feed_lr: feed_dict[om.lr_op] = om.lr_val if step % record_every == 0: summary, _, loss = sess.run( [train_summary, om.optim_op, model.loss], feed_dict=feed_dict) pbar.set_postfix(loss='{:.3f}'.format(loss)) train_wtr.add_summary(summary, step) else: sess.run([om.optim_op], feed_dict=feed_dict) if step and step % validate_every == 0: pbar.set_description('Valid') valid_acc = _iterate_dataset( sess, model, valid_iter, valid_hd, valid_wtr[om.optim.get_name()], step) # Update upon the validation perfomance om.update(valid_acc, step) pbar.set_description('Test') _iterate_dataset(sess, model, test_iter, test_hd, test_wtr, step) pbar.set_description('Train') if step and step % save_every == 0: save_path = _save_model(sess, model_path, step) pbar.set_description(save_path) pbar.update(batch_size) pbar.close() pbar = tqdm.tqdm(total=save_every, desc='Train', unit=' inst') else: pbar.update(batch_size) step += batch_size except tf.errors.OutOfRangeError: save_path = _save_model(sess, model_path, step) pbar.set_description(save_path) log.info('Training finished!')
s1 = ' '.join(dataset.x1_words[i]) s2 = ' '.join(dataset.x2_words[i]) l_pred = labels[y_pred] l_true = labels[y_true] print(tmpl % (i, l_pred, l_true, s1, s2)) print_errors_limit -= 1 if __name__ == "__main__": # Disable the debugging INFO and WARNING information os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' #os.environ['CUDA_VISIBLE_DEVICES'] = '-1' # Testing reads arguments from both file and commend line. Commend line # arguments can override ones parsed from file. mode = sys.argv[1] kwargs = parse.parse_args(sys.argv[1:]) if 'file' in kwargs: # Use the file name as the default model name. fname = os.path.basename(kwargs['file']) # type: ignore if 'name' not in kwargs: kwargs['name'] = fname[:fname.rfind('.')] # type: ignore kwargs = {**parse.parse_yaml(kwargs['file'], mode=mode), **kwargs} del kwargs['file'] log.debug('Input arguments:\n\n\t%s\n' % '\n\t'.join('%-25s %s' % (k + ':', v) for k, v in kwargs.items())) locals()[mode](**kwargs) # type: ignore