def store_model(checkpoint_base, config_sha1, checkpoint_store, print_fn=print): checkpoint_base = unzip_model(checkpoint_base) mdir, mbase = os.path.split(checkpoint_base) mdir = mdir if mdir else "." if not os.path.exists(mdir): print_fn("no directory found for the model location: [{}], aborting command".format(mdir)) return None mfiles = ["{}/{}".format(mdir, x) for x in os.listdir(mdir) if x.startswith(mbase + "-") or x.startswith(mbase + ".")] if not mfiles: print_fn("no model files found with base [{}] at location [{}], aborting command".format(mbase, mdir)) return None model_loc_base = "{}/{}".format(checkpoint_store, config_sha1) if not os.path.exists(model_loc_base): os.makedirs(model_loc_base) dirs = [int(x[:-4]) for x in os.listdir(model_loc_base) if x.endswith(".zip") and x[:-4].isdigit()] # we expect dirs in numbers. new_dir = "1" if not dirs else str(max(dirs) + 1) model_loc = "{}/{}".format(model_loc_base, new_dir) os.makedirs(model_loc) for mfile in mfiles: shutil.copy(mfile, model_loc) print_fn("writing model file: [{}] to store: [{}]".format(mfile, model_loc)) print_fn("zipping model files") shutil.make_archive(base_name=model_loc, format='zip', root_dir=model_loc_base, base_dir=new_dir) shutil.rmtree(model_loc) print_fn("model files zipped and written") return model_loc + ".zip"
def load(basename, **kwargs): basename = unzip_model(basename) with open(basename + '.state', 'r') as f: state = json.load(f) # FIXME: Need a single name for this. This is a total hack state["layers"] = state["nlayers"] with open(basename + '-1.vocab', 'r') as f: src_vocab_embed = json.load(f) with open(basename + '-2.vocab', 'r') as f: dst_vocab_embed = json.load(f) if 'predict' in kwargs: state['predict'] = kwargs['predict'] if 'sampling' in kwargs: state['sampling'] = kwargs['sampling'] if 'sampling_temp' in kwargs: state['sampling_temp'] = kwargs['sampling_temp'] if 'beam' in kwargs: state['beam'] = kwargs['beam'] state['sess'] = kwargs.get('sess', tf.Session()) if 'model_type' in kwargs: state['model_type'] = kwargs['model_type'] elif state['attn']: state[ 'model_type'] = 'attn' if state['attn'] is True else 'default' model = Seq2SeqModel.create(src_vocab_embed, dst_vocab_embed, **state) do_init = kwargs.get('init', True) if do_init: init = tf.global_variables_initializer() model.sess.run(init) model.saver = tf.train.Saver() model.saver.restore(model.sess, basename + '.model') return model
def load(basename, **kwargs): basename = unzip_model(basename) model = RNNTaggerModel() model.sess = kwargs.get('sess', tf.Session()) checkpoint_name = kwargs.get('checkpoint_name', basename) checkpoint_name = checkpoint_name or basename with open(basename + '.state') as f: state = json.load(f) model.mxlen = state.get('mxlen', 100) model.maxw = state.get('maxw', 100) model.crf = bool(state.get('crf', False)) model.crf_mask = bool(state.get('crf_mask', False)) model.span_type = state.get('span_type') model.proj = bool(state.get('proj', False)) with open(basename + '.saver') as fsv: saver_def = tf.train.SaverDef() text_format.Merge(fsv.read(), saver_def) with gfile.FastGFile(basename + '.graph', 'rb') as f: gd = tf.GraphDef() gd.ParseFromString(f.read()) model.sess.graph.as_default() tf.import_graph_def(gd, name='') model.sess.run(saver_def.restore_op_name, {saver_def.filename_tensor_name: checkpoint_name}) model.x = tf.get_default_graph().get_tensor_by_name('x:0') model.xch = tf.get_default_graph().get_tensor_by_name('xch:0') model.y = tf.get_default_graph().get_tensor_by_name('y:0') model.lengths = tf.get_default_graph().get_tensor_by_name( 'lengths:0') model.pkeep = tf.get_default_graph().get_tensor_by_name('pkeep:0') model.best = tf.get_default_graph().get_tensor_by_name( 'output/ArgMax:0') model.probs = tf.get_default_graph().get_tensor_by_name( 'output/Reshape_1:0') # TODO: rename try: model.A = tf.get_default_graph().get_tensor_by_name( 'Loss/transitions:0') #print('Found transition matrix in graph, setting crf=True') if not model.crf: print( 'Warning: meta-data says no CRF but model contains transition matrix!' ) model.crf = True except: if model.crf is True: print( 'Warning: meta-data says there is a CRF but not transition matrix found!' ) model.A = None model.crf = False with open(basename + '.labels', 'r') as f: model.labels = json.load(f) model.word_vocab = {} if os.path.exists(basename + '-word.vocab'): with open(basename + '-word.vocab', 'r') as f: model.word_vocab = json.load(f) with open(basename + '-char.vocab', 'r') as f: model.char_vocab = json.load(f) model.saver = tf.train.Saver(saver_def=saver_def) return model
def fit(model, ts, vs, es=None, **kwargs): """ Train a classifier using TensorFlow :param model: The model to train :param ts: A training data set :param vs: A validation data set :param es: A test data set, can be None :param kwargs: See below :Keyword Arguments: * *do_early_stopping* (``bool``) -- Stop after evaluation data is no longer improving. Defaults to True * *epochs* (``int``) -- how many epochs. Default to 20 * *outfile* -- Model output file, defaults to classifier-model.pyth * *patience* -- How many epochs where evaluation is no longer improving before we give up * *reporting* -- Callbacks which may be used on reporting updates * Additional arguments are supported, see :func:`baseline.tf.optimize` for full list :return: """ do_early_stopping = bool(kwargs.get('do_early_stopping', True)) verbose = kwargs.get( 'verbose', { 'console': kwargs.get('verbose_console', False), 'file': kwargs.get('verbose_file', None) }) epochs = int(kwargs.get('epochs', 20)) model_file = get_model_file('classify', 'tf', kwargs.get('basedir')) ema = True if kwargs.get('ema_decay') is not None else False output = kwargs.get('output') txts = kwargs.get('txts') best_metric = 0 if do_early_stopping: early_stopping_metric = kwargs.get('early_stopping_metric', 'acc') early_stopping_cmp, best_metric = get_metric_cmp( early_stopping_metric, kwargs.get('early_stopping_cmp')) patience = kwargs.get('patience', epochs) logger.info('Doing early stopping on [%s] with patience [%d]', early_stopping_metric, patience) reporting_fns = listify(kwargs.get('reporting', [])) logger.info('reporting %s', reporting_fns) trainer = create_trainer(model, **kwargs) tables = tf.tables_initializer() model.sess.run(tables) m = model.replicas[0] if hasattr(model, 'replicas') else model feed_dict = { k: v for e in m.embeddings.values() for k, v in e.get_feed_dict().items() } model.sess.run(tf.global_variables_initializer(), feed_dict) model.set_saver(tf.train.Saver()) checkpoint = kwargs.get('checkpoint') if checkpoint is not None: checkpoint = unzip_model(checkpoint) model.saver.restore(model.sess, checkpoint) last_improved = 0 for epoch in range(epochs): trainer.train(ts, reporting_fns) test_metrics = trainer.test(vs, reporting_fns, phase='Valid') if do_early_stopping is False: trainer.checkpoint() trainer.model.save(model_file) elif early_stopping_cmp(test_metrics[early_stopping_metric], best_metric): last_improved = epoch best_metric = test_metrics[early_stopping_metric] logger.info('New best %.3f', best_metric) trainer.checkpoint() trainer.model.save(model_file) elif (epoch - last_improved) > patience: logger.info('Stopping due to persistent failures to improve') break if do_early_stopping is True: logger.info('Best performance on %s: %.3f at epoch %d', early_stopping_metric, best_metric, last_improved) if es is not None: logger.info('Reloading best checkpoint') trainer.recover_last_checkpoint() test_metrics = trainer.test(es, reporting_fns, phase='Test', verbose=verbose, output=output, txts=txts) return test_metrics