def _load_finetuned_model(self): var_list = find_trainable_variables('model', exclude='model/target') if self.target_dim is not None: var_list.extend(find_trainable_variables('model/target')) saver = tf.train.Saver(var_list=var_list) saver.restore(self.sess, os.path.join(self._load_from_file, SAVE_PREFIX)) self._load_from_file = False self.is_trained = True
def _init_from_pretrained(self, init_params): """ Load pre-trained weights into the tensors """ pretrained_params = find_trainable_variables("model", exclude="model/target") self.sess.run(tf.global_variables_initializer()) self.sess.run( [p.assign(ip) for p, ip in zip(pretrained_params, init_params)])
def _load_base_model(self): """ Load serialized base model parameters into tf Tensors """ pretrained_params = find_trainable_variables('model', exclude='model/clf') self._initialize_session() self.sess.run(tf.global_variables_initializer()) with open(SHAPES_PATH) as shapes_file: shapes = json.load(shapes_file) offsets = np.cumsum([np.prod(shape) for shape in shapes]) init_params = [np.load(PARAM_PATH.format(n)) for n in range(10)] init_params = np.split(np.concatenate(init_params, 0), offsets)[:-1] init_params = [ param.reshape(shape) for param, shape in zip(init_params, shapes) ] init_params[0] = init_params[0][:self.config.max_length] special_embed = (np.random.randn(len(self.encoder.special_tokens), self.config.n_embed) * self.config.weight_stddev).astype(np.float32) init_params[0] = np.concatenate( [init_params[1], special_embed, init_params[0]], 0) del init_params[1] self.sess.run([ p.assign(ip) for p, ip in zip(pretrained_params, init_params) ])
def _save_fallback(self): with open(SHAPES_PATH) as shapes_file: shapes = json.load(shapes_file) offsets = np.cumsum([np.prod(shape) for shape in shapes]) init_params = [np.load(PARAM_PATH.format(n)) for n in range(10)] init_params = np.split(np.concatenate(init_params, 0), offsets)[:-1] init_params = [ param.reshape(shape) for param, shape in zip(init_params, shapes) ] init_params[0] = np.load( os.path.join(os.path.dirname(__file__), "model", "embeddings.npy")) del init_params[1] var_dict = dict( zip((var.name for var in find_trainable_variables( "model", exclude="model/target")), init_params)) joblib.dump(var_dict, self.fallback_filename)
lm_logits = language_model_state["logits"] aggregator["lm_model"].append(sample_with_temperature(lm_logits, self.config.lm_temp)) if target_dim is not None: target_model_state = self._target_model( featurizer_state=featurizer_state, targets=Y, n_outputs=target_dim, train=train, reuse=do_reuse, max_length=self.config.max_length ) train_loss += (1 - lm_loss_coef) * tf.reduce_mean(target_model_state['losses']) train_loss_tower += train_loss params = find_trainable_variables("model") grads = tf.gradients(train_loss, params) grads = list(zip(grads, params)) gpu_grads.append(grads) aggregator['logits'].append(target_model_state['logits']) aggregator['clf_losses'].append(target_model_state['losses']) self.lm_predict_op = tf.concat(aggregator["lm_model"], 0) self.features = tf.concat(aggregator['features'], axis=0) self.lm_losses = tf.concat(aggregator['lm_losses'], axis=0) if target_dim is not None: self.logits = tf.concat(aggregator['logits'], axis=0) self.clf_losses = concat_or_stack(aggregator['clf_losses']) self.predict_op, self.predict_proba_op = self._predict_ops(
def _construct_graph(self, n_updates_total, target_dim=None, train=True): gpu_grads = [] self.summaries = [] # store whether or not graph was previously compiled with dropout self.train = train self._define_placeholders(target_dim=target_dim) aggregator = defaultdict(list) train_loss_tower = 0 gpus = self.config.visible_gpus n_splits = max(len(gpus), 1) # multi-GPU setup, using CPU as param server is most efficient unless system has direct GPU connections # single GPU, no need to use a different GPU as a parameter server params_device = 'cpu' if len(gpus) != 1 else gpus[0] # decide on setting for language model loss coefficient # if the language model loss does not contribute to overall loss, # remove the language model computation from the graph lm_loss_coef = self.config.lm_loss_coef if target_dim is None: lm_loss_coef = 1.0 compile_lm = (train and lm_loss_coef > 0) or self.require_lm for i, (X, M, Y) in enumerate(soft_split(self.X, self.M, self.Y, n_splits=n_splits)): do_reuse = True if i > 0 else tf.AUTO_REUSE if gpus: device = tf.device(assign_to_gpu(gpus[i], params_device=params_device)) else: device = tf.device('cpu') scope = tf.variable_scope(tf.get_variable_scope(), reuse=do_reuse) with device, scope: featurizer_state = featurizer( X, config=self.config, encoder=self.encoder, dropout_placeholder=self.do_dropout, train=train, reuse=do_reuse ) if compile_lm: language_model_state = language_model( X=X, M=M, config=self.config, embed_weights=featurizer_state['embed_weights'], hidden=featurizer_state['sequence_features'], reuse=do_reuse ) train_loss = lm_loss_coef * tf.reduce_mean(language_model_state['losses']) aggregator['lm_losses'].append(language_model_state['losses']) lm_logits = language_model_state["logits"] aggregator["lm_model"].append(sample_with_temperature(lm_logits, self.config.lm_temp)) else: train_loss = 0 aggregator['features'].append(featurizer_state['features']) if target_dim is not None: with tf.variable_scope('model/target'): target_model_state = self._target_model( featurizer_state=featurizer_state, targets=Y, n_outputs=target_dim, train=train, reuse=do_reuse, max_length=self.config.max_length ) train_loss += (1 - lm_loss_coef) * tf.reduce_mean(target_model_state['losses']) train_loss_tower += train_loss aggregator['logits'].append(target_model_state['logits']) aggregator['target_losses'].append(target_model_state['losses']) params = find_trainable_variables("model") grads = tf.gradients(train_loss, params) grads = list(zip(grads, params)) gpu_grads.append(grads) with tf.device(params_device): self.features = tf.concat(aggregator['features'], axis=0) if compile_lm: self.lm_predict_op = tf.concat(aggregator["lm_model"], 0) self.lm_losses = tf.concat(aggregator['lm_losses'], axis=0) self.lm_loss = tf.reduce_mean(self.lm_losses) self.summaries.append(tf.summary.scalar('LanguageModelLoss', self.lm_loss)) if train: self._compile_train_op( params=params, grads=gpu_grads, n_updates_total=n_updates_total ) if target_dim is not None: self.logits = tf.concat(aggregator['logits'], axis=0) self.target_losses = concat_or_stack(aggregator['target_losses']) self.predict_op = self._predict_op( self.logits, **target_model_state.get("predict_params", {}) ) self.predict_proba_op = self._predict_proba_op( self.logits, **target_model_state.get("predict_params", {}) ) self.target_loss = tf.reduce_mean(self.target_losses) self.summaries.append(tf.summary.scalar('TargetModelLoss', self.target_loss)) self.summaries.append(tf.summary.scalar('TotalLoss', train_loss_tower / n_splits)) self.summaries = tf.summary.merge(self.summaries) if self.summaries else self.noop
def _construct_graph(self, n_updates_total, target_dim=None, train=True, pre_trained_weights=None): gpu_grads = [] self.summaries = [] # store whether or not graph was previously compiled with dropout self.train = train self._define_placeholders(target_dim=target_dim) aggregator = defaultdict(list) train_loss_tower = 0 gpus = self.config.visible_gpus n_splits = max(len(gpus), 1) # multi-GPU setup, using CPU as param server is most efficient unless system has direct GPU connections # single GPU, no need to use a different GPU as a parameter server params_device = 'cpu' if len(gpus) != 1 else gpus[0] for i, (X, M, Y) in enumerate(soft_split(self.X, self.M, self.Y, n_splits=n_splits)): do_reuse = True if i > 0 else tf.AUTO_REUSE if gpus: device = tf.device(assign_to_gpu(gpus[i], params_device=params_device)) else: device = tf.device('cpu') scope = tf.variable_scope(tf.get_variable_scope(), reuse=do_reuse) with device, scope: featurizer_state = featurizer( X, config=self.config, encoder=self.encoder, dropout_placeholder=self.do_dropout, train=train, reuse=do_reuse ) language_model_state = language_model( X=X, M=M, config=self.config, embed_weights=featurizer_state['embed_weights'], hidden=featurizer_state['sequence_features'], reuse=do_reuse ) lm_loss_coef = self.config.lm_loss_coef if target_dim is None: lm_loss_coef = 1.0 train_loss = lm_loss_coef * tf.reduce_mean(language_model_state['losses']) aggregator['features'].append(featurizer_state['features']) aggregator['lm_losses'].append(language_model_state['losses']) lm_logits = language_model_state["logits"] aggregator["lm_model"].append(sample_with_temperature(lm_logits, self.config.lm_temp)) if target_dim is not None: with tf.variable_scope('model/target'): target_model_state = self._target_model( featurizer_state=featurizer_state, targets=Y, n_outputs=target_dim, train=train, reuse=do_reuse, max_length=self.config.max_length ) train_loss += (1 - lm_loss_coef) * tf.reduce_mean(target_model_state['losses']) train_loss_tower += train_loss aggregator['logits'].append(target_model_state['logits']) aggregator['target_losses'].append(target_model_state['losses']) params = find_trainable_variables("model") grads = tf.gradients(train_loss, params) grads = list(zip(grads, params)) gpu_grads.append(grads) with tf.device(params_device): self.lm_predict_op = tf.concat(aggregator["lm_model"], 0) self.features = tf.concat(aggregator['features'], axis=0) self.lm_losses = tf.concat(aggregator['lm_losses'], axis=0) if train: self._compile_train_op( params=params, grads=gpu_grads, n_updates_total=n_updates_total, initial_params=pre_trained_weights ) if target_dim is not None: self.logits = tf.concat(aggregator['logits'], axis=0) self.target_losses = concat_or_stack(aggregator['target_losses']) self.predict_op = self._predict_op( self.logits, **target_model_state.get("predict_params", {}) ) self.predict_proba_op = self._predict_proba_op( self.logits, **target_model_state.get("predict_params", {}) ) self.target_loss = tf.reduce_mean(self.target_losses) self.lm_loss = tf.reduce_mean(self.lm_losses) self.summaries.append(tf.summary.scalar('TargetModelLoss', self.target_loss)) self.summaries.append(tf.summary.scalar('LanguageModelLoss', self.lm_loss)) self.summaries.append(tf.summary.scalar('TotalLoss', train_loss_tower / n_splits)) self.summaries = tf.summary.merge(self.summaries) if self.summaries else self.noop
def _construct_graph(self, n_updates_total, target_dim=None, train=True): gpu_grads = [] self.summaries = [] # store whether or not graph was previously compiled with dropout self.train = train self.target_dim = target_dim self._define_placeholders() aggregator = defaultdict(list) train_loss_tower = 0 gpus = get_available_gpus(self.config) n_splits = max(len(gpus), 1) for i, (X, M, Y) in enumerate( soft_split(self.X, self.M, self.Y, n_splits=n_splits)): do_reuse = True if i > 0 else tf.AUTO_REUSE if gpus: device = tf.device( assign_to_gpu(gpus[i], params_device=gpus[0])) else: device = tf.device('cpu') scope = tf.variable_scope(tf.get_variable_scope(), reuse=do_reuse) with device, scope: featurizer_state = featurizer( X, config=self.config, encoder=self.encoder, dropout_placeholder=self.do_dropout, train=train, reuse=do_reuse) language_model_state = language_model( X=X, M=M, config=self.config, embed_weights=featurizer_state['embed_weights'], hidden=featurizer_state['sequence_features'], reuse=do_reuse) lm_loss_coef = self.config.lm_loss_coef if target_dim is None: lm_loss_coef = 1.0 train_loss = lm_loss_coef * tf.reduce_mean( language_model_state['losses']) aggregator['features'].append(featurizer_state['features']) aggregator['lm_losses'].append(language_model_state['losses']) lm_logits = language_model_state["logits"] aggregator["lm_model"].append( sample_with_temperature(lm_logits, self.config.lm_temp)) if target_dim is not None: target_model_state = self._target_model( featurizer_state=featurizer_state, targets=Y, n_outputs=target_dim, train=train, reuse=do_reuse, max_length=self.config.max_length) train_loss += (1 - lm_loss_coef) * tf.reduce_mean( target_model_state['losses']) train_loss_tower += train_loss params = find_trainable_variables("model") grads = tf.gradients(train_loss, params) grads = list(zip(grads, params)) gpu_grads.append(grads) aggregator['logits'].append(target_model_state['logits']) aggregator['clf_losses'].append( target_model_state['losses']) self.lm_predict_op = tf.concat(aggregator["lm_model"], 0) self.features = tf.concat(aggregator['features'], axis=0) self.lm_losses = tf.concat(aggregator['lm_losses'], axis=0) if target_dim is not None: self.logits = tf.concat(aggregator['logits'], axis=0) self.clf_losses = concat_or_stack(aggregator['clf_losses']) self.predict_op, self.predict_proba_op = self._predict_ops( self.logits, **target_model_state.get("predict_params", {})) self._compile_train_op(params=params, grads=gpu_grads, n_updates_total=n_updates_total) self.clf_loss = tf.reduce_mean(self.clf_losses) self.lm_loss = tf.reduce_mean(self.lm_losses) self.summaries.append( tf.summary.scalar('TargetModelLoss', self.clf_loss)) self.summaries.append( tf.summary.scalar('LanguageModelLoss', self.lm_loss)) self.summaries.append( tf.summary.scalar('TotalLoss', train_loss_tower / n_splits)) self.summaries = tf.summary.merge(self.summaries)