Пример #1
0
 def _load_finetuned_model(self):
     var_list = find_trainable_variables('model', exclude='model/target')
     if self.target_dim is not None:
         var_list.extend(find_trainable_variables('model/target'))
     saver = tf.train.Saver(var_list=var_list)
     saver.restore(self.sess, os.path.join(self._load_from_file, SAVE_PREFIX))
     self._load_from_file = False
     self.is_trained = True
Пример #2
0
 def _init_from_pretrained(self, init_params):
     """ Load pre-trained weights into the tensors """
     pretrained_params = find_trainable_variables("model",
                                                  exclude="model/target")
     self.sess.run(tf.global_variables_initializer())
     self.sess.run(
         [p.assign(ip) for p, ip in zip(pretrained_params, init_params)])
Пример #3
0
    def _load_base_model(self):
        """
        Load serialized base model parameters into tf Tensors
        """
        pretrained_params = find_trainable_variables('model',
                                                     exclude='model/clf')
        self._initialize_session()
        self.sess.run(tf.global_variables_initializer())

        with open(SHAPES_PATH) as shapes_file:
            shapes = json.load(shapes_file)
            offsets = np.cumsum([np.prod(shape) for shape in shapes])
            init_params = [np.load(PARAM_PATH.format(n)) for n in range(10)]
            init_params = np.split(np.concatenate(init_params, 0),
                                   offsets)[:-1]
            init_params = [
                param.reshape(shape)
                for param, shape in zip(init_params, shapes)
            ]
            init_params[0] = init_params[0][:self.config.max_length]
            special_embed = (np.random.randn(len(self.encoder.special_tokens),
                                             self.config.n_embed) *
                             self.config.weight_stddev).astype(np.float32)
            init_params[0] = np.concatenate(
                [init_params[1], special_embed, init_params[0]], 0)
            del init_params[1]

            self.sess.run([
                p.assign(ip) for p, ip in zip(pretrained_params, init_params)
            ])
Пример #4
0
 def _save_fallback(self):
     with open(SHAPES_PATH) as shapes_file:
         shapes = json.load(shapes_file)
         offsets = np.cumsum([np.prod(shape) for shape in shapes])
         init_params = [np.load(PARAM_PATH.format(n)) for n in range(10)]
         init_params = np.split(np.concatenate(init_params, 0),
                                offsets)[:-1]
         init_params = [
             param.reshape(shape)
             for param, shape in zip(init_params, shapes)
         ]
         init_params[0] = np.load(
             os.path.join(os.path.dirname(__file__), "model",
                          "embeddings.npy"))
         del init_params[1]
     var_dict = dict(
         zip((var.name for var in find_trainable_variables(
             "model", exclude="model/target")), init_params))
     joblib.dump(var_dict, self.fallback_filename)
Пример #5
0
                lm_logits = language_model_state["logits"]
                aggregator["lm_model"].append(sample_with_temperature(lm_logits, self.config.lm_temp))

                if target_dim is not None:
                    target_model_state = self._target_model(
                        featurizer_state=featurizer_state,
                        targets=Y,
                        n_outputs=target_dim,
                        train=train,
                        reuse=do_reuse,
                        max_length=self.config.max_length
                    )
                    train_loss += (1 - lm_loss_coef) * tf.reduce_mean(target_model_state['losses'])
                    train_loss_tower += train_loss

                    params = find_trainable_variables("model")
                    grads = tf.gradients(train_loss, params)
                    grads = list(zip(grads, params))
                    gpu_grads.append(grads)
                    aggregator['logits'].append(target_model_state['logits'])
                    aggregator['clf_losses'].append(target_model_state['losses'])

        self.lm_predict_op = tf.concat(aggregator["lm_model"], 0)
        self.features = tf.concat(aggregator['features'], axis=0)
        self.lm_losses = tf.concat(aggregator['lm_losses'], axis=0)

        if target_dim is not None:
            self.logits = tf.concat(aggregator['logits'], axis=0)
            self.clf_losses = concat_or_stack(aggregator['clf_losses'])

            self.predict_op, self.predict_proba_op = self._predict_ops(
Пример #6
0
    def _construct_graph(self, n_updates_total, target_dim=None, train=True):
        gpu_grads = []
        self.summaries = []

        # store whether or not graph was previously compiled with dropout
        self.train = train
        self._define_placeholders(target_dim=target_dim)

        aggregator = defaultdict(list)
        train_loss_tower = 0
        gpus = self.config.visible_gpus
        n_splits = max(len(gpus), 1)

        # multi-GPU setup, using CPU as param server is most efficient unless system has direct GPU connections
        # single GPU, no need to use a different GPU as a parameter server
        params_device = 'cpu' if len(gpus) != 1 else gpus[0]

        # decide on setting for language model loss coefficient
        # if the language model loss does not contribute to overall loss,
        # remove the language model computation from the graph
        lm_loss_coef = self.config.lm_loss_coef
        if target_dim is None:
            lm_loss_coef = 1.0
        compile_lm = (train and lm_loss_coef > 0) or self.require_lm

        for i, (X, M, Y) in enumerate(soft_split(self.X, self.M, self.Y, n_splits=n_splits)):
            do_reuse = True if i > 0 else tf.AUTO_REUSE

            if gpus:
                device = tf.device(assign_to_gpu(gpus[i], params_device=params_device))
            else:
                device = tf.device('cpu')

            scope = tf.variable_scope(tf.get_variable_scope(), reuse=do_reuse)

            with device, scope:
                featurizer_state = featurizer(
                    X,
                    config=self.config,
                    encoder=self.encoder,
                    dropout_placeholder=self.do_dropout,
                    train=train,
                    reuse=do_reuse
                )

                if compile_lm:
                    language_model_state = language_model(
                        X=X,
                        M=M,
                        config=self.config,
                        embed_weights=featurizer_state['embed_weights'],
                        hidden=featurizer_state['sequence_features'],
                        reuse=do_reuse
                    )

                    train_loss = lm_loss_coef * tf.reduce_mean(language_model_state['losses'])
                    aggregator['lm_losses'].append(language_model_state['losses'])
                    lm_logits = language_model_state["logits"]
                    aggregator["lm_model"].append(sample_with_temperature(lm_logits, self.config.lm_temp))
                else:
                    train_loss = 0

                aggregator['features'].append(featurizer_state['features'])

                if target_dim is not None:
                    with tf.variable_scope('model/target'):
                        target_model_state = self._target_model(
                            featurizer_state=featurizer_state,
                            targets=Y,
                            n_outputs=target_dim,
                            train=train,
                            reuse=do_reuse,
                            max_length=self.config.max_length
                        )
                    train_loss += (1 - lm_loss_coef) * tf.reduce_mean(target_model_state['losses'])
                    train_loss_tower += train_loss

                    aggregator['logits'].append(target_model_state['logits'])
                    aggregator['target_losses'].append(target_model_state['losses'])

                params = find_trainable_variables("model")
                grads = tf.gradients(train_loss, params)
                grads = list(zip(grads, params))
                gpu_grads.append(grads)

        with tf.device(params_device):
            self.features = tf.concat(aggregator['features'], axis=0)

            if compile_lm:
                self.lm_predict_op = tf.concat(aggregator["lm_model"], 0)
                self.lm_losses = tf.concat(aggregator['lm_losses'], axis=0)
                self.lm_loss = tf.reduce_mean(self.lm_losses)
                self.summaries.append(tf.summary.scalar('LanguageModelLoss', self.lm_loss))

            if train:
                self._compile_train_op(
                    params=params,
                    grads=gpu_grads,
                    n_updates_total=n_updates_total
                )

            if target_dim is not None:
                self.logits = tf.concat(aggregator['logits'], axis=0)
                self.target_losses = concat_or_stack(aggregator['target_losses'])

                self.predict_op = self._predict_op(
                    self.logits, **target_model_state.get("predict_params", {})
                )
                self.predict_proba_op = self._predict_proba_op(
                    self.logits, **target_model_state.get("predict_params", {})
                )
                self.target_loss = tf.reduce_mean(self.target_losses)

                self.summaries.append(tf.summary.scalar('TargetModelLoss', self.target_loss))
                self.summaries.append(tf.summary.scalar('TotalLoss', train_loss_tower / n_splits))

            self.summaries = tf.summary.merge(self.summaries) if self.summaries else self.noop
Пример #7
0
    def _construct_graph(self, n_updates_total, target_dim=None, train=True, pre_trained_weights=None):
        gpu_grads = []
        self.summaries = []

        # store whether or not graph was previously compiled with dropout
        self.train = train
        self._define_placeholders(target_dim=target_dim)

        aggregator = defaultdict(list)
        train_loss_tower = 0
        gpus = self.config.visible_gpus
        n_splits = max(len(gpus), 1)

        # multi-GPU setup, using CPU as param server is most efficient unless system has direct GPU connections
        # single GPU, no need to use a different GPU as a parameter server
        params_device = 'cpu' if len(gpus) != 1 else gpus[0]
        for i, (X, M, Y) in enumerate(soft_split(self.X, self.M, self.Y, n_splits=n_splits)):
            do_reuse = True if i > 0 else tf.AUTO_REUSE

            if gpus:
                device = tf.device(assign_to_gpu(gpus[i], params_device=params_device))
            else:
                device = tf.device('cpu')

            scope = tf.variable_scope(tf.get_variable_scope(), reuse=do_reuse)

            with device, scope:
                featurizer_state = featurizer(
                    X,
                    config=self.config,
                    encoder=self.encoder,
                    dropout_placeholder=self.do_dropout,
                    train=train,
                    reuse=do_reuse
                )
                language_model_state = language_model(
                    X=X,
                    M=M,
                    config=self.config,
                    embed_weights=featurizer_state['embed_weights'],
                    hidden=featurizer_state['sequence_features'],
                    reuse=do_reuse
                )

                lm_loss_coef = self.config.lm_loss_coef
                if target_dim is None:
                    lm_loss_coef = 1.0

                train_loss = lm_loss_coef * tf.reduce_mean(language_model_state['losses'])

                aggregator['features'].append(featurizer_state['features'])
                aggregator['lm_losses'].append(language_model_state['losses'])

                lm_logits = language_model_state["logits"]
                aggregator["lm_model"].append(sample_with_temperature(lm_logits, self.config.lm_temp))

                if target_dim is not None:
                    with tf.variable_scope('model/target'):
                        target_model_state = self._target_model(
                            featurizer_state=featurizer_state,
                            targets=Y,
                            n_outputs=target_dim,
                            train=train,
                            reuse=do_reuse,
                            max_length=self.config.max_length
                        )
                    train_loss += (1 - lm_loss_coef) * tf.reduce_mean(target_model_state['losses'])
                    train_loss_tower += train_loss

                    aggregator['logits'].append(target_model_state['logits'])
                    aggregator['target_losses'].append(target_model_state['losses'])

                params = find_trainable_variables("model")
                grads = tf.gradients(train_loss, params)
                grads = list(zip(grads, params))
                gpu_grads.append(grads)

        with tf.device(params_device):
            self.lm_predict_op = tf.concat(aggregator["lm_model"], 0)
            self.features = tf.concat(aggregator['features'], axis=0)
            self.lm_losses = tf.concat(aggregator['lm_losses'], axis=0)

            if train:
                self._compile_train_op(
                    params=params,
                    grads=gpu_grads,
                    n_updates_total=n_updates_total,
                    initial_params=pre_trained_weights
                )

            if target_dim is not None:
                self.logits = tf.concat(aggregator['logits'], axis=0)
                self.target_losses = concat_or_stack(aggregator['target_losses'])

                self.predict_op = self._predict_op(
                    self.logits, **target_model_state.get("predict_params", {})
                )
                self.predict_proba_op = self._predict_proba_op(
                    self.logits, **target_model_state.get("predict_params", {})
                )
                self.target_loss = tf.reduce_mean(self.target_losses)
                self.lm_loss = tf.reduce_mean(self.lm_losses)
                self.summaries.append(tf.summary.scalar('TargetModelLoss', self.target_loss))
                self.summaries.append(tf.summary.scalar('LanguageModelLoss', self.lm_loss))
                self.summaries.append(tf.summary.scalar('TotalLoss', train_loss_tower / n_splits))
            
            self.summaries = tf.summary.merge(self.summaries) if self.summaries else self.noop
Пример #8
0
    def _construct_graph(self, n_updates_total, target_dim=None, train=True):
        gpu_grads = []
        self.summaries = []

        # store whether or not graph was previously compiled with dropout
        self.train = train
        self.target_dim = target_dim
        self._define_placeholders()

        aggregator = defaultdict(list)
        train_loss_tower = 0
        gpus = get_available_gpus(self.config)
        n_splits = max(len(gpus), 1)
        for i, (X, M, Y) in enumerate(
                soft_split(self.X, self.M, self.Y, n_splits=n_splits)):
            do_reuse = True if i > 0 else tf.AUTO_REUSE

            if gpus:
                device = tf.device(
                    assign_to_gpu(gpus[i], params_device=gpus[0]))
            else:
                device = tf.device('cpu')

            scope = tf.variable_scope(tf.get_variable_scope(), reuse=do_reuse)

            with device, scope:
                featurizer_state = featurizer(
                    X,
                    config=self.config,
                    encoder=self.encoder,
                    dropout_placeholder=self.do_dropout,
                    train=train,
                    reuse=do_reuse)
                language_model_state = language_model(
                    X=X,
                    M=M,
                    config=self.config,
                    embed_weights=featurizer_state['embed_weights'],
                    hidden=featurizer_state['sequence_features'],
                    reuse=do_reuse)

                lm_loss_coef = self.config.lm_loss_coef
                if target_dim is None:
                    lm_loss_coef = 1.0

                train_loss = lm_loss_coef * tf.reduce_mean(
                    language_model_state['losses'])

                aggregator['features'].append(featurizer_state['features'])
                aggregator['lm_losses'].append(language_model_state['losses'])

                lm_logits = language_model_state["logits"]
                aggregator["lm_model"].append(
                    sample_with_temperature(lm_logits, self.config.lm_temp))

                if target_dim is not None:
                    target_model_state = self._target_model(
                        featurizer_state=featurizer_state,
                        targets=Y,
                        n_outputs=target_dim,
                        train=train,
                        reuse=do_reuse,
                        max_length=self.config.max_length)
                    train_loss += (1 - lm_loss_coef) * tf.reduce_mean(
                        target_model_state['losses'])
                    train_loss_tower += train_loss

                    params = find_trainable_variables("model")
                    grads = tf.gradients(train_loss, params)
                    grads = list(zip(grads, params))
                    gpu_grads.append(grads)
                    aggregator['logits'].append(target_model_state['logits'])
                    aggregator['clf_losses'].append(
                        target_model_state['losses'])

        self.lm_predict_op = tf.concat(aggregator["lm_model"], 0)
        self.features = tf.concat(aggregator['features'], axis=0)
        self.lm_losses = tf.concat(aggregator['lm_losses'], axis=0)

        if target_dim is not None:
            self.logits = tf.concat(aggregator['logits'], axis=0)
            self.clf_losses = concat_or_stack(aggregator['clf_losses'])

            self.predict_op, self.predict_proba_op = self._predict_ops(
                self.logits, **target_model_state.get("predict_params", {}))
            self._compile_train_op(params=params,
                                   grads=gpu_grads,
                                   n_updates_total=n_updates_total)
            self.clf_loss = tf.reduce_mean(self.clf_losses)
            self.lm_loss = tf.reduce_mean(self.lm_losses)
            self.summaries.append(
                tf.summary.scalar('TargetModelLoss', self.clf_loss))
            self.summaries.append(
                tf.summary.scalar('LanguageModelLoss', self.lm_loss))
            self.summaries.append(
                tf.summary.scalar('TotalLoss', train_loss_tower / n_splits))
            self.summaries = tf.summary.merge(self.summaries)