예제 #1
0
파일: trainer.py 프로젝트: rzcwade/nabu
    def _validate(self):
        '''
        get the validation loss

        returns:
            - the validation loss
            - an op to update the validation loss
            - the number of validation batches
        '''

        #create the evaluator
        evaltype = self.evaluatorconf.get('evaluator', 'evaluator')
        if evaltype != 'None':
            evaluator = evaluator_factory.factory(evaltype)(
                conf=self.evaluatorconf,
                dataconf=self.dataconf,
                model=self.model)

        return evaluator.evaluate()
예제 #2
0
파일: trainer.py 프로젝트: mangataz/nabu
    def _validate(self):
        '''
        get the validation loss

        returns:
            - the validation loss for a batch
            - the number of validation batches
        '''

        #create the evaluator
        evaltype = self.evaluatorconf.get('evaluator', 'evaluator')
        if evaltype != 'None':
            evaluator = evaluator_factory.factory(evaltype)(
                conf=self.evaluatorconf,
                dataconf=self.dataconf,
                model=self.model
            )

        #compute the loss
        val_batch_loss, valbatches = evaluator.evaluate()

        return val_batch_loss, valbatches
예제 #3
0
    def __init__(self, conf, tasksconf, dataconf, modelconf, evaluatorconf,
                 expdir, init_filename, server, task_index):
        '''
        NnetTrainer constructor, creates the training graph

        Args:
            conf: the trainer config
            taskconf: the config file for each task
            dataconf: the data configuration as a ConfigParser
            modelconf: the neural net model configuration
            evaluatorconf: the evaluator configuration for evaluating
                if None no evaluation will be done
            expdir: directory where the summaries will be written
            init_filename: filename of the network that should be used to
            initialize the model. Put to None if no network is available/wanted.
            server: optional server to be used for distributed training
            task_index: optional index of the worker task in the cluster
        '''

        self.expdir = expdir
        self.server = server
        self.conf = conf
        self.tasksconf = tasksconf
        self.task_index = task_index
        self.init_filename = init_filename

        self.batch_size = int(conf['batch_size'])

        cluster = tf.train.ClusterSpec(server.server_def.cluster)

        #create the graph
        self.graph = tf.Graph()

        #3 model types for multi task: single one to one; single one to many; multiple one to one
        #single one to one: the whole model is shared for all tasks, only loss function can be different
        #single one to many: each task has a separate output so only part of the network is shared, eg evrything but the output layer
        #multiple one to one: each task has its own network. Possibly the outputs are combined in a loss function

        #create the model
        modelfile = os.path.join(expdir, 'model', 'model.pkl')
        with open(modelfile, 'wb') as fid:
            self.model = model_factory.factory(
                modelconf.get('model', 'architecture'))(conf=modelconf)
            pickle.dump(self.model, fid)

        evaltype = evaluatorconf.get('evaluator', 'evaluator')

        #get the database configurations
        input_dataconfs = dict()
        target_dataconfs = dict()
        loss_computers = dict()
        nr_input_sections = dict()
        if evaltype != 'None':
            evaluators = dict()

        for task in self.conf['tasks'].split(' '):
            taskconf = self.tasksconf[task]

            #get the database configurations
            input_names = modelconf.get('io', 'inputs').split(' ')
            if input_names == ['']:
                input_names = []
            input_sections = [taskconf[i].split(' ') for i in input_names]
            nr_input_sections[task] = len(input_sections)
            task_input_dataconfs = []
            for sectionset in input_sections:
                task_input_dataconfs.append([])
                for section in sectionset:
                    task_input_dataconfs[-1].append(
                        dict(dataconf.items(section)))
            input_dataconfs[task] = task_input_dataconfs

            output_names = taskconf['targets'].split(' ')
            if output_names == ['']:
                output_names = []
            target_sections = [taskconf[o].split(' ') for o in output_names]
            task_target_dataconfs = []
            for sectionset in target_sections:
                task_target_dataconfs.append([])
                for section in sectionset:
                    task_target_dataconfs[-1].append(
                        dict(dataconf.items(section)))
            target_dataconfs[task] = task_target_dataconfs

            #create the loss computer
            loss_computer = loss_computer_factory.factory(
                taskconf['loss_type'])(self.batch_size)

            loss_computers[task] = loss_computer

            if evaltype != 'None':
                evaluator = evaluator_factory.factory(evaltype)(
                    conf=evaluatorconf,
                    dataconf=dataconf,
                    model=self.model,
                    task=task)

                evaluators[task] = evaluator

        if 'local' in cluster.as_dict():
            num_replicas = 1
            device = tf.DeviceSpec(job='local')
        else:
            #distributed training
            num_replicas = len(cluster.as_dict()['worker'])
            num_servers = len(cluster.as_dict()['ps'])
            ps_strategy = tf.contrib.training.GreedyLoadBalancingStrategy(
                num_tasks=num_servers,
                load_fn=tf.contrib.training.byte_size_load_fn)
            device = tf.train.replica_device_setter(ps_tasks=num_servers,
                                                    ps_strategy=ps_strategy)
            chief_ps = tf.DeviceSpec(job='ps', task=0)

        self.is_chief = task_index == 0

        #define the placeholders in the graph
        with self.graph.as_default():

            #create a local num_steps variable
            self.num_steps = tf.get_variable(
                name='num_steps',
                shape=[],
                dtype=tf.int32,
                initializer=tf.constant_initializer(0),
                trainable=False)

            #a variable to hold the amount of steps already taken
            self.global_step = tf.get_variable(
                name='global_step',
                shape=[],
                dtype=tf.int32,
                initializer=tf.constant_initializer(0),
                trainable=False)

            should_terminate = tf.get_variable(
                name='should_terminate',
                shape=[],
                dtype=tf.bool,
                initializer=tf.constant_initializer(False),
                trainable=False)

            self.terminate = should_terminate.assign(True).op

            #create a check if training should continue
            self.should_stop = tf.logical_or(
                tf.greater_equal(self.global_step, self.num_steps),
                should_terminate)

            with tf.device(device):
                data_queues = dict()
                num_steps = []
                done_ops = []
                for task in self.conf['tasks'].split(' '):

                    #check if running in distributed model
                    if 'local' in cluster.as_dict():

                        #get the filenames
                        data_queue_elements, _ = input_pipeline.get_filenames(
                            input_dataconfs[task] + target_dataconfs[task])

                        #create the data queue and queue runners (inputs get shuffled! I already did this so set to False)
                        data_queue = tf.train.string_input_producer(
                            string_tensor=data_queue_elements,
                            shuffle=False,
                            seed=None,
                            capacity=self.batch_size * 2,
                            shared_name='data_queue_' + task)

                        data_queues[task] = data_queue

                        #compute the number of steps
                        if int(conf['numbatches_to_aggregate']) == 0:
                            task_num_steps = (int(conf['num_epochs']) *
                                              len(data_queue_elements) /
                                              self.batch_size)
                        else:
                            task_num_steps = (
                                int(conf['num_epochs']) *
                                len(data_queue_elements) /
                                (self.batch_size *
                                 int(conf['numbatches_to_aggregate'])))

                        #set the number of steps
                        num_steps.append(task_num_steps)
                        done_ops.append(tf.no_op())

                    else:
                        with tf.device(chief_ps):

                            #get the data queue
                            data_queue = tf.FIFOQueue(
                                capacity=self.batch_size * (num_replicas + 1),
                                shared_name='data_queue_' + task,
                                name='data_queue_' + task,
                                dtypes=[tf.string],
                                shapes=[[]])

                            data_queues[task] = data_queue

                            #get the number of steps from the parameter server
                            num_steps_queue = tf.FIFOQueue(
                                capacity=num_replicas,
                                dtypes=[tf.int32],
                                shared_name='num_steps_queue',
                                name='num_steps_queue',
                                shapes=[[]])

                            #set the number of steps
                            task_num_steps = num_steps_queue.dequeue()

                        #get the done queues
                        for i in range(num_servers):
                            with tf.device('job:ps/task:%d' % i):
                                done_queue = tf.FIFOQueue(
                                    capacity=num_replicas,
                                    dtypes=[tf.bool],
                                    shapes=[[]],
                                    shared_name='done_queue%d' % i,
                                    name='done_queue%d' % i)

                                done_ops.append(done_queue.enqueue(True))

                self.set_num_steps = self.num_steps.assign(min(num_steps)).op
                self.done = tf.group(*done_ops)

                #training part
                with tf.variable_scope('train'):

                    #a variable to scale the learning rate (used to reduce the
                    #learning rate in case validation performance drops)
                    learning_rate_fact = tf.get_variable(
                        name='learning_rate_fact',
                        shape=[],
                        initializer=tf.constant_initializer(1.0),
                        trainable=False)

                    #compute the learning rate with exponential decay and scale
                    #with the learning rate factor
                    self.learning_rate = (tf.train.exponential_decay(
                        learning_rate=float(conf['initial_learning_rate']),
                        global_step=self.global_step,
                        decay_steps=self.num_steps,
                        decay_rate=float(conf['learning_rate_decay'])) *
                                          learning_rate_fact)

                    #create the optimizer
                    optimizer = tf.train.AdamOptimizer(self.learning_rate)

                    self.total_loss = tf.get_variable(
                        name='total_loss',
                        shape=[],
                        dtype=tf.float32,
                        initializer=tf.constant_initializer(0),
                        trainable=False)

                    self.reset_loss = self.total_loss.assign(0.0)

                    loss = []

                    for task in self.conf['tasks'].split(' '):

                        with tf.variable_scope(task):

                            #create the input pipeline
                            data, seq_length = input_pipeline.input_pipeline(
                                data_queue=data_queues[task],
                                batch_size=self.batch_size,
                                numbuckets=int(conf['numbuckets']),
                                dataconfs=input_dataconfs[task] +
                                target_dataconfs[task])

                            inputs = {
                                input_names[i]: d
                                for i, d in enumerate(
                                    data[:nr_input_sections[task]])
                            }
                            seq_length = {
                                input_names[i]: d
                                for i, d in enumerate(
                                    seq_length[:nr_input_sections[task]])
                            }
                            targets = {
                                output_names[i]: d
                                for i, d in enumerate(
                                    data[nr_input_sections[task]:])
                            }
                            #target_seq_length = {
                            #output_names[i]: d
                            #for i, d in enumerate(seq_length[nr_input_sections[task]:])}

                            #compute the training outputs of the model
                            logits = self.model(inputs=inputs,
                                                input_seq_length=seq_length,
                                                is_training=True)

                            #TODO: The proper way to exploit data paralellism is via the
                            #SyncReplicasOptimizer defined below. However for some reason it hangs
                            #and I have not yet found a solution for it. For the moment the gradients
                            #are accumulated in a way that does not allow data paralellism and there
                            # is no advantage on having multiple workers. (We also accumulate the loss)

                            #create an optimizer that aggregates gradients
                            #if int(conf['numbatches_to_aggregate']) > 0:
                            #optimizer = tf.train.SyncReplicasOptimizer(
                            #opt=optimizer,
                            #replicas_to_aggregate=int(
                            #conf['numbatches_to_aggregate'])#,
                            ##total_num_replicas=num_replicas
                            #)

                            #compute the loss
                            task_loss = loss_computers[task](targets, logits,
                                                             seq_length)

                            #append the task loss to the global loss
                            loss.append(task_loss)

                #accumulate losses from tasks
                    with tf.variable_scope('accumulate_loss_from_tasks'):
                        loss = tf.reduce_mean(loss)

                #accumulate losses from batches
                    self.acc_loss = self.total_loss.assign_add(loss)

                    ##compute the gradients
                    #grads_and_vars = optimizer.compute_gradients(self.loss)

                    #with tf.variable_scope('clip'):
                    #clip_value = float(conf['clip_grad_value'])
                    ##clip the gradients
                    #grads_and_vars = [(tf.clip_by_value(grad, -clip_value, clip_value), var)
                    #for grad, var in grads_and_vars]

                    self.params = tf.trainable_variables()

                    grads = [
                        tf.get_variable(param.op.name,
                                        param.get_shape().as_list(),
                                        initializer=tf.constant_initializer(0),
                                        trainable=False)
                        for param in self.params
                    ]

                    self.reset_grad = tf.variables_initializer(grads)

                    #compute the gradients
                    minibatch_grads_and_vars = optimizer.compute_gradients(
                        loss)

                    with tf.variable_scope('clip'):
                        clip_value = float(conf['clip_grad_value'])
                        #clip the gradients
                        minibatch_grads_and_vars = [
                            (tf.clip_by_value(grad, -clip_value,
                                              clip_value), var)
                            for grad, var in minibatch_grads_and_vars
                        ]

                    (minibatchgrads,
                     minibatchvars) = zip(*minibatch_grads_and_vars)

                    #update gradients by accumulating them
                    self.update_gradients = [
                        grad.assign_add(batchgrad)
                        for batchgrad, grad in zip(minibatchgrads, grads)
                    ]

                    #opperation to apply the gradients
                    grads_and_vars = list(zip(grads, minibatchvars))
                    apply_gradients_op = optimizer.apply_gradients(
                        grads_and_vars=grads_and_vars,
                        global_step=self.global_step,
                        name='apply_gradients')

                    #all remaining operations with the UPDATE_OPS GraphKeys
                    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

                    #create an operation to update the gradients, the batch_loss
                    #and do all other update ops
                    self.update_op = tf.group(*([apply_gradients_op] +
                                                update_ops),
                                              name='update')

                if evaltype != 'None':

                    #validation part
                    with tf.variable_scope('validate'):

                        #create a variable to hold the validation loss
                        self.validation_loss = tf.get_variable(
                            name='validation_loss',
                            shape=[],
                            dtype=tf.float32,
                            initializer=tf.constant_initializer(0),
                            trainable=False)

                        #create a variable to save the last step where the model
                        #was validated
                        validated_step = tf.get_variable(
                            name='validated_step',
                            shape=[],
                            dtype=tf.int32,
                            initializer=tf.constant_initializer(
                                -int(conf['valid_frequency'])),
                            trainable=False)

                        #a check if validation is due
                        self.should_validate = tf.greater_equal(
                            self.global_step - validated_step,
                            int(conf['valid_frequency']))

                        val_batch_loss = []
                        valbatches = []

                        for task in self.conf['tasks'].split(' '):

                            with tf.variable_scope(task):

                                task_val_batch_loss, task_valbatches, _, _ = evaluators[
                                    task].evaluate()
                                val_batch_loss.append(task_val_batch_loss)
                                valbatches.append(task_valbatches)

                        val_batch_loss = tf.reduce_mean(val_batch_loss)
                        self.valbatches = min(valbatches)

                        self.update_loss = self.validation_loss.assign(
                            self.validation_loss +
                            val_batch_loss  #/self.valbatches
                        ).op

                        #update the learning rate factor
                        self.half_lr = learning_rate_fact.assign(
                            learning_rate_fact / 2).op

                        #create an operation to updated the validated step
                        self.update_validated_step = validated_step.assign(
                            self.global_step).op

                        #variable to hold the best validation loss so far
                        self.best_validation = tf.get_variable(
                            name='best_validation',
                            shape=[],
                            dtype=tf.float32,
                            initializer=tf.constant_initializer(1.79e+308),
                            trainable=False)

                        #op to update the best velidation loss
                        self.update_best = self.best_validation.assign(
                            self.validation_loss).op

                        #a variable that holds the amount of workers at the
                        #validation point
                        waiting_workers = tf.get_variable(
                            name='waiting_workers',
                            shape=[],
                            dtype=tf.int32,
                            initializer=tf.constant_initializer(0),
                            trainable=False)

                        #an operation to signal a waiting worker
                        self.waiting = waiting_workers.assign_add(1).op

                        #an operation to set the waiting workers to zero
                        self.reset_waiting = waiting_workers.initializer

                        #an operation to check if all workers are waiting
                        self.all_waiting = tf.equal(waiting_workers,
                                                    num_replicas - 1)

                        tf.summary.scalar('validation loss',
                                          self.validation_loss)

                else:
                    self.update_loss = None

                tf.summary.scalar('learning rate', self.learning_rate)

                #create a histogram for all trainable parameters
                for param in tf.trainable_variables():
                    tf.summary.histogram(param.name, param)

                #create the scaffold
                self.scaffold = tf.train.Scaffold()
예제 #4
0
def test(expdir, test_model_checkpoint, task):
    """does everything for testing"""
    # read the database config file
    database_cfg = configparser.ConfigParser()
    database_cfg.read(os.path.join(expdir, 'database.cfg'))

    # read the model config file
    model_cfg = configparser.ConfigParser()
    model_cfg.read(os.path.join(expdir, 'model.cfg'))

    # read the evaluator config file
    evaluator_cfg = configparser.ConfigParser()
    evaluator_cfg.read(os.path.join(expdir, 'evaluator.cfg'))

    losses_cfg_file = os.path.join(expdir, 'loss.cfg')
    if not os.path.isfile(losses_cfg_file):
        warnings.warn(
            'In following versions it will be required to provide a loss config file',
            Warning)
        loss_cfg = None
    else:
        loss_cfg = configparser.ConfigParser()
        loss_cfg.read(losses_cfg_file)

    if evaluator_cfg.has_option(task, 'output_handling_type'):
        output_handling_type = evaluator_cfg.get(task, 'output_handling_type')
    else:
        output_handling_type = 'reconstructor'

    if output_handling_type == 'reconstructor':
        # read the reconstructor config file
        output_handler_cfg = configparser.ConfigParser()
        output_handler_cfg.read(os.path.join(expdir, 'reconstructor.cfg'))

        rec_dir = os.path.join(expdir, 'reconstructions', task)

        # read the scorer config file
        scorer_cfg = configparser.ConfigParser()
        scorer_cfg.read(os.path.join(expdir, 'scorer.cfg'))
    elif output_handling_type == 'speaker_verification':
        # read the speaker verification output handler config file
        output_handler_cfg = configparser.ConfigParser()
        output_handler_cfg.read(
            os.path.join(expdir, 'speaker_verification_handler.cfg'))

        store_dir = os.path.join(expdir, 'speaker_verification_data', task)

        # read the scorer config file
        scorer_cfg = configparser.ConfigParser()
        scorer_cfg.read(os.path.join(expdir,
                                     'speaker_verification_scorer.cfg'))

    else:
        raise BaseException('Unknown output handling type: %s' %
                            output_handling_type)

    # read the postprocessor config file, if it exists
    try:
        postprocessor_cfg = configparser.ConfigParser()
        postprocessor_cfg.read(os.path.join(expdir, 'postprocessor.cfg'))
        if not postprocessor_cfg.sections():
            postprocessor_cfg = None
    except:
        postprocessor_cfg = None

    # load the model
    with open(os.path.join(expdir, 'model', 'model.pkl'), 'rb') as fid:
        models = pickle.load(fid)

    if \
     '/esat/spchtemp/scratch/jzegers/Nabu-SS2.0/Default17_MERL_DANet_Drude2018_sum_task_losses_sweep' in expdir or \
     '/esat/spchtemp/scratch/jzegers/Nabu-SS2.0/Default17_MERL_DANet_Drude2018_acc_step_norm_weights_sweep' in expdir:
        models['speaker_embeddings_model'].conf['no_bias'] = 'True'
        models['outlayer'].conf['no_bias'] = 'True'
        models['id_outlayer'].conf['no_bias'] = 'True'
        with open(os.path.join(expdir, 'model', 'model.pkl'), 'wb') as fid2:
            pickle.dump(models, fid2)
    elif \
     '/esat/spchtemp/scratch/jzegers/Nabu-SS2.0/Default17_SREMix_101trspks_DANet_hamming_scipy_Drude2018' in expdir:
        models['speaker_embeddings_model'].conf['no_bias'] = 'True'
        models['outlayer'].conf['no_bias'] = 'True'
        models['id_outlayer'].conf['no_bias'] = 'False'
        with open(os.path.join(expdir, 'model', 'model.pkl'), 'wb') as fid2:
            pickle.dump(models, fid2)

    if os.path.isfile(os.path.join(expdir, 'loss_%s' % task)):
        print 'Already reconstructed all signals for task %s, going straight to scoring' % task
        if evaluator_cfg.has_option(task, 'requested_utts'):
            requested_utts = int(evaluator_cfg.get(task, 'requested_utts'))
        else:
            requested_utts = int(
                evaluator_cfg.get('evaluator', 'requested_utts'))
        if evaluator_cfg.has_option(task, 'batch_size'):
            batch_size = int(evaluator_cfg.get(task, 'batch_size'))
        else:
            batch_size = int(evaluator_cfg.get('evaluator', 'batch_size'))
        numbatches = int(float(requested_utts) / float(batch_size))

    else:

        print 'Evaluating task %s' % task

        # create the evaluator
        if loss_cfg:
            loss_cfg = dict(
                loss_cfg.items(evaluator_cfg.get(task, 'loss_type')))
        evaltype = evaluator_cfg.get(task, 'evaluator')
        evaluator = evaluator_factory.factory(evaltype)(conf=evaluator_cfg,
                                                        lossconf=loss_cfg,
                                                        dataconf=database_cfg,
                                                        models=models,
                                                        task=task)

        checkpoint_dir = os.path.join(expdir, 'logdir_%s' % task)

        # create the output handler
        if output_handling_type == 'reconstructor':
            # create the reconstructor

            task_output_handler_cfg = dict(output_handler_cfg.items(task))
            reconstruct_type = task_output_handler_cfg['reconstruct_type']

            # whether the targets should be used to determine the optimal speaker permutation on frame level. Should
            # only be used for analysis and not for reporting results.
            if 'optimal_frame_permutation' in task_output_handler_cfg and \
             task_output_handler_cfg['optimal_frame_permutation'] == 'True':
                optimal_frame_permutation = True
            else:
                optimal_frame_permutation = False

            output_handler = reconstructor_factory.factory(reconstruct_type)(
                conf=task_output_handler_cfg,
                evalconf=evaluator_cfg,
                dataconf=database_cfg,
                rec_dir=rec_dir,
                task=task,
                optimal_frame_permutation=optimal_frame_permutation)

            if optimal_frame_permutation:
                opt_frame_perm_op = getattr(
                    output_handler, "reconstruct_signals_opt_frame_perm", None)
                if not callable(opt_frame_perm_op):
                    raise NotImplementedError(
                        'The "optimal_frame_permutation" flag was set while the function '
                        '"reconstruct_signals_opt_frame_perm" is not implemented in the reconstructor'
                    )

        elif output_handling_type == 'speaker_verification':
            task_output_handler_cfg = dict(output_handler_cfg.items(task))
            speaker_verification_handler_type = task_output_handler_cfg[
                'speaker_verification_handler_type']

            output_handler = speaker_verification_handler_factory.factory(
                speaker_verification_handler_type)(
                    conf=task_output_handler_cfg,
                    evalconf=evaluator_cfg,
                    dataconf=database_cfg,
                    store_dir=store_dir,
                    exp_dir=expdir,
                    task=task)

        else:
            raise BaseException('Unknown output handling type: %s' %
                                output_handling_type)

        # create the graph
        with tf.Graph().as_default():

            # create a hook that will load the model
            load_hook = LoadAtBegin(test_model_checkpoint, models)

            # create a hook for summary writing
            # summary_hook = SummaryHook(os.path.join(expdir, 'logdir'))

            #
            saver_hook = tf.train.CheckpointSaverHook(
                checkpoint_dir=checkpoint_dir,
                save_steps=np.ceil(1000.0 / float(evaluator.batch_size)))

            config = tf.ConfigProto(intra_op_parallelism_threads=6,
                                    inter_op_parallelism_threads=2,
                                    device_count={
                                        'CPU': 8,
                                        'GPU': 0
                                    })

            options = tf.RunOptions()
            options.report_tensor_allocations_upon_oom = True

            #
            current_batch_ind_tf = tf.get_variable(
                name='global_step',
                shape=[],
                dtype=tf.int32,
                initializer=tf.constant_initializer(0),
                trainable=False)
            current_batch_ind_inc_op = current_batch_ind_tf.assign_add(1)
            reset_current_batch_ind_op = current_batch_ind_tf.assign(0)

            # get the current batch_ind
            with tf.train.SingularMonitoredSession(
                    config=config, checkpoint_dir=checkpoint_dir) as sess:
                start_batch_ind = sess.run(current_batch_ind_tf)
                start_utt_ind = start_batch_ind * evaluator.batch_size
                output_handler.pos = start_utt_ind

            output_handler.open_scp_files(from_start=start_utt_ind == 0)

            # compute the loss
            batch_loss, batch_norm, numbatches, batch_outputs, batch_targets, batch_seq_length = evaluator.evaluate(
                start_utt_ind=start_utt_ind)

            # only keep the outputs requested by the reconstructor (usually the output of the output layer)
            batch_outputs = {
                out_name: out
                for out_name, out in batch_outputs.iteritems()
                if out_name in output_handler.requested_output_names
            }
            batch_seq_length = {
                seq_name: seq
                for seq_name, seq in batch_seq_length.iteritems()
                if seq_name in output_handler.requested_output_names
            }

            hooks = [load_hook]
            # hooks = [load_hook, summary_hook]
            if numbatches > 100:
                hooks.append(saver_hook)

            # start the session
            with tf.train.SingularMonitoredSession(
                    hooks=hooks, config=config,
                    checkpoint_dir=checkpoint_dir) as sess:

                loss = 0.0
                loss_norm = 0.0

                for batch_ind in range(start_batch_ind, numbatches):
                    print('evaluating batch number %d' % batch_ind)

                    last_time = time.time()
                    [
                        batch_loss_eval, batch_norm_eval, batch_outputs_eval,
                        batch_targets_eval, batch_seq_length_eval
                    ] = sess.run(fetches=[
                        batch_loss, batch_norm, batch_outputs, batch_targets,
                        batch_seq_length
                    ],
                                 options=options)

                    loss += batch_loss_eval
                    loss_norm += batch_norm_eval
                    print('%f' % (time.time() - last_time))
                    last_time = time.time()

                    if output_handling_type != 'reconstructor' or not optimal_frame_permutation:
                        output_handler(batch_outputs_eval,
                                       batch_seq_length_eval)
                    else:
                        output_handler.opt_frame_perm(batch_outputs_eval,
                                                      batch_targets_eval,
                                                      batch_seq_length_eval)

                    sess.run(current_batch_ind_inc_op)

                    print('%f' % (time.time() - last_time))

                loss = loss / loss_norm

        print('task %s: loss = %0.6g' % (task, loss))

        # write the loss to disk
        with open(os.path.join(expdir, 'loss_%s' % task), 'w') as fid:
            fid.write(str(loss))

        if hasattr(output_handler, 'scp_file'):
            output_handler.scp_fid.close()
        if hasattr(output_handler, 'masks_pointer_file'):
            output_handler.masks_pointer_fid.close()

        if os.path.isdir(checkpoint_dir):
            try:
                os.rmdir(checkpoint_dir)
            except:
                pass

    # from here on there is no need for a GPU anymore ==> score script to be run separately on
    # different machine?
    if evaluator_cfg.has_option(task, 'scorers_names'):
        scorers_names = evaluator_cfg.get(task, 'scorers_names').split(' ')
    else:
        scorers_names = [task]

    for scorer_name in scorers_names:
        task_scorer_cfg = dict(scorer_cfg.items(scorer_name))
        score_types = task_scorer_cfg['score_type'].split(' ')

        for score_type in score_types:
            if os.path.isfile(
                    os.path.join(
                        expdir, 'results_%s_%s_complete.json' %
                        (scorer_name, score_type))):
                print(
                    'Already found a score for score task %s for score type %s, skipping it.'
                    % (scorer_name, score_type))
            else:
                print('Scoring task %s for score type %s' %
                      (scorer_name, score_type))
                checkpoint_file = os.path.join(
                    expdir,
                    'checkpoint_results_%s_%s' % (scorer_name, score_type))
                if output_handling_type == 'reconstructor':
                    # create the scorer
                    scorer = scorer_factory.factory(score_type)(
                        conf=task_scorer_cfg,
                        evalconf=evaluator_cfg,
                        dataconf=database_cfg,
                        rec_dir=rec_dir,
                        numbatches=numbatches,
                        task=task,
                        scorer_name=scorer_name,
                        checkpoint_file=checkpoint_file)
                elif output_handling_type == 'speaker_verification':
                    # create the scorer
                    scorer = speaker_verification_scorer_factory.factory(
                        score_type)(conf=task_scorer_cfg,
                                    evalconf=evaluator_cfg,
                                    dataconf=database_cfg,
                                    store_dir=store_dir,
                                    numbatches=numbatches,
                                    task=task,
                                    scorer_name=scorer_name,
                                    checkpoint_file=checkpoint_file)

                # run the scorer
                scorer()

                result_summary = scorer.summarize()

                with open(
                        os.path.join(
                            expdir, 'results_%s_%s_summary.json' %
                            (scorer_name, score_type)), 'w') as fid:
                    json.dump(result_summary, fid)

                with open(
                        os.path.join(
                            expdir, 'results_%s_%s_complete.json' %
                            (scorer_name, score_type)), 'w') as fid:
                    json.dump(scorer.storable_result(), fid)

                if os.path.isfile(checkpoint_file):
                    try:
                        os.remove(checkpoint_file)
                    except:
                        pass

    # legacy code to be removed
    if postprocessor_cfg != None:  # && postprocessing is not done yet for this task
        from nabu.postprocessing.postprocessors import postprocessor_factory

        if evaluator_cfg.has_option(task, 'postprocessors_names'):
            postprocessors_names = evaluator_cfg.get(
                task, 'postprocessors_names').split(' ')
        else:
            postprocessors_names = [task]

        for postprocessors_name in postprocessors_names:
            task_postprocessor_cfg = dict(
                postprocessor_cfg.items(postprocessors_name))
            postprocess_types = task_postprocessor_cfg[
                'postprocess_type'].split(' ')

            for postprocess_type in postprocess_types:
                print('Postprocessing task %s for postprocessor type %s' %
                      (postprocessors_name, postprocess_type))

                # create the postprocessor
                postprocessor = postprocessor_factory.factory(
                    postprocess_type)(conf=task_postprocessor_cfg,
                                      evalconf=evaluator_cfg,
                                      expdir=expdir,
                                      rec_dir=rec_dir,
                                      postprocessors_name=postprocessors_name)

                # run the postprocessor
                postprocessor()

                postprocessor.matlab_eng.quit()
예제 #5
0
    def __init__(self, task_name, trainerconf, taskconf, models, modelconf,
                 dataconf, evaluatorconf, lossconf, batch_size):
        """
		TaskTrainer constructor, gathers the dataconfigs and sets the loss_computer and
		evaluator for this task.

		Args:
		task_name: a name for the training task
			trainerconf: the trainer config
			taskconf: the config file for each task
			models: the neural net models
			modelconf: the neural net models configuration
			dataconf: the data configuration as a ConfigParser
			evaluatorconf: the evaluator configuration for evaluating
				if None no evaluation will be done
			lossconf: the configuration of the loss function
			batch_size: the size of the batch.
		"""

        self.task_name = task_name
        self.trainerconf = trainerconf
        self.taskconf = taskconf
        self.models = models
        self.modelconf = modelconf
        self.evaluatorconf = evaluatorconf
        self.batch_size = batch_size

        # get the database configurations for all inputs, outputs, intermediate model nodes and models.
        self.output_names = taskconf['outputs'].split(' ')
        self.input_names = taskconf['inputs'].split(' ')
        self.target_names = taskconf['targets'].split(' ')
        if self.target_names == ['']:
            self.target_names = []
        self.model_nodes = taskconf['nodes'].split(' ')

        if 'linkedsets' in taskconf:
            set_names = taskconf['linkedsets'].split(' ')
            self.linkedsets = dict()
            for set_name in set_names:
                set_input_names = [
                    '%s_%s' % (set_name, in_name)
                    for in_name in self.input_names
                ]
                set_target_names = [
                    '%s_%s' % (set_name, tar_name)
                    for tar_name in self.target_names
                ]
                self.linkedsets[set_name] = {
                    'inputs': set_input_names,
                    'targets': set_target_names
                }

            if 'linkedset_weighting' in taskconf:
                linkedset_weighting = np.array(
                    map(float, taskconf['linkedset_weighting'].split(' ')))
                # the first set has the reference weight
                linkedset_weighting /= linkedset_weighting[0]
            else:
                linkedset_weighting = np.array([1.0] * len(self.linkedsets))
            self.linkedset_weighting = {
                set_name: weight
                for set_name, weight in zip(set_names, linkedset_weighting)
            }
        else:
            self.linkedsets = {
                'set0': {
                    'inputs': self.input_names,
                    'targets': self.target_names
                }
            }
            self.linkedset_weighting = {'set0': 1.0}

        self.input_dataconfs = dict()
        self.target_dataconfs = dict()
        for linkedset in self.linkedsets:
            self.input_dataconfs[linkedset] = []
            for input_name in self.linkedsets[linkedset]['inputs']:
                # input config
                dataconfs_for_input = []
                sections = taskconf[input_name].split(' ')
                for section in sections:
                    dataconfs_for_input.append(dict(dataconf.items(section)))
                self.input_dataconfs[linkedset].append(dataconfs_for_input)

            self.target_dataconfs[linkedset] = []
            for target_name in self.linkedsets[linkedset]['targets']:
                # target config
                dataconfs_for_target = []
                sections = taskconf[target_name].split(' ')
                for section in sections:
                    dataconfs_for_target.append(dict(dataconf.items(section)))
                self.target_dataconfs[linkedset].append(dataconfs_for_target)

        self.model_links = dict()
        self.inputs_links = dict()
        self.nodes_output_names = dict()
        for node in self.model_nodes:
            self.model_links[node] = taskconf['%s_model' % node]
            self.inputs_links[node] = taskconf['%s_inputs' % node].split(' ')
            if '%s_output_names' % node in taskconf:
                self.nodes_output_names[node] = taskconf['%s_output_names' %
                                                         node].split(' ')
            else:
                self.nodes_output_names[node] = node

        # create the loss computer
        if lossconf:
            loss_type = lossconf['loss_type']
        else:
            loss_type = taskconf['loss_type']

        self.loss_computer = loss_computer_factory.factory(loss_type)(
            lossconf, self.batch_size)

        # create valiation evaluator
        evaltype = evaluatorconf.get('evaluator', 'evaluator')
        if evaltype != 'None':
            self.evaluator = evaluator_factory.factory(evaltype)(
                conf=evaluatorconf,
                dataconf=dataconf,
                lossconf=lossconf,
                models=self.models,
                task=task_name)
예제 #6
0
def test(expdir):
    '''does everything for testing'''

    #read the database config file
    database_cfg = configparser.ConfigParser()
    database_cfg.read(os.path.join(expdir, 'database.cfg'))

    #read the model config file
    model_cfg = configparser.ConfigParser()
    model_cfg.read(os.path.join(expdir, 'model.cfg'))

    #read the evaluator config file
    evaluator_cfg = configparser.ConfigParser()
    evaluator_cfg.read(os.path.join(expdir, 'evaluator.cfg'))
    #quick fix
    #evaluator_cfg.set('evaluator','batch_size','5')

    #read the reconstructor config file
    reconstructor_cfg = configparser.ConfigParser()
    reconstructor_cfg.read(os.path.join(expdir, 'reconstructor.cfg'))

    #read the scorer config file
    scorer_cfg = configparser.ConfigParser()
    scorer_cfg.read(os.path.join(expdir, 'scorer.cfg'))

    #read the postprocessor config file, if it exists
    try:
        postprocessor_cfg = configparser.ConfigParser()
        postprocessor_cfg.read(os.path.join(expdir, 'postprocessor.cfg'))
        if not postprocessor_cfg.sections():
            postprocessor_cfg = None
    except:
        postprocessor_cfg = None
    postprocessor_cfg = None

    if evaluator_cfg.get('evaluator', 'evaluator') == 'multi_task':
        tasks = evaluator_cfg.get('evaluator', 'tasks').split(' ')

    else:
        raise 'unkown type of evaluation %s' % evaluator_cfg.get(
            'evaluator', 'evaluator')

    #evaluate each task separately
    for task in tasks:

        rec_dir = os.path.join(expdir, 'reconstructions', task)

        #load the model
        with open(os.path.join(expdir, 'model', 'model.pkl'), 'rb') as fid:
            models = pickle.load(fid)

        if os.path.isfile(os.path.join(expdir, 'loss_%s' % task)):
            print 'already reconstructed all signals for task %s, going straight to scoring' % task
            if evaluator_cfg.has_option(task, 'requested_utts'):
                requested_utts = int(evaluator_cfg.get(task, 'requested_utts'))
            else:
                requested_utts = int(
                    evaluator_cfg.get('evaluator', 'requested_utts'))
            if evaluator_cfg.has_option(task, 'batch_size'):
                batch_size = int(evaluator_cfg.get(task, 'batch_size'))
            else:
                batch_size = int(evaluator_cfg.get('evaluator', 'batch_size'))
            numbatches = int(float(requested_utts) / float(batch_size))

        else:

            print 'Evaluating task %s' % task

            #create the evaluator
            evaltype = evaluator_cfg.get(task, 'evaluator')
            evaluator = evaluator_factory.factory(evaltype)(
                conf=evaluator_cfg,
                dataconf=database_cfg,
                models=models,
                task=task)

            #create the reconstructor

            task_reconstructor_cfg = dict(reconstructor_cfg.items(task))
            reconstruct_type = task_reconstructor_cfg['reconstruct_type']
            reconstructor = reconstructor_factory.factory(reconstruct_type)(
                conf=task_reconstructor_cfg,
                evalconf=evaluator_cfg,
                dataconf=database_cfg,
                rec_dir=rec_dir,
                task=task)

            #create the graph
            graph = tf.Graph()

            with graph.as_default():
                #compute the loss
                batch_loss, batch_norm, numbatches, batch_outputs, batch_seq_length = evaluator.evaluate(
                )

                #create a hook that will load the model
                load_hook = LoadAtBegin(
                    os.path.join(expdir, 'model', 'network.ckpt'), models)

                #create a hook for summary writing
                summary_hook = SummaryHook(os.path.join(expdir, 'logdir'))

                config = tf.ConfigProto(device_count={'CPU': 1, 'GPU': 0})

                options = tf.RunOptions()
                options.report_tensor_allocations_upon_oom = True

                #start the session
                with tf.train.SingularMonitoredSession(
                        hooks=[load_hook,
                               summary_hook], config=config) as sess:

                    loss = 0.0
                    loss_norm = 0.0

                    for batch_ind in range(0, numbatches):
                        print 'evaluating batch number %d' % batch_ind
                        last_time = time.time()
                        [
                            batch_loss_eval, batch_norm_eval,
                            batch_outputs_eval, batch_seq_length_eval
                        ] = sess.run(fetches=[
                            batch_loss, batch_norm, batch_outputs,
                            batch_seq_length
                        ],
                                     options=options)

                        loss += batch_loss_eval
                        loss_norm += batch_norm_eval
                        print '%f' % (time.time() - last_time)
                        last_time = time.time()
                        #chosing the first seq_length
                        reconstructor(batch_outputs_eval,
                                      batch_seq_length_eval)
                        print '%f' % (time.time() - last_time)

                    loss = loss / loss_norm

            print 'task %s: loss = %0.6g' % (task, loss)

            #write the loss to disk
            with open(os.path.join(expdir, 'loss_%s' % task), 'w') as fid:
                fid.write(str(loss))

        #from here on there is no need for a GPU anymore ==> score script to be run separately on
        #different machine?

        task_scorer_cfg = dict(scorer_cfg.items(task))
        score_types = task_scorer_cfg['score_type'].split(' ')

        for score_type in score_types:
            if os.path.isfile(
                    os.path.join(
                        expdir,
                        'results_%s_%s_complete.json' % (task, score_type))):
                print 'Already found a score for task %s for score type %s, skipping it.' % (
                    task, score_type)
            else:

                print 'Scoring task %s for score type %s' % (task, score_type)

                #create the scorer
                scorer = scorer_factory.factory(score_type)(
                    conf=task_scorer_cfg,
                    evalconf=evaluator_cfg,
                    dataconf=database_cfg,
                    rec_dir=rec_dir,
                    numbatches=numbatches,
                    task=task)

                #run the scorer
                scorer()

                with open(
                        os.path.join(
                            expdir, 'results_%s_%s_complete.json' %
                            (task, score_type)), 'w') as fid:
                    json.dump(scorer.results, fid)

                result_summary = scorer.summarize()
                with open(
                        os.path.join(
                            expdir,
                            'results_%s_%s_summary.json' % (task, score_type)),
                        'w') as fid:
                    json.dump(result_summary, fid)

        if postprocessor_cfg != None:  # && postprocessing is not done yet for this task
            task_postprocessor_cfg = dict(postprocessor_cfg.items(task))
            task_processor_cfg = dict(
                postprocessor_cfg.items('processor_' + task))
            postprocess_types = task_postprocessor_cfg[
                'postprocess_type'].split(' ')

            for postprocess_type in postprocess_types:
                #create the postprocessor
                postprocessor = postprocessor_factory.factory(
                    postprocess_type)(conf=task_postprocessor_cfg,
                                      proc_conf=task_processor_cfg,
                                      evalconf=evaluator_cfg,
                                      expdir=expdir,
                                      rec_dir=rec_dir,
                                      task=task)

                #run the postprocessor
                postprocessor()

                postprocessor.matlab_eng.quit()
예제 #7
0
    def __init__(self, task_name, trainerconf, taskconf, models, modelconf,
                 dataconf, evaluatorconf, batch_size):
        """
		TaskTrainer constructor, gathers the dataconfigs and sets the loss_computer and
		evaluator for this task.

		Args:
		task_name: a name for the training task
			trainerconf: the trainer config
			taskconf: the config file for each task
			models: the neural net models
			modelconf: the neural net models configuration
			dataconf: the data configuration as a ConfigParser
			evaluatorconf: the evaluator configuration for evaluating
				if None no evaluation will be done
			batch_size: the size of the batch.
		"""

        self.task_name = task_name
        self.trainerconf = trainerconf
        self.taskconf = taskconf
        self.models = models
        self.modelconf = modelconf
        self.evaluatorconf = evaluatorconf
        self.batch_size = batch_size

        # get the database configurations for all inputs, outputs, intermediate model nodes and models.
        self.output_names = taskconf['outputs'].split(' ')
        self.input_names = taskconf['inputs'].split(' ')
        self.target_names = taskconf['targets'].split(' ')
        if self.target_names == ['']:
            self.target_names = []
        self.model_nodes = taskconf['nodes'].split(' ')

        if 'linkedsets' in taskconf:
            set_names = taskconf['linkedsets'].split(' ')
            self.linkedsets = dict()
            for set_name in set_names:
                inp_indices = map(int,
                                  taskconf['%s_inputs' % set_name].split(' '))
                tar_indices = map(int,
                                  taskconf['%s_targets' % set_name].split(' '))
                set_inputs = [
                    inp for ind, inp in enumerate(self.input_names)
                    if ind in inp_indices
                ]
                set_targets = [
                    tar for ind, tar in enumerate(self.target_names)
                    if ind in tar_indices
                ]
                self.linkedsets[set_name] = {
                    'inputs': set_inputs,
                    'targets': set_targets
                }
        else:
            self.linkedsets = {
                'set0': {
                    'inputs': self.input_names,
                    'targets': self.target_names
                }
            }

        self.input_dataconfs = dict()
        self.target_dataconfs = dict()
        for linkedset in self.linkedsets:
            self.input_dataconfs[linkedset] = []
            for input_name in self.linkedsets[linkedset]['inputs']:
                # input config
                dataconfs_for_input = []
                sections = taskconf[input_name].split(' ')
                for section in sections:
                    dataconfs_for_input.append(dict(dataconf.items(section)))
                self.input_dataconfs[linkedset].append(dataconfs_for_input)

            self.target_dataconfs[linkedset] = []
            for target_name in self.linkedsets[linkedset]['targets']:
                # target config
                dataconfs_for_target = []
                sections = taskconf[target_name].split(' ')
                for section in sections:
                    dataconfs_for_target.append(dict(dataconf.items(section)))
                self.target_dataconfs[linkedset].append(dataconfs_for_target)

        self.model_links = dict()
        self.inputs_links = dict()
        for node in self.model_nodes:
            self.model_links[node] = taskconf['%s_model' % node]
            self.inputs_links[node] = taskconf['%s_inputs' % node].split(' ')

        # create the loss computer
        self.loss_computer = loss_computer_factory.factory(
            taskconf['loss_type'])(self.batch_size)

        # create valiation evaluator
        evaltype = evaluatorconf.get('evaluator', 'evaluator')
        if evaltype != 'None':
            self.evaluator = evaluator_factory.factory(evaltype)(
                conf=evaluatorconf,
                dataconf=dataconf,
                models=self.models,
                task=task_name)
예제 #8
0
def test(expdir, testing=False):
    '''does everything for testing

    args:
        expdir: the experiments directory
        testing: if true only the graph will be created for debugging purposes
    '''

    #read the database config file
    database_cfg = configparser.ConfigParser()
    database_cfg.read(os.path.join(expdir, 'database.cfg'))

    if testing:
        model_cfg = configparser.ConfigParser()
        model_cfg.read(os.path.join(expdir, 'model.cfg'))
        trainer_cfg = configparser.ConfigParser()
        trainer_cfg.read(os.path.join(expdir, 'trainer.cfg'))
        model = Model(conf=model_cfg,
                      trainlabels=int(trainer_cfg.get('trainer',
                                                      'trainlabels')),
                      constraint=None)
    else:
        #load the model
        with open(os.path.join(expdir, 'model', 'model.pkl'), 'rb') as fid:
            model = pickle.load(fid)

    #read the evaluator config file
    evaluator_cfg = configparser.ConfigParser()
    evaluator_cfg.read(os.path.join(expdir, 'test_evaluator.cfg'))

    #create the evaluator
    evaltype = evaluator_cfg.get('evaluator', 'evaluator')
    evaluator = evaluator_factory.factory(evaltype)(conf=evaluator_cfg,
                                                    dataconf=database_cfg,
                                                    model=model)

    #create the graph
    graph = tf.Graph()

    with graph.as_default():

        #compute the loss
        loss, update_loss, numbatches = evaluator.evaluate()
        if testing:
            return

        #create a histogram for all trainable parameters
        for param in tf.trainable_variables():
            tf.summary.histogram(param.name,
                                 param,
                                 collections=['variable_summaries'])

        eval_summary = tf.summary.merge_all('eval_summaries')
        variable_summary = tf.summary.merge_all('variable_summaries')

        #create a hook that will load the model
        load_hook = LoadAtBegin(os.path.join(expdir, 'model', 'network.ckpt'),
                                model.variables)

        #start the session
        with tf.train.SingularMonitoredSession(hooks=[load_hook]) as sess:

            summary_writer = tf.summary.FileWriter(
                os.path.join(expdir, 'logdir'))

            summary = variable_summary.eval(session=sess)
            summary_writer.add_summary(summary)

            print 'TENSORFLOW ITEMS'
            print '---Errors----'
            op = sess.graph.get_operations()
            print 'errors:'
            test = tf.get_default_graph().get_tensor_by_name(
                "evaluate/evaluate_decoder/Sum_1:0")
            print test.eval(session=sess)
            print 'new_num_targets'
            test = tf.get_default_graph().get_tensor_by_name(
                "evaluate/evaluate_decoder/add:0")
            print test.eval(session=sess)
            print 'batch_targets'
            test = tf.get_default_graph().get_tensor_by_name(
                "evaluate/evaluate_decoder/Sum_3:0")
            print test.eval(session=sess)

            print '--CTC DECODER ---'
            print 'loss:'
            test = tf.get_default_graph().get_tensor_by_name(
                "validation_loss:0")
            print test.eval(session=sess)
            print 'outputs:'
            print 'references:'
            test = tf.get_default_graph().get_tensor_by_name(
                "evaluate/input_pipeline/batch:2")
            print test.eval(session=sess)
            print 'references_seq_length'
            test = tf.get_default_graph().get_tensor_by_name(
                "evaluate/input_pipeline/batch:3")
            print test.eval(session=sess)

            for i in range(numbatches):
                if eval_summary is not None:
                    _, summary = sess.run([update_loss, eval_summary])
                    summary_writer.add_summary(summary, i)
                else:
                    update_loss.run(session=sess)
                    print 'loss: '
                    temploss = loss.eval(session=sess)
                    print temploss

            loss = loss.eval(session=sess)

    print 'loss = %f' % loss

    #write the result to disk
    with open(os.path.join(expdir, 'result'), 'w') as fid:
        fid.write(str(loss))
예제 #9
0
파일: test.py 프로젝트: Kaatje95/Nabu-MSSS
def test(expdir):
    '''does everything for testing'''
    
    #read the database config file
    database_cfg = configparser.ConfigParser()
    database_cfg.read(os.path.join(expdir, 'database.cfg'))

    #load the model
    with open(os.path.join(expdir, 'model', 'model.pkl'), 'rb') as fid:
        model = pickle.load(fid)

    #read the evaluator config file
    evaluator_cfg = configparser.ConfigParser()
    evaluator_cfg.read(os.path.join(expdir, 'evaluator.cfg'))

    #create the evaluator
    evaltype = evaluator_cfg.get('evaluator', 'evaluator')
    evaluator = evaluator_factory.factory(evaltype)(
        conf=evaluator_cfg,
        dataconf=database_cfg,
        model=model)
    
    #create the reconstructor
    reconstruct_type = evaluator_cfg.get('reconstructor', 'reconstruct_type')
    reconstructor = reconstructor_factory.factory(reconstruct_type)(
        conf=evaluator_cfg,
        dataconf=database_cfg,
        expdir=expdir)
	     
    os.environ['CUDA_VISIBLE_DEVICES'] = '0'

    #create the graph
    graph = tf.Graph()

    with graph.as_default():
        #compute the loss
        batch_loss, numbatches, batch_outputs, batch_seq_length = evaluator.evaluate()

        #create a hook that will load the model
        load_hook = LoadAtBegin(
            os.path.join(expdir, 'model', 'network.ckpt'),
            model)

        #create a hook for summary writing
        summary_hook = SummaryHook(os.path.join(expdir, 'logdir'))

        #start the session
        with tf.train.SingularMonitoredSession(
            hooks=[load_hook, summary_hook]) as sess:

            loss = 0.0

            for batch_ind in range(0,numbatches):
		print 'evaluating batch number %d' %batch_ind

		batch_loss_eval, batch_outputs_eval, batch_seq_length_eval = sess.run(
		      fetches=[batch_loss, batch_outputs, batch_seq_length])

                loss += batch_loss_eval

                reconstructor(batch_outputs_eval['outputs'],
			      batch_seq_length_eval['features'])              
                
            loss = loss#/numbatches

    print 'loss = %0.6g' % loss
    
    #write the loss to disk
    with open(os.path.join(expdir, 'loss'), 'w') as fid:
        fid.write(str(loss))
        
    #from here on there is no need for a GPU anymore ==> score script to be run separately on
    #different machine? reconstructor.rec_dir has to be known though. can be put in evaluator_cfg
    
    score_type = evaluator_cfg.get('scorer', 'score_type')
    
    for i in range(10):
	# Sometime it fails and not sure why. Just retry then. max 10 times
	try:
	    #create the scorer
	    scorer = scorer_factory.factory(score_type)(
		conf=evaluator_cfg,
		dataconf=database_cfg,
		rec_dir=reconstructor.rec_dir,
		numbatches=numbatches)
    
	    #run the scorer
	    scorer()
	except Exception:
	  if i==9:
	      raise Exception
	  else:
	      continue
	break
    
    with open(os.path.join(expdir, 'results_complete.json'), 'w') as fid:
        json.dump(scorer.results,fid)
    
    result_summary = scorer.summarize()
    with open(os.path.join(expdir, 'results_summary.json'), 'w') as fid:
        json.dump(result_summary,fid)
예제 #10
0
def test(expdir, testing=False):
    '''does everything for testing

    args:
        expdir: the experiments directory
        testing: if true only the graph will be created for debugging purposes
    '''

    #read the database config file
    database_cfg = configparser.ConfigParser()
    database_cfg.read(os.path.join(expdir, 'database.conf'))

    if testing:
        model_cfg = configparser.ConfigParser()
        model_cfg.read(os.path.join(expdir, 'model.cfg'))
        trainer_cfg = configparser.ConfigParser()
        trainer_cfg.read(os.path.join(expdir, 'trainer.cfg'))
        model = Model(conf=model_cfg,
                      trainlabels=int(trainer_cfg.get('trainer',
                                                      'trainlabels')),
                      constraint=None)
    else:
        #load the model
        with open(os.path.join(expdir, 'model', 'model.pkl'), 'rb') as fid:
            model = pickle.load(fid)

    #read the evaluator config file
    evaluator_cfg = configparser.ConfigParser()
    evaluator_cfg.read(os.path.join(expdir, 'test_evaluator.cfg'))

    #create the evaluator
    evaltype = evaluator_cfg.get('evaluator', 'evaluator')
    evaluator = evaluator_factory.factory(evaltype)(conf=evaluator_cfg,
                                                    dataconf=database_cfg,
                                                    model=model)

    #create the graph
    graph = tf.Graph()

    with graph.as_default():

        #compute the loss
        loss, update_loss, numbatches = evaluator.evaluate()

        if testing:
            return

        #create a histogram for all trainable parameters
        for param in tf.trainable_variables():
            tf.summary.histogram(param.name,
                                 param,
                                 collections=['variable_summaries'])

        eval_summary = tf.summary.merge_all('eval_summaries')
        variable_summary = tf.summary.merge_all('variable_summaries')

        #create a hook that will load the model
        load_hook = LoadAtBegin(os.path.join(expdir, 'model', 'network.ckpt'),
                                model.variables)

        #start the session
        with tf.train.SingularMonitoredSession(hooks=[load_hook]) as sess:

            summary_writer = tf.summary.FileWriter(
                os.path.join(expdir, 'logdir'))

            summary = variable_summary.eval(session=sess)
            summary_writer.add_summary(summary)

            for i in range(numbatches):
                if eval_summary is not None:
                    _, summary = sess.run([update_loss, eval_summary])
                    summary_writer.add_summary(summary, i)
                else:
                    update_loss.run(session=sess)

            loss = loss.eval(session=sess)

    print 'loss = %f' % loss

    #write the result to disk
    with open(os.path.join(expdir, 'result'), 'w') as fid:
        fid.write(str(loss))
예제 #11
0
def test(expdir, testing=False):
    '''does everything for testing

    args:
        expdir: the experiments directory
        testing: if true only the graph will be created for debugging purposes
    '''

    #read the database config file
    database_cfg = configparser.ConfigParser()
    database_cfg.read(os.path.join(expdir, 'database.conf'))

    if testing:
        model_cfg = configparser.ConfigParser()
        model_cfg.read(os.path.join(expdir, 'model.cfg'))
        trainer_cfg = configparser.ConfigParser()
        trainer_cfg.read(os.path.join(expdir, 'trainer.cfg'))
        model = Model(conf=model_cfg,
                      trainlabels=int(trainer_cfg.get('trainer',
                                                      'trainlabels')))
    else:
        #load the model
        with open(os.path.join(expdir, 'model', 'model.pkl'), 'rb') as fid:
            model = pickle.load(fid)

    #read the evaluator config file
    evaluator_cfg = configparser.ConfigParser()
    evaluator_cfg.read(os.path.join(expdir, 'test_evaluator.cfg'))

    #create the evaluator
    evaltype = evaluator_cfg.get('evaluator', 'evaluator')
    evaluator = evaluator_factory.factory(evaltype)(conf=evaluator_cfg,
                                                    dataconf=database_cfg,
                                                    model=model)

    #create the graph
    graph = tf.Graph()

    with graph.as_default():

        #compute the loss
        batch_loss, numbatches = evaluator.evaluate()

        if testing:
            return

        #create a histogram for all trainable parameters
        for param in model.variables:
            tf.summary.histogram(param.name, param)

        #create a hook that will load the model
        load_hook = LoadAtBegin(os.path.join(expdir, 'model', 'network.ckpt'),
                                model.variables)

        #create a hook for summary writing
        summary_hook = SummaryHook(os.path.join(expdir, 'logdir'))

        #start the session
        with tf.train.SingularMonitoredSession(
                hooks=[load_hook, summary_hook]) as sess:

            loss = 0.0
            for _ in range(numbatches):
                loss += batch_loss.eval(session=sess)
            loss = loss / numbatches

    print 'loss = %f' % loss

    #write the result to disk
    with open(os.path.join(expdir, 'result'), 'w') as fid:
        fid.write(str(loss))
예제 #12
0
    def __init__(self, task_name, trainerconf, taskconf, models, modelconf,
                 dataconf, evaluatorconf, batch_size):
        '''
        TaskTrainer constructor, gathers the dataconfigs and sets the loss_computer and
        evaluator for this task.

        Args:
	    task_name: a name for the training task
            trainerconf: the trainer config
            taskconf: the config file for each task
            models: the neural net models
            modelconf: the neural net models configuration
            dataconf: the data configuration as a ConfigParser
            evaluatorconf: the evaluator configuration for evaluating
                if None no evaluation will be done
            batch_size: the size of the batch.
        '''

        self.task_name = task_name
        self.trainerconf = trainerconf
        self.taskconf = taskconf
        self.models = models
        self.modelconf = modelconf
        self.evaluatorconf = evaluatorconf
        self.batch_size = batch_size

        #get the database configurations for all inputs, outputs, intermediate model nodes and models.
        self.output_names = taskconf['outputs'].split(' ')
        self.input_names = taskconf['inputs'].split(' ')
        self.model_nodes = taskconf['nodes'].split(' ')
        self.input_dataconfs = []
        for input_name in self.input_names:
            #input config
            self.input_dataconfs.append(
                dict(dataconf.items(taskconf[input_name])))

        self.target_names = taskconf['targets'].split(' ')
        if self.target_names == ['']:
            self.target_names = []
        self.target_dataconfs = []
        for target_name in self.target_names:
            #target config
            self.target_dataconfs.append(
                dict(dataconf.items(taskconf[target_name])))

        self.model_links = dict()
        self.inputs_links = dict()
        for node in self.model_nodes:
            self.model_links[node] = taskconf['%s_model' % node]
            self.inputs_links[node] = taskconf['%s_inputs' % node].split(' ')

    #create the loss computer
        self.loss_computer = loss_computer_factory.factory(
            taskconf['loss_type'])(self.batch_size)

        #create valiation evaluator
        evaltype = evaluatorconf.get('evaluator', 'evaluator')
        if evaltype != 'None':
            self.evaluator = evaluator_factory.factory(evaltype)(
                conf=evaluatorconf,
                dataconf=dataconf,
                models=self.models,
                task=task_name)