Пример #1
0
def create_job(config, name, work_dir, train_file, model_file,
               test_file, classif_file, memory=MEMORY):
    """\
    Save configuration to a pickle file, create a corresponding cluster job
    and run it.
    """
    train_file = os.path.abspath(train_file)
    cfg_file = append_name('config.pickle', name)
    model_file = append_name(model_file, name)
    if test_file is not None and classif_file is not None:
        test_file = os.path.abspath(test_file)
        classif_file = append_name(classif_file, name)
        test_str = "'{0}', '{1}'".format(test_file, classif_file)
    else:
        test_str = 'None, None'
    # save unfolded config file
    fh = open(os.path.join(work_dir, cfg_file), mode='wb')
    marshal_lambda(config, 'filter_attr')
    marshal_lambda(config, 'postprocess')
    pickle.dump(config, fh, pickle.HIGHEST_PROTOCOL)
    fh.close()
    # create the training job
    job = Job(name=name, work_dir=work_dir)
    job.header = "from flect.experiment.train_model import run_training\n"
    job.code = "run_training('{0}', '{1}',".format(work_dir, cfg_file) + \
            "'{0}', '{1}', {2})\n".format(train_file, model_file, test_str)
    job.submit(memory=memory)
    print 'Job', job, 'submitted.'
Пример #2
0
    def train(self, das_file, ttree_file, data_portion=1.0, context_file=None, validation_files=None):
        """Run parallel perceptron training, start and manage workers."""
        # initialize the ranker instance
        log_info('Initializing...')
        # run server to process registering clients
        self._init_server()
        # spawn training jobs
        log_info('Spawning jobs...')
        host_short, _ = self.host.split('.', 1)  # short host name for job names
        for j in xrange(self.jobs_number):
            # set up debugging logfile only if we have it on the head
            debug_logfile = ('"PRT%02d.debug-out.txt.gz"' % j) if is_debug_stream() else 'None'
            job = Job(header='from tgen.parallel_seq2seq_train import run_training',
                      code=('run_training("%s", %d, %s)' %
                            (self.host, self.port, debug_logfile)),
                      name=self.experiment_id + ("PRT%02d-%s-%d" % (j, host_short, self.port)),
                      work_dir=self.work_dir)
            job.submit(memory=self.job_memory, queue=self.queue_settings)
            self.jobs.append(job)

        # run the training passes
        try:
            cur_assign = 0
            results = [None] * self.jobs_number
            rnd_seeds = [rnd.random() for _ in xrange(self.jobs_number)]

            # assign training and wait for it to finish
            while cur_assign < self.jobs_number or self.pending_requests:
                log_debug('Starting loop over services.')

                # check if some of the pending computations have finished
                for sc, job_no, req in list(self.pending_requests):
                    res = self._check_pending_request(sc, job_no, req)
                    if res is not None:
                        results[job_no] = res, sc

                # check for free services and assign new computation
                while cur_assign < self.jobs_number and self.free_services:
                    log_debug('Assigning request %d' % cur_assign)
                    sc = self.free_services.popleft()
                    log_info('Assigning request %d to %s:%d' % (cur_assign, sc.host, sc.port))
                    if validation_files is not None:
                        validation_files = ','.join([os.path.relpath(f, self.work_dir)
                                                     for f in validation_files.split(',')])
                    train_func = async(sc.conn.root.train)
                    req = train_func(rnd_seeds[cur_assign],
                                     os.path.relpath(das_file, self.work_dir),
                                     os.path.relpath(ttree_file, self.work_dir),
                                     data_portion,
                                     os.path.relpath(context_file, self.work_dir)
                                     if context_file else None,
                                     validation_files)
                    self.pending_requests.add((sc, cur_assign, req))
                    cur_assign += 1
                    log_debug('Assigned %d' % cur_assign)

                # sleep for a while
                log_debug('Sleeping.')
                time.sleep(self.poll_interval)

            log_info("Results:\n" + "\n".join("%.5f %s:%d" % (cost, sc.host, sc.port)
                                              for cost, sc in results))

            self.model_temp_path = os.path.join(self.work_dir, self.TEMPFILE_NAME)
            results.sort(key=lambda res: res[0])
            # average the computed models
            if self.average_models:
                log_info('Creating ensemble models...')
                # use only top k if required
                results_for_ensemble = (results[:self.average_models_top_k]
                                        if self.average_models_top_k > 0
                                        else results)
                ensemble_model = self.build_ensemble_model(results_for_ensemble)
                log_info('Saving the ensemble model temporarily to %s...' % self.model_temp_path)
                ensemble_model.save_to_file(self.model_temp_path)
            # select the best result on devel data + save it
            else:
                best_cost, best_sc = results[0]
                log_info('Best cost: %f (computed at %s:%d).' % (best_cost, best_sc.host, best_sc.port))
                log_info('Saving best generator temporarily to %s...' % self.model_temp_path)
                # use relative path (working directory of worker jobs is different)
                best_sc.conn.root.save_model(os.path.relpath(self.model_temp_path, self.work_dir))

        # kill all jobs
        finally:
            for job in self.jobs:
                job.delete()
Пример #3
0
    def train(self, das_file, ttree_file, data_portion=1.0):
        """Run parallel perceptron training, start and manage workers."""
        # initialize the ranker instance
        log_info('Initializing...')
        self.loc_ranker._init_training(das_file, ttree_file, data_portion)
        # run server to process registering clients
        self._init_server()
        # spawn training jobs
        log_info('Spawning jobs...')
        host_short, _ = self.host.split('.',
                                        1)  # short host name for job names
        for j in xrange(self.jobs_number):
            # set up debugging logfile only if we have it on the head
            debug_logfile = ('"PRT%02d.debug-out.txt.gz"' %
                             j) if is_debug_stream() else 'None'
            job = Job(
                header='from tgen.parallel_percrank_train import run_worker',
                code=('run_worker("%s", %d, %s)' %
                      (self.host, self.port, debug_logfile)),
                name=self.experiment_id + ("PRT%02d-%s-%d" %
                                           (j, host_short, self.port)),
                work_dir=self.work_dir)
            job.submit(self.job_memory)
            self.jobs.append(job)
        # run the training passes
        try:
            for iter_no in xrange(1, self.loc_ranker.passes + 1):

                log_info('Pass %d...' % iter_no)
                log_debug('\n***\nTR%05d:' % iter_no)

                iter_start_time = time.time()
                cur_portion = 0
                results = [None] * self.data_portions
                w_dump = pickle.dumps(self.loc_ranker.get_weights(),
                                      protocol=pickle.HIGHEST_PROTOCOL)
                rnd_seeds = [rnd.random() for _ in xrange(self.data_portions)]
                # wait for free services / assign computation
                while cur_portion < self.data_portions or self.pending_requests:
                    log_debug('Starting loop over services.')

                    # check if some of the pending computations have finished
                    for sc, req_portion, req in list(self.pending_requests):
                        res = self._check_pending_request(
                            iter_no, sc, req_portion, req)
                        if res:
                            results[req_portion] = res

                    # check for free services and assign new computation
                    while cur_portion < self.data_portions and self.free_services:
                        log_debug('Assigning request %d' % cur_portion)
                        sc = self.free_services.popleft()
                        log_info('Assigning request %d / %d to %s:%d' %
                                 (iter_no, cur_portion, sc.host, sc.port))
                        train_func = async (sc.conn.root.training_pass)
                        req = train_func(
                            w_dump, iter_no, rnd_seeds[cur_portion],
                            *self._get_portion_bounds(cur_portion))
                        self.pending_requests.add((sc, cur_portion, req))
                        cur_portion += 1
                        log_debug('Assigned %d' % cur_portion)
                    # sleep for a while
                    log_debug('Sleeping.')
                    time.sleep(self.poll_interval)

                # delete the temporary ranker dump when the 1st iteration is complete
                if self.ranker_dump_path:
                    log_info('Removing temporary ranker dump at %s.' %
                             self.ranker_dump_path)
                    os.remove(self.ranker_dump_path)
                    self.ranker_dump_path = None

                # gather/average the diagnostic statistics
                self.loc_ranker.set_diagnostics_average(
                    [d for _, d in results])

                # take an average of weights; set it as new w
                self.loc_ranker.set_weights_average([w for w, _ in results])
                self.loc_ranker.store_iter_weights(
                )  # store a copy of w for averaged perceptron

                # print statistics
                log_debug(self.loc_ranker._feat_val_str(), '\n***')
                self.loc_ranker._print_pass_stats(
                    iter_no,
                    datetime.timedelta(seconds=(time.time() -
                                                iter_start_time)))

            # after all passes: average weights if set to do so
            if self.loc_ranker.averaging is True:
                self.loc_ranker.set_weights_iter_average()
        # kill all jobs
        finally:
            for job in self.jobs:
                job.delete()