def create_job(config, name, work_dir, train_file, model_file, test_file, classif_file, memory=MEMORY): """\ Save configuration to a pickle file, create a corresponding cluster job and run it. """ train_file = os.path.abspath(train_file) cfg_file = append_name('config.pickle', name) model_file = append_name(model_file, name) if test_file is not None and classif_file is not None: test_file = os.path.abspath(test_file) classif_file = append_name(classif_file, name) test_str = "'{0}', '{1}'".format(test_file, classif_file) else: test_str = 'None, None' # save unfolded config file fh = open(os.path.join(work_dir, cfg_file), mode='wb') marshal_lambda(config, 'filter_attr') marshal_lambda(config, 'postprocess') pickle.dump(config, fh, pickle.HIGHEST_PROTOCOL) fh.close() # create the training job job = Job(name=name, work_dir=work_dir) job.header = "from flect.experiment.train_model import run_training\n" job.code = "run_training('{0}', '{1}',".format(work_dir, cfg_file) + \ "'{0}', '{1}', {2})\n".format(train_file, model_file, test_str) job.submit(memory=memory) print 'Job', job, 'submitted.'
def train(self, das_file, ttree_file, data_portion=1.0, context_file=None, validation_files=None): """Run parallel perceptron training, start and manage workers.""" # initialize the ranker instance log_info('Initializing...') # run server to process registering clients self._init_server() # spawn training jobs log_info('Spawning jobs...') host_short, _ = self.host.split('.', 1) # short host name for job names for j in xrange(self.jobs_number): # set up debugging logfile only if we have it on the head debug_logfile = ('"PRT%02d.debug-out.txt.gz"' % j) if is_debug_stream() else 'None' job = Job(header='from tgen.parallel_seq2seq_train import run_training', code=('run_training("%s", %d, %s)' % (self.host, self.port, debug_logfile)), name=self.experiment_id + ("PRT%02d-%s-%d" % (j, host_short, self.port)), work_dir=self.work_dir) job.submit(memory=self.job_memory, queue=self.queue_settings) self.jobs.append(job) # run the training passes try: cur_assign = 0 results = [None] * self.jobs_number rnd_seeds = [rnd.random() for _ in xrange(self.jobs_number)] # assign training and wait for it to finish while cur_assign < self.jobs_number or self.pending_requests: log_debug('Starting loop over services.') # check if some of the pending computations have finished for sc, job_no, req in list(self.pending_requests): res = self._check_pending_request(sc, job_no, req) if res is not None: results[job_no] = res, sc # check for free services and assign new computation while cur_assign < self.jobs_number and self.free_services: log_debug('Assigning request %d' % cur_assign) sc = self.free_services.popleft() log_info('Assigning request %d to %s:%d' % (cur_assign, sc.host, sc.port)) if validation_files is not None: validation_files = ','.join([os.path.relpath(f, self.work_dir) for f in validation_files.split(',')]) train_func = async(sc.conn.root.train) req = train_func(rnd_seeds[cur_assign], os.path.relpath(das_file, self.work_dir), os.path.relpath(ttree_file, self.work_dir), data_portion, os.path.relpath(context_file, self.work_dir) if context_file else None, validation_files) self.pending_requests.add((sc, cur_assign, req)) cur_assign += 1 log_debug('Assigned %d' % cur_assign) # sleep for a while log_debug('Sleeping.') time.sleep(self.poll_interval) log_info("Results:\n" + "\n".join("%.5f %s:%d" % (cost, sc.host, sc.port) for cost, sc in results)) self.model_temp_path = os.path.join(self.work_dir, self.TEMPFILE_NAME) results.sort(key=lambda res: res[0]) # average the computed models if self.average_models: log_info('Creating ensemble models...') # use only top k if required results_for_ensemble = (results[:self.average_models_top_k] if self.average_models_top_k > 0 else results) ensemble_model = self.build_ensemble_model(results_for_ensemble) log_info('Saving the ensemble model temporarily to %s...' % self.model_temp_path) ensemble_model.save_to_file(self.model_temp_path) # select the best result on devel data + save it else: best_cost, best_sc = results[0] log_info('Best cost: %f (computed at %s:%d).' % (best_cost, best_sc.host, best_sc.port)) log_info('Saving best generator temporarily to %s...' % self.model_temp_path) # use relative path (working directory of worker jobs is different) best_sc.conn.root.save_model(os.path.relpath(self.model_temp_path, self.work_dir)) # kill all jobs finally: for job in self.jobs: job.delete()
def train(self, das_file, ttree_file, data_portion=1.0): """Run parallel perceptron training, start and manage workers.""" # initialize the ranker instance log_info('Initializing...') self.loc_ranker._init_training(das_file, ttree_file, data_portion) # run server to process registering clients self._init_server() # spawn training jobs log_info('Spawning jobs...') host_short, _ = self.host.split('.', 1) # short host name for job names for j in xrange(self.jobs_number): # set up debugging logfile only if we have it on the head debug_logfile = ('"PRT%02d.debug-out.txt.gz"' % j) if is_debug_stream() else 'None' job = Job( header='from tgen.parallel_percrank_train import run_worker', code=('run_worker("%s", %d, %s)' % (self.host, self.port, debug_logfile)), name=self.experiment_id + ("PRT%02d-%s-%d" % (j, host_short, self.port)), work_dir=self.work_dir) job.submit(self.job_memory) self.jobs.append(job) # run the training passes try: for iter_no in xrange(1, self.loc_ranker.passes + 1): log_info('Pass %d...' % iter_no) log_debug('\n***\nTR%05d:' % iter_no) iter_start_time = time.time() cur_portion = 0 results = [None] * self.data_portions w_dump = pickle.dumps(self.loc_ranker.get_weights(), protocol=pickle.HIGHEST_PROTOCOL) rnd_seeds = [rnd.random() for _ in xrange(self.data_portions)] # wait for free services / assign computation while cur_portion < self.data_portions or self.pending_requests: log_debug('Starting loop over services.') # check if some of the pending computations have finished for sc, req_portion, req in list(self.pending_requests): res = self._check_pending_request( iter_no, sc, req_portion, req) if res: results[req_portion] = res # check for free services and assign new computation while cur_portion < self.data_portions and self.free_services: log_debug('Assigning request %d' % cur_portion) sc = self.free_services.popleft() log_info('Assigning request %d / %d to %s:%d' % (iter_no, cur_portion, sc.host, sc.port)) train_func = async (sc.conn.root.training_pass) req = train_func( w_dump, iter_no, rnd_seeds[cur_portion], *self._get_portion_bounds(cur_portion)) self.pending_requests.add((sc, cur_portion, req)) cur_portion += 1 log_debug('Assigned %d' % cur_portion) # sleep for a while log_debug('Sleeping.') time.sleep(self.poll_interval) # delete the temporary ranker dump when the 1st iteration is complete if self.ranker_dump_path: log_info('Removing temporary ranker dump at %s.' % self.ranker_dump_path) os.remove(self.ranker_dump_path) self.ranker_dump_path = None # gather/average the diagnostic statistics self.loc_ranker.set_diagnostics_average( [d for _, d in results]) # take an average of weights; set it as new w self.loc_ranker.set_weights_average([w for w, _ in results]) self.loc_ranker.store_iter_weights( ) # store a copy of w for averaged perceptron # print statistics log_debug(self.loc_ranker._feat_val_str(), '\n***') self.loc_ranker._print_pass_stats( iter_no, datetime.timedelta(seconds=(time.time() - iter_start_time))) # after all passes: average weights if set to do so if self.loc_ranker.averaging is True: self.loc_ranker.set_weights_iter_average() # kill all jobs finally: for job in self.jobs: job.delete()