def load_from_file(fname): log_info('Loading model from ' + fname) with file_stream(fname, mode='rb', encoding=None) as fh: candgen = pickle.load(fh) # various backward compatibility tricks if type(candgen) == dict: child_type_counts = candgen candgen = RandomCandidateGenerator({}) candgen.child_type_counts = child_type_counts candgen.child_num_cdfs = pickle.load(fh) candgen.max_children = pickle.load(fh) if not hasattr(candgen, 'node_limits'): candgen.node_limits = None if not hasattr(candgen, 'child_type_counts'): candgen.child_type_counts = candgen.form_counts candgen.child_num_cdfs = candgen.child_cdfs if not hasattr(candgen, 'exp_child_num'): candgen.exp_child_num = candgen.exp_from_cdfs(candgen.child_num_cdfs) if not hasattr(candgen, 'compatible_dais'): candgen.compatible_dais = None candgen.compatible_dais_type = None candgen.compatible_dais_limit = 1000 if not hasattr(candgen, 'compatible_slots'): candgen.compatible_slots = False if not hasattr(candgen, 'classif'): candgen.classif = None return candgen
def get_worker_registrar_for(head): """Return a class that will handle worker registration for the given head.""" # create a dump of the head to be passed to workers log_info('Saving ranker init state...') tstart = time.time() ranker_dump_path = dump_ranker(head.loc_ranker, head.work_dir) log_info('Ranker init state saved in %s, it took %f secs.' % (ranker_dump_path, time.time() - tstart)) class WorkerRegistrarService(Service): def exposed_register_worker(self, host, port): """Register a worker with my head, initialize it.""" # initiate connection in the other direction log_info('Worker %s:%d connected, initializing training.' % (host, port)) conn = connect(host, port, config={'allow_pickle': True}) # initialize the remote server (with training data etc.) init_func = async(conn.root.init_training) req = init_func(ranker_dump_path) # add it to the list of running services sc = ServiceConn(host, port, conn) head.services.add(sc) head.pending_requests.add((sc, None, req)) log_info('Worker %s:%d initialized.' % (host, port)) return WorkerRegistrarService, ranker_dump_path
def exposed_init_training(self, cfg): """Create the Seq2SeqGen object.""" cfg = pickle.loads(cfg) tstart = time.time() log_info('Initializing training...') self.seq2seq = Seq2SeqGen(cfg) log_info('Training initialized. Time taken: %f secs.' % (time.time() - tstart))
def can_generate(self, tree, da): """Check if the candidate generator can generate a given tree at all. This is for debugging purposes only. Tries if get_all_successors always returns a successor that leads to the given tree (puts on the open list only successors that are subtrees of the given tree). """ self.init_run(da) open_list = CandidateList({TreeData(): 1}) found = False tree_no = 0 while open_list and not found: cur_st, _ = open_list.pop() if cur_st == tree: found = True break for succ in self.get_all_successors(cur_st): tree_no += 1 # only push on the open list if the successor is still a subtree of the target tree if tree.common_subtree_size(succ) == len(succ): open_list.push(succ, len(succ)) if not found: log_info('Did not find tree: ' + unicode(tree) + ' for DA: ' + unicode(da) + ('(total %d trees)' % tree_no)) return False log_info('Found tree: %s for DA: %s (as %d-th tree)' % (unicode(tree), unicode(da), tree_no)) return tree_no
def can_generate_greedy(self, tree, da): """Check if the candidate generator can generate a given tree greedily, always pursuing the first viable path. This is for debugging purposes only. Uses `get_all_successors` and always goes on with the first one that increases coverage of the current tree. """ self.init_run(da) cur_subtree = TreeData() found = True while found and cur_subtree != tree: found = False for succ in self.get_all_successors(cur_subtree): # use the first successor that is still a subtree of the target tree if tree.common_subtree_size(succ) == len(succ): cur_subtree = succ found = True break # we have hit a dead end if cur_subtree != tree: log_info('Did not find tree: ' + unicode(tree) + ' for DA: ' + unicode(da)) return False # everything alright log_info('Found tree: %s for DA: %s' % (unicode(tree), unicode(da))) return True
def save_to_file(self, model_file): """\ Save the model to a pickle file or stream (supports GZip compression). """ log_info('Saving model to file ' + str(model_file)) fh = file_stream(model_file, mode='wb', encoding=None) pickle.Pickler(fh, pickle.HIGHEST_PROTOCOL).dump(self) fh.close() log_info('Model successfully saved.')
def _init_training(self, das_file, ttree_file, data_portion): # load data, determine number of features etc. etc. super(PerceptronRanker, self)._init_training(das_file, ttree_file, data_portion) # initialize weights self.w = np.ones(self.train_feats.shape[1]) self.update_weights_sum() # self.w = np.array([rnd.gauss(0, self.alpha) for _ in xrange(self.train_feats.shape[1])]) log_debug('\n***\nINIT:') log_debug(self._feat_val_str()) log_info('Training ...')
def load_training_set(self, filename, encoding='UTF-8'): """\ Load the given training data set into memory and strip it if configured to via the train_part parameter. """ log_info('Loading training data set from ' + str(filename) + '...') train = DataSet() train.load_from_arff(filename, encoding) if self.train_part < 1: train = train.subset(0, int(round(self.train_part * len(train))), copy=False) return train
def load_from_file(model_file): """\ Load the model from a pickle file or stream (supports GZip compression). """ log_info('Loading model from file ' + str(model_file)) fh = file_stream(model_file, mode='rb', encoding=None) unpickler = pickle.Unpickler(fh) model = unpickler.load() fh.close() log_info('Model loaded successfully.') return model
def exposed_register_worker(self, host, port): """Register a worker with my head, initialize it.""" # initiate connection in the other direction log_info('Worker %s:%d connected, initializing training.' % (host, port)) conn = connect(host, port, config={'allow_pickle': True}) # initialize the remote server (with training data etc.) init_func = async(conn.root.init_training) req = init_func(ranker_dump_path) # add it to the list of running services sc = ServiceConn(host, port, conn) head.services.add(sc) head.pending_requests.add((sc, None, req)) log_info('Worker %s:%d initialized.' % (host, port))
def train(self, das_file, ttree_file, data_portion=1.0): """Run training on the given training data.""" self._init_training(das_file, ttree_file, data_portion) for iter_no in xrange(1, self.passes + 1): self.train_order = range(len(self.train_trees)) if self.randomize: rnd.shuffle(self.train_order) log_info("Train order: " + str(self.train_order)) self._training_pass(iter_no) if self.evaluator.tree_accuracy() == 1: # if tree accuracy is 1, we won't learn anything anymore break # averaged perceptron – average the weights obtained after each pass if self.averaging is True: self.set_weights_iter_average()
def train_on_data(self, train): """\ Train model on the specified training data set (which must be a loaded DataSet object). """ log_info('Preparing data set...') self.data_headers = train.get_headers() self.attr_mask = self.get_attr_mask() train_vect = self.__vectorize(train) train_classes = self.get_classes(train) # if all the training data have the same class, use a dummy classifier if train.get_attrib(self.class_attr).num_values == 1: self.feature_filter = None self.classifier = DummyClassifier(strategy='most_frequent') # filter features log_info('Filtering...') train_filt = self.__filter_features(train_vect, train_classes) # train the classifier log_info('Training...') if self.use_weights: self.classifier.fit(train_filt, train_classes, sample_weight=train.inst_weights) else: self.classifier.fit(train_filt, train_classes) self.classifier_trained = True log_info('Training done.')
def exposed_register_worker(self, host, port): """Register a worker with my head, initialize it.""" # initiate connection in the other direction log_info('Worker %s:%d connected, initializing training.' % (host, port)) conn = connect(host, port, config={'allow_pickle': True}) # initialize the remote server (with training data etc.) init_func = async(conn.root.init_training) # add unique 'scope suffix' so that the models don't clash in ensembles head.cfg['scope_suffix'] = hashlib.md5("%s:%d" % (host, port)).hexdigest() req = init_func(pickle.dumps(head.cfg, pickle.HIGHEST_PROTOCOL)) # add it to the list of running services sc = ServiceConn(host, port, conn) head.services.add(sc) head.pending_requests.add((sc, None, req)) log_info('Worker %s:%d initialized.' % (host, port))
def train(self, das_file, ttree_file, data_portion=1.0): """Run training on the given training data.""" self._init_training(das_file, ttree_file, data_portion) for iter_no in xrange(1, self.passes + 1): self.train_order = range(len(self.train_trees)) if self.randomize: rnd.shuffle(self.train_order) log_info("Train order: " + str(self.train_order)) self._training_pass(iter_no) if self.evaluator.tree_accuracy( ) == 1: # if tree accuracy is 1, we won't learn anything anymore break # averaged perceptron – average the weights obtained after each pass if self.averaging is True: self.set_weights_iter_average()
def exposed_train(self, rnd_seed, das_file, ttree_file, data_portion, context_file, validation_files): """Run the whole training. """ rnd.seed(rnd_seed) log_info('Random seed: %f' % rnd_seed) tstart = time.time() log_info('Starting training...') self.seq2seq.train(das_file, ttree_file, data_portion, context_file, validation_files) log_info('Training finished -- time taken: %f secs.' % (time.time() - tstart)) top_cost = self.seq2seq.top_k_costs[0] log_info('Best cost: %f' % top_cost) return top_cost
def save_to_file(self, model_fname): """This will actually just move the best generator (which is saved in a temporary file) to the final location.""" log_info('Moving generator to %s...' % model_fname) orig_model_fname = self.model_temp_path shutil.move(orig_model_fname, model_fname) orig_tf_session_fname = re.sub(r'(.pickle)?(.gz)?$', '.tfsess', orig_model_fname) tf_session_fname = re.sub(r'(.pickle)?(.gz)?$', '.tfsess', model_fname) if os.path.isfile(orig_tf_session_fname): shutil.move(orig_tf_session_fname, tf_session_fname) # move the reranking classifier model files as well, if they exist orig_clfilter_fname = re.sub(r'((.pickle)?(.gz)?)$', r'.tftreecl\1', orig_model_fname) orig_clfilter_tf_fname = re.sub(r'((.pickle)?(.gz)?)$', r'.tfsess', orig_clfilter_fname) if os.path.isfile(orig_clfilter_fname) and os.path.isfile(orig_clfilter_tf_fname): clfilter_fname = re.sub(r'((.pickle)?(.gz)?)$', r'.tftreecl\1', model_fname) clfilter_tf_fname = re.sub(r'((.pickle)?(.gz)?)$', r'.tfsess', clfilter_fname) shutil.move(orig_clfilter_fname, clfilter_fname) shutil.move(orig_clfilter_tf_fname, clfilter_tf_fname)
def _init_training(self, das_file, ttree_file, data_portion): super(FeaturesPerceptronRanker, self)._init_training(das_file, ttree_file, data_portion) # precompute training data features X = [] for da, tree in zip(self.train_das, self.train_trees): X.append(self.feats.get_features(tree, {'da': da})) if self.prune_feats > 1: self._prune_features(X) # vectorize and binarize or normalize (+train vectorizer/normalizer) if self.binarize: self.vectorizer = DictVectorizer(sparse=False, binarize_numeric=True) self.train_feats = self.vectorizer.fit_transform(X) else: self.vectorizer = DictVectorizer(sparse=False) self.normalizer = StandardScaler(copy=False) self.train_feats = self.normalizer.fit_transform(self.vectorizer.fit_transform(X)) log_info('Features matrix shape: %s' % str(self.train_feats.shape))
def run_training(head_host, head_port, debug_out=None): """Main worker training routine (creates the Seq2SeqTrainingService and connects it to the head. @param head_host: hostname of the head @param head_port: head port number @param debug_out: path to the debugging output file (debug output discarded if None) """ # setup debugging output, if applicable if debug_out is not None: set_debug_stream(file_stream(debug_out, mode='w')) # start the server (in the background) log_info('Creating training server...') server = ThreadPoolServer(service=Seq2SeqTrainingService, nbThreads=1) server_thread = Thread(target=server.start) server_thread.start() my_host = socket.getfqdn() log_info('Worker server created at %s:%d. Connecting to head at %s:%d...' % (my_host, server.port, head_host, head_port)) # notify main about this server conn = connect(head_host, head_port, config={'allow_pickle': True}) conn.root.register_worker(my_host, server.port) conn.close() log_info('Worker is registered with the head.') # now serve until we're killed (the server thread will continue to run) server_thread.join()
def _init_training(self, das_file, ttree_file, data_portion): """Initialize training (read input data, fix size, initialize candidate generator and planner)""" # read input log_info('Reading DAs from ' + das_file + '...') das = read_das(das_file) log_info('Reading t-trees from ' + ttree_file + '...') ttree_doc = read_ttrees(ttree_file) sents = sentences_from_doc(ttree_doc, self.language, self.selector) trees = trees_from_doc(ttree_doc, self.language, self.selector) # make training data smaller if necessary train_size = int(round(data_portion * len(trees))) self.train_trees = trees[:train_size] self.train_das = das[:train_size] self.train_sents = sents[:train_size] self.train_order = range(len(self.train_trees)) log_info('Using %d training instances.' % train_size) # initialize candidate generator + planner if needed if self.candgen_model is not None: self.candgen = RandomCandidateGenerator.load_from_file(self.candgen_model) self.sampling_planner = SamplingPlanner({'language': self.language, 'selector': self.selector, 'candgen': self.candgen}) if 'gen_cur_weights' in self.rival_gen_strategy: assert self.candgen is not None self.asearch_planner = ASearchPlanner({'candgen': self.candgen, 'language': self.language, 'selector': self.selector, 'ranker': self, })
def exposed_training_pass(self, w, pass_no, rnd_seed, data_offset, data_len): """(Worker) Run one pass over a part of the training data. @param w: initial perceptron weights (pickled) @param pass_no: pass number (for logging purposes) @param rnd_seed: random generator seed for shuffling training examples @param data_offset: training data portion start @param data_len: training data portion size @return: updated perceptron weights after passing the selected data portion (pickled) """ log_info('Training pass %d with data portion %d + %d' % (pass_no, data_offset, data_len)) # import current feature weights ranker = self.ranker_inst ranker.set_weights(pickle.loads(w)) # save rest of the training data to temporary variables, set just the # required portion for computation all_train_das = ranker.train_das ranker.train_das = ranker.train_das[data_offset:data_offset + data_len] all_train_trees = ranker.train_trees ranker.train_trees = ranker.train_trees[data_offset:data_offset + data_len] all_train_feats = ranker.train_feats ranker.train_feats = ranker.train_feats[data_offset:data_offset + data_len] all_train_sents = ranker.train_sents ranker.train_sents = ranker.train_sents[data_offset:data_offset + data_len] all_train_order = ranker.train_order ranker.train_order = range(len(ranker.train_trees)) if ranker.randomize: rnd.seed(rnd_seed) rnd.shuffle(ranker.train_order) # do the actual computation (update w) ranker._training_pass(pass_no) # return the rest of the training data to member variables ranker.train_das = all_train_das ranker.train_trees = all_train_trees ranker.train_feats = all_train_feats ranker.train_sents = all_train_sents ranker.train_order = all_train_order # return the result of the computation log_info('Training pass %d / %d / %d done.' % (pass_no, data_offset, data_len)) return pickle.dumps((ranker.get_weights(), ranker.get_diagnostics()), pickle.HIGHEST_PROTOCOL)
def _init_training(self, das_file, ttree_file, data_portion): super(FeaturesPerceptronRanker, self)._init_training(das_file, ttree_file, data_portion) # precompute training data features X = [] for da, tree in zip(self.train_das, self.train_trees): X.append(self.feats.get_features(tree, {'da': da})) if self.prune_feats > 1: self._prune_features(X) # vectorize and binarize or normalize (+train vectorizer/normalizer) if self.binarize: self.vectorizer = DictVectorizer(sparse=False, binarize_numeric=True) self.train_feats = self.vectorizer.fit_transform(X) else: self.vectorizer = DictVectorizer(sparse=False) self.normalizer = StandardScaler(copy=False) self.train_feats = self.normalizer.fit_transform( self.vectorizer.fit_transform(X)) log_info('Features matrix shape: %s' % str(self.train_feats.shape))
def _check_pending_request(self, sc, job_no, req): """Check whether the given request has finished (i.e., job is loaded or job has processed the given data portion. If the request is finished, the worker that processed it is moved to the pool of free services. @param iter_no: current iteration number (for logging) @param sc: a ServiceConn object that stores the worker connection parameters @param job_no: current job number (is None for jobs loading) @param req: the request itself @return: the value returned by the finished data processing request, or None \ (for loading requests or unfinished requests) """ result = None if job_no is not None: log_debug('Checking %d' % job_no) # checking if the request has finished if req.ready: if job_no is not None: log_debug('Ready %d' % job_no) log_info('Retrieved finished request %d' % job_no) if req.error: log_info('Error found on request: job #%d, worker %s:%d' % (job_no if job_no is not None else -1, sc.host, sc.port)) result = req.value # remove from list of pending requests # TODO return to pool of free requests (but needs to store the results somewhere) self.pending_requests.remove((sc, job_no, req)) if job_no is None: self.free_services.append(sc) return result
def _check_pending_request(self, iter_no, sc, req_portion, req): """Check whether the given request has finished (i.e., job is loaded or job has processed the given data portion. If the request is finished, the worker that processed it is moved to the pool of free services. @param iter_no: current iteration number (for logging) @param sc: a ServiceConn object that stores the worker connection parameters @param req_portion: current data portion number (is None for jobs loading) @param req: the request itself @return: the value returned by the finished data processing request, or None \ (for loading requests or unfinished requests) """ result = None if req_portion is not None: log_debug('Checking %d' % req_portion) # checking if the request has finished if req.ready: # loading requests -- do nothing (just logging) if req_portion is None: if req.error: log_info('Error loading on %s:%d' % (sc.host, sc.port)) else: log_info('Worker %s:%d finished loading.' % (sc.host, sc.port)) # data processing request -- retrieve the value else: log_debug('Ready %d' % req_portion) log_info('Retrieved finished request %d / %d' % (iter_no, req_portion)) if req.error: log_info('Error found on request: IT %d PORTION %d, WORKER %s:%d' % (iter_no, req_portion, sc.host, sc.port)) result = pickle.loads(req.value) # add the worker to the pool of free services (both loading and data processing requests) self.pending_requests.remove((sc, req_portion, req)) self.free_services.append(sc) if req_portion is not None: log_debug('Done with %d' % req_portion) return result
def exposed_register_worker(self, host, port): """Register a worker with my head, initialize it.""" # initiate connection in the other direction log_info('Worker %s:%d connected, initializing training.' % (host, port)) conn = connect(host, port, config={'allow_pickle': True}) # initialize the remote server (with training data etc.) log_info('Ranker dump size: %d' % sys.getsizeof(ranker_dump)) conn.root.init_training(ranker_dump) # add it to the list of running services sc = ServiceConn(host, port, conn) head.services.add(sc) head.free_services.append(sc) log_info('Worker %s:%d initialized.' % (host, port))
def _init_training(self, das_file, ttree_file, data_portion): """Initialize training (read input data, fix size, initialize candidate generator and planner)""" # read input log_info('Reading DAs from ' + das_file + '...') das = read_das(das_file) log_info('Reading t-trees from ' + ttree_file + '...') ttree_doc = read_ttrees(ttree_file) sents = sentences_from_doc(ttree_doc, self.language, self.selector) trees = trees_from_doc(ttree_doc, self.language, self.selector) # make training data smaller if necessary train_size = int(round(data_portion * len(trees))) self.train_trees = trees[:train_size] self.train_das = das[:train_size] self.train_sents = sents[:train_size] self.train_order = range(len(self.train_trees)) log_info('Using %d training instances.' % train_size) # initialize candidate generator if self.candgen_model is not None: self.candgen = RandomCandidateGenerator.load_from_file( self.candgen_model) # self.sampling_planner = SamplingPlanner({'language': self.language, # 'selector': self.selector, # 'candgen': self.candgen}) # check if A*search planner is needed (i.e., any rival generation strategy requires it) # and initialize it if isinstance(self.rival_gen_strategy[0], tuple): asearch_needed = any([ s in ['gen_cur_weights', 'gen_update'] for _, ss in self.rival_gen_strategy for s in ss ]) else: asearch_needed = any([ s in ['gen_cur_weights', 'gen_update'] for s in self.rival_gen_strategy ]) if asearch_needed: assert self.candgen is not None self.asearch_planner = ASearchPlanner({ 'candgen': self.candgen, 'language': self.language, 'selector': self.selector, 'ranker': self, })
def run_worker(head_host, head_port, debug_out=None): # setup debugging output, if applicable if debug_out is not None: set_debug_stream(file_stream(debug_out, mode='w')) # start the server (in the background) log_info('Creating worker server...') server = ThreadPoolServer(service=RankerTrainingService, nbThreads=1) server_thread = Thread(target=server.start) server_thread.start() my_host = socket.getfqdn() log_info('Worker server created at %s:%d. Connecting to head at %s:%d...' % (my_host, server.port, head_host, head_port)) # notify main about this server conn = connect(head_host, head_port, config={'allow_pickle': True}) conn.root.register_worker(my_host, server.port) conn.close() log_info('Worker is registered with the head.') # now serve until we're killed (the server thread will continue to run) server_thread.join()
def _init_training(self, das_file, ttree_file, data_portion): """Initialize training (read input data, fix size, initialize candidate generator and planner)""" # read input log_info('Reading DAs from ' + das_file + '...') das = read_das(das_file) log_info('Reading t-trees from ' + ttree_file + '...') ttree_doc = read_ttrees(ttree_file) sents = sentences_from_doc(ttree_doc, self.language, self.selector) trees = trees_from_doc(ttree_doc, self.language, self.selector) # make training data smaller if necessary train_size = int(round(data_portion * len(trees))) self.train_trees = trees[:train_size] self.train_das = das[:train_size] self.train_sents = sents[:train_size] self.train_order = range(len(self.train_trees)) log_info('Using %d training instances.' % train_size) # initialize candidate generator if self.candgen_model is not None: self.candgen = RandomCandidateGenerator.load_from_file(self.candgen_model) # self.sampling_planner = SamplingPlanner({'language': self.language, # 'selector': self.selector, # 'candgen': self.candgen}) # check if A*search planner is needed (i.e., any rival generation strategy requires it) # and initialize it if isinstance(self.rival_gen_strategy[0], tuple): asearch_needed = any([s in ['gen_cur_weights', 'gen_update'] for _, ss in self.rival_gen_strategy for s in ss]) else: asearch_needed = any([s in ['gen_cur_weights', 'gen_update'] for s in self.rival_gen_strategy]) if asearch_needed: assert self.candgen is not None self.asearch_planner = ASearchPlanner({'candgen': self.candgen, 'language': self.language, 'selector': self.selector, 'ranker': self, })
def exposed_init_training(self, head_ranker_path): """(Worker) Just deep-copy all necessary attributes from the head instance.""" tstart = time.time() log_info('Initializing training...') self.ranker_inst = load_ranker(head_ranker_path) log_info('Training initialized. Time taken: %f secs.' % (time.time() - tstart))
def train(self, das_file, ttree_file, data_portion=1.0): """Run parallel perceptron training, start and manage workers.""" # initialize the ranker instance log_info('Initializing...') self.loc_ranker._init_training(das_file, ttree_file, data_portion) # run server to process registering clients self._init_server() # spawn training jobs log_info('Spawning jobs...') host_short, _ = self.host.split('.', 1) # short host name for job names for j in xrange(self.jobs_number): # set up debugging logfile only if we have it on the head debug_logfile = ('"PRT%02d.debug-out.txt.gz"' % j) if is_debug_stream() else 'None' job = Job(header='from tgen.parallel_percrank_train import run_worker', code=('run_worker("%s", %d, %s)' % (self.host, self.port, debug_logfile)), name=self.experiment_id + ("PRT%02d-%s-%d" % (j, host_short, self.port)), work_dir=self.work_dir) job.submit(self.job_memory) self.jobs.append(job) # run the training passes try: for iter_no in xrange(1, self.loc_ranker.passes + 1): log_info('Pass %d...' % iter_no) log_debug('\n***\nTR%05d:' % iter_no) iter_start_time = time.time() cur_portion = 0 results = [None] * self.data_portions w_dump = pickle.dumps(self.loc_ranker.get_weights(), protocol=pickle.HIGHEST_PROTOCOL) rnd_seeds = [rnd.random() for _ in xrange(self.data_portions)] # wait for free services / assign computation while cur_portion < self.data_portions or self.pending_requests: log_debug('Starting loop over services.') # check if some of the pending computations have finished for sc, req_portion, req in list(self.pending_requests): res = self._check_pending_request(iter_no, sc, req_portion, req) if res: results[req_portion] = res # check for free services and assign new computation while cur_portion < self.data_portions and self.free_services: log_debug('Assigning request %d' % cur_portion) sc = self.free_services.popleft() log_info('Assigning request %d / %d to %s:%d' % (iter_no, cur_portion, sc.host, sc.port)) train_func = async(sc.conn.root.training_pass) req = train_func(w_dump, iter_no, rnd_seeds[cur_portion], * self._get_portion_bounds(cur_portion)) self.pending_requests.add((sc, cur_portion, req)) cur_portion += 1 log_debug('Assigned %d' % cur_portion) # sleep for a while log_debug('Sleeping.') time.sleep(self.poll_interval) # delete the temporary ranker dump when the 1st iteration is complete if self.ranker_dump_path: log_info('Removing temporary ranker dump at %s.' % self.ranker_dump_path) os.remove(self.ranker_dump_path) self.ranker_dump_path = None # gather/average the diagnostic statistics self.loc_ranker.set_diagnostics_average([d for _, d in results]) # take an average of weights; set it as new w self.loc_ranker.set_weights_average([w for w, _ in results]) self.loc_ranker.store_iter_weights() # store a copy of w for averaged perceptron # print statistics log_debug(self.loc_ranker._feat_val_str(), '\n***') self.loc_ranker._print_pass_stats(iter_no, datetime.timedelta(seconds=(time.time() - iter_start_time))) # after all passes: average weights if set to do so if self.loc_ranker.averaging is True: self.loc_ranker.set_weights_iter_average() # kill all jobs finally: for job in self.jobs: job.delete()
def load_from_file(model_fname): """Load a pre-trained model from a file.""" log_info("Loading ranker from %s..." % model_fname) with file_stream(model_fname, 'rb', encoding=None) as fh: return pickle.load(fh)
def save_to_file(self, model_fname): """Save the model to a file.""" log_info("Saving ranker to %s..." % model_fname) with file_stream(model_fname, 'wb', encoding=None) as fh: pickle.dump(self, fh, protocol=pickle.HIGHEST_PROTOCOL)
def train(self, train_file, work_dir, memory=8, encoding='UTF-8'): """\ Read training data, split them and train the individual models (in cluster jobs). """ # load the entire data set train = self.load_training_set(train_file, encoding) self.data_headers = train.get_headers() self.attr_mask = self.get_attr_mask() # train a backoff model log_info('Training a backoff model...') self.backoff_model = self.train_backoff_model(train) # split it log_info('Split...') train_split = train.split(eval(self.divide_func), keep_copy=False) jobs = [] model_files = {} # save training files and create training jobs for key, subset in train_split.iteritems(): fn = re.sub(r'(.arff(.gz)?)?$', '-' + key + '.arff.gz', train_file) fn = os.path.join(work_dir, os.path.basename(fn)) subset.save_to_arff(fn, encoding) job, model_file = Model.create_training_job(self.config, work_dir, fn, memory=memory, encoding=encoding) jobs.append(job) model_files[key] = model_file # submit the training jobs and wait for all of them log_info('Submitting training jobs...') for job in jobs: job.submit() log_info('Waiting for jobs...') for job in jobs: job.wait() # load all models log_info('Training complete. Assembling model files...') for key, model_file in model_files.iteritems(): self.models[key] = Model.load_from_file(model_file) self.trained = True log_info('Training done.')
def _print_pass_stats(self, pass_no, pass_duration): """Print pass statistics from internal evaluator fields and given pass duration.""" log_info('Pass %05d -- tree-level accuracy: %.4f' % (pass_no, self.evaluator.tree_accuracy())) log_info(' * Generated trees NODE scores: P: %.4f, R: %.4f, F: %.4f' % self.evaluator.p_r_f1()) log_info(' * Generated trees DEP scores: P: %.4f, R: %.4f, F: %.4f' % self.evaluator.p_r_f1(EvalTypes.DEP)) log_info(' * Gold tree BEST: %.4f, on CLOSE: %.4f, on ANY list: %.4f' % self.lists_analyzer.stats()) log_info(' * Tree size stats:\n -- GOLD: %s\n -- PRED: %s\n -- DIFF: %s' % self.evaluator.size_stats()) log_info(' * Common subtree stats:\n -- SIZE: %s\n -- ΔGLD: %s\n -- ΔPRD: %s' % self.evaluator.common_substruct_stats()) log_info(' * Score stats\n -- GOLD: %s\n -- PRED: %s\n -- DIFF: %s' % self.evaluator.score_stats()) log_info(' * Duration: %s' % str(pass_duration))