def run_training(head_host, head_port, debug_out=None): """Main worker training routine (creates the Seq2SeqTrainingService and connects it to the head. @param head_host: hostname of the head @param head_port: head port number @param debug_out: path to the debugging output file (debug output discarded if None) """ # setup debugging output, if applicable if debug_out is not None: set_debug_stream(file_stream(debug_out, mode='w')) # start the server (in the background) log_info('Creating training server...') server = ThreadPoolServer(service=Seq2SeqTrainingService, nbThreads=1) server_thread = Thread(target=server.start) server_thread.start() my_host = socket.getfqdn() log_info('Worker server created at %s:%d. Connecting to head at %s:%d...' % (my_host, server.port, head_host, head_port)) # notify main about this server conn = connect(head_host, head_port, config={'allow_pickle': True}) conn.root.register_worker(my_host, server.port) conn.close() log_info('Worker is registered with the head.') # now serve until we're killed (the server thread will continue to run) server_thread.join()
def load_from_file(model_fname): """Load the generator from a file (actually two files, one for configuration and one for the TensorFlow graph, which must be stored separately). @param model_fname: file name (for the configuration file); TF graph must be stored with a \ different extension """ log_info("Loading generator from %s..." % model_fname) with file_stream(model_fname, 'rb', encoding=None) as fh: data = pickle.load(fh) ret = Seq2SeqGen(cfg=data['cfg']) ret.load_all_settings(data) if ret.classif_filter: classif_filter_fname = re.sub(r'((.pickle)?(.gz)?)$', r'.tftreecl\1', model_fname) if os.path.isfile(classif_filter_fname): ret.classif_filter = RerankingClassifier.load_from_file(classif_filter_fname) else: log_warn("Classification filter data not found, ignoring.") ret.classif_filter = False # re-build TF graph and restore the TF session tf_session_fname = re.sub(r'(.pickle)?(.gz)?$', '.tfsess', model_fname) ret._init_neural_network() ret.saver.restore(ret.session, tf_session_fname) return ret
def load_from_file(fname): log_info('Loading model from ' + fname) with file_stream(fname, mode='rb', encoding=None) as fh: candgen = pickle.load(fh) # various backward compatibility tricks if type(candgen) == dict: child_type_counts = candgen candgen = RandomCandidateGenerator({}) candgen.child_type_counts = child_type_counts candgen.child_num_cdfs = pickle.load(fh) candgen.max_children = pickle.load(fh) if not hasattr(candgen, 'node_limits'): candgen.node_limits = None if not hasattr(candgen, 'child_type_counts'): candgen.child_type_counts = candgen.form_counts candgen.child_num_cdfs = candgen.child_cdfs if not hasattr(candgen, 'exp_child_num'): candgen.exp_child_num = candgen.exp_from_cdfs( candgen.child_num_cdfs) if not hasattr(candgen, 'compatible_dais'): candgen.compatible_dais = None candgen.compatible_dais_type = None candgen.compatible_dais_limit = 1000 if not hasattr(candgen, 'compatible_slots'): candgen.compatible_slots = False if not hasattr(candgen, 'classif'): candgen.classif = None return candgen
def save_to_file(self, model_fname): """Save the generator to a file (actually two files, one for configuration and one for the TensorFlow graph, which must be stored separately). @param model_fname: file name (for the configuration file); TF graph will be stored with a \ different extension """ log_info("Saving generator to %s..." % model_fname) if self.classif_filter: classif_filter_fname = re.sub(r'((.pickle)?(.gz)?)$', r'.tftreecl\1', model_fname) self.classif_filter.save_to_file(classif_filter_fname) if self.lexicalizer: lexicalizer_fname = re.sub(r'((.pickle)?(.gz)?)$', r'.lexic\1', model_fname) self.lexicalizer.save_to_file(lexicalizer_fname) with file_stream(model_fname, 'wb', encoding=None) as fh: pickle.dump(self.get_all_settings(), fh, protocol=pickle.HIGHEST_PROTOCOL) tf_session_fname = re.sub(r'(.pickle)?(.gz)?$', '.tfsess', model_fname) if hasattr(self, 'checkpoint_path') and self.checkpoint_path: shutil.copyfile(self.checkpoint_path, tf_session_fname) else: self.saver.save(self.session, tf_session_fname)
def process_document(self, doc): "Write a CoNLL-U file" out = file_stream(self.get_output_file_name(doc), 'w', encoding='UTF-8') for bundle in doc.bundles: zone = bundle.get_zone(self.language, self.selector) nodes = zone.atree.get_descendants(ordered=1) # Empty sentences are not allowed in CoNLL-U. if len(nodes) == 0: continue comment = zone.wild['comment'] if comment: out.write('#' + comment.rstrip('\r\n').replace('\n', '\n#') + '\n') index = 1 for node in nodes: out.write('\t'.join( '_' if value is None else value for value in map((lambda x: str(x) if type( x) == int else getattr(node, x, '_')), [ index, 'form', 'lemma', 'upos', 'xpos', 'feats', node.parent.ord, 'deprel', 'deps', 'misc' ])) + '\n') index += 1 out.write('\n')
def save_to_file(self, model_fname): """Save the whole ensemble into a file (get all settings and parameters, dump them in a pickle).""" log_info("Saving generator to %s..." % model_fname) with file_stream(model_fname, 'wb', encoding=None) as fh: pickle.dump(self.__class__, fh, protocol=pickle.HIGHEST_PROTOCOL) pickle.dump(self.cfg, fh, protocol=pickle.HIGHEST_PROTOCOL) gens_dump = [] for gen in self.gens: setting = gen.get_all_settings() parset = gen.get_model_params() setting['classif_filter'] = self.classif_filter is not None gens_dump.append((setting, parset)) pickle.dump(gens_dump, fh, protocol=pickle.HIGHEST_PROTOCOL) if self.classif_filter: pickle.dump(self.classif_filter.get_all_settings(), fh, protocol=pickle.HIGHEST_PROTOCOL) pickle.dump(self.classif_filter.get_model_params(), fh, protocol=pickle.HIGHEST_PROTOCOL)
def load_from_file(model_fname): """Load the generator from a file (actually two files, one for configuration and one for the TensorFlow graph, which must be stored separately). @param model_fname: file name (for the configuration file); TF graph must be stored with a \ different extension """ log_info("Loading generator from %s..." % model_fname) with file_stream(model_fname, 'rb', encoding=None) as fh: data = pickle.load(fh) ret = Seq2SeqGen(cfg=data['cfg']) ret.load_all_settings(data) if ret.classif_filter: classif_filter_fname = re.sub(r'((.pickle)?(.gz)?)$', r'.tftreecl\1', model_fname) if os.path.isfile(classif_filter_fname): ret.classif_filter = RerankingClassifier.load_from_file( classif_filter_fname) else: log_warn("Classification filter data not found, ignoring.") ret.classif_filter = False # re-build TF graph and restore the TF session tf_session_fname = re.sub(r'(.pickle)?(.gz)?$', '.tfsess', model_fname) ret._init_neural_network() ret.saver.restore(ret.session, tf_session_fname) return ret
def process_document(self, filename): "Read a YAML file and return its contents as a Document object" f = file_stream(filename, encoding=None) data = yaml.load(f) doc = Document(filename, data) f.close() return doc
def load_from_file(fname): log_info('Loading model from ' + fname) with file_stream(fname, mode='rb', encoding=None) as fh: candgen = pickle.load(fh) # various backward compatibility tricks if type(candgen) == dict: child_type_counts = candgen candgen = RandomCandidateGenerator({}) candgen.child_type_counts = child_type_counts candgen.child_num_cdfs = pickle.load(fh) candgen.max_children = pickle.load(fh) if not hasattr(candgen, 'node_limits'): candgen.node_limits = None if not hasattr(candgen, 'child_type_counts'): candgen.child_type_counts = candgen.form_counts candgen.child_num_cdfs = candgen.child_cdfs if not hasattr(candgen, 'exp_child_num'): candgen.exp_child_num = candgen.exp_from_cdfs(candgen.child_num_cdfs) if not hasattr(candgen, 'compatible_dais'): candgen.compatible_dais = None candgen.compatible_dais_type = None candgen.compatible_dais_limit = 1000 if not hasattr(candgen, 'compatible_slots'): candgen.compatible_slots = False if not hasattr(candgen, 'classif'): candgen.classif = None return candgen
def load_from_file(model_fname): """Detect correct model type (plain/ensemble) and start loading.""" model_type = Seq2SeqGen # default to plain generator with file_stream(model_fname, 'rb', encoding=None) as fh: data = pickle.load(fh) if isinstance(data, type): model_type = data return model_type.load_from_file(model_fname)
def process_document(self, doc): "Write a YAML document" data = [] for bundle in doc.bundles: data.append(self.serialize_bundle(bundle)) out = file_stream(self.get_output_file_name(doc), 'w', encoding=None) out.write(yaml.safe_dump(data, allow_unicode=True, explicit_start=True)) out.close()
def convert_model(model_fname): reset_default_graph() param_fname = re.sub(r'((.pickle)?(.gz)?)$', r'.params.gz', model_fname) log_info('Converting %s to %s...' % (model_fname, param_fname)) model = Seq2SeqBase.load_from_file(model_fname) with file_stream(param_fname, 'wb', encoding=None) as fh: pickle.dump(model.get_model_params(), fh, protocol=pickle.HIGHEST_PROTOCOL)
def save_to_file(self, model_file): """\ Save the model to a pickle file or stream (supports GZip compression). """ log_info('Saving model to file ' + str(model_file)) fh = file_stream(model_file, mode='wb', encoding=None) pickle.Pickler(fh, pickle.HIGHEST_PROTOCOL).dump(self) fh.close() log_info('Model successfully saved.')
def write_data(dir, fname_base, fname_repl, data): chunk_size = len(data[0]) for chunk_idx in xrange(chunk_size): fname_suff = ".%d" % chunk_idx if chunk_size > 1 else '' file_name = os.path.join(dir, re.sub(r'^[^-._]*', fname_repl + fname_suff, fname_base)) print 'WRITING ' + file_name with file_stream(file_name, 'w') as fh: for chunk in data: print >> fh, chunk[chunk_idx],
def write_data(dir, fname_base, fname_repl, data): chunk_size = len(data[0]) for chunk_idx in xrange(chunk_size): fname_suff = ".%d" % chunk_idx if chunk_size > 1 else '' file_name = os.path.join( dir, re.sub(r'^[^-._]*', fname_repl + fname_suff, fname_base)) print 'WRITING ' + file_name with file_stream(file_name, 'w') as fh: for chunk in data: print >> fh, chunk[chunk_idx],
def load_from_file(model_file): """\ Load the model from a pickle file or stream (supports GZip compression). """ log_info('Loading model from file ' + str(model_file)) fh = file_stream(model_file, mode='rb', encoding=None) unpickler = pickle.Unpickler(fh) model = unpickler.load() fh.close() log_info('Model loaded successfully.') return model
def save_to_file(self, model_fname): """Save the generator to a file (actually two files, one for configuration and one for the TensorFlow graph, which must be stored separately). @param model_fname: file name (for the configuration file); TF graph will be stored with a \ different extension """ log_info("Saving classifier to %s..." % model_fname) with file_stream(model_fname, 'wb', encoding=None) as fh: pickle.dump(self.get_all_settings(), fh, protocol=pickle.HIGHEST_PROTOCOL) tf_session_fname = re.sub(r'(.pickle)?(.gz)?$', '.tfsess', model_fname) if self.checkpoint_path: shutil.copyfile(self.checkpoint_path, tf_session_fname) else: self.saver.save(self.session, tf_session_fname)
def process_document(self, filename): """\ Read a Tecto-Template file and return its contents as a Document object. """ fh = file_stream(filename, encoding=self.encoding) doc = Document(filename) for line in fh: bundle = doc.create_bundle() zone = bundle.create_zone(self.language, self.selector) ttree = zone.create_ttree() self.parse_line(line, ttree) log_info('Parsed a tree with %d nodes.' % len(ttree.get_descendants())) fh.close() return doc
def save_to_file(self, model_fname): """Save the classifier to a file (actually two files, one for configuration and one for the TensorFlow graph, which must be stored separately). @param model_fname: file name (for the configuration file); TF graph will be stored with a \ different extension """ log_info("Saving classifier to %s..." % model_fname) with file_stream(model_fname, 'wb', encoding=None) as fh: pickle.dump(self.get_all_settings(), fh, protocol=pickle.HIGHEST_PROTOCOL) tf_session_fname = re.sub(r'(.pickle)?(.gz)?$', '.tfsess', model_fname) if hasattr(self, 'checkpoint_path') and self.checkpoint_path: self.restore_checkpoint() shutil.rmtree(os.path.dirname(self.checkpoint_path)) self.saver.save(self.session, tf_session_fname)
def load_from_file(model_fname): """Load the reranker from a file (actually two files, one for configuration and one for the TensorFlow graph, which must be stored separately). @param model_fname: file name (for the configuration file); TF graph must be stored with a \ different extension """ log_info("Loading reranker from %s..." % model_fname) with file_stream(model_fname, 'rb', encoding=None) as fh: data = pickle.load(fh) ret = RerankingClassifier(cfg=data['cfg']) ret.load_all_settings(data) # re-build TF graph and restore the TF session tf_session_fname = re.sub(r'(.pickle)?(.gz)?$', '.tfsess', model_fname) ret._init_neural_network() ret.saver.restore(ret.session, tf_session_fname) return ret
def save_to_arff(self, filename, encoding='UTF-8'): """ Save the data set to an ARFF file """ # open the file fh = file_stream(filename, 'w', encoding) # print the relation name print >> fh, '@relation ' + (self.relation_name if self.relation_name is not None else '<noname>') # print the list of attributes for attrib in self.attribs: print >> fh, '@attribute ' + attrib.name + ' ' + \ attrib.get_arff_type() # print instances print >> fh, '@data' for inst, weight in zip(self.data, self.inst_weights): print >> fh, self.__get_arff_line(inst, weight)
def load_from_file(model_fname): """Load the reranker from a file (actually two files, one for configuration and one for the TensorFlow graph, which must be stored separately). @param model_fname: file name (for the configuration file); TF graph must be stored with a \ different extension """ log_info("Loading reranker from %s..." % model_fname) with file_stream(model_fname, 'rb', encoding=None) as fh: data = pickle.load(fh) ret = RerankingClassifier(cfg=data['cfg']) ret.load_all_settings(data) # re-build TF graph and restore the TF session tf_session_fname = os.path.abspath(re.sub(r'(.pickle)?(.gz)?$', '.tfsess', model_fname)) ret._init_neural_network() ret.saver.restore(ret.session, tf_session_fname) return ret
def create_training_job(config, work_dir, train_file, name=None, memory=8, encoding='UTF-8'): """\ Submit a training process on the cluster which will save the model to a pickle. Return the submitted job and the future location of the model pickle. train_file cannot be a stream, it must be an actual file. """ # purge name if name is None: name = 'TR-' + re.sub(r'[^A-Za-z0-9_]', '_', train_file) else: name = re.sub(r'[^A-Za-z0-9_]', '_', name) # create working directory, if not existing if not os.path.isdir(work_dir): os.mkdir(work_dir) train_file = os.path.abspath(train_file) # generate model file name model_file = os.path.abspath( os.path.join(work_dir, name + '-model.pickle.gz')) config_pickle = os.path.abspath( os.path.join(work_dir, name + '-cfg.pickle.gz')) # create the configuration pickle fh = file_stream(config_pickle, mode='wb', encoding=None) pickle.Pickler(fh, pickle.HIGHEST_PROTOCOL).dump(config) fh.close() # create the job job = Job(name=name, work_dir=work_dir) job.code = "fh = file_stream('" + config_pickle + \ "', mode='rb', encoding=None)\n" + \ "cfg = pickle.Unpickler(fh).load()\n" + \ "fh.close()\n" + \ "model = Model(cfg)\n" + \ "model.train('" + train_file + "', encoding='" + \ encoding + "')\n" \ "model.save_to_file('" + model_file + "')\n" job.header += "from pytreex.tool.ml.model import Model\n" + \ "import pickle\n" + \ "from pytreex.core.util import file_stream\n" return job, model_file
def run_worker(head_host, head_port, debug_out=None): # setup debugging output, if applicable if debug_out is not None: set_debug_stream(file_stream(debug_out, mode='w')) # start the server (in the background) log_info('Creating worker server...') server = ThreadPoolServer(service=RankerTrainingService, nbThreads=1) server_thread = Thread(target=server.start) server_thread.start() my_host = socket.getfqdn() log_info('Worker server created at %s:%d. Connecting to head at %s:%d...' % (my_host, server.port, head_host, head_port)) # notify main about this server conn = connect(head_host, head_port, config={'allow_pickle': True}) conn.root.register_worker(my_host, server.port) conn.close() log_info('Worker is registered with the head.') # now serve until we're killed (the server thread will continue to run) server_thread.join()
def load_from_arff(self, filename, encoding='UTF-8'): """ Load an ARFF file/stream, filling the data structures. """ # initialize if not self.is_empty: raise IOError('Cannot store second data set into the same object.') status = 'header' # we first assume to read the header line_num = 1 # line counter instances = [] weights = [] # open the file fh = file_stream(filename, encoding=encoding) # parse the file for line in fh: line = line.strip() # skip comments if line.startswith('%'): continue # relation name elif line.lower().startswith('@relation'): self.relation_name = line.split(None, 1)[1] # attribute definition elif line.lower().startswith('@attribute'): attr_name, attr_type = line.split(None, 2)[1:] self.attribs.append(Attribute(attr_name, attr_type)) # data section start elif line.lower().startswith('@data'): status = 'data' # data lines elif status == 'data' and line != '': inst, weight = self.__parse_line(line, line_num) instances.append(inst) weights.append(weight) line_num += 1 fh.close() # store the resulting matrix self.data = instances self.inst_weights = weights # remember attribute names self.attribs_by_name = {attr.name: idx for idx, attr in enumerate(self.attribs)}
def process_document(self, doc): "Write a CoNLL-U file" out = file_stream(self.get_output_file_name(doc), 'w', encoding='UTF-8') for bundle in doc.bundles: zone = bundle.get_zone(self.language, self.selector) nodes = zone.atree.get_descendants(ordered=1) # Empty sentences are not allowed in CoNLL-U. if len(nodes)==0: continue comment = zone.wild['comment'] if comment: out.write('#' + comment.rstrip('\r\n').replace('\n','\n#') + '\n') index = 1 for node in nodes: out.write('\t'.join( '_' if value is None else value for value in map((lambda x: str(x) if type(x)==int else getattr(node, x, '_')), [index, 'form', 'lemma', 'upos', 'xpos', 'feats', node.parent.ord, 'deprel', 'deps', 'misc']) ) + '\n') index += 1 out.write('\n')
def create_training_job(config, work_dir, train_file, name=None, memory=8, encoding='UTF-8'): """\ Submit a training process on the cluster which will save the model to a pickle. Return the submitted job and the future location of the model pickle. train_file cannot be a stream, it must be an actual file. """ # purge name if name is None: name = 'TR-' + re.sub(r'[^A-Za-z0-9_]', '_', train_file) else: name = re.sub(r'[^A-Za-z0-9_]', '_', name) # create working directory, if not existing if not os.path.isdir(work_dir): os.mkdir(work_dir) train_file = os.path.abspath(train_file) # generate model file name model_file = os.path.abspath(os.path.join(work_dir, name + '-model.pickle.gz')) config_pickle = os.path.abspath(os.path.join(work_dir, name + '-cfg.pickle.gz')) # create the configuration pickle fh = file_stream(config_pickle, mode='wb', encoding=None) pickle.Pickler(fh, pickle.HIGHEST_PROTOCOL).dump(config) fh.close() # create the job job = Job(name=name, work_dir=work_dir) job.code = "fh = file_stream('" + config_pickle + \ "', mode='rb', encoding=None)\n" + \ "cfg = pickle.Unpickler(fh).load()\n" + \ "fh.close()\n" + \ "model = Model(cfg)\n" + \ "model.train('" + train_file + "', encoding='" + \ encoding + "')\n" \ "model.save_to_file('" + model_file + "')\n" job.header += "from pytreex.tool.ml.model import Model\n" + \ "import pickle\n" + \ "from pytreex.core.util import file_stream\n" return job, model_file
def load_from_file(model_fname): """Load the whole ensemble from a file (load settings and model parameters, then build the ensemble network).""" log_info("Loading ensemble generator from %s..." % model_fname) with file_stream(model_fname, 'rb', encoding=None) as fh: typeid = pickle.load(fh) if typeid != Seq2SeqEnsemble: raise ValueError('Wrong type identifier in file %s' % model_fname) cfg = pickle.load(fh) ret = Seq2SeqEnsemble(cfg) gens_dump = pickle.load(fh) if 'classif_filter' in cfg: rerank_settings = pickle.load(fh) rerank_params = pickle.load(fh) else: rerank_settings = None rerank_params = None ret.build_ensemble(gens_dump, rerank_settings, rerank_params) return ret
def load_from_file(model_fname): """Load the whole ensemble from a file (load settings and model parameters, then build the ensemble network).""" # TODO support for lexicalizer log_info("Loading ensemble generator from %s..." % model_fname) with file_stream(model_fname, 'rb', encoding=None) as fh: typeid = pickle.load(fh) if typeid != Seq2SeqEnsemble: raise ValueError('Wrong type identifier in file %s' % model_fname) cfg = pickle.load(fh) ret = Seq2SeqEnsemble(cfg) gens_dump = pickle.load(fh) if 'classif_filter' in cfg: rerank_settings = pickle.load(fh) rerank_params = pickle.load(fh) else: rerank_settings = None rerank_params = None ret.build_ensemble(gens_dump, rerank_settings, rerank_params) return ret
def save_to_file(self, fname): log_info('Saving model to ' + fname) with file_stream(fname, mode='wb', encoding=None) as fh: pickle.dump(self, fh, pickle.HIGHEST_PROTOCOL)
def load_from_file(fname): log_info('Loading model from ' + fname) with file_stream(fname, mode='rb', encoding=None) as fh: classif = pickle.load(fh) return classif
def process_document(self, filename): """\ Read a CoNLL-U file and return its contents as a Document object. """ fh = file_stream(filename, encoding=self.encoding) doc = Document(filename) bundle = doc.create_bundle() zone = bundle.create_zone(self.language, self.selector) root = zone.create_atree() last_node = root nodes = [root] parents = [0] comment = '' for line in fh: # Strip newline character (\n or \r\n) line = line.rstrip('\r\n') # Empty line as a end of sentence if not line: # Ignore (multiple) empty lines before start of sentence (invalid CoNLL-U) if len(nodes)==1: continue # Rehang to correct parents and save nonempty comment to root for i in xrange(1,len(nodes)): nodes[i].parent = nodes[parents[i]] if len(comment): zone.wild['comment'] = comment # Prepare a new bundle bundle = doc.create_bundle() zone = bundle.create_zone(self.language, self.selector) root = zone.create_atree() last_node = root nodes = [root] parents = [0] comment = '' # Comment elif line[0] == '#': comment += line[1:] + "\n" # A normal line with one token else: columns = line.split('\t') # TODO: multi-word tokens if '-' in columns[0]: continue # Create new node new_node = root.create_child(data = dict( (key, value) for key, value in zip(['form', 'lemma', 'upos', 'xpos', 'feats', 'deprel', 'deps', 'misc'], columns[1:6] + columns[7:10] ) if value is not None and value != '_' ) ) nodes.append(new_node) try: parent_index = int(columns[6]) except (ValueError, TypeError): # TODO: warning? parent_index = 0 parents.append(parent_index) # Word order TODO is this needed? new_node.shift_after_subtree(last_node) last_node = new_node # The last bundle should be empty (if the file ended with an empty line), # so we need to remove it. But let's check it. if len(nodes)==1: doc.bundles.pop() else: for i in xrange(1,len(nodes)): nodes[i].parent = nodes[parents[i]] if len(comment): zone.wild['comment'] = comment fh.close() return doc
def main(argv): opts, files = getopt(argv, 'f:c:d:') folds = 10 chunk_size = 2 dir_prefix = 'cv' for opt, arg in opts: if opt == '-f': folds = int(arg) elif opt == '-c': chunk_size = int(arg) elif opt == '-d': dir_prefix = arg if not files: sys.exit(__doc__) random.seed(1206) ordering = None for file in files: # read all data data = [] with file_stream(file) as fh: chunk = [] for line in fh: chunk.append(line) if len(chunk) == chunk_size: data.append(chunk) chunk = [] if chunk: log_warn('Incomplete chunk at end of file %s, size %d' % (file, len(chunk))) if ordering is None: # create ordering ordering = range(len(data)) random.shuffle(ordering) # create directories for fold_no in xrange(folds): os.mkdir(dir_prefix + "%02d" % fold_no) # output as train and test into all CV portions fold_size, bigger_folds = divmod(len(data), folds) for fold_no in xrange(folds): # compute test data bounds if fold_no < bigger_folds: test_lo = (fold_size + 1) * fold_no test_hi = (fold_size + 1) * (fold_no + 1) else: test_lo = fold_size * fold_no + bigger_folds test_hi = fold_size * (fold_no + 1) + bigger_folds # select train and test data instances train_data = [data[idx] for ord, idx in enumerate(ordering) if ord < test_lo or ord >= test_hi] test_data = [data[idx] for ord, idx in enumerate(ordering) if ord >= test_lo and ord < test_hi] # write them out to a file (replace `all' in name with train/test) fname_base = os.path.basename(file) write_data(dir_prefix + "%02d" % fold_no, fname_base, 'train', train_data) write_data(dir_prefix + "%02d" % fold_no, fname_base, 'test', test_data)
def save_to_file(self, model_fname): """Save the model to a file.""" log_info("Saving ranker to %s..." % model_fname) with file_stream(model_fname, 'wb', encoding=None) as fh: pickle.dump(self, fh, protocol=pickle.HIGHEST_PROTOCOL)
def load_from_file(model_fname): """Load a pre-trained model from a file.""" log_info("Loading ranker from %s..." % model_fname) with file_stream(model_fname, 'rb', encoding=None) as fh: return pickle.load(fh)
def main(argv): opts, files = getopt(argv, 'f:c:d:') folds = 10 chunk_size = 2 dir_prefix = 'cv' for opt, arg in opts: if opt == '-f': folds = int(arg) elif opt == '-c': chunk_size = int(arg) elif opt == '-d': dir_prefix = arg if not files: sys.exit(__doc__) random.seed(1206) ordering = None for file in files: # read all data data = [] with file_stream(file) as fh: chunk = [] for line in fh: chunk.append(line) if len(chunk) == chunk_size: data.append(chunk) chunk = [] if chunk: log_warn('Incomplete chunk at end of file %s, size %d' % (file, len(chunk))) if ordering is None: # create ordering ordering = range(len(data)) random.shuffle(ordering) # create directories for fold_no in xrange(folds): os.mkdir(dir_prefix + "%02d" % fold_no) # output as train and test into all CV portions fold_size, bigger_folds = divmod(len(data), folds) for fold_no in xrange(folds): # compute test data bounds if fold_no < bigger_folds: test_lo = (fold_size + 1) * fold_no test_hi = (fold_size + 1) * (fold_no + 1) else: test_lo = fold_size * fold_no + bigger_folds test_hi = fold_size * (fold_no + 1) + bigger_folds # select train and test data instances train_data = [ data[idx] for ord, idx in enumerate(ordering) if ord < test_lo or ord >= test_hi ] test_data = [ data[idx] for ord, idx in enumerate(ordering) if ord >= test_lo and ord < test_hi ] # write them out to a file (replace `all' in name with train/test) fname_base = os.path.basename(file) write_data(dir_prefix + "%02d" % fold_no, fname_base, 'train', train_data) write_data(dir_prefix + "%02d" % fold_no, fname_base, 'test', test_data)