def save_to_file(self, model_file): """\ Save the model to a pickle file or stream (supports GZip compression). """ log_info('Saving model to file ' + str(model_file)) fh = file_stream(model_file, mode='wb', encoding=None) pickle.Pickler(fh, pickle.HIGHEST_PROTOCOL).dump(self) fh.close() log_info('Model successfully saved.')
def load_from_file(model_file): """\ Load the model from a pickle file or stream (supports GZip compression). """ log_info('Loading model from file ' + str(model_file)) fh = file_stream(model_file, mode='rb', encoding=None) unpickler = pickle.Unpickler(fh) model = unpickler.load() fh.close() log_info('Model loaded successfully.') return model
def save_to_csv(self, filename, encoding='UTF-8'): if self.is_sparse: raise Exception('CSV output not supported for sparse data sets!') fh = file_stream(filename, 'w', encoding) # header print >> fh, ','.join([attrib.name for attrib in self.attribs]) # instances for inst in self.data: # undo some ARFF escaping (change \' to '') line = re.sub(r'\\\'', r'\'\'', self.__get_arff_line(inst)) line = re.sub(r'\\', '', line) print >> fh, line
def load_from_arff(self, filename, encoding='UTF-8', headers_only=False): """\ Load an ARFF file/stream, filling the data structures. @param filename: the ARFF file to read @param encoding: the encoding (defaults to UTF-8) @param headers_only: read just the headers, ignore data """ # initialize if not self.is_empty: raise IOError('Cannot store second data set into the same object.') status = 'header' # we first assume to read the header line_num = 1 # line counter instances = [] weights = [] # open the file fh = file_stream(filename, encoding=encoding) # parse the file for line in fh: line = line.strip() # skip comments if line.startswith('%'): continue # relation name elif line.lower().startswith('@relation'): tokens = line.split(None, 1) if len(tokens) > 1: self.relation_name = tokens[1] # attribute definition elif line.lower().startswith('@attribute'): attr_name, attr_type = line.split(None, 2)[1:] self.attribs.append(Attribute(attr_name, attr_type)) # data section start elif line.lower().startswith('@data'): status = 'data' if headers_only: # stop after reading headers break # data lines elif status == 'data' and line != '': inst, weight = self.__parse_line(line, line_num) instances.append(inst) weights.append(weight) line_num += 1 fh.close() # store the resulting matrix self.data = instances self.inst_weights = weights # remember attribute names self.attribs_by_name = { attr.name: idx for idx, attr in enumerate(self.attribs) }
def load_from_arff(self, filename, encoding='UTF-8', headers_only=False): """\ Load an ARFF file/stream, filling the data structures. @param filename: the ARFF file to read @param encoding: the encoding (defaults to UTF-8) @param headers_only: read just the headers, ignore data """ # initialize if not self.is_empty: raise IOError('Cannot store second data set into the same object.') status = 'header' # we first assume to read the header line_num = 1 # line counter instances = [] weights = [] # open the file fh = file_stream(filename, encoding=encoding) # parse the file for line in fh: line = line.strip() # skip comments if line.startswith('%'): continue # relation name elif line.lower().startswith('@relation'): tokens = line.split(None, 1) if len(tokens) > 1: self.relation_name = tokens[1] # attribute definition elif line.lower().startswith('@attribute'): attr_name, attr_type = line.split(None, 2)[1:] self.attribs.append(Attribute(attr_name, attr_type)) # data section start elif line.lower().startswith('@data'): status = 'data' if headers_only: # stop after reading headers break # data lines elif status == 'data' and line != '': inst, weight = self.__parse_line(line, line_num) instances.append(inst) weights.append(weight) line_num += 1 fh.close() # store the resulting matrix self.data = instances self.inst_weights = weights # remember attribute names self.attribs_by_name = {attr.name: idx for idx, attr in enumerate(self.attribs)}
def load_from_file(model_file): """\ Load the model from a pickle file or stream (supports GZip compression). """ log_info('Loading model from file ' + str(model_file)) fh = file_stream(model_file, mode='rb', encoding=None) unpickler = pickle.Unpickler(fh) model = unpickler.load() if not hasattr(model, 'attr_mask'): model.attr_mask = model.get_attr_mask() fh.close() log_info('Model loaded successfully.') return model
def save_to_arff(self, filename, encoding='UTF-8'): """\ Save the data set to an ARFF file """ # open the file fh = file_stream(filename, 'w', encoding) # print the relation name print >> fh, '@relation ' + (self.relation_name if self.relation_name is not None else '<noname>') # print the list of attributes for attrib in self.attribs: print >> fh, '@attribute ' + attrib.name + ' ' + \ attrib.get_arff_type() # print instances print >> fh, '@data' for inst, weight in zip(self.data, self.inst_weights): print >> fh, self.__get_arff_line(inst, weight)
def create_training_job(config, work_dir, train_file, name=None, memory=8, encoding='UTF-8'): """\ Submit a training process on the cluster which will save the model to a pickle. Return the submitted job and the future location of the model pickle. train_file cannot be a stream, it must be an actual file. """ # purge name if name is None: name = 'TR-' + re.sub(r'[^A-Za-z0-9_]', '_', train_file) else: name = re.sub(r'[^A-Za-z0-9_]', '_', name) # create working directory, if not existing if not os.path.isdir(work_dir): os.mkdir(work_dir) train_file = os.path.abspath(train_file) # generate model file name model_file = os.path.abspath( os.path.join(work_dir, name + '-model.pickle.gz')) config_pickle = os.path.abspath( os.path.join(work_dir, name + '-cfg.pickle.gz')) # create the configuration pickle fh = file_stream(config_pickle, mode='wb', encoding=None) pickle.Pickler(fh, pickle.HIGHEST_PROTOCOL).dump(config) fh.close() # create the job job = Job(name=name, work_dir=work_dir) job.code = "fh = file_stream('" + config_pickle + \ "', mode='rb', encoding=None)\n" + \ "cfg = pickle.Unpickler(fh).load()\n" + \ "fh.close()\n" + \ "model = Model(cfg)\n" + \ "model.train('" + train_file + "', encoding='" + \ encoding + "')\n" \ "model.save_to_file('" + model_file + "')\n" job.header += "from flect.model import Model\n" + \ "import pickle\n" + \ "from flect.varutil import file_stream\n" return job, model_file
def create_training_job(config, work_dir, train_file, name=None, memory=8, encoding='UTF-8'): """\ Submit a training process on the cluster which will save the model to a pickle. Return the submitted job and the future location of the model pickle. train_file cannot be a stream, it must be an actual file. """ # purge name if name is None: name = 'TR-' + re.sub(r'[^A-Za-z0-9_]', '_', train_file) else: name = re.sub(r'[^A-Za-z0-9_]', '_', name) # create working directory, if not existing if not os.path.isdir(work_dir): os.mkdir(work_dir) train_file = os.path.abspath(train_file) # generate model file name model_file = os.path.abspath(os.path.join(work_dir, name + '-model.pickle.gz')) config_pickle = os.path.abspath(os.path.join(work_dir, name + '-cfg.pickle.gz')) # create the configuration pickle fh = file_stream(config_pickle, mode='wb', encoding=None) pickle.Pickler(fh, pickle.HIGHEST_PROTOCOL).dump(config) fh.close() # create the job job = Job(name=name, work_dir=work_dir) job.code = "fh = file_stream('" + config_pickle + \ "', mode='rb', encoding=None)\n" + \ "cfg = pickle.Unpickler(fh).load()\n" + \ "fh.close()\n" + \ "model = Model(cfg)\n" + \ "model.train('" + train_file + "', encoding='" + \ encoding + "')\n" \ "model.save_to_file('" + model_file + "')\n" job.header += "from flect.model import Model\n" + \ "import pickle\n" + \ "from flect.varutil import file_stream\n" return job, model_file