예제 #1
0
파일: model.py 프로젝트: imclab/flect
 def save_to_file(self, model_file):
     """\
     Save the model to a pickle file or stream (supports GZip compression).
     """
     log_info('Saving model to file ' + str(model_file))
     fh = file_stream(model_file, mode='wb', encoding=None)
     pickle.Pickler(fh, pickle.HIGHEST_PROTOCOL).dump(self)
     fh.close()
     log_info('Model successfully saved.')
예제 #2
0
파일: model.py 프로젝트: mkorvas/flect
 def save_to_file(self, model_file):
     """\
     Save the model to a pickle file or stream (supports GZip compression).
     """
     log_info('Saving model to file ' + str(model_file))
     fh = file_stream(model_file, mode='wb', encoding=None)
     pickle.Pickler(fh, pickle.HIGHEST_PROTOCOL).dump(self)
     fh.close()
     log_info('Model successfully saved.')
예제 #3
0
파일: model.py 프로젝트: imclab/flect
 def load_from_file(model_file):
     """\
     Load the model from a pickle file or stream
     (supports GZip compression).
     """
     log_info('Loading model from file ' + str(model_file))
     fh = file_stream(model_file, mode='rb', encoding=None)
     unpickler = pickle.Unpickler(fh)
     model = unpickler.load()
     fh.close()
     log_info('Model loaded successfully.')
     return model
예제 #4
0
파일: dataset.py 프로젝트: UFAL-DSG/flect
 def save_to_csv(self, filename, encoding='UTF-8'):
     if self.is_sparse:
         raise Exception('CSV output not supported for sparse data sets!')
     fh = file_stream(filename, 'w', encoding)
     # header
     print >> fh, ','.join([attrib.name for attrib in self.attribs])
     # instances
     for inst in self.data:
         # undo some ARFF escaping (change \' to '')
         line = re.sub(r'\\\'', r'\'\'', self.__get_arff_line(inst))
         line = re.sub(r'\\', '', line)
         print >> fh, line
예제 #5
0
 def save_to_csv(self, filename, encoding='UTF-8'):
     if self.is_sparse:
         raise Exception('CSV output not supported for sparse data sets!')
     fh = file_stream(filename, 'w', encoding)
     # header
     print >> fh, ','.join([attrib.name for attrib in self.attribs])
     # instances
     for inst in self.data:
         # undo some ARFF escaping (change \' to '')
         line = re.sub(r'\\\'', r'\'\'', self.__get_arff_line(inst))
         line = re.sub(r'\\', '', line)
         print >> fh, line
예제 #6
0
    def load_from_arff(self, filename, encoding='UTF-8', headers_only=False):
        """\
        Load an ARFF file/stream, filling the data structures.

        @param filename: the ARFF file to read
        @param encoding: the encoding (defaults to UTF-8)
        @param headers_only: read just the headers, ignore data
        """
        # initialize
        if not self.is_empty:
            raise IOError('Cannot store second data set into the same object.')
        status = 'header'  # we first assume to read the header
        line_num = 1  # line counter
        instances = []
        weights = []
        # open the file
        fh = file_stream(filename, encoding=encoding)
        # parse the file
        for line in fh:
            line = line.strip()
            # skip comments
            if line.startswith('%'):
                continue
            # relation name
            elif line.lower().startswith('@relation'):
                tokens = line.split(None, 1)
                if len(tokens) > 1:
                    self.relation_name = tokens[1]
            # attribute definition
            elif line.lower().startswith('@attribute'):
                attr_name, attr_type = line.split(None, 2)[1:]
                self.attribs.append(Attribute(attr_name, attr_type))
            # data section start
            elif line.lower().startswith('@data'):
                status = 'data'
                if headers_only:  # stop after reading headers
                    break
            # data lines
            elif status == 'data' and line != '':
                inst, weight = self.__parse_line(line, line_num)
                instances.append(inst)
                weights.append(weight)
            line_num += 1
        fh.close()
        # store the resulting matrix
        self.data = instances
        self.inst_weights = weights
        # remember attribute names
        self.attribs_by_name = {
            attr.name: idx
            for idx, attr in enumerate(self.attribs)
        }
예제 #7
0
파일: dataset.py 프로젝트: UFAL-DSG/flect
    def load_from_arff(self, filename, encoding='UTF-8', headers_only=False):
        """\
        Load an ARFF file/stream, filling the data structures.

        @param filename: the ARFF file to read
        @param encoding: the encoding (defaults to UTF-8)
        @param headers_only: read just the headers, ignore data
        """
        # initialize
        if not self.is_empty:
            raise IOError('Cannot store second data set into the same object.')
        status = 'header'  # we first assume to read the header
        line_num = 1  # line counter
        instances = []
        weights = []
        # open the file
        fh = file_stream(filename, encoding=encoding)
        # parse the file
        for line in fh:
            line = line.strip()
            # skip comments
            if line.startswith('%'):
                continue
            # relation name
            elif line.lower().startswith('@relation'):
                tokens = line.split(None, 1)
                if len(tokens) > 1:
                    self.relation_name = tokens[1]
            # attribute definition
            elif line.lower().startswith('@attribute'):
                attr_name, attr_type = line.split(None, 2)[1:]
                self.attribs.append(Attribute(attr_name, attr_type))
            # data section start
            elif line.lower().startswith('@data'):
                status = 'data'
                if headers_only:  # stop after reading headers
                    break
            # data lines
            elif status == 'data' and line != '':
                inst, weight = self.__parse_line(line, line_num)
                instances.append(inst)
                weights.append(weight)
            line_num += 1
        fh.close()
        # store the resulting matrix
        self.data = instances
        self.inst_weights = weights
        # remember attribute names
        self.attribs_by_name = {attr.name: idx
                                for idx, attr in enumerate(self.attribs)}
예제 #8
0
파일: model.py 프로젝트: mkorvas/flect
 def load_from_file(model_file):
     """\
     Load the model from a pickle file or stream
     (supports GZip compression).
     """
     log_info('Loading model from file ' + str(model_file))
     fh = file_stream(model_file, mode='rb', encoding=None)
     unpickler = pickle.Unpickler(fh)
     model = unpickler.load()
     if not hasattr(model, 'attr_mask'):
         model.attr_mask = model.get_attr_mask()
     fh.close()
     log_info('Model loaded successfully.')
     return model
예제 #9
0
 def save_to_arff(self, filename, encoding='UTF-8'):
     """\
     Save the data set to an ARFF file
     """
     # open the file
     fh = file_stream(filename, 'w', encoding)
     # print the relation name
     print >> fh, '@relation ' + (self.relation_name if self.relation_name
                                  is not None else '<noname>')
     # print the list of attributes
     for attrib in self.attribs:
         print >> fh, '@attribute ' + attrib.name + ' ' + \
                 attrib.get_arff_type()
     # print instances
     print >> fh, '@data'
     for inst, weight in zip(self.data, self.inst_weights):
         print >> fh, self.__get_arff_line(inst, weight)
예제 #10
0
파일: dataset.py 프로젝트: UFAL-DSG/flect
 def save_to_arff(self, filename, encoding='UTF-8'):
     """\
     Save the data set to an ARFF file
     """
     # open the file
     fh = file_stream(filename, 'w', encoding)
     # print the relation name
     print >> fh, '@relation ' + (self.relation_name
                                  if self.relation_name is not None
                                  else '<noname>')
     # print the list of attributes
     for attrib in self.attribs:
         print >> fh, '@attribute ' + attrib.name + ' ' + \
                 attrib.get_arff_type()
     # print instances
     print >> fh, '@data'
     for inst, weight in zip(self.data, self.inst_weights):
         print >> fh, self.__get_arff_line(inst, weight)
예제 #11
0
파일: model.py 프로젝트: mkorvas/flect
 def create_training_job(config,
                         work_dir,
                         train_file,
                         name=None,
                         memory=8,
                         encoding='UTF-8'):
     """\
     Submit a training process on the cluster which will save the
     model to a pickle. Return the submitted job and the future location of
     the model pickle.
     train_file cannot be a stream, it must be an actual file.
     """
     # purge name
     if name is None:
         name = 'TR-' + re.sub(r'[^A-Za-z0-9_]', '_', train_file)
     else:
         name = re.sub(r'[^A-Za-z0-9_]', '_', name)
     # create working directory, if not existing
     if not os.path.isdir(work_dir):
         os.mkdir(work_dir)
     train_file = os.path.abspath(train_file)
     # generate model file name
     model_file = os.path.abspath(
         os.path.join(work_dir, name + '-model.pickle.gz'))
     config_pickle = os.path.abspath(
         os.path.join(work_dir, name + '-cfg.pickle.gz'))
     # create the configuration pickle
     fh = file_stream(config_pickle, mode='wb', encoding=None)
     pickle.Pickler(fh, pickle.HIGHEST_PROTOCOL).dump(config)
     fh.close()
     # create the job
     job = Job(name=name, work_dir=work_dir)
     job.code = "fh = file_stream('" + config_pickle + \
             "', mode='rb', encoding=None)\n" + \
             "cfg = pickle.Unpickler(fh).load()\n" + \
             "fh.close()\n" + \
             "model = Model(cfg)\n" + \
             "model.train('" + train_file + "', encoding='" + \
             encoding + "')\n" \
             "model.save_to_file('" + model_file + "')\n"
     job.header += "from flect.model import Model\n" + \
             "import pickle\n" + \
             "from flect.varutil import file_stream\n"
     return job, model_file
예제 #12
0
파일: model.py 프로젝트: imclab/flect
 def create_training_job(config, work_dir, train_file,
                         name=None, memory=8, encoding='UTF-8'):
     """\
     Submit a training process on the cluster which will save the
     model to a pickle. Return the submitted job and the future location of
     the model pickle.
     train_file cannot be a stream, it must be an actual file.
     """
     # purge name
     if name is None:
         name = 'TR-' + re.sub(r'[^A-Za-z0-9_]', '_', train_file)
     else:
         name = re.sub(r'[^A-Za-z0-9_]', '_', name)
     # create working directory, if not existing
     if not os.path.isdir(work_dir):
         os.mkdir(work_dir)
     train_file = os.path.abspath(train_file)
     # generate model file name
     model_file = os.path.abspath(os.path.join(work_dir,
                                               name + '-model.pickle.gz'))
     config_pickle = os.path.abspath(os.path.join(work_dir,
                                                  name + '-cfg.pickle.gz'))
     # create the configuration pickle
     fh = file_stream(config_pickle, mode='wb', encoding=None)
     pickle.Pickler(fh, pickle.HIGHEST_PROTOCOL).dump(config)
     fh.close()
     # create the job
     job = Job(name=name, work_dir=work_dir)
     job.code = "fh = file_stream('" + config_pickle + \
             "', mode='rb', encoding=None)\n" + \
             "cfg = pickle.Unpickler(fh).load()\n" + \
             "fh.close()\n" + \
             "model = Model(cfg)\n" + \
             "model.train('" + train_file + "', encoding='" + \
             encoding + "')\n" \
             "model.save_to_file('" + model_file + "')\n"
     job.header += "from flect.model import Model\n" + \
             "import pickle\n" + \
             "from flect.varutil import file_stream\n"
     return job, model_file