示例#1
0
def run_training(head_host, head_port, debug_out=None):
    """Main worker training routine (creates the Seq2SeqTrainingService and connects it to the
    head.

    @param head_host: hostname of the head
    @param head_port: head port number
    @param debug_out: path to the debugging output file (debug output discarded if None)
    """
    # setup debugging output, if applicable
    if debug_out is not None:
        set_debug_stream(file_stream(debug_out, mode='w'))
    # start the server (in the background)
    log_info('Creating training server...')
    server = ThreadPoolServer(service=Seq2SeqTrainingService, nbThreads=1)
    server_thread = Thread(target=server.start)
    server_thread.start()
    my_host = socket.getfqdn()
    log_info('Worker server created at %s:%d. Connecting to head at %s:%d...' %
             (my_host, server.port, head_host, head_port))
    # notify main about this server
    conn = connect(head_host, head_port, config={'allow_pickle': True})
    conn.root.register_worker(my_host, server.port)
    conn.close()
    log_info('Worker is registered with the head.')
    # now serve until we're killed (the server thread will continue to run)
    server_thread.join()
示例#2
0
def run_training(head_host, head_port, debug_out=None):
    """Main worker training routine (creates the Seq2SeqTrainingService and connects it to the
    head.

    @param head_host: hostname of the head
    @param head_port: head port number
    @param debug_out: path to the debugging output file (debug output discarded if None)
    """
    # setup debugging output, if applicable
    if debug_out is not None:
        set_debug_stream(file_stream(debug_out, mode='w'))
    # start the server (in the background)
    log_info('Creating training server...')
    server = ThreadPoolServer(service=Seq2SeqTrainingService, nbThreads=1)
    server_thread = Thread(target=server.start)
    server_thread.start()
    my_host = socket.getfqdn()
    log_info('Worker server created at %s:%d. Connecting to head at %s:%d...' %
             (my_host, server.port, head_host, head_port))
    # notify main about this server
    conn = connect(head_host, head_port, config={'allow_pickle': True})
    conn.root.register_worker(my_host, server.port)
    conn.close()
    log_info('Worker is registered with the head.')
    # now serve until we're killed (the server thread will continue to run)
    server_thread.join()
示例#3
0
文件: seq2seq.py 项目: pdsujnow/tgen
    def load_from_file(model_fname):
        """Load the generator from a file (actually two files, one for configuration and one
        for the TensorFlow graph, which must be stored separately).

        @param model_fname: file name (for the configuration file); TF graph must be stored with a \
            different extension
        """
        log_info("Loading generator from %s..." % model_fname)
        with file_stream(model_fname, 'rb', encoding=None) as fh:
            data = pickle.load(fh)
            ret = Seq2SeqGen(cfg=data['cfg'])
            ret.load_all_settings(data)

        if ret.classif_filter:
            classif_filter_fname = re.sub(r'((.pickle)?(.gz)?)$', r'.tftreecl\1', model_fname)
            if os.path.isfile(classif_filter_fname):
                ret.classif_filter = RerankingClassifier.load_from_file(classif_filter_fname)
            else:
                log_warn("Classification filter data not found, ignoring.")
                ret.classif_filter = False

        # re-build TF graph and restore the TF session
        tf_session_fname = re.sub(r'(.pickle)?(.gz)?$', '.tfsess', model_fname)
        ret._init_neural_network()
        ret.saver.restore(ret.session, tf_session_fname)

        return ret
示例#4
0
 def load_from_file(fname):
     log_info('Loading model from ' + fname)
     with file_stream(fname, mode='rb', encoding=None) as fh:
         candgen = pickle.load(fh)
         # various backward compatibility tricks
         if type(candgen) == dict:
             child_type_counts = candgen
             candgen = RandomCandidateGenerator({})
             candgen.child_type_counts = child_type_counts
             candgen.child_num_cdfs = pickle.load(fh)
             candgen.max_children = pickle.load(fh)
         if not hasattr(candgen, 'node_limits'):
             candgen.node_limits = None
         if not hasattr(candgen, 'child_type_counts'):
             candgen.child_type_counts = candgen.form_counts
             candgen.child_num_cdfs = candgen.child_cdfs
         if not hasattr(candgen, 'exp_child_num'):
             candgen.exp_child_num = candgen.exp_from_cdfs(
                 candgen.child_num_cdfs)
         if not hasattr(candgen, 'compatible_dais'):
             candgen.compatible_dais = None
             candgen.compatible_dais_type = None
             candgen.compatible_dais_limit = 1000
         if not hasattr(candgen, 'compatible_slots'):
             candgen.compatible_slots = False
         if not hasattr(candgen, 'classif'):
             candgen.classif = None
         return candgen
示例#5
0
文件: seq2seq.py 项目: tonydeep/tgen
    def save_to_file(self, model_fname):
        """Save the generator to a file (actually two files, one for configuration and one
        for the TensorFlow graph, which must be stored separately).

        @param model_fname: file name (for the configuration file); TF graph will be stored with a \
            different extension
        """
        log_info("Saving generator to %s..." % model_fname)
        if self.classif_filter:
            classif_filter_fname = re.sub(r'((.pickle)?(.gz)?)$',
                                          r'.tftreecl\1', model_fname)
            self.classif_filter.save_to_file(classif_filter_fname)
        if self.lexicalizer:
            lexicalizer_fname = re.sub(r'((.pickle)?(.gz)?)$', r'.lexic\1',
                                       model_fname)
            self.lexicalizer.save_to_file(lexicalizer_fname)

        with file_stream(model_fname, 'wb', encoding=None) as fh:
            pickle.dump(self.get_all_settings(),
                        fh,
                        protocol=pickle.HIGHEST_PROTOCOL)
        tf_session_fname = re.sub(r'(.pickle)?(.gz)?$', '.tfsess', model_fname)
        if hasattr(self, 'checkpoint_path') and self.checkpoint_path:
            shutil.copyfile(self.checkpoint_path, tf_session_fname)
        else:
            self.saver.save(self.session, tf_session_fname)
示例#6
0
 def process_document(self, doc):
     "Write a CoNLL-U file"
     out = file_stream(self.get_output_file_name(doc),
                       'w',
                       encoding='UTF-8')
     for bundle in doc.bundles:
         zone = bundle.get_zone(self.language, self.selector)
         nodes = zone.atree.get_descendants(ordered=1)
         # Empty sentences are not allowed in CoNLL-U.
         if len(nodes) == 0:
             continue
         comment = zone.wild['comment']
         if comment:
             out.write('#' + comment.rstrip('\r\n').replace('\n', '\n#') +
                       '\n')
         index = 1
         for node in nodes:
             out.write('\t'.join(
                 '_' if value is None else value
                 for value in map((lambda x: str(x) if type(
                     x) == int else getattr(node, x, '_')), [
                         index, 'form', 'lemma', 'upos', 'xpos', 'feats',
                         node.parent.ord, 'deprel', 'deps', 'misc'
                     ])) + '\n')
             index += 1
         out.write('\n')
示例#7
0
    def save_to_file(self, model_fname):
        """Save the whole ensemble into a file (get all settings and parameters, dump them in a
        pickle)."""

        log_info("Saving generator to %s..." % model_fname)
        with file_stream(model_fname, 'wb', encoding=None) as fh:
            pickle.dump(self.__class__, fh, protocol=pickle.HIGHEST_PROTOCOL)
            pickle.dump(self.cfg, fh, protocol=pickle.HIGHEST_PROTOCOL)

            gens_dump = []
            for gen in self.gens:
                setting = gen.get_all_settings()
                parset = gen.get_model_params()
                setting['classif_filter'] = self.classif_filter is not None
                gens_dump.append((setting, parset))

            pickle.dump(gens_dump, fh, protocol=pickle.HIGHEST_PROTOCOL)

            if self.classif_filter:
                pickle.dump(self.classif_filter.get_all_settings(),
                            fh,
                            protocol=pickle.HIGHEST_PROTOCOL)
                pickle.dump(self.classif_filter.get_model_params(),
                            fh,
                            protocol=pickle.HIGHEST_PROTOCOL)
示例#8
0
    def load_from_file(model_fname):
        """Load the generator from a file (actually two files, one for configuration and one
        for the TensorFlow graph, which must be stored separately).

        @param model_fname: file name (for the configuration file); TF graph must be stored with a \
            different extension
        """
        log_info("Loading generator from %s..." % model_fname)
        with file_stream(model_fname, 'rb', encoding=None) as fh:
            data = pickle.load(fh)
            ret = Seq2SeqGen(cfg=data['cfg'])
            ret.load_all_settings(data)

        if ret.classif_filter:
            classif_filter_fname = re.sub(r'((.pickle)?(.gz)?)$',
                                          r'.tftreecl\1', model_fname)
            if os.path.isfile(classif_filter_fname):
                ret.classif_filter = RerankingClassifier.load_from_file(
                    classif_filter_fname)
            else:
                log_warn("Classification filter data not found, ignoring.")
                ret.classif_filter = False

        # re-build TF graph and restore the TF session
        tf_session_fname = re.sub(r'(.pickle)?(.gz)?$', '.tfsess', model_fname)
        ret._init_neural_network()
        ret.saver.restore(ret.session, tf_session_fname)

        return ret
示例#9
0
文件: yaml.py 项目: ufal/pytreex
 def process_document(self, filename):
     "Read a YAML file and return its contents as a Document object"
     f = file_stream(filename, encoding=None)
     data = yaml.load(f)
     doc = Document(filename, data)
     f.close()
     return doc
示例#10
0
文件: candgen.py 项目: UFAL-DSG/tgen
 def load_from_file(fname):
     log_info('Loading model from ' + fname)
     with file_stream(fname, mode='rb', encoding=None) as fh:
         candgen = pickle.load(fh)
         # various backward compatibility tricks
         if type(candgen) == dict:
             child_type_counts = candgen
             candgen = RandomCandidateGenerator({})
             candgen.child_type_counts = child_type_counts
             candgen.child_num_cdfs = pickle.load(fh)
             candgen.max_children = pickle.load(fh)
         if not hasattr(candgen, 'node_limits'):
             candgen.node_limits = None
         if not hasattr(candgen, 'child_type_counts'):
             candgen.child_type_counts = candgen.form_counts
             candgen.child_num_cdfs = candgen.child_cdfs
         if not hasattr(candgen, 'exp_child_num'):
             candgen.exp_child_num = candgen.exp_from_cdfs(candgen.child_num_cdfs)
         if not hasattr(candgen, 'compatible_dais'):
             candgen.compatible_dais = None
             candgen.compatible_dais_type = None
             candgen.compatible_dais_limit = 1000
         if not hasattr(candgen, 'compatible_slots'):
             candgen.compatible_slots = False
         if not hasattr(candgen, 'classif'):
             candgen.classif = None
         return candgen
示例#11
0
文件: yaml.py 项目: leotilli/pytreex
 def process_document(self, filename):
     "Read a YAML file and return its contents as a Document object"
     f = file_stream(filename, encoding=None)
     data = yaml.load(f)
     doc = Document(filename, data)
     f.close()
     return doc
示例#12
0
文件: seq2seq.py 项目: pdsujnow/tgen
    def load_from_file(model_fname):
        """Detect correct model type (plain/ensemble) and start loading."""
        model_type = Seq2SeqGen  # default to plain generator
        with file_stream(model_fname, 'rb', encoding=None) as fh:
            data = pickle.load(fh)
            if isinstance(data, type):
                model_type = data

        return model_type.load_from_file(model_fname)
示例#13
0
文件: yaml.py 项目: ufal/pytreex
 def process_document(self, doc):
     "Write a YAML document"
     data = []
     for bundle in doc.bundles:
         data.append(self.serialize_bundle(bundle))
     out = file_stream(self.get_output_file_name(doc), 'w', encoding=None)
     out.write(yaml.safe_dump(data, allow_unicode=True,
                              explicit_start=True))
     out.close()
示例#14
0
def convert_model(model_fname):

    reset_default_graph()

    param_fname = re.sub(r'((.pickle)?(.gz)?)$', r'.params.gz', model_fname)
    log_info('Converting %s to %s...' % (model_fname, param_fname))
    model = Seq2SeqBase.load_from_file(model_fname)
    with file_stream(param_fname, 'wb', encoding=None) as fh:
        pickle.dump(model.get_model_params(), fh, protocol=pickle.HIGHEST_PROTOCOL)
示例#15
0
 def process_document(self, doc):
     "Write a YAML document"
     data = []
     for bundle in doc.bundles:
         data.append(self.serialize_bundle(bundle))
     out = file_stream(self.get_output_file_name(doc), 'w', encoding=None)
     out.write(yaml.safe_dump(data, allow_unicode=True,
                              explicit_start=True))
     out.close()
示例#16
0
文件: model.py 项目: leotilli/pytreex
 def save_to_file(self, model_file):
     """\
     Save the model to a pickle file or stream (supports GZip compression).
     """
     log_info('Saving model to file ' + str(model_file))
     fh = file_stream(model_file, mode='wb', encoding=None)
     pickle.Pickler(fh, pickle.HIGHEST_PROTOCOL).dump(self)
     fh.close()
     log_info('Model successfully saved.')
示例#17
0
文件: cv_split.py 项目: UFAL-DSG/tgen
def write_data(dir, fname_base, fname_repl, data):
    chunk_size = len(data[0])
    for chunk_idx in xrange(chunk_size):
        fname_suff = ".%d" % chunk_idx if chunk_size > 1 else ''
        file_name = os.path.join(dir, re.sub(r'^[^-._]*', fname_repl + fname_suff, fname_base))
        print 'WRITING ' + file_name
        with file_stream(file_name, 'w') as fh:
            for chunk in data:
                print >> fh, chunk[chunk_idx],
示例#18
0
文件: model.py 项目: ufal/pytreex
 def save_to_file(self, model_file):
     """\
     Save the model to a pickle file or stream (supports GZip compression).
     """
     log_info('Saving model to file ' + str(model_file))
     fh = file_stream(model_file, mode='wb', encoding=None)
     pickle.Pickler(fh, pickle.HIGHEST_PROTOCOL).dump(self)
     fh.close()
     log_info('Model successfully saved.')
示例#19
0
    def load_from_file(model_fname):
        """Detect correct model type (plain/ensemble) and start loading."""
        model_type = Seq2SeqGen  # default to plain generator
        with file_stream(model_fname, 'rb', encoding=None) as fh:
            data = pickle.load(fh)
            if isinstance(data, type):
                model_type = data

        return model_type.load_from_file(model_fname)
示例#20
0
def write_data(dir, fname_base, fname_repl, data):
    chunk_size = len(data[0])
    for chunk_idx in xrange(chunk_size):
        fname_suff = ".%d" % chunk_idx if chunk_size > 1 else ''
        file_name = os.path.join(
            dir, re.sub(r'^[^-._]*', fname_repl + fname_suff, fname_base))
        print 'WRITING ' + file_name
        with file_stream(file_name, 'w') as fh:
            for chunk in data:
                print >> fh, chunk[chunk_idx],
def convert_model(model_fname):

    reset_default_graph()

    param_fname = re.sub(r'((.pickle)?(.gz)?)$', r'.params.gz', model_fname)
    log_info('Converting %s to %s...' % (model_fname, param_fname))
    model = Seq2SeqBase.load_from_file(model_fname)
    with file_stream(param_fname, 'wb', encoding=None) as fh:
        pickle.dump(model.get_model_params(),
                    fh,
                    protocol=pickle.HIGHEST_PROTOCOL)
示例#22
0
文件: model.py 项目: ufal/pytreex
 def load_from_file(model_file):
     """\
     Load the model from a pickle file or stream
     (supports GZip compression).
     """
     log_info('Loading model from file ' + str(model_file))
     fh = file_stream(model_file, mode='rb', encoding=None)
     unpickler = pickle.Unpickler(fh)
     model = unpickler.load()
     fh.close()
     log_info('Model loaded successfully.')
     return model
示例#23
0
文件: model.py 项目: leotilli/pytreex
 def load_from_file(model_file):
     """\
     Load the model from a pickle file or stream
     (supports GZip compression).
     """
     log_info('Loading model from file ' + str(model_file))
     fh = file_stream(model_file, mode='rb', encoding=None)
     unpickler = pickle.Unpickler(fh)
     model = unpickler.load()
     fh.close()
     log_info('Model loaded successfully.')
     return model
示例#24
0
    def save_to_file(self, model_fname):
        """Save the generator to a file (actually two files, one for configuration and one
        for the TensorFlow graph, which must be stored separately).

        @param model_fname: file name (for the configuration file); TF graph will be stored with a \
            different extension
        """
        log_info("Saving classifier to %s..." % model_fname)
        with file_stream(model_fname, 'wb', encoding=None) as fh:
            pickle.dump(self.get_all_settings(), fh, protocol=pickle.HIGHEST_PROTOCOL)
        tf_session_fname = re.sub(r'(.pickle)?(.gz)?$', '.tfsess', model_fname)
        if self.checkpoint_path:
            shutil.copyfile(self.checkpoint_path, tf_session_fname)
        else:
            self.saver.save(self.session, tf_session_fname)
示例#25
0
 def process_document(self, filename):
     """\
     Read a Tecto-Template file and return its contents as
     a Document object.
     """
     fh = file_stream(filename, encoding=self.encoding)
     doc = Document(filename)
     for line in fh:
         bundle = doc.create_bundle()
         zone = bundle.create_zone(self.language, self.selector)
         ttree = zone.create_ttree()
         self.parse_line(line, ttree)
         log_info('Parsed a tree with %d nodes.' %
                  len(ttree.get_descendants()))
     fh.close()
     return doc
示例#26
0
    def save_to_file(self, model_fname):
        """Save the classifier  to a file (actually two files, one for configuration and one
        for the TensorFlow graph, which must be stored separately).

        @param model_fname: file name (for the configuration file); TF graph will be stored with a \
            different extension
        """
        log_info("Saving classifier to %s..." % model_fname)
        with file_stream(model_fname, 'wb', encoding=None) as fh:
            pickle.dump(self.get_all_settings(),
                        fh,
                        protocol=pickle.HIGHEST_PROTOCOL)
        tf_session_fname = re.sub(r'(.pickle)?(.gz)?$', '.tfsess', model_fname)
        if hasattr(self, 'checkpoint_path') and self.checkpoint_path:
            self.restore_checkpoint()
            shutil.rmtree(os.path.dirname(self.checkpoint_path))
        self.saver.save(self.session, tf_session_fname)
示例#27
0
    def load_from_file(model_fname):
        """Load the reranker from a file (actually two files, one for configuration and one
        for the TensorFlow graph, which must be stored separately).

        @param model_fname: file name (for the configuration file); TF graph must be stored with a \
            different extension
        """
        log_info("Loading reranker from %s..." % model_fname)
        with file_stream(model_fname, 'rb', encoding=None) as fh:
            data = pickle.load(fh)
            ret = RerankingClassifier(cfg=data['cfg'])
            ret.load_all_settings(data)

        # re-build TF graph and restore the TF session
        tf_session_fname = re.sub(r'(.pickle)?(.gz)?$', '.tfsess', model_fname)
        ret._init_neural_network()
        ret.saver.restore(ret.session, tf_session_fname)
        return ret
示例#28
0
 def save_to_arff(self, filename, encoding='UTF-8'):
     """
     Save the data set to an ARFF file
     """
     # open the file
     fh = file_stream(filename, 'w', encoding)
     # print the relation name
     print >> fh, '@relation ' + (self.relation_name
                                  if self.relation_name is not None
                                  else '<noname>')
     # print the list of attributes
     for attrib in self.attribs:
         print >> fh, '@attribute ' + attrib.name + ' ' + \
                 attrib.get_arff_type()
     # print instances
     print >> fh, '@data'
     for inst, weight in zip(self.data, self.inst_weights):
         print >> fh, self.__get_arff_line(inst, weight)
示例#29
0
    def load_from_file(model_fname):
        """Load the reranker from a file (actually two files, one for configuration and one
        for the TensorFlow graph, which must be stored separately).

        @param model_fname: file name (for the configuration file); TF graph must be stored with a \
            different extension
        """
        log_info("Loading reranker from %s..." % model_fname)
        with file_stream(model_fname, 'rb', encoding=None) as fh:
            data = pickle.load(fh)
            ret = RerankingClassifier(cfg=data['cfg'])
            ret.load_all_settings(data)

        # re-build TF graph and restore the TF session
        tf_session_fname = os.path.abspath(re.sub(r'(.pickle)?(.gz)?$', '.tfsess', model_fname))
        ret._init_neural_network()
        ret.saver.restore(ret.session, tf_session_fname)
        return ret
示例#30
0
文件: model.py 项目: leotilli/pytreex
 def create_training_job(config,
                         work_dir,
                         train_file,
                         name=None,
                         memory=8,
                         encoding='UTF-8'):
     """\
     Submit a training process on the cluster which will save the
     model to a pickle. Return the submitted job and the future location of
     the model pickle.
     train_file cannot be a stream, it must be an actual file.
     """
     # purge name
     if name is None:
         name = 'TR-' + re.sub(r'[^A-Za-z0-9_]', '_', train_file)
     else:
         name = re.sub(r'[^A-Za-z0-9_]', '_', name)
     # create working directory, if not existing
     if not os.path.isdir(work_dir):
         os.mkdir(work_dir)
     train_file = os.path.abspath(train_file)
     # generate model file name
     model_file = os.path.abspath(
         os.path.join(work_dir, name + '-model.pickle.gz'))
     config_pickle = os.path.abspath(
         os.path.join(work_dir, name + '-cfg.pickle.gz'))
     # create the configuration pickle
     fh = file_stream(config_pickle, mode='wb', encoding=None)
     pickle.Pickler(fh, pickle.HIGHEST_PROTOCOL).dump(config)
     fh.close()
     # create the job
     job = Job(name=name, work_dir=work_dir)
     job.code = "fh = file_stream('" + config_pickle + \
             "', mode='rb', encoding=None)\n" + \
             "cfg = pickle.Unpickler(fh).load()\n" + \
             "fh.close()\n" + \
             "model = Model(cfg)\n" + \
             "model.train('" + train_file + "', encoding='" + \
             encoding + "')\n" \
             "model.save_to_file('" + model_file + "')\n"
     job.header += "from pytreex.tool.ml.model import Model\n" + \
             "import pickle\n" + \
             "from pytreex.core.util import file_stream\n"
     return job, model_file
def run_worker(head_host, head_port, debug_out=None):
    # setup debugging output, if applicable
    if debug_out is not None:
        set_debug_stream(file_stream(debug_out, mode='w'))
    # start the server (in the background)
    log_info('Creating worker server...')
    server = ThreadPoolServer(service=RankerTrainingService, nbThreads=1)
    server_thread = Thread(target=server.start)
    server_thread.start()
    my_host = socket.getfqdn()
    log_info('Worker server created at %s:%d. Connecting to head at %s:%d...' %
             (my_host, server.port, head_host, head_port))
    # notify main about this server
    conn = connect(head_host, head_port, config={'allow_pickle': True})
    conn.root.register_worker(my_host, server.port)
    conn.close()
    log_info('Worker is registered with the head.')
    # now serve until we're killed (the server thread will continue to run)
    server_thread.join()
示例#32
0
def run_worker(head_host, head_port, debug_out=None):
    # setup debugging output, if applicable
    if debug_out is not None:
        set_debug_stream(file_stream(debug_out, mode='w'))
    # start the server (in the background)
    log_info('Creating worker server...')
    server = ThreadPoolServer(service=RankerTrainingService, nbThreads=1)
    server_thread = Thread(target=server.start)
    server_thread.start()
    my_host = socket.getfqdn()
    log_info('Worker server created at %s:%d. Connecting to head at %s:%d...' %
             (my_host, server.port, head_host, head_port))
    # notify main about this server
    conn = connect(head_host, head_port, config={'allow_pickle': True})
    conn.root.register_worker(my_host, server.port)
    conn.close()
    log_info('Worker is registered with the head.')
    # now serve until we're killed (the server thread will continue to run)
    server_thread.join()
示例#33
0
 def load_from_arff(self, filename, encoding='UTF-8'):
     """
     Load an ARFF file/stream, filling the data structures.
     """
     # initialize
     if not self.is_empty:
         raise IOError('Cannot store second data set into the same object.')
     status = 'header'  # we first assume to read the header
     line_num = 1  # line counter
     instances = []
     weights = []
     # open the file
     fh = file_stream(filename, encoding=encoding)
     # parse the file
     for line in fh:
         line = line.strip()
         # skip comments
         if line.startswith('%'):
             continue
         # relation name
         elif line.lower().startswith('@relation'):
             self.relation_name = line.split(None, 1)[1]
         # attribute definition
         elif line.lower().startswith('@attribute'):
             attr_name, attr_type = line.split(None, 2)[1:]
             self.attribs.append(Attribute(attr_name, attr_type))
         # data section start
         elif line.lower().startswith('@data'):
             status = 'data'
         # data lines
         elif status == 'data' and line != '':
             inst, weight = self.__parse_line(line, line_num)
             instances.append(inst)
             weights.append(weight)
         line_num += 1
     fh.close()
     # store the resulting matrix
     self.data = instances
     self.inst_weights = weights
     # remember attribute names
     self.attribs_by_name = {attr.name: idx
                             for idx, attr in enumerate(self.attribs)}
示例#34
0
文件: conllu.py 项目: ufal/pytreex
 def process_document(self, doc):
     "Write a CoNLL-U file"
     out = file_stream(self.get_output_file_name(doc), 'w', encoding='UTF-8')
     for bundle in doc.bundles:
         zone = bundle.get_zone(self.language, self.selector)
         nodes = zone.atree.get_descendants(ordered=1)
         # Empty sentences are not allowed in CoNLL-U.
         if len(nodes)==0:
             continue
         comment = zone.wild['comment']
         if comment:
             out.write('#' + comment.rstrip('\r\n').replace('\n','\n#') + '\n')
         index = 1
         for node in nodes:
             out.write('\t'.join(
                 '_' if value is None else value for value in
                 map((lambda x: str(x) if type(x)==int else getattr(node, x, '_')),
                     [index, 'form', 'lemma', 'upos', 'xpos', 'feats', node.parent.ord, 'deprel', 'deps', 'misc'])
             ) + '\n')
             index += 1
         out.write('\n')
示例#35
0
文件: model.py 项目: ufal/pytreex
 def create_training_job(config, work_dir, train_file,
                         name=None, memory=8, encoding='UTF-8'):
     """\
     Submit a training process on the cluster which will save the
     model to a pickle. Return the submitted job and the future location of
     the model pickle.
     train_file cannot be a stream, it must be an actual file.
     """
     # purge name
     if name is None:
         name = 'TR-' + re.sub(r'[^A-Za-z0-9_]', '_', train_file)
     else:
         name = re.sub(r'[^A-Za-z0-9_]', '_', name)
     # create working directory, if not existing
     if not os.path.isdir(work_dir):
         os.mkdir(work_dir)
     train_file = os.path.abspath(train_file)
     # generate model file name
     model_file = os.path.abspath(os.path.join(work_dir,
                                               name + '-model.pickle.gz'))
     config_pickle = os.path.abspath(os.path.join(work_dir,
                                                  name + '-cfg.pickle.gz'))
     # create the configuration pickle
     fh = file_stream(config_pickle, mode='wb', encoding=None)
     pickle.Pickler(fh, pickle.HIGHEST_PROTOCOL).dump(config)
     fh.close()
     # create the job
     job = Job(name=name, work_dir=work_dir)
     job.code = "fh = file_stream('" + config_pickle + \
             "', mode='rb', encoding=None)\n" + \
             "cfg = pickle.Unpickler(fh).load()\n" + \
             "fh.close()\n" + \
             "model = Model(cfg)\n" + \
             "model.train('" + train_file + "', encoding='" + \
             encoding + "')\n" \
             "model.save_to_file('" + model_file + "')\n"
     job.header += "from pytreex.tool.ml.model import Model\n" + \
             "import pickle\n" + \
             "from pytreex.core.util import file_stream\n"
     return job, model_file
示例#36
0
    def load_from_file(model_fname):
        """Load the whole ensemble from a file (load settings and model parameters, then build the
        ensemble network)."""

        log_info("Loading ensemble generator from %s..." % model_fname)

        with file_stream(model_fname, 'rb', encoding=None) as fh:
            typeid = pickle.load(fh)
            if typeid != Seq2SeqEnsemble:
                raise ValueError('Wrong type identifier in file %s' % model_fname)
            cfg = pickle.load(fh)
            ret = Seq2SeqEnsemble(cfg)
            gens_dump = pickle.load(fh)
            if 'classif_filter' in cfg:
                rerank_settings = pickle.load(fh)
                rerank_params = pickle.load(fh)
            else:
                rerank_settings = None
                rerank_params = None

        ret.build_ensemble(gens_dump, rerank_settings, rerank_params)
        return ret
示例#37
0
    def save_to_file(self, model_fname):
        """Save the whole ensemble into a file (get all settings and parameters, dump them in a
        pickle)."""

        log_info("Saving generator to %s..." % model_fname)
        with file_stream(model_fname, 'wb', encoding=None) as fh:
            pickle.dump(self.__class__, fh, protocol=pickle.HIGHEST_PROTOCOL)
            pickle.dump(self.cfg, fh, protocol=pickle.HIGHEST_PROTOCOL)

            gens_dump = []
            for gen in self.gens:
                setting = gen.get_all_settings()
                parset = gen.get_model_params()
                setting['classif_filter'] = self.classif_filter is not None
                gens_dump.append((setting, parset))

            pickle.dump(gens_dump, fh, protocol=pickle.HIGHEST_PROTOCOL)

            if self.classif_filter:
                pickle.dump(self.classif_filter.get_all_settings(), fh,
                            protocol=pickle.HIGHEST_PROTOCOL)
                pickle.dump(self.classif_filter.get_model_params(), fh,
                            protocol=pickle.HIGHEST_PROTOCOL)
示例#38
0
    def load_from_file(model_fname):
        """Load the whole ensemble from a file (load settings and model parameters, then build the
        ensemble network)."""
        # TODO support for lexicalizer

        log_info("Loading ensemble generator from %s..." % model_fname)

        with file_stream(model_fname, 'rb', encoding=None) as fh:
            typeid = pickle.load(fh)
            if typeid != Seq2SeqEnsemble:
                raise ValueError('Wrong type identifier in file %s' % model_fname)
            cfg = pickle.load(fh)
            ret = Seq2SeqEnsemble(cfg)
            gens_dump = pickle.load(fh)
            if 'classif_filter' in cfg:
                rerank_settings = pickle.load(fh)
                rerank_params = pickle.load(fh)
            else:
                rerank_settings = None
                rerank_params = None

        ret.build_ensemble(gens_dump, rerank_settings, rerank_params)
        return ret
示例#39
0
 def save_to_file(self, fname):
     log_info('Saving model to ' + fname)
     with file_stream(fname, mode='wb', encoding=None) as fh:
         pickle.dump(self, fh, pickle.HIGHEST_PROTOCOL)
示例#40
0
 def load_from_file(fname):
     log_info('Loading model from ' + fname)
     with file_stream(fname, mode='rb', encoding=None) as fh:
         classif = pickle.load(fh)
     return classif
示例#41
0
    def process_document(self, filename):
        """\
        Read a CoNLL-U file and return its contents as a Document object.
        """
        fh = file_stream(filename, encoding=self.encoding)
        doc = Document(filename)
        bundle = doc.create_bundle()
        zone = bundle.create_zone(self.language, self.selector)
        root = zone.create_atree()
        last_node = root
        nodes = [root]
        parents = [0]
        comment = ''
        
        for line in fh:
            
            # Strip newline character (\n or \r\n)
            line = line.rstrip('\r\n')

            # Empty line as a end of sentence
            if not line:
                # Ignore (multiple) empty lines before start of sentence (invalid CoNLL-U)
                if len(nodes)==1:
                    continue

                # Rehang to correct parents and save nonempty comment to root
                for i in xrange(1,len(nodes)):
                    nodes[i].parent = nodes[parents[i]]
                if len(comment):
                    zone.wild['comment'] = comment

                # Prepare a new bundle
                bundle = doc.create_bundle()
                zone = bundle.create_zone(self.language, self.selector)
                root = zone.create_atree()
                last_node = root
                nodes = [root]
                parents = [0]
                comment = ''
            
            # Comment
            elif line[0] == '#':
                comment += line[1:] + "\n"

            # A normal line with one token
            else:
                columns = line.split('\t')
            
                # TODO: multi-word tokens
                if '-' in columns[0]:
                    continue
            
                # Create new node
                new_node = root.create_child(data = dict(
                    (key, value) for key, value in
                    zip(['form', 'lemma', 'upos', 'xpos', 'feats',    'deprel', 'deps', 'misc'],
                        columns[1:6]                                 + columns[7:10]  )
                    if value is not None and value != '_'
                    ) )
                nodes.append(new_node)
                try:
                    parent_index = int(columns[6])
                except (ValueError, TypeError):
                    # TODO: warning?
                    parent_index = 0
                parents.append(parent_index)

                # Word order TODO is this needed?
                new_node.shift_after_subtree(last_node)
                last_node = new_node

        # The last bundle should be empty (if the file ended with an empty line),
        # so we need to remove it. But let's check it.
        if len(nodes)==1:
            doc.bundles.pop()
        else:
            for i in xrange(1,len(nodes)):
                nodes[i].parent = nodes[parents[i]]
            if len(comment):
                zone.wild['comment'] = comment

        fh.close()
        return doc
示例#42
0
文件: cv_split.py 项目: UFAL-DSG/tgen
def main(argv):

    opts, files = getopt(argv, 'f:c:d:')

    folds = 10
    chunk_size = 2
    dir_prefix = 'cv'

    for opt, arg in opts:
        if opt == '-f':
            folds = int(arg)
        elif opt == '-c':
            chunk_size = int(arg)
        elif opt == '-d':
            dir_prefix = arg

    if not files:
        sys.exit(__doc__)
    
    random.seed(1206)
    ordering = None

    for file in files:
        # read all data
        data = []
        with file_stream(file) as fh:
            chunk = []
            for line in fh:
                chunk.append(line)
                if len(chunk) == chunk_size:
                    data.append(chunk)
                    chunk = []
            if chunk:
                log_warn('Incomplete chunk at end of file %s, size %d' % (file, len(chunk)))

        if ordering is None:
            # create ordering
            ordering = range(len(data))
            random.shuffle(ordering)

            # create directories
            for fold_no in xrange(folds):
                os.mkdir(dir_prefix + "%02d" % fold_no)
            
        # output as train and test into all CV portions
        fold_size, bigger_folds = divmod(len(data), folds)
        for fold_no in xrange(folds):
            # compute test data bounds
            if fold_no < bigger_folds:
                test_lo = (fold_size + 1) * fold_no
                test_hi = (fold_size + 1) * (fold_no + 1)
            else:
                test_lo = fold_size * fold_no + bigger_folds
                test_hi = fold_size * (fold_no + 1) + bigger_folds
            # select train and test data instances
            train_data = [data[idx] for ord, idx in enumerate(ordering)
                          if ord < test_lo or ord >= test_hi]
            test_data = [data[idx] for ord, idx in enumerate(ordering)
                         if ord >= test_lo and ord < test_hi]

            # write them out to a file (replace `all' in name with train/test)
            fname_base = os.path.basename(file)
            write_data(dir_prefix + "%02d" % fold_no, fname_base, 'train', train_data)
            write_data(dir_prefix + "%02d" % fold_no, fname_base, 'test', test_data)
示例#43
0
 def save_to_file(self, model_fname):
     """Save the model to a file."""
     log_info("Saving ranker to %s..." % model_fname)
     with file_stream(model_fname, 'wb', encoding=None) as fh:
         pickle.dump(self, fh, protocol=pickle.HIGHEST_PROTOCOL)
示例#44
0
文件: classif.py 项目: pdsujnow/tgen
 def load_from_file(fname):
     log_info('Loading model from ' + fname)
     with file_stream(fname, mode='rb', encoding=None) as fh:
         classif = pickle.load(fh)
     return classif
示例#45
0
文件: rank.py 项目: UFAL-DSG/tgen
 def save_to_file(self, model_fname):
     """Save the model to a file."""
     log_info("Saving ranker to %s..." % model_fname)
     with file_stream(model_fname, 'wb', encoding=None) as fh:
         pickle.dump(self, fh, protocol=pickle.HIGHEST_PROTOCOL)
示例#46
0
文件: rank.py 项目: UFAL-DSG/tgen
 def load_from_file(model_fname):
     """Load a pre-trained model from a file."""
     log_info("Loading ranker from %s..." % model_fname)
     with file_stream(model_fname, 'rb', encoding=None) as fh:
         return pickle.load(fh)
示例#47
0
 def load_from_file(model_fname):
     """Load a pre-trained model from a file."""
     log_info("Loading ranker from %s..." % model_fname)
     with file_stream(model_fname, 'rb', encoding=None) as fh:
         return pickle.load(fh)
示例#48
0
文件: classif.py 项目: pdsujnow/tgen
 def save_to_file(self, fname):
     log_info('Saving model to ' + fname)
     with file_stream(fname, mode='wb', encoding=None) as fh:
         pickle.dump(self, fh, pickle.HIGHEST_PROTOCOL)
示例#49
0
def main(argv):

    opts, files = getopt(argv, 'f:c:d:')

    folds = 10
    chunk_size = 2
    dir_prefix = 'cv'

    for opt, arg in opts:
        if opt == '-f':
            folds = int(arg)
        elif opt == '-c':
            chunk_size = int(arg)
        elif opt == '-d':
            dir_prefix = arg

    if not files:
        sys.exit(__doc__)

    random.seed(1206)
    ordering = None

    for file in files:
        # read all data
        data = []
        with file_stream(file) as fh:
            chunk = []
            for line in fh:
                chunk.append(line)
                if len(chunk) == chunk_size:
                    data.append(chunk)
                    chunk = []
            if chunk:
                log_warn('Incomplete chunk at end of file %s, size %d' %
                         (file, len(chunk)))

        if ordering is None:
            # create ordering
            ordering = range(len(data))
            random.shuffle(ordering)

            # create directories
            for fold_no in xrange(folds):
                os.mkdir(dir_prefix + "%02d" % fold_no)

        # output as train and test into all CV portions
        fold_size, bigger_folds = divmod(len(data), folds)
        for fold_no in xrange(folds):
            # compute test data bounds
            if fold_no < bigger_folds:
                test_lo = (fold_size + 1) * fold_no
                test_hi = (fold_size + 1) * (fold_no + 1)
            else:
                test_lo = fold_size * fold_no + bigger_folds
                test_hi = fold_size * (fold_no + 1) + bigger_folds
            # select train and test data instances
            train_data = [
                data[idx] for ord, idx in enumerate(ordering)
                if ord < test_lo or ord >= test_hi
            ]
            test_data = [
                data[idx] for ord, idx in enumerate(ordering)
                if ord >= test_lo and ord < test_hi
            ]

            # write them out to a file (replace `all' in name with train/test)
            fname_base = os.path.basename(file)
            write_data(dir_prefix + "%02d" % fold_no, fname_base, 'train',
                       train_data)
            write_data(dir_prefix + "%02d" % fold_no, fname_base, 'test',
                       test_data)