Exemplo n.º 1
0
 def load_from_file(fname):
     log_info('Loading model from ' + fname)
     with file_stream(fname, mode='rb', encoding=None) as fh:
         candgen = pickle.load(fh)
         # various backward compatibility tricks
         if type(candgen) == dict:
             child_type_counts = candgen
             candgen = RandomCandidateGenerator({})
             candgen.child_type_counts = child_type_counts
             candgen.child_num_cdfs = pickle.load(fh)
             candgen.max_children = pickle.load(fh)
         if not hasattr(candgen, 'node_limits'):
             candgen.node_limits = None
         if not hasattr(candgen, 'child_type_counts'):
             candgen.child_type_counts = candgen.form_counts
             candgen.child_num_cdfs = candgen.child_cdfs
         if not hasattr(candgen, 'exp_child_num'):
             candgen.exp_child_num = candgen.exp_from_cdfs(candgen.child_num_cdfs)
         if not hasattr(candgen, 'compatible_dais'):
             candgen.compatible_dais = None
             candgen.compatible_dais_type = None
             candgen.compatible_dais_limit = 1000
         if not hasattr(candgen, 'compatible_slots'):
             candgen.compatible_slots = False
         if not hasattr(candgen, 'classif'):
             candgen.classif = None
         return candgen
Exemplo n.º 2
0
def get_worker_registrar_for(head):
    """Return a class that will handle worker registration for the given head."""

    # create a dump of the head to be passed to workers
    log_info('Saving ranker init state...')
    tstart = time.time()
    ranker_dump_path = dump_ranker(head.loc_ranker, head.work_dir)
    log_info('Ranker init state saved in %s, it took %f secs.' % (ranker_dump_path,
                                                                  time.time() - tstart))

    class WorkerRegistrarService(Service):

        def exposed_register_worker(self, host, port):
            """Register a worker with my head, initialize it."""
            # initiate connection in the other direction
            log_info('Worker %s:%d connected, initializing training.' % (host, port))
            conn = connect(host, port, config={'allow_pickle': True})
            # initialize the remote server (with training data etc.)
            init_func = async(conn.root.init_training)
            req = init_func(ranker_dump_path)
            # add it to the list of running services
            sc = ServiceConn(host, port, conn)
            head.services.add(sc)
            head.pending_requests.add((sc, None, req))
            log_info('Worker %s:%d initialized.' % (host, port))

    return WorkerRegistrarService, ranker_dump_path
Exemplo n.º 3
0
 def exposed_init_training(self, cfg):
     """Create the Seq2SeqGen object."""
     cfg = pickle.loads(cfg)
     tstart = time.time()
     log_info('Initializing training...')
     self.seq2seq = Seq2SeqGen(cfg)
     log_info('Training initialized. Time taken: %f secs.' % (time.time() - tstart))
Exemplo n.º 4
0
    def can_generate(self, tree, da):
        """Check if the candidate generator can generate a given tree at all.

        This is for debugging purposes only.
        Tries if get_all_successors always returns a successor that leads to the given tree
        (puts on the open list only successors that are subtrees of the given tree).
        """
        self.init_run(da)
        open_list = CandidateList({TreeData(): 1})
        found = False
        tree_no = 0

        while open_list and not found:
            cur_st, _ = open_list.pop()
            if cur_st == tree:
                found = True
                break
            for succ in self.get_all_successors(cur_st):
                tree_no += 1
                # only push on the open list if the successor is still a subtree of the target tree
                if tree.common_subtree_size(succ) == len(succ):
                    open_list.push(succ, len(succ))

        if not found:
            log_info('Did not find tree: ' + unicode(tree) + ' for DA: ' + unicode(da) + ('(total %d trees)' % tree_no))
            return False
        log_info('Found tree: %s for DA: %s (as %d-th tree)' % (unicode(tree), unicode(da), tree_no))
        return tree_no
Exemplo n.º 5
0
    def can_generate_greedy(self, tree, da):
        """Check if the candidate generator can generate a given tree greedily, always
        pursuing the first viable path.

        This is for debugging purposes only.
        Uses `get_all_successors` and always goes on with the first one that increases coverage
        of the current tree.
        """
        self.init_run(da)
        cur_subtree = TreeData()
        found = True

        while found and cur_subtree != tree:
            found = False
            for succ in self.get_all_successors(cur_subtree):
                # use the first successor that is still a subtree of the target tree
                if tree.common_subtree_size(succ) == len(succ):
                    cur_subtree = succ
                    found = True
                    break

        # we have hit a dead end
        if cur_subtree != tree:
            log_info('Did not find tree: ' + unicode(tree) + ' for DA: ' + unicode(da))
            return False

        # everything alright
        log_info('Found tree: %s for DA: %s' % (unicode(tree), unicode(da)))
        return True
Exemplo n.º 6
0
 def save_to_file(self, model_file):
     """\
     Save the model to a pickle file or stream (supports GZip compression).
     """
     log_info('Saving model to file ' + str(model_file))
     fh = file_stream(model_file, mode='wb', encoding=None)
     pickle.Pickler(fh, pickle.HIGHEST_PROTOCOL).dump(self)
     fh.close()
     log_info('Model successfully saved.')
Exemplo n.º 7
0
    def _init_training(self, das_file, ttree_file, data_portion):
        # load data, determine number of features etc. etc.
        super(PerceptronRanker, self)._init_training(das_file, ttree_file, data_portion)
        # initialize weights
        self.w = np.ones(self.train_feats.shape[1])
        self.update_weights_sum()
        # self.w = np.array([rnd.gauss(0, self.alpha) for _ in xrange(self.train_feats.shape[1])])

        log_debug('\n***\nINIT:')
        log_debug(self._feat_val_str())
        log_info('Training ...')
Exemplo n.º 8
0
 def load_training_set(self, filename, encoding='UTF-8'):
     """\
     Load the given training data set into memory and strip it if
     configured to via the train_part parameter.
     """
     log_info('Loading training data set from ' + str(filename) + '...')
     train = DataSet()
     train.load_from_arff(filename, encoding)
     if self.train_part < 1:
         train = train.subset(0, int(round(self.train_part * len(train))),
                              copy=False)
     return train
Exemplo n.º 9
0
 def load_from_file(model_file):
     """\
     Load the model from a pickle file or stream
     (supports GZip compression).
     """
     log_info('Loading model from file ' + str(model_file))
     fh = file_stream(model_file, mode='rb', encoding=None)
     unpickler = pickle.Unpickler(fh)
     model = unpickler.load()
     fh.close()
     log_info('Model loaded successfully.')
     return model
Exemplo n.º 10
0
    def _init_training(self, das_file, ttree_file, data_portion):
        # load data, determine number of features etc. etc.
        super(PerceptronRanker, self)._init_training(das_file, ttree_file,
                                                     data_portion)
        # initialize weights
        self.w = np.ones(self.train_feats.shape[1])
        self.update_weights_sum()
        # self.w = np.array([rnd.gauss(0, self.alpha) for _ in xrange(self.train_feats.shape[1])])

        log_debug('\n***\nINIT:')
        log_debug(self._feat_val_str())
        log_info('Training ...')
Exemplo n.º 11
0
 def exposed_register_worker(self, host, port):
     """Register a worker with my head, initialize it."""
     # initiate connection in the other direction
     log_info('Worker %s:%d connected, initializing training.' % (host, port))
     conn = connect(host, port, config={'allow_pickle': True})
     # initialize the remote server (with training data etc.)
     init_func = async(conn.root.init_training)
     req = init_func(ranker_dump_path)
     # add it to the list of running services
     sc = ServiceConn(host, port, conn)
     head.services.add(sc)
     head.pending_requests.add((sc, None, req))
     log_info('Worker %s:%d initialized.' % (host, port))
Exemplo n.º 12
0
 def train(self, das_file, ttree_file, data_portion=1.0):
     """Run training on the given training data."""
     self._init_training(das_file, ttree_file, data_portion)
     for iter_no in xrange(1, self.passes + 1):
         self.train_order = range(len(self.train_trees))
         if self.randomize:
             rnd.shuffle(self.train_order)
         log_info("Train order: " + str(self.train_order))
         self._training_pass(iter_no)
         if self.evaluator.tree_accuracy() == 1:  # if tree accuracy is 1, we won't learn anything anymore
             break
     # averaged perceptron – average the weights obtained after each pass
     if self.averaging is True:
         self.set_weights_iter_average()
Exemplo n.º 13
0
 def train_on_data(self, train):
     """\
     Train model on the specified training data set (which must be a loaded
     DataSet object).
     """
     log_info('Preparing data set...')
     self.data_headers = train.get_headers()
     self.attr_mask = self.get_attr_mask()
     train_vect = self.__vectorize(train)
     train_classes = self.get_classes(train)
     # if all the training data have the same class, use a dummy classifier
     if train.get_attrib(self.class_attr).num_values == 1:
         self.feature_filter = None
         self.classifier = DummyClassifier(strategy='most_frequent')
     # filter features
     log_info('Filtering...')
     train_filt = self.__filter_features(train_vect, train_classes)
     # train the classifier
     log_info('Training...')
     if self.use_weights:
         self.classifier.fit(train_filt, train_classes,
                             sample_weight=train.inst_weights)
     else:
         self.classifier.fit(train_filt, train_classes)
     self.classifier_trained = True
     log_info('Training done.')
Exemplo n.º 14
0
 def exposed_register_worker(self, host, port):
     """Register a worker with my head, initialize it."""
     # initiate connection in the other direction
     log_info('Worker %s:%d connected, initializing training.' % (host, port))
     conn = connect(host, port, config={'allow_pickle': True})
     # initialize the remote server (with training data etc.)
     init_func = async(conn.root.init_training)
     # add unique 'scope suffix' so that the models don't clash in ensembles
     head.cfg['scope_suffix'] = hashlib.md5("%s:%d" % (host, port)).hexdigest()
     req = init_func(pickle.dumps(head.cfg, pickle.HIGHEST_PROTOCOL))
     # add it to the list of running services
     sc = ServiceConn(host, port, conn)
     head.services.add(sc)
     head.pending_requests.add((sc, None, req))
     log_info('Worker %s:%d initialized.' % (host, port))
Exemplo n.º 15
0
 def train(self, das_file, ttree_file, data_portion=1.0):
     """Run training on the given training data."""
     self._init_training(das_file, ttree_file, data_portion)
     for iter_no in xrange(1, self.passes + 1):
         self.train_order = range(len(self.train_trees))
         if self.randomize:
             rnd.shuffle(self.train_order)
         log_info("Train order: " + str(self.train_order))
         self._training_pass(iter_no)
         if self.evaluator.tree_accuracy(
         ) == 1:  # if tree accuracy is 1, we won't learn anything anymore
             break
     # averaged perceptron – average the weights obtained after each pass
     if self.averaging is True:
         self.set_weights_iter_average()
Exemplo n.º 16
0
 def exposed_train(self, rnd_seed, das_file, ttree_file, data_portion, context_file, validation_files):
     """Run the whole training.
     """
     rnd.seed(rnd_seed)
     log_info('Random seed: %f' % rnd_seed)
     tstart = time.time()
     log_info('Starting training...')
     self.seq2seq.train(das_file, ttree_file, data_portion, context_file, validation_files)
     log_info('Training finished -- time taken: %f secs.' % (time.time() - tstart))
     top_cost = self.seq2seq.top_k_costs[0]
     log_info('Best cost: %f' % top_cost)
     return top_cost
Exemplo n.º 17
0
    def save_to_file(self, model_fname):
        """This will actually just move the best generator (which is saved in a temporary file)
        to the final location."""
        log_info('Moving generator to %s...' % model_fname)
        orig_model_fname = self.model_temp_path
        shutil.move(orig_model_fname, model_fname)
        orig_tf_session_fname = re.sub(r'(.pickle)?(.gz)?$', '.tfsess', orig_model_fname)
        tf_session_fname = re.sub(r'(.pickle)?(.gz)?$', '.tfsess', model_fname)
        if os.path.isfile(orig_tf_session_fname):
            shutil.move(orig_tf_session_fname, tf_session_fname)

        # move the reranking classifier model files as well, if they exist
        orig_clfilter_fname = re.sub(r'((.pickle)?(.gz)?)$', r'.tftreecl\1', orig_model_fname)
        orig_clfilter_tf_fname = re.sub(r'((.pickle)?(.gz)?)$', r'.tfsess', orig_clfilter_fname)

        if os.path.isfile(orig_clfilter_fname) and os.path.isfile(orig_clfilter_tf_fname):
            clfilter_fname = re.sub(r'((.pickle)?(.gz)?)$', r'.tftreecl\1', model_fname)
            clfilter_tf_fname = re.sub(r'((.pickle)?(.gz)?)$', r'.tfsess', clfilter_fname)
            shutil.move(orig_clfilter_fname, clfilter_fname)
            shutil.move(orig_clfilter_tf_fname, clfilter_tf_fname)
Exemplo n.º 18
0
    def _init_training(self, das_file, ttree_file, data_portion):

        super(FeaturesPerceptronRanker, self)._init_training(das_file, ttree_file, data_portion)

        # precompute training data features
        X = []
        for da, tree in zip(self.train_das, self.train_trees):
            X.append(self.feats.get_features(tree, {'da': da}))
        if self.prune_feats > 1:
            self._prune_features(X)
        # vectorize and binarize or normalize (+train vectorizer/normalizer)
        if self.binarize:
            self.vectorizer = DictVectorizer(sparse=False, binarize_numeric=True)
            self.train_feats = self.vectorizer.fit_transform(X)
        else:
            self.vectorizer = DictVectorizer(sparse=False)
            self.normalizer = StandardScaler(copy=False)
            self.train_feats = self.normalizer.fit_transform(self.vectorizer.fit_transform(X))

        log_info('Features matrix shape: %s' % str(self.train_feats.shape))
Exemplo n.º 19
0
def run_training(head_host, head_port, debug_out=None):
    """Main worker training routine (creates the Seq2SeqTrainingService and connects it to the
    head.

    @param head_host: hostname of the head
    @param head_port: head port number
    @param debug_out: path to the debugging output file (debug output discarded if None)
    """
    # setup debugging output, if applicable
    if debug_out is not None:
        set_debug_stream(file_stream(debug_out, mode='w'))
    # start the server (in the background)
    log_info('Creating training server...')
    server = ThreadPoolServer(service=Seq2SeqTrainingService, nbThreads=1)
    server_thread = Thread(target=server.start)
    server_thread.start()
    my_host = socket.getfqdn()
    log_info('Worker server created at %s:%d. Connecting to head at %s:%d...' %
             (my_host, server.port, head_host, head_port))
    # notify main about this server
    conn = connect(head_host, head_port, config={'allow_pickle': True})
    conn.root.register_worker(my_host, server.port)
    conn.close()
    log_info('Worker is registered with the head.')
    # now serve until we're killed (the server thread will continue to run)
    server_thread.join()
Exemplo n.º 20
0
Arquivo: rank.py Projeto: fooyou/tgen
    def _init_training(self, das_file, ttree_file, data_portion):
        """Initialize training (read input data, fix size, initialize candidate generator
        and planner)"""
        # read input
        log_info('Reading DAs from ' + das_file + '...')
        das = read_das(das_file)
        log_info('Reading t-trees from ' + ttree_file + '...')
        ttree_doc = read_ttrees(ttree_file)
        sents = sentences_from_doc(ttree_doc, self.language, self.selector)
        trees = trees_from_doc(ttree_doc, self.language, self.selector)

        # make training data smaller if necessary
        train_size = int(round(data_portion * len(trees)))
        self.train_trees = trees[:train_size]
        self.train_das = das[:train_size]
        self.train_sents = sents[:train_size]
        self.train_order = range(len(self.train_trees))
        log_info('Using %d training instances.' % train_size)

        # initialize candidate generator + planner if needed
        if self.candgen_model is not None:
            self.candgen = RandomCandidateGenerator.load_from_file(self.candgen_model)
            self.sampling_planner = SamplingPlanner({'language': self.language,
                                                     'selector': self.selector,
                                                     'candgen': self.candgen})
        if 'gen_cur_weights' in self.rival_gen_strategy:
            assert self.candgen is not None
            self.asearch_planner = ASearchPlanner({'candgen': self.candgen,
                                                   'language': self.language,
                                                   'selector': self.selector,
                                                   'ranker': self, })
Exemplo n.º 21
0
 def exposed_training_pass(self, w, pass_no, rnd_seed, data_offset, data_len):
     """(Worker) Run one pass over a part of the training data.
     @param w: initial perceptron weights (pickled)
     @param pass_no: pass number (for logging purposes)
     @param rnd_seed: random generator seed for shuffling training examples
     @param data_offset: training data portion start
     @param data_len: training data portion size
     @return: updated perceptron weights after passing the selected data portion (pickled)
     """
     log_info('Training pass %d with data portion %d + %d' %
              (pass_no, data_offset, data_len))
     # import current feature weights
     ranker = self.ranker_inst
     ranker.set_weights(pickle.loads(w))
     # save rest of the training data to temporary variables, set just the
     # required portion for computation
     all_train_das = ranker.train_das
     ranker.train_das = ranker.train_das[data_offset:data_offset + data_len]
     all_train_trees = ranker.train_trees
     ranker.train_trees = ranker.train_trees[data_offset:data_offset + data_len]
     all_train_feats = ranker.train_feats
     ranker.train_feats = ranker.train_feats[data_offset:data_offset + data_len]
     all_train_sents = ranker.train_sents
     ranker.train_sents = ranker.train_sents[data_offset:data_offset + data_len]
     all_train_order = ranker.train_order
     ranker.train_order = range(len(ranker.train_trees))
     if ranker.randomize:
         rnd.seed(rnd_seed)
         rnd.shuffle(ranker.train_order)
     # do the actual computation (update w)
     ranker._training_pass(pass_no)
     # return the rest of the training data to member variables
     ranker.train_das = all_train_das
     ranker.train_trees = all_train_trees
     ranker.train_feats = all_train_feats
     ranker.train_sents = all_train_sents
     ranker.train_order = all_train_order
     # return the result of the computation
     log_info('Training pass %d / %d / %d done.' % (pass_no, data_offset, data_len))
     return pickle.dumps((ranker.get_weights(), ranker.get_diagnostics()), pickle.HIGHEST_PROTOCOL)
Exemplo n.º 22
0
    def _init_training(self, das_file, ttree_file, data_portion):

        super(FeaturesPerceptronRanker,
              self)._init_training(das_file, ttree_file, data_portion)

        # precompute training data features
        X = []
        for da, tree in zip(self.train_das, self.train_trees):
            X.append(self.feats.get_features(tree, {'da': da}))
        if self.prune_feats > 1:
            self._prune_features(X)
        # vectorize and binarize or normalize (+train vectorizer/normalizer)
        if self.binarize:
            self.vectorizer = DictVectorizer(sparse=False,
                                             binarize_numeric=True)
            self.train_feats = self.vectorizer.fit_transform(X)
        else:
            self.vectorizer = DictVectorizer(sparse=False)
            self.normalizer = StandardScaler(copy=False)
            self.train_feats = self.normalizer.fit_transform(
                self.vectorizer.fit_transform(X))

        log_info('Features matrix shape: %s' % str(self.train_feats.shape))
Exemplo n.º 23
0
    def _check_pending_request(self, sc, job_no, req):
        """Check whether the given request has finished (i.e., job is loaded or job has
        processed the given data portion.

        If the request is finished, the worker that processed it is moved to the pool
        of free services.

        @param iter_no: current iteration number (for logging)
        @param sc: a ServiceConn object that stores the worker connection parameters
        @param job_no: current job number (is None for jobs loading)
        @param req: the request itself

        @return: the value returned by the finished data processing request, or None \
            (for loading requests or unfinished requests)
        """
        result = None
        if job_no is not None:
            log_debug('Checking %d' % job_no)

        # checking if the request has finished
        if req.ready:
            if job_no is not None:
                log_debug('Ready %d' % job_no)
                log_info('Retrieved finished request %d' % job_no)
            if req.error:
                log_info('Error found on request: job #%d, worker %s:%d' %
                         (job_no if job_no is not None else -1, sc.host, sc.port))
            result = req.value

            # remove from list of pending requests
            # TODO return to pool of free requests (but needs to store the results somewhere)
            self.pending_requests.remove((sc, job_no, req))
            if job_no is None:
                self.free_services.append(sc)

        return result
Exemplo n.º 24
0
    def _check_pending_request(self, iter_no, sc, req_portion, req):
        """Check whether the given request has finished (i.e., job is loaded or job has
        processed the given data portion.

        If the request is finished, the worker that processed it is moved to the pool
        of free services.

        @param iter_no: current iteration number (for logging)
        @param sc: a ServiceConn object that stores the worker connection parameters
        @param req_portion: current data portion number (is None for jobs loading)
        @param req: the request itself

        @return: the value returned by the finished data processing request, or None \
            (for loading requests or unfinished requests)
        """
        result = None
        if req_portion is not None:
            log_debug('Checking %d' % req_portion)

        # checking if the request has finished
        if req.ready:
            # loading requests -- do nothing (just logging)
            if req_portion is None:
                if req.error:
                    log_info('Error loading on %s:%d' % (sc.host, sc.port))
                else:
                    log_info('Worker %s:%d finished loading.' % (sc.host, sc.port))
            # data processing request -- retrieve the value
            else:
                log_debug('Ready %d' % req_portion)
                log_info('Retrieved finished request %d / %d' % (iter_no, req_portion))
                if req.error:
                    log_info('Error found on request: IT %d PORTION %d, WORKER %s:%d' %
                             (iter_no, req_portion, sc.host, sc.port))
                result = pickle.loads(req.value)

            # add the worker to the pool of free services (both loading and data processing requests)
            self.pending_requests.remove((sc, req_portion, req))
            self.free_services.append(sc)

        if req_portion is not None:
            log_debug('Done with %d' % req_portion)
        return result
Exemplo n.º 25
0
 def exposed_register_worker(self, host, port):
     """Register a worker with my head, initialize it."""
     # initiate connection in the other direction
     log_info('Worker %s:%d connected, initializing training.' % (host, port))
     conn = connect(host, port, config={'allow_pickle': True})
     # initialize the remote server (with training data etc.)
     log_info('Ranker dump size: %d' % sys.getsizeof(ranker_dump))
     conn.root.init_training(ranker_dump)
     # add it to the list of running services
     sc = ServiceConn(host, port, conn)
     head.services.add(sc)
     head.free_services.append(sc)
     log_info('Worker %s:%d initialized.' % (host, port))
Exemplo n.º 26
0
    def _init_training(self, das_file, ttree_file, data_portion):
        """Initialize training (read input data, fix size, initialize candidate generator
        and planner)"""
        # read input
        log_info('Reading DAs from ' + das_file + '...')
        das = read_das(das_file)
        log_info('Reading t-trees from ' + ttree_file + '...')
        ttree_doc = read_ttrees(ttree_file)
        sents = sentences_from_doc(ttree_doc, self.language, self.selector)
        trees = trees_from_doc(ttree_doc, self.language, self.selector)

        # make training data smaller if necessary
        train_size = int(round(data_portion * len(trees)))
        self.train_trees = trees[:train_size]
        self.train_das = das[:train_size]
        self.train_sents = sents[:train_size]
        self.train_order = range(len(self.train_trees))
        log_info('Using %d training instances.' % train_size)

        # initialize candidate generator
        if self.candgen_model is not None:
            self.candgen = RandomCandidateGenerator.load_from_file(
                self.candgen_model)
#             self.sampling_planner = SamplingPlanner({'language': self.language,
#                                                      'selector': self.selector,
#                                                      'candgen': self.candgen})

# check if A*search planner is needed (i.e., any rival generation strategy requires it)
# and initialize it
        if isinstance(self.rival_gen_strategy[0], tuple):
            asearch_needed = any([
                s in ['gen_cur_weights', 'gen_update']
                for _, ss in self.rival_gen_strategy for s in ss
            ])
        else:
            asearch_needed = any([
                s in ['gen_cur_weights', 'gen_update']
                for s in self.rival_gen_strategy
            ])
        if asearch_needed:
            assert self.candgen is not None
            self.asearch_planner = ASearchPlanner({
                'candgen': self.candgen,
                'language': self.language,
                'selector': self.selector,
                'ranker': self,
            })
Exemplo n.º 27
0
def run_worker(head_host, head_port, debug_out=None):
    # setup debugging output, if applicable
    if debug_out is not None:
        set_debug_stream(file_stream(debug_out, mode='w'))
    # start the server (in the background)
    log_info('Creating worker server...')
    server = ThreadPoolServer(service=RankerTrainingService, nbThreads=1)
    server_thread = Thread(target=server.start)
    server_thread.start()
    my_host = socket.getfqdn()
    log_info('Worker server created at %s:%d. Connecting to head at %s:%d...' %
             (my_host, server.port, head_host, head_port))
    # notify main about this server
    conn = connect(head_host, head_port, config={'allow_pickle': True})
    conn.root.register_worker(my_host, server.port)
    conn.close()
    log_info('Worker is registered with the head.')
    # now serve until we're killed (the server thread will continue to run)
    server_thread.join()
Exemplo n.º 28
0
    def _init_training(self, das_file, ttree_file, data_portion):
        """Initialize training (read input data, fix size, initialize candidate generator
        and planner)"""
        # read input
        log_info('Reading DAs from ' + das_file + '...')
        das = read_das(das_file)
        log_info('Reading t-trees from ' + ttree_file + '...')
        ttree_doc = read_ttrees(ttree_file)
        sents = sentences_from_doc(ttree_doc, self.language, self.selector)
        trees = trees_from_doc(ttree_doc, self.language, self.selector)

        # make training data smaller if necessary
        train_size = int(round(data_portion * len(trees)))
        self.train_trees = trees[:train_size]
        self.train_das = das[:train_size]
        self.train_sents = sents[:train_size]
        self.train_order = range(len(self.train_trees))
        log_info('Using %d training instances.' % train_size)

        # initialize candidate generator
        if self.candgen_model is not None:
            self.candgen = RandomCandidateGenerator.load_from_file(self.candgen_model)
#             self.sampling_planner = SamplingPlanner({'language': self.language,
#                                                      'selector': self.selector,
#                                                      'candgen': self.candgen})

        # check if A*search planner is needed (i.e., any rival generation strategy requires it)
        # and initialize it
        if isinstance(self.rival_gen_strategy[0], tuple):
            asearch_needed = any([s in ['gen_cur_weights', 'gen_update']
                                  for _, ss in self.rival_gen_strategy
                                  for s in ss])
        else:
            asearch_needed = any([s in ['gen_cur_weights', 'gen_update']
                                  for s in self.rival_gen_strategy])
        if asearch_needed:
            assert self.candgen is not None
            self.asearch_planner = ASearchPlanner({'candgen': self.candgen,
                                                   'language': self.language,
                                                   'selector': self.selector,
                                                   'ranker': self, })
Exemplo n.º 29
0
 def exposed_init_training(self, head_ranker_path):
     """(Worker) Just deep-copy all necessary attributes from the head instance."""
     tstart = time.time()
     log_info('Initializing training...')
     self.ranker_inst = load_ranker(head_ranker_path)
     log_info('Training initialized. Time taken: %f secs.' % (time.time() - tstart))
Exemplo n.º 30
0
    def train(self, das_file, ttree_file, data_portion=1.0):
        """Run parallel perceptron training, start and manage workers."""
        # initialize the ranker instance
        log_info('Initializing...')
        self.loc_ranker._init_training(das_file, ttree_file, data_portion)
        # run server to process registering clients
        self._init_server()
        # spawn training jobs
        log_info('Spawning jobs...')
        host_short, _ = self.host.split('.', 1)  # short host name for job names
        for j in xrange(self.jobs_number):
            # set up debugging logfile only if we have it on the head
            debug_logfile = ('"PRT%02d.debug-out.txt.gz"' % j) if is_debug_stream() else 'None'
            job = Job(header='from tgen.parallel_percrank_train import run_worker',
                      code=('run_worker("%s", %d, %s)' %
                            (self.host, self.port, debug_logfile)),
                      name=self.experiment_id + ("PRT%02d-%s-%d" % (j, host_short, self.port)),
                      work_dir=self.work_dir)
            job.submit(self.job_memory)
            self.jobs.append(job)
        # run the training passes
        try:
            for iter_no in xrange(1, self.loc_ranker.passes + 1):

                log_info('Pass %d...' % iter_no)
                log_debug('\n***\nTR%05d:' % iter_no)

                iter_start_time = time.time()
                cur_portion = 0
                results = [None] * self.data_portions
                w_dump = pickle.dumps(self.loc_ranker.get_weights(), protocol=pickle.HIGHEST_PROTOCOL)
                rnd_seeds = [rnd.random() for _ in xrange(self.data_portions)]
                # wait for free services / assign computation
                while cur_portion < self.data_portions or self.pending_requests:
                    log_debug('Starting loop over services.')

                    # check if some of the pending computations have finished
                    for sc, req_portion, req in list(self.pending_requests):
                        res = self._check_pending_request(iter_no, sc, req_portion, req)
                        if res:
                            results[req_portion] = res

                    # check for free services and assign new computation
                    while cur_portion < self.data_portions and self.free_services:
                        log_debug('Assigning request %d' % cur_portion)
                        sc = self.free_services.popleft()
                        log_info('Assigning request %d / %d to %s:%d' %
                                 (iter_no, cur_portion, sc.host, sc.port))
                        train_func = async(sc.conn.root.training_pass)
                        req = train_func(w_dump, iter_no, rnd_seeds[cur_portion],
                                         * self._get_portion_bounds(cur_portion))
                        self.pending_requests.add((sc, cur_portion, req))
                        cur_portion += 1
                        log_debug('Assigned %d' % cur_portion)
                    # sleep for a while
                    log_debug('Sleeping.')
                    time.sleep(self.poll_interval)

                # delete the temporary ranker dump when the 1st iteration is complete
                if self.ranker_dump_path:
                    log_info('Removing temporary ranker dump at %s.' % self.ranker_dump_path)
                    os.remove(self.ranker_dump_path)
                    self.ranker_dump_path = None

                # gather/average the diagnostic statistics
                self.loc_ranker.set_diagnostics_average([d for _, d in results])

                # take an average of weights; set it as new w
                self.loc_ranker.set_weights_average([w for w, _ in results])
                self.loc_ranker.store_iter_weights()  # store a copy of w for averaged perceptron

                # print statistics
                log_debug(self.loc_ranker._feat_val_str(), '\n***')
                self.loc_ranker._print_pass_stats(iter_no, datetime.timedelta(seconds=(time.time() - iter_start_time)))

            # after all passes: average weights if set to do so
            if self.loc_ranker.averaging is True:
                self.loc_ranker.set_weights_iter_average()
        # kill all jobs
        finally:
            for job in self.jobs:
                job.delete()
Exemplo n.º 31
0
 def load_from_file(model_fname):
     """Load a pre-trained model from a file."""
     log_info("Loading ranker from %s..." % model_fname)
     with file_stream(model_fname, 'rb', encoding=None) as fh:
         return pickle.load(fh)
Exemplo n.º 32
0
 def save_to_file(self, model_fname):
     """Save the model to a file."""
     log_info("Saving ranker to %s..." % model_fname)
     with file_stream(model_fname, 'wb', encoding=None) as fh:
         pickle.dump(self, fh, protocol=pickle.HIGHEST_PROTOCOL)
Exemplo n.º 33
0
 def train(self, train_file, work_dir, memory=8, encoding='UTF-8'):
     """\
     Read training data, split them and train the individual models
     (in cluster jobs).
     """
     # load the entire data set
     train = self.load_training_set(train_file, encoding)
     self.data_headers = train.get_headers()
     self.attr_mask = self.get_attr_mask()
     # train a backoff model
     log_info('Training a backoff model...')
     self.backoff_model = self.train_backoff_model(train)
     # split it
     log_info('Split...')
     train_split = train.split(eval(self.divide_func), keep_copy=False)
     jobs = []
     model_files = {}
     # save training files and create training jobs
     for key, subset in train_split.iteritems():
         fn = re.sub(r'(.arff(.gz)?)?$', '-' + key + '.arff.gz', train_file)
         fn = os.path.join(work_dir, os.path.basename(fn))
         subset.save_to_arff(fn, encoding)
         job, model_file = Model.create_training_job(self.config, work_dir,
                                                     fn, memory=memory,
                                                     encoding=encoding)
         jobs.append(job)
         model_files[key] = model_file
     # submit the training jobs and wait for all of them
     log_info('Submitting training jobs...')
     for job in jobs:
         job.submit()
     log_info('Waiting for jobs...')
     for job in jobs:
         job.wait()
     # load all models
     log_info('Training complete. Assembling model files...')
     for key, model_file in model_files.iteritems():
         self.models[key] = Model.load_from_file(model_file)
     self.trained = True
     log_info('Training done.')
Exemplo n.º 34
0
 def _print_pass_stats(self, pass_no, pass_duration):
     """Print pass statistics from internal evaluator fields and given pass duration."""
     log_info('Pass %05d -- tree-level accuracy: %.4f' % (pass_no, self.evaluator.tree_accuracy()))
     log_info(' * Generated trees NODE scores: P: %.4f, R: %.4f, F: %.4f' %
              self.evaluator.p_r_f1())
     log_info(' * Generated trees DEP  scores: P: %.4f, R: %.4f, F: %.4f' %
              self.evaluator.p_r_f1(EvalTypes.DEP))
     log_info(' * Gold tree BEST: %.4f, on CLOSE: %.4f, on ANY list: %.4f' %
              self.lists_analyzer.stats())
     log_info(' * Tree size stats:\n -- GOLD: %s\n -- PRED: %s\n -- DIFF: %s' %
              self.evaluator.size_stats())
     log_info(' * Common subtree stats:\n -- SIZE: %s\n -- ΔGLD: %s\n -- ΔPRD: %s' %
              self.evaluator.common_substruct_stats())
     log_info(' * Score stats\n -- GOLD: %s\n -- PRED: %s\n -- DIFF: %s'
              % self.evaluator.score_stats())
     log_info(' * Duration: %s' % str(pass_duration))
Exemplo n.º 35
0
 def load_from_file(model_fname):
     """Load a pre-trained model from a file."""
     log_info("Loading ranker from %s..." % model_fname)
     with file_stream(model_fname, 'rb', encoding=None) as fh:
         return pickle.load(fh)
Exemplo n.º 36
0
 def save_to_file(self, model_fname):
     """Save the model to a file."""
     log_info("Saving ranker to %s..." % model_fname)
     with file_stream(model_fname, 'wb', encoding=None) as fh:
         pickle.dump(self, fh, protocol=pickle.HIGHEST_PROTOCOL)