Пример #1
0
    def generate_one_preprocessed_sample(self):
        """ Yield a preprocessed sample from the dataset.

        Args:
            preprocessor: the preprocessor to use must implement the following method: preprocess

        Yields:
            preprocessed_sample: yields the preprocessed sample one by one.
        """
        if len(self._preprocessed_samples) == len(self._data):
            for preprocessed_sentence, entities, data in self._preprocessed_samples:
                logging.debug(
                    "Yielding Following Data from memory:\npreprocessed_sentence:{}\nentities:{}\ndata:{}"
                    .format(preprocessed_sentence.encode("utf8"), entities,
                            data.encode("utf8")))
                yield preprocessed_sentence, entities, data
        else:
            self._preprocessed_samples = []
            for data in self.generate_one_sample():
                if self.data_type == "one_line_one_sentence":
                    preprocessed_sentence, entities = self.preprocessor.preprocess(
                        data)
                    logging.debug(
                        "Yielding and Saving Following Data:\npreprocessed_sentence:{}\nentities:{}\ndata:{}"
                        .format(preprocessed_sentence.encode("utf8"), entities,
                                data.encode("utf8")))
                    self._preprocessed_samples.append(
                        (preprocessed_sentence, entities, data))
                    yield preprocessed_sentence, entities, data
                else:
                    raise NotImplementedError
Пример #2
0
 def generate_one_sample(self):
     """ Yield data samples one by one. """
     if self.data_type == "one_line_one_sentence":
         for data in self._data:
             logging.debug("Yielding sentence: {}".format(
                 data.encode("utf8")))
             yield data
     else:
         raise NotImplementedError
Пример #3
0
    def _train(cls, config, train_state, examples):
        model = train_state.model
        optimizer = train_state.optimizer
        train_batches = similar_size_batches(examples.train,
                                             config.optim.batch_size,
                                             size=lambda ex: len(ex))

        while True:
            random.shuffle(train_batches)
            i = 0  # cannot enumerate(verboserate(...))
            for batch in verboserate(train_batches,
                                     desc='Streaming training examples'):
                loss = model.loss(batch, cls._train_state.train_steps)
                cls._take_grad_step(train_state, loss)
                if (i % 5) == 0:
                    cls.evaluate()
                if (i % 1000) == 0:
                    if config.model.type == 1:  # SVAE
                        # write interpolations to file
                        fname = "interps_batches_{}".format(i)
                        num_ex = 10
                        a_idx = np.random.randint(len(batch), size=num_ex)
                        b_idx = np.random.randint(len(batch), size=num_ex)
                        interps = []
                        for a, b in zip(a_idx, b_idx):
                            ex_a = batch[a]
                            ex_b = batch[b]
                            interpolation = model._interpolate_examples(
                                ex_a, ex_b)
                            interpolation_repr = []
                            interpolation_repr.append(" ".join(ex_a))
                            interpolation_repr.extend(
                                [" ".join(ex) for ex in interpolation])
                            interpolation_repr.append(" ".join(ex_b))
                            interps.append(interpolation_repr)
                        with open(join(cls._interps_dir, fname), 'w') as fout:
                            data = "\n\n".join(
                                ["\n".join(ex) for ex in interps])
                            fout.write(data.encode('utf-8'))
                if (i % 5000) == 0:
                    cls.checkpoints.save(train_state)
                i += 1
Пример #4
0
    def preprocess_all(self, force_preprocessing):
        """ Preprocess all the data or load them from pickle file.

        Args:
            force_preprocessing: Bool weather to force preprocessing or autoload from pickle file if exist.
        """
        if not force_preprocessing:
            try:
                with open(self._file_path + "_preprocessed.pickle", "rb") as f:
                    self._preprocessed_samples = pickle.load(f)
                    logging.info(
                        "Preprocessed sample pickle file successfully loaded.")
                    return True
            except IOError:
                logging.info("Couldn't find a preprocessed pickle file.")

        logging.info("Preprocessing all the data and saving a new pickle.")
        for data in self.generate_one_sample():
            if self.data_type == "one_line_one_sentence":
                preprocessed_sentence, entities = self.preprocessor.preprocess(
                    data)
                logging.debug(
                    "Saving Following Preprocessed Data:\npreprocessed_sentence:{}\nentities:{}\ndata:{}"
                    .format(preprocessed_sentence.encode("utf8"), entities,
                            data.encode("utf8")))
                self._preprocessed_samples.append(
                    (preprocessed_sentence, entities, data))
            else:
                raise NotImplementedError

        with open(self._file_path + "_preprocessed.pickle", "wb") as f:
            pickle.dump(self._preprocessed_samples, f)
        logging.info(
            "Preprocessing done and preprocessed samples saved in `{}`".format(
                self._file_path + "_preprocessed.pickle"))
        return True