예제 #1
0
def check_dataset_and_coders(dataset, runners):
    # pylint: disable=protected-access

    data_list = []
    for runner in runners:
        for c in runner.all_coders:
            if hasattr(c, "data_id"):
                data_list.append((c.data_id, c))
            elif hasattr(c, "data_ids"):
                data_list.extend([(d, c) for d in c.data_ids])
            else:
                warn(("Coder: {} does not have" "a data attribute").format(c))

    debug("Found series: {}".format(str(data_list)), "checking")
    missing = []

    for (serie, coder) in data_list:
        if not dataset.has_series(serie):
            log("dataset {} does not have serie {}".format(
                dataset.name, serie))
            missing.append((coder, serie))

    if len(missing) > 0:
        formated = [
            "{} ({}, {}.{})".format(serie, cod.name, cod.__class__.__module__,
                                    cod.__class__.__name__)
            for cod, serie in missing
        ]

        raise CheckingException("Dataset '{}' is mising series {}:".format(
            dataset.name, ", ".join(formated)))
예제 #2
0
def from_dataset(datasets: List[Dataset],
                 series_ids: List[str],
                 max_size: int,
                 save_file: str = None,
                 overwrite: bool = False,
                 min_freq: Optional[int] = None,
                 unk_sample_prob: float = 0.5) -> 'Vocabulary':
    """Loads vocabulary from a dataset with an option to save it.

    Arguments:
        datasets: A list of datasets from which to create the vocabulary
        series_ids: A list of ids of series of the datasets that should be used
                    producing the vocabulary
        max_size: The maximum size of the vocabulary
        save_file: A file to save the vocabulary to. If None (default),
                   the vocabulary will not be saved.
        overwrite: Overwrite existing file.
        min_freq: Do not include words with frequency smaller than this.
        unk_sample_prob: The probability with which to sample unks out of
                         words with frequency 1. Defaults to 0.5.

    Returns:
        The new Vocabulary instance.
    """
    check_argument_types()

    vocabulary = Vocabulary(unk_sample_prob=unk_sample_prob)

    for dataset in datasets:
        if isinstance(dataset, LazyDataset):
            warn("Inferring vocabulary from lazy dataset!")

        for series_id in series_ids:
            if not dataset.has_series(series_id):
                warn("Data series '{}' not present in the dataset".format(
                    series_id))

            series = dataset.get_series(series_id, allow_none=True)
            if series:
                vocabulary.add_tokenized_text(
                    [token for sent in series for token in sent])

    vocabulary.truncate(max_size)

    if min_freq is not None:
        if min_freq > 1:
            vocabulary.truncate_by_min_freq(min_freq)

    log("Vocabulary for series {} initialized, containing {} words".format(
        series_ids, len(vocabulary)))

    vocabulary.log_sample()

    if save_file is not None:
        directory = os.path.dirname(save_file)
        if not os.path.exists(directory):
            os.makedirs(directory)
        vocabulary.save_to_file(save_file, overwrite)

    return vocabulary
 def reader(files: List[str]) -> Iterable[List[str]]:
     column_count = None
     text_reader = string_reader(encoding)
     for line in text_reader(files):
         io_line = io.StringIO(line.strip())
         if quotechar is not None:
             parsed_csv = list(
                 csv.reader(io_line,
                            delimiter=delimiter,
                            quotechar=quotechar,
                            skipinitialspace=True))
         else:
             parsed_csv = list(
                 csv.reader(io_line,
                            delimiter=delimiter,
                            quoting=csv.QUOTE_NONE,
                            skipinitialspace=True))
         columns = len(parsed_csv[0])
         if column_count is None:
             column_count = columns
         elif column_count != columns:
             warn("A mismatch in number of columns. Expected {} got {}".
                  format(column_count, columns))
         if columns < column:
             warn("There is a missing column number {} in the dataset.".
                  format(column))
             yield []
         else:
             yield parsed_csv[0][column - 1].split()
예제 #4
0
def load_dataset_from_files(name: str,
                            lazy: bool = False,
                            preprocessors: List[Tuple[str, str,
                                                      Callable]] = None,
                            **kwargs) -> "Dataset":
    """Load a dataset from the files specified by the provided arguments.

    Paths to the data are provided in a form of dictionary.

    Keyword arguments:
        name: The name of the dataset to use. If None (default), the name will
              be inferred from the file names.
        lazy: Boolean flag specifying whether to use lazy loading (useful for
              large files). Note that the lazy dataset cannot be shuffled.
              Defaults to False.
        preprocessor: A callable used for preprocessing of the input sentences.
        kwargs: Dataset keyword argument specs. These parameters should begin
                with 's_' prefix and may end with '_out' suffix.  For example,
                a data series 'source' which specify the source sentences
                should be initialized with the 's_source' parameter, which
                specifies the path and optinally reader of the source file. If
                runners generate data of the 'target' series, the output file
                should be initialized with the 's_target_out' parameter.
                Series identifiers should not contain underscores.
                Dataset-level preprocessors are defined with 'pre_' prefix
                followed by a new series name. In case of the pre-processed
                series, a callable taking the dataset and returning a new
                series is expected as a value.

    Returns:
        The newly created dataset.
    """
    warn("Use of deprecated function. Consider using dataset.load instead.")
    check_argument_types()

    series_paths_and_readers = _get_series_paths_and_readers(kwargs)
    outputs = _get_series_outputs(kwargs)

    if not series_paths_and_readers:
        raise ValueError("No input files were provided.")

    series, data = [list(x) for x in zip(*series_paths_and_readers.items())]

    # Series-level preprocessors
    if preprocessors:
        for src, tgt, fun in preprocessors:
            series.append(tgt)
            data.append((fun, src))

    # Dataset-level preprocessors
    keys = [key for key in kwargs if PREPROCESSED_SERIES.match(key)]

    for key in keys:
        s_name = get_first_match(PREPROCESSED_SERIES, key)
        preprocessor = cast(DatasetPreprocess, kwargs[key])
        series.append(s_name)
        data.append(preprocessor)

    buffer_size = None if not lazy else 5000
    return load(name, series, data, outputs, buffer_size, False)
예제 #5
0
    def __init__(self,
                 output_series: str,
                 encoder: GenericModelPart,
                 attribute: str = "output",
                 select_session: int = None) -> None:
        """Initialize the representation runner.

        Args:
            output_series: Name of the output series with vectors.
            encoder: The encoder to use. This can be any ``GenericModelPart``
                object.
            attribute: The name of the encoder attribute that contains the
                data.
            used_session: Id of the TensorFlow session used in case of model
                ensembles.
        """
        check_argument_types()

        if attribute not in dir(encoder):
            warn("The encoder '{}' seems not to have the specified "
                 "attribute '{}'".format(encoder, attribute))

        TensorRunner.__init__(self,
                              output_series,
                              modelparts=[encoder],
                              tensors=[attribute],
                              batch_dims=[0],
                              tensors_by_name=[],
                              batch_dims_by_name=[],
                              select_session=select_session,
                              single_tensor=True)
    def func(
            train_mode: tf.Tensor,
            rnn_size: int,
            encoders: List[TemporalStatefulWithOutput]) -> tf.Tensor:

        if len(encoders) != 1:
            raise ValueError("Exactly one encoder required for this type of "
                             "projection. {} given.".format(len(encoders)))
        encoder = encoders[0]

        # shape (batch, time)
        masked_sum = tf.reduce_sum(
            encoder.temporal_states
            * tf.expand_dims(encoder.temporal_mask, 2), 1)

        # shape (batch, 1)
        lengths = tf.reduce_sum(encoder.temporal_mask, 1, keep_dims=True)

        means = masked_sum / lengths
        means = dropout(means, dropout_keep_prob, train_mode)

        encoder_rnn_size = means.get_shape()[1].value

        kernel_initializer = orthogonal_initializer()
        if encoder_rnn_size != rnn_size:
            kernel_initializer = tf.glorot_normal_initializer()
            warn("Using nematus projection on nonequal encoder and decoder "
                 "state sizes ({} vs {})".format(encoder_rnn_size, rnn_size))

        return tf.layers.dense(means, rnn_size,
                               activation=tf.tanh,
                               kernel_initializer=kernel_initializer,
                               name="encoders_projection")
예제 #7
0
 def reader(files: List[str]) -> Iterable[List[str]]:
     column_count = None
     text_reader = string_reader(encoding)
     for line in text_reader(files):
         io_line = io.StringIO(line.strip())
         if quotechar is not None:
             parsed_csv = list(csv.reader(io_line, delimiter=delimiter,
                                          quotechar=quotechar,
                                          skipinitialspace=True))
         else:
             parsed_csv = list(csv.reader(io_line, delimiter=delimiter,
                                          quoting=csv.QUOTE_NONE,
                                          skipinitialspace=True))
         columns = len(parsed_csv[0])
         if column_count is None:
             column_count = columns
         elif column_count != columns:
             warn("A mismatch in number of columns. Expected {} got {}"
                  .format(column_count, columns))
         if columns < column:
             warn("There is a missing column number {} in the dataset."
                  .format(column))
             yield []
         else:
             yield parsed_csv[0][column - 1].split()
예제 #8
0
    def __init__(self,
                 output_series: str,
                 encoder: GenericModelPart,
                 attribute: str = "output",
                 select_session: int = None) -> None:
        """Initialize the representation runner.

        Args:
            output_series: Name of the output series with vectors.
            encoder: The encoder to use. This can be any ``GenericModelPart``
                object.
            attribute: The name of the encoder attribute that contains the
                data.
            used_session: Id of the TensorFlow session used in case of model
                ensembles.
        """
        check_argument_types()

        if attribute not in dir(encoder):
            warn("The encoder '{}' seems not to have the specified "
                 "attribute '{}'".format(encoder, attribute))

        TensorRunner.__init__(
            self,
            output_series,
            modelparts=[encoder],
            tensors=[attribute],
            batch_dims=[0],
            tensors_by_name=[],
            batch_dims_by_name=[],
            select_session=select_session,
            single_tensor=True)
예제 #9
0
    def __init__(self,
                 wrapper: str,
                 name: str = "MultEval",
                 encoding: str = "utf-8",
                 metric: str = "bleu",
                 language: str = "en") -> None:
        """Initialize the wrapper.

        Arguments:
            wrapper: Path to multeval.sh script
            name: Name of the evaluator
            encoding: Encoding of input files
            language: Language of hypotheses and references
            metric: Evaluation metric "bleu", "ter", "meteor"
        """
        check_argument_types()
        super().__init__("{}_{}_{}".format(name, metric, language))

        self.wrapper = wrapper
        self.encoding = encoding
        self.language = language
        self.metric = metric

        if self.metric not in ["bleu", "ter", "meteor"]:
            warn("{} metric is not valid. Using bleu instead.".
                 format(self.metric))
            self.metric = "bleu"
예제 #10
0
def from_wordlist(path: str,
                  encoding: str = "utf-8",
                  contains_header: bool = True,
                  contains_frequencies: bool = True) -> "Vocabulary":
    """Load a vocabulary from a wordlist.

    The file can contain either list of words with no header.
    Or it can contain words and their counts separated
    by tab and a header on the first line.

    Arguments:
        path: The path to the wordlist file
        encoding: The encoding of the wordlist file (defaults to UTF-8)
        contains_header: if the file have a header on first line
        contains_frequencies: if the file contains frequencies in second column

    Returns:
        The new Vocabulary instance.
    """
    vocabulary = Vocabulary()

    with open(path, encoding=encoding) as wordlist:
        line_number = 1
        if contains_header:
            # skip the header
            line_number += 1
            next(wordlist)

        for line in wordlist:
            line = line.strip()
            # check if line is empty
            if not line:
                warn("Vocabulary file {}:{}: line empty".format(
                    path, line_number))
                line_number += 1
                continue

            # if contains_frequencies:
            #     info = line.split("\t")
            #     if len(info) != 2:
            #         raise ValueError(
            #             "Vocabulary file {}:{}: line does not have two columns"
            #             .format(path, line_number))
            #     vocabulary.add_word(info[0], int(info[1]))
            # else:
            #     if "\t" in line:
            #         warn("Vocabulary file {}:{}: line contains a tabulator"
            #              .format(path, line_number))
            #     vocabulary.add_word(line)

            if contains_frequencies:
                info = line.split("\t")
                vocabulary.add_word(info[0])
            line_number += 1

    log("Vocabulary from wordlist loaded, containing {} words".format(
        len(vocabulary)))
    vocabulary.log_sample()
    return vocabulary
예제 #11
0
def rnn_layer(rnn_input: tf.Tensor, lengths: tf.Tensor, rnn_spec: RNNSpec,
              add_residual: bool) -> Tuple[tf.Tensor, tf.Tensor]:
    """Construct a RNN layer given its inputs and specs.

    Arguments:
        rnn_inputs: The input sequence to the RNN.
        lengths: Lengths of input sequences.
        rnn_spec: A valid RNNSpec tuple specifying the network architecture.
        add_residual: Add residual connections to the layer output.
    """
    if rnn_spec.direction == "bidirectional":
        fw_cell = _make_rnn_cell(rnn_spec)
        bw_cell = _make_rnn_cell(rnn_spec)

        outputs_tup, states_tup = tf.nn.bidirectional_dynamic_rnn(
            fw_cell,
            bw_cell,
            rnn_input,
            sequence_length=lengths,
            dtype=tf.float32)

        outputs = tf.concat(outputs_tup, 2)

        if rnn_spec.cell_type == "LSTM":
            states_tup = (state.h for state in states_tup)

        final_state = tf.concat(list(states_tup), 1)
    else:
        if rnn_spec.direction == "backward":
            rnn_input = tf.reverse_sequence(rnn_input, lengths, seq_axis=1)

        cell = _make_rnn_cell(rnn_spec)
        outputs, final_state = tf.nn.dynamic_rnn(cell,
                                                 rnn_input,
                                                 sequence_length=lengths,
                                                 dtype=tf.float32)

        if rnn_spec.direction == "backward":
            outputs = tf.reverse_sequence(outputs, lengths, seq_axis=1)

        if rnn_spec.cell_type == "LSTM":
            final_state = final_state.h

    if add_residual:
        if outputs.get_shape()[-1].value != rnn_input.get_shape()[-1].value:
            warn("Size of the RNN layer input ({}) and layer output ({}) "
                 "must match when applying residual connection. Reshaping "
                 "the rnn output using linear projection.".format(
                     outputs.get_shape(), rnn_input.get_shape()))
            # pylint: disable=redefined-variable-type
            outputs = tf.layers.dense(outputs, rnn_input.shape.as_list()[-1])
            # pylint: enable=redefined-variable-type
        outputs += rnn_input

    return outputs, final_state
예제 #12
0
def initialize_model(tf_manager: TensorFlowManager,
                     initial_variables: Optional[List[str]],
                     executables: List[GraphExecutor]):

    if initial_variables is None:
        # Assume we don't look at coder checkpoints when global
        # initial variables are supplied
        tf_manager.initialize_model_parts(executables, save=True)
    else:
        try:
            tf_manager.restore(initial_variables)
        except tf.errors.NotFoundError:
            warn("Some variables were not found in checkpoint.)")
    def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict:
        fd = ModelPart.feed_dict(self, dataset, train)

        alignment = dataset.maybe_get_series(self.data_id)
        if alignment is None:
            if train:
                warn("Training alignment not present!")

            alignment = np.zeros((len(dataset), self.decoder.max_output_len,
                                  self.enc_input.max_length), np.float32)

        fd[self.ref_alignment] = alignment

        return fd
예제 #14
0
    def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict:
        fd = {}

        alignment = dataset.get_series(self.data_id, allow_none=True)
        if alignment is None:
            if train:
                warn("Training alignment not present!")

            alignment = np.zeros((len(dataset), self.decoder.max_output_len,
                                  self.encoder.max_input_len), np.float32)

        fd[self.ref_alignment] = alignment

        return fd
예제 #15
0
    def embedding_size(self) -> int:
        if self.embeddings_source is None:
            if self._embedding_size is None:
                raise ValueError(
                    "You must specify either embedding size or the embedded "
                    "sequence from which to reuse the embeddings (e.g. set "
                    "'embedding_size' or 'embeddings_source' parameter)")
            return self._embedding_size

        if self.embeddings_source is not None:
            if self._embedding_size is not None:
                warn("Overriding the embedding_size parameter with the "
                     "size of the reused embeddings from the encoder.")

        return self.embeddings_source.embedding_matrix.get_shape()[1].value
예제 #16
0
    def embedding_size(self) -> int:
        if self.embeddings_source is None:
            if self._embedding_size is None:
                raise ValueError(
                    "You must specify either embedding size or the embedded "
                    "sequence from which to reuse the embeddings (e.g. set "
                    "'embedding_size' or 'embeddings_source' parameter)")
            return self._embedding_size

        if self.embeddings_source is not None:
            if self._embedding_size is not None:
                warn("Overriding the embedding_size parameter with the "
                     "size of the reused embeddings from the encoder.")

        return self.embeddings_source.embedding_matrix.get_shape()[1].value
예제 #17
0
    def __init__(self,
                 name: str,
                 reuse: "Parameterized" = None,
                 save_checkpoint: str = None,
                 load_checkpoint: str = None,
                 initializers: InitializerSpecs = None) -> None:
        """Construct a new parameterized object.

        Arguments:
            name: The name for the model part. Will be used in the variable
                and name scopes.
            reuse: Optional parameterized part with which to share parameters.
            save_checkpoint: Optional path to a checkpoint file which will
                store the parameters of this object.
            load_checkpoint: Optional path to a checkpoint file from which to
                load initial variables for this object.
            initializers: An `InitializerSpecs` instance with specification
                of the initializers.
        """
        self._name = name
        self._save_checkpoint = save_checkpoint
        self._load_checkpoint = load_checkpoint

        self._saver = None  # type: tf.train.Saver
        self._reuse = reuse is not None

        if reuse is not None:
            # pylint: disable=unidiomatic-typecheck
            # Here we need an exact match of types
            if type(self) != type(reuse):
                warn("Warning: sharing parameters between model parts of "
                     "different types.")
            # pylint: enable=unidiomatic-typecheck

            if initializers is not None:
                raise ValueError("Cannot use initializers in model part '{}' "
                                 "that reuses variables from '{}'.".format(
                                     name, reuse.name))

            # pylint: disable=protected-access
            self._variable_scope = reuse._variable_scope  # type: ignore
            # pylint: enable=protected-access
        else:
            with tf.variable_scope(name) as scope:
                self._variable_scope = scope
                if initializers is not None:
                    update_initializers((scope.name + "/" + name, initializer)
                                        for name, initializer in initializers)
예제 #18
0
    def __init__(self,
                 name: str,
                 reuse: "Parameterized" = None,
                 save_checkpoint: str = None,
                 load_checkpoint: str = None,
                 initializers: InitializerSpecs = None) -> None:
        """Construct a new parameterized object.

        Arguments:
            name: The name for the model part. Will be used in the variable
                and name scopes.
            reuse: Optional parameterized part with which to share parameters.
            save_checkpoint: Optional path to a checkpoint file which will
                store the parameters of this object.
            load_checkpoint: Optional path to a checkpoint file from which to
                load initial variables for this object.
            initializers: An `InitializerSpecs` instance with specification
                of the initializers.
        """
        self._name = name
        self._save_checkpoint = save_checkpoint
        self._load_checkpoint = load_checkpoint

        self._saver = None  # type: tf.train.Saver
        self._reuse = reuse is not None

        if reuse is not None:
            # pylint: disable=unidiomatic-typecheck
            # Here we need an exact match of types
            if type(self) != type(reuse):
                warn("Warning: sharing parameters between model parts of "
                     "different types.")
            # pylint: enable=unidiomatic-typecheck

            if initializers is not None:
                raise ValueError("Cannot use initializers in model part '{}' "
                                 "that reuses variables from '{}'."
                                 .format(name, reuse.name))

            # pylint: disable=protected-access
            self._variable_scope = reuse._variable_scope  # type: ignore
            # pylint: enable=protected-access
        else:
            with tf.variable_scope(name) as scope:
                self._variable_scope = scope
                if initializers is not None:
                    update_initializers((scope.name + "/" + name, initializer)
                                        for name, initializer in initializers)
예제 #19
0
    def regularization_losses(self) -> Tuple[tf.Tensor, tf.Tensor]:
        """Compute the regularization losses, e.g. L1 and L2."""
        regularizable = [v for v in tf.trainable_variables()
                         if not BIAS_REGEX.findall(v.name)
                         and not v.name.startswith("vgg")
                         and not v.name.startswith("Inception")
                         and not v.name.startswith("resnet")]

        if not regularizable:
            warn("It seems that there are no trainable variables in the model")
            return tf.zeros([]), tf.zeros([])

        with tf.name_scope("regularization"):
            l1_norm = sum(tf.reduce_sum(abs(v)) for v in regularizable)
            l2_norm = sum(tf.reduce_sum(v ** 2) for v in regularizable)

        return l1_norm, l2_norm
예제 #20
0
def initialize_vocabulary(directory: str,
                          name: str,
                          datasets: List[Dataset] = None,
                          series_ids: List[str] = None,
                          max_size: int = None) -> "Vocabulary":
    """Initialize a vocabulary.

    This function is supposed to initialize vocabulary when called from
    the configuration file. It first checks whether the vocabulary is already
    loaded on the provided path and if not, it tries to generate it from
    the provided dataset.

    Args:
        directory: Directory where the vocabulary should be stored.

        name: Name of the vocabulary which is also the name of the file
              it is stored it.

        datasets: A a list of datasets from which the vocabulary can be
                  created.

        series_ids: A list of ids of series of the datasets that should be used
                    for producing the vocabulary.

        max_size: The maximum size of the vocabulary

    Returns:
        The new vocabulary
    """
    warn("Use of deprecated initialize_vocabulary method. "
         "Did you think this through?")

    file_name = os.path.join(directory, name + ".pickle")
    if os.path.exists(file_name):
        return from_wordlist(file_name)

    if datasets is None or series_ids is None or max_size is None:
        raise Exception("Vocabulary does not exist in '{}', "
                        "neither dataset and series_id were provided.")

    return from_dataset(datasets,
                        series_ids,
                        max_size,
                        save_file=file_name,
                        overwrite=False)
예제 #21
0
    def truncate(self, size: int) -> None:
        """Truncate the vocabulary to the requested size.

        The infrequent tokens are discarded.

        Arguments:
            size: The final size of the vocabulary
        """

        if not self.correct_counts:
            raise ValueError("The vocabulary does not have correct "
                             "word_counts to use for vocabulary truncate")

        # sort by frequency
        # sorting words first makes vocabulary generation deterministic
        words_by_freq = sorted(list(sorted(self.word_count.keys())),
                               key=lambda w: self.word_count[w])

        # keep the least frequent words which are not special symbols
        to_delete = len(self) - size
        if to_delete < 0:
            to_delete = 0
            warn("Actual vocabulary size ({}) is smaller than max_size ({})".
                 format(len(self), size))
        words_to_delete = []  # type: List[str]
        for word in words_by_freq:
            if len(words_to_delete) == to_delete:
                break
            if not is_special_token(word):
                words_to_delete.append(word)

        # sort by index ... bigger indices needs to be removed first
        # to keep the lists propertly shaped
        delete_words_by_index = sorted([(w, self.word_to_index[w])
                                        for w in words_to_delete],
                                       key=lambda p: -p[1])

        for word, index in delete_words_by_index:
            del self.word_count[word]
            del self.index_to_word[index]

        self.word_to_index = {}
        for index, word in enumerate(self.index_to_word):
            self.word_to_index[word] = index
예제 #22
0
def from_wordlist(path: str,
                  encoding: str = "utf-8",
                  contains_header: bool = True,
                  contains_frequencies: bool = True) -> 'Vocabulary':
    """Loads vocabulary from a wordlist. The file can contain either list of
    words with no header. Or it can contain words and their counts separated
    by tab and a header on the first line.

    Arguments:
        path: The path to the wordlist file
        encoding: The encoding of the merge file (defaults to UTF-8)
        contains_header: if the file have a header on first line
        contains_frequencies: if the file contains frequencies in second column

    Returns:
        The new Vocabulary instance.
    """
    vocabulary = Vocabulary()

    with open(path, encoding=encoding) as wordlist:
        if contains_header:
            # skip the header
            next(wordlist)

        for line in wordlist:
            line = line.strip()
            # check if line is empty
            if not line:
                continue

            if contains_frequencies:
                info = line.split('\t')
                if len(info) != 2:
                    raise ValueError("Vocabulary file do not have two columns")
                vocabulary.add_word(info[0], int(info[1]))
            else:
                if '\t' in line:
                    warn("The vocabulary contains a tabulator")
                vocabulary.add_word(line)

    log("Vocabulary from wordlist loaded, containing {} words".format(
        len(vocabulary)))
    vocabulary.log_sample()
    return vocabulary
예제 #23
0
def save_git_info(git_commit_file: str, git_diff_file: str,
                  branch: str = "HEAD", repo_dir: str = None) -> None:
    if shutil.which("git") is not None:
        if repo_dir is None:
            # This points inside the neuralmonkey/ dir inside the repo, but
            # it does not matter for git.
            repo_dir = os.path.dirname(os.path.realpath(__file__))

        with open(git_commit_file, "wb") as file:
            subprocess.run(["git", "log", "-1", "--format=%H", branch],
                           cwd=repo_dir, stdout=file)

        with open(git_diff_file, "wb") as file:
            subprocess.run(
                ["git", "--no-pager", "diff", "--color=always", branch],
                cwd=repo_dir, stdout=file
            )
    else:
        warn("No git executable found. Not storing git commit and diffs")
예제 #24
0
def build_config(config_dicts: Dict[str, Any],
                 ignore_names: Set[str],
                 warn_unused: bool = False) -> Tuple[Dict[str, Any],
                                                     Dict[str, Any]]:
    """Build the model from the configuration.

    Arguments:
        config_dicts: The parsed configuration file
        ignore_names: A set of names that should be ignored during the loading.
        warn_unused: Emit a warning if there are unused sections.

    Returns:
        A tuple containing a dictionary corresponding to the main section and
        a dictionary mapping section names to objects.
    """
    if "main" not in config_dicts:
        raise Exception("Configuration does not contain the main block.")

    existing_objects = collections.OrderedDict()  # type: Dict[str, Any]

    main_config = config_dicts["main"]
    existing_objects["main"] = Namespace(**main_config)

    configuration = collections.OrderedDict()  # type: Dict[str, Any]
    # TODO ensure tf_manager goes last in a better way
    for key, value in sorted(main_config.items(),
                             key=lambda t: t[0] if t[0] != "tf_manager"
                             else "zzz"):
        if key not in ignore_names:
            try:
                configuration[key] = build_object(
                    value, config_dicts, existing_objects, 0)
            except Exception as exc:
                raise ConfigBuildException(key, exc) from None

    if warn_unused:
        existing_names = set(existing_objects.keys()) | {"main"}
        unused = config_dicts.keys() - existing_names
        if unused:
            warn("Configuration contains unused sections: "
                 + str(unused) + ".")

    return configuration, existing_objects
예제 #25
0
    def get_executable(self, compute_losses: bool, summaries: bool,
                       num_sessions: int) -> TensorExecutable:
        fetches = {}
        batch_ids = {}

        for name, bid in zip(self._names, self._batch_dims_name):
            try:
                fetches[name] = tf.get_default_graph().get_tensor_by_name(name)
                batch_ids[name] = bid
            except KeyError:
                warn(("The tensor of name '{}' is not present in the "
                      "graph.").format(name))

        for tensor, bid in zip(self._tensors, self._batch_dims_ref):
            fetches[tensor.name] = tensor
            batch_ids[tensor.name] = bid

        return TensorExecutable(self.all_coders, fetches, batch_ids,
                                self._select_session)
예제 #26
0
def build_config(
        config_dicts: Dict[str, Any],
        ignore_names: Set[str],
        warn_unused: bool = False) -> Tuple[Dict[str, Any], Dict[str, Any]]:
    """Build the model from the configuration.

    Arguments:
        config_dicts: The parsed configuration file
        ignore_names: A set of names that should be ignored during the loading.
        warn_unused: Emit a warning if there are unused sections.

    Returns:
        A tuple containing a dictionary corresponding to the main section and
        a dictionary mapping section names to objects.
    """
    if "main" not in config_dicts:
        raise Exception("Configuration does not contain the main block.")

    existing_objects = collections.OrderedDict()  # type: Dict[str, Any]

    main_config = config_dicts["main"]
    existing_objects["main"] = Namespace(**main_config)

    configuration = collections.OrderedDict()  # type: Dict[str, Any]
    # TODO ensure tf_manager goes last in a better way
    for key, value in sorted(main_config.items(),
                             key=lambda t: t[0]
                             if t[0] != "tf_manager" else "zzz"):
        if key not in ignore_names:
            try:
                configuration[key] = build_object(value, config_dicts,
                                                  existing_objects, 0)
            except Exception as exc:
                raise ConfigBuildException(key, exc) from None

    if warn_unused:
        existing_names = set(existing_objects.keys()) | {"main"}
        unused = config_dicts.keys() - existing_names
        if unused:
            warn("Configuration contains unused sections: " + str(unused) +
                 ".")

    return configuration, existing_objects
예제 #27
0
    def load(list_files: List[str]) -> Iterable[np.ndarray]:
        for list_file in list_files:
            with open(list_file) as f_list:
                for i, image_file in enumerate(f_list):
                    path = os.path.join(prefix, image_file.rstrip())

                    if not os.path.exists(path):
                        raise Exception(
                            ("Image file '{}' no."
                             "{}  does not exist.").format(path, i + 1))

                    try:
                        image = Image.open(path).convert(mode)
                    except IOError:
                        warn("Skipping image from file '{}' no. '{}'.".format(
                            path, i + 1))
                        image = Image.new(mode, (pad_w, pad_h))

                    image = _rescale_or_crop(image, pad_w, pad_h,
                                             rescale_w, rescale_h,
                                             keep_aspect_ratio)
                    image_np = np.array(image)

                    if len(image_np.shape) == 2:
                        img_channels = 1
                        image_np = np.expand_dims(image_np, 2)
                    elif len(image_np.shape) == 3:
                        img_channels = image_np.shape[2]
                    else:
                        raise ValueError(
                            ("Image should have either 2 (black and white) "
                             "or three dimensions (color channels), has {} "
                             "dimension.").format(len(image_np.shape)))

                    if channels != img_channels:
                        raise ValueError(
                            "Image does not have the pre-declared number of "
                            "channels {}, but {}.".format(
                                channels, img_channels))

                    yield _pad(image_np, pad_w, pad_h, channels)
예제 #28
0
    def load(list_files: List[str]) -> Iterable[np.ndarray]:
        for list_file in list_files:
            with open(list_file) as f_list:
                for i, image_file in enumerate(f_list):
                    path = os.path.join(prefix, image_file.rstrip())

                    if not os.path.exists(path):
                        raise Exception(
                            ("Image file '{}' no."
                             "{}  does not exist.").format(path, i + 1))

                    try:
                        image = Image.open(path).convert(mode)
                    except IOError:
                        warn("Skipping image from file '{}' no. '{}'.".format(
                            path, i + 1))
                        image = Image.new(mode, (pad_w, pad_h))

                    image = _rescale_or_crop(image, pad_w, pad_h, rescale_w,
                                             rescale_h, keep_aspect_ratio)
                    image_np = np.array(image)

                    if len(image_np.shape) == 2:
                        img_channels = 1
                        image_np = np.expand_dims(image_np, 2)
                    elif len(image_np.shape) == 3:
                        img_channels = image_np.shape[2]
                    else:
                        raise ValueError(
                            ("Image should have either 2 (black and white) "
                             "or three dimensions (color channels), has {} "
                             "dimension.").format(len(image_np.shape)))

                    if channels != img_channels:
                        raise ValueError(
                            "Image does not have the pre-declared number of "
                            "channels {}, but {}.".format(
                                channels, img_channels))

                    yield _pad(image_np, pad_w, pad_h, channels)
예제 #29
0
def save_git_info(git_commit_file: str,
                  git_diff_file: str,
                  branch: str = "HEAD",
                  repo_dir: str = None) -> None:
    if shutil.which("git") is not None:
        if repo_dir is None:
            # This points inside the neuralmonkey/ dir inside the repo, but
            # it does not matter for git.
            repo_dir = os.path.dirname(os.path.realpath(__file__))

        with open(git_commit_file, "wb") as file:
            subprocess.run(["git", "log", "-1", "--format=%H", branch],
                           cwd=repo_dir,
                           stdout=file)

        with open(git_diff_file, "wb") as file:
            subprocess.run(
                ["git", "--no-pager", "diff", "--color=always", branch],
                cwd=repo_dir,
                stdout=file)
    else:
        warn("No git executable found. Not storing git commit and diffs")
예제 #30
0
def build_config(config_dicts: Dict[str, Any],
                 ignore_names: Set[str],
                 warn_unused: bool = False) -> Dict[str, Any]:
    """ Builds the model from the configuration

    Arguments:
        config_dicts: The parsed configuration file
        ignore_names: A set of names that should be ignored during the loading.
        warn_unused: Emit a warning if there are unused sections.
    """
    if "main" not in config_dicts:
        raise Exception("Configuration does not contain the main block.")

    existing_objects = collections.OrderedDict()  # type: Dict[str, Any]

    main_config = config_dicts['main']

    configuration = collections.OrderedDict()  # type: Dict[str, Any]
    # TODO ensure tf_manager goes last in a better way
    for key, value in sorted(main_config.items(),
                             key=lambda t: t[0]
                             if t[0] != 'tf_manager' else 'zzz'):
        if key not in ignore_names:
            try:
                configuration[key] = build_object(value, config_dicts,
                                                  existing_objects, 0)
            except Exception as exc:
                raise ConfigBuildException(key, exc) from None

    if warn_unused:
        existing_names = {x[7:] for x in existing_objects.keys()} | {'main'}
        unused = config_dicts.keys() - existing_names
        if unused:
            warn("Configuration contains unused sections: " + str(unused) +
                 ".")

    return configuration
예제 #31
0
    def fetches(self) -> Dict[str, tf.Tensor]:

        fetches = {}  # type: Dict[str, tf.Tensor]
        for name, bid in zip(self._names, self._batch_dims_name):
            try:
                fetches[name] = (
                    Experiment.get_current().graph.get_tensor_by_name(name))
                self.batch_ids[name] = bid
            except KeyError:
                warn(("The tensor of name '{}' is not present in the "
                      "graph.").format(name))

        for mpart, tname, bid in zip(self._modelparts, self._tensors,
                                     self.batch_dims):
            if not hasattr(mpart, tname):
                raise ValueError("Model part {} does not have a tensor called "
                                 "{}.".format(mpart, tname))

            tensorval = getattr(mpart, tname)

            fetches[tensorval.name] = tensorval
            self.batch_ids[tensorval.name] = bid

        return fetches
예제 #32
0
    def fetches(self) -> Dict[str, tf.Tensor]:

        fetches = {}  # type: Dict[str, tf.Tensor]
        for name, bid in zip(self._names, self._batch_dims_name):
            try:
                fetches[name] = (
                    Experiment.get_current().graph.get_tensor_by_name(name))
                self.batch_ids[name] = bid
            except KeyError:
                warn(("The tensor of name '{}' is not present in the "
                      "graph.").format(name))

        for mpart, tname, bid in zip(self._modelparts, self._tensors,
                                     self.batch_dims):
            if not hasattr(mpart, tname):
                raise ValueError("Model part {} does not have a tensor called "
                                 "{}.".format(mpart, tname))

            tensorval = getattr(mpart, tname)

            fetches[tensorval.name] = tensorval
            self.batch_ids[tensorval.name] = bid

        return fetches
예제 #33
0
    def score_batch(self,
                    hypotheses: List[List[str]],
                    references: List[List[str]]) -> float:

        ref_bytes = self.serialize_to_bytes(references)
        hyp_bytes = self.serialize_to_bytes(hypotheses)

        with tempfile.NamedTemporaryFile() as reffile, \
                tempfile.NamedTemporaryFile() as hypfile:

            reffile.write(ref_bytes)
            reffile.flush()

            hypfile.write(hyp_bytes)
            hypfile.flush()

            args = [self.wrapper, "eval", "--refs", reffile.name,
                    "--hyps-baseline", hypfile.name, "--metrics", self.metric]
            if self.metric == "meteor":
                args.extend(["--meteor.language", self.language])
                # problem: if meteor run for the first time,
                # paraphrase tables are downloaded

            output_proc = subprocess.run(
                args, stderr=subprocess.PIPE, stdout=subprocess.PIPE)

            proc_stdout = output_proc.stdout.decode("utf-8")  # type: ignore
            lines = proc_stdout.splitlines()

            if not lines:
                return 0.0
            try:
                filtered = float(lines[1].split()[1])
                eval_score = filtered / 100.
                return eval_score
            except IndexError:
                warn("Error: Malformed output from MultEval wrapper:")
                warn(proc_stdout)
                warn("=======")
                return 0.0
            except ValueError:
                warn("Value error - '{}' is not a number.".format(lines[0]))
                return 0.0
예제 #34
0
def _normalize_train_cfg(cfg: Namespace) -> None:
    """Given a configuration namespace, normalize the values it contains.

    This function is only executed when training mode has been invoked.

    Arguments:
        cfg: The namespace object returned by `Configuration.make_namespace`
    """
    if not isinstance(cfg.val_dataset, List):
        cfg.val_datasets = [cfg.val_dataset]
    else:
        cfg.val_datasets = cfg.val_dataset

    if not isinstance(cfg.trainer, List):
        cfg.trainers = [cfg.trainer]
    else:
        cfg.trainers = cfg.trainer

    # deal with delayed trainer and logging periods
    # the correct way if there are more trainers is perhaps to do a
    # lowest common denominator of their batches_per_update.
    # But we can also warn because it is a very weird setup.

    delayed_trainers = [
        t for t in cfg.trainers if isinstance(t, DelayedUpdateTrainer)
    ]

    denominator = 1
    if len(cfg.trainers) > 1 and delayed_trainers:
        warn("Weird setup: using more trainers and one of them is delayed "
             "update trainer. No-one can vouch for your safety, user!")
        warn("Using the lowest common denominator of all delayed trainers'"
             " batches_per_update parameters for logging period")
        warn("Note that if you are using a multi-task trainer, it is on "
             "your own risk")

        denominator = np.lcm.reduce(
            [t.batches_per_update for t in delayed_trainers])
    elif delayed_trainers:
        assert len(cfg.trainers) == 1
        denominator = cfg.trainers[0].batches_per_update

    cfg.log_timer = _resolve_period(cfg.logging_period, denominator)
    cfg.val_timer = _resolve_period(cfg.validation_period, denominator)
예제 #35
0
def _normalize_train_cfg(cfg: Namespace) -> None:
    """Given a configuration namespace, normalize the values it contains.

    This function is only executed when training mode has been invoked.

    Arguments:
        cfg: The namespace object returned by `Configuration.make_namespace`
    """
    if not isinstance(cfg.val_dataset, List):
        cfg.val_datasets = [cfg.val_dataset]
    else:
        cfg.val_datasets = cfg.val_dataset

    if not isinstance(cfg.trainer, List):
        cfg.trainers = [cfg.trainer]
    else:
        cfg.trainers = cfg.trainer

    # deal with delayed trainer and logging periods
    # the correct way if there are more trainers is perhaps to do a
    # lowest common denominator of their batches_per_update.
    # But we can also warn because it is a very weird setup.

    delayed_trainers = [t for t in cfg.trainers
                        if isinstance(t, DelayedUpdateTrainer)]

    denominator = 1
    if len(cfg.trainers) > 1 and delayed_trainers:
        warn("Weird setup: using more trainers and one of them is delayed "
             "update trainer. No-one can vouch for your safety, user!")
        warn("Using the lowest common denominator of all delayed trainers'"
             " batches_per_update parameters for logging period")
        warn("Note that if you are using a multi-task trainer, it is on "
             "your own risk")

        denominator = np.lcm.reduce([t.batches_per_update
                                     for t in delayed_trainers])
    elif delayed_trainers:
        assert len(cfg.trainers) == 1
        denominator = cfg.trainers[0].batches_per_update

    cfg.log_timer = _resolve_period(cfg.logging_period, denominator)
    cfg.val_timer = _resolve_period(cfg.validation_period, denominator)
예제 #36
0
    def batches(self) -> Iterator["Dataset"]:
        """Split the dataset into batches.

        Returns:
            Generator yielding the batches.
        """
        if self.batching.batch_size is not None:
            max_bs = self.batching.batch_size
        else:
            assert self.batching.bucket_batch_sizes is not None
            max_bs = max(self.batching.bucket_batch_sizes)

        if self.lazy and self.buffer_min_size < max_bs:
            warn("Minimum buffer size ({}) lower than batch size ({}). "
                 "It is recommended to use large buffer size."
                 .format(self.buffer_min_size, max_bs))

        # Initialize iterators
        iterators = {s: it() for s, it in self.iterators.items()}

        # Create iterator over instances
        zipped_iterator = (
            dict(zip(iterators, row)) for row in zip(*iterators.values()))

        # Fill the buffer with initial values, shuffle optionally
        if self.lazy:
            # pylint: disable=stop-iteration-return
            # This is pylint issue https://github.com/PyCQA/pylint/issues/2158
            lbuf = list(next(zipped_iterator) for _ in range(self.buffer_size))
            # pylint: enable=stop-iteration-return
        else:
            lbuf = list(zipped_iterator)
        if self.shuffled:
            random.shuffle(lbuf)
        buf = deque(lbuf)

        def _make_datagen(rows, key):
            def itergen():
                return (row[key] for row in rows)
            return itergen

        # Iterate over the rest of the data until buffer is empty
        batch_index = 0
        buckets = [[]]  # type: List[List[DataExample]]

        if self.batching.bucket_boundaries is not None:
            buckets += [[] for _ in self.batching.bucket_boundaries]

        while buf:
            row = buf.popleft()

            if self.batching.bucket_boundaries is None:
                bucket_id = 0
            else:
                # TODO: use only specific series to determine the bucket number
                length = max(len(row[key]) for key in row)

                bucket_id = -1
                for b_id, limit in enumerate(self.batching.bucket_boundaries):
                    fits_in = length <= limit
                    tighter_fit = (
                        bucket_id == -1
                        or limit < self.batching.bucket_boundaries[
                            bucket_id])

                    if fits_in and tighter_fit:
                        bucket_id = b_id

            buckets[bucket_id].append(row)

            if self.batching.bucket_batch_sizes is None:
                assert self.batching.batch_size is not None
                is_full = len(buckets[bucket_id]) >= self.batching.batch_size
            else:
                is_full = (len(buckets[bucket_id])
                           >= self.batching.bucket_batch_sizes[bucket_id])

            if is_full:
                # Create the batch
                name = "{}.batch.{}".format(self.name, batch_index)
                data = {key: _make_datagen(buckets[bucket_id], key)
                        for key in buckets[bucket_id][0]}

                yield Dataset(
                    name=name, iterators=data, batching=self.batching)
                batch_index += 1
                buckets[bucket_id] = []

            # If lazy, refill buffer & shuffle if needed
            # Otherwise, all of the data is already loaded in the buffer.
            if self.lazy and len(buf) < self.buffer_min_size:
                # In case buffer_size is lower than batch_size
                to_add = self.buffer_size - len(buf)

                for _, item in zip(range(to_add), zipped_iterator):
                    buf.append(item)

                if self.shuffled:
                    lbuf = list(buf)
                    random.shuffle(lbuf)
                    buf = deque(lbuf)

        if not self.batching.drop_remainder:
            for bucket in buckets:
                if bucket:
                    name = "{}.batch.{}".format(self.name, batch_index)
                    data = {key: _make_datagen(bucket, key)
                            for key in bucket[0]}

                    yield Dataset(
                        name=name, iterators=data, batching=self.batching)
                    batch_index += 1
예제 #37
0
    def __init__(self,
                 encoders: List[Stateful],
                 vocabulary: Vocabulary,
                 data_id: str,
                 name: str,
                 max_output_len: int,
                 dropout_keep_prob: float = 1.0,
                 rnn_size: int = None,
                 embedding_size: int = None,
                 output_projection: OutputProjectionSpec = None,
                 encoder_projection: EncoderProjection = None,
                 attentions: List[BaseAttention] = None,
                 embeddings_source: EmbeddedSequence = None,
                 attention_on_input: bool = True,
                 rnn_cell: str = "GRU",
                 conditional_gru: bool = False,
                 save_checkpoint: str = None,
                 load_checkpoint: str = None) -> None:
        """Create a refactored version of monster decoder.

        Arguments:
            encoders: Input encoders of the decoder
            vocabulary: Target vocabulary
            data_id: Target data series
            name: Name of the decoder. Should be unique accross all Neural
                Monkey objects
            max_output_len: Maximum length of an output sequence
            dropout_keep_prob: Probability of keeping a value during dropout

        Keyword arguments:
            rnn_size: Size of the decoder hidden state, if None set
                according to encoders.
            embedding_size: Size of embedding vectors for target words
            output_projection: How to generate distribution over vocabulary
                from decoder rnn_outputs
            encoder_projection: How to construct initial state from encoders
            attention: The attention object to use. Optional.
            embeddings_source: Embedded sequence to take embeddings from
            rnn_cell: RNN Cell used by the decoder (GRU or LSTM)
            conditional_gru: Flag whether to use the Conditional GRU
                architecture
            attention_on_input: Flag whether attention from previous decoding
                step should be combined with the input in the next step.
        """
        ModelPart.__init__(self, name, save_checkpoint, load_checkpoint)
        check_argument_types()

        log("Initializing decoder, name: '{}'".format(name))

        self.encoders = encoders
        self.vocabulary = vocabulary
        self.data_id = data_id
        self.max_output_len = max_output_len
        self.dropout_keep_prob = dropout_keep_prob
        self.embedding_size = embedding_size
        self.rnn_size = rnn_size
        self.output_projection_spec = output_projection
        self.encoder_projection = encoder_projection
        self.attentions = attentions
        self.embeddings_source = embeddings_source
        self._conditional_gru = conditional_gru
        self._attention_on_input = attention_on_input
        self._rnn_cell_str = rnn_cell

        if self.attentions is None:
            self.attentions = []

        if self.embedding_size is None and self.embeddings_source is None:
            raise ValueError("You must specify either embedding size or the "
                             "embedded sequence from which to reuse the "
                             "embeddings (e.g. set either 'embedding_size' or "
                             " 'embeddings_source' parameter)")

        if self.embeddings_source is not None:
            if self.embedding_size is not None:
                warn("Overriding the embedding_size parameter with the"
                     " size of the reused embeddings from the encoder.")

            self.embedding_size = (
                self.embeddings_source.embedding_matrix.get_shape()[1].value)

        if self.encoder_projection is None:
            if not self.encoders:
                log("No encoder - language model only.")
                self.encoder_projection = empty_initial_state
            elif rnn_size is None:
                log("No rnn_size or encoder_projection: Using concatenation of"
                    " encoded states")
                self.encoder_projection = concat_encoder_projection
                self.rnn_size = sum(e.output.get_shape()[1].value
                                    for e in encoders)
            else:
                log("Using linear projection of encoders as the initial state")
                self.encoder_projection = linear_encoder_projection(
                    self.dropout_keep_prob)

        assert self.rnn_size is not None

        if self._rnn_cell_str not in RNN_CELL_TYPES:
            raise ValueError("RNN cell must be a either 'GRU', 'LSTM', or "
                             "'NematusGRU'. Not {}".format(self._rnn_cell_str))

        if self.output_projection_spec is None:
            log("No output projection specified - using tanh projection")
            self.output_projection = nonlinear_output(
                self.rnn_size, tf.tanh)[0]
            self.output_projection_size = self.rnn_size
        elif isinstance(self.output_projection_spec, tuple):
            (self.output_projection,
             self.output_projection_size) = tuple(self.output_projection_spec)
        else:
            self.output_projection = self.output_projection_spec
            self.output_projection_size = self.rnn_size

        if self._attention_on_input:
            self.input_projection = self.input_plus_attention
        else:
            self.input_projection = self.embed_input_symbol

        with self.use_scope():
            with tf.variable_scope("attention_decoder") as self.step_scope:
                pass

        # TODO when it is possible, remove the printing of the cost var
        log("Decoder initalized. Cost var: {}".format(str(self.cost)))
        log("Runtime logits tensor: {}".format(str(self.runtime_logits)))
예제 #38
0
    def __init__(self,
                 output_series: str,
                 toplevel_modelpart: ModelPart,
                 toplevel_tensors: List[tf.Tensor],
                 tensors_by_name: List[str],
                 tensors_by_ref: List[tf.Tensor],
                 batch_dims_by_name: List[int],
                 batch_dims_by_ref: List[int],
                 select_session: int = None,
                 single_tensor: bool = False) -> None:
        """Construct a new ``TensorRunner`` object.

        Note that at this time, one must specify the toplevel objects so that
        it is ensured that the graph is built. The reason for this behavior is
        that the graph is constructed lazily and therefore if the tensors to
        store are provided by indirect reference (name), the system does not
        know early enough that it needs to create them.

        Args:
            output_series: The name of the generated output data series.
            toplevel_modelpart: A ``ModelPart`` object that is used as the
                top-level component of the model. This object should depend on
                values of all the wanted tensors.
            toplevel_tensors: A list of tensors that should be constructed. Use
                this when the toplevel model part does not depend on this
                tensor. The tensors are constructed during running this
                constructor method which prints them out.
            tensors_by_name: A list of tensor names to fetch. If a tensor
                is not in the graph, a warning is generated and the tensor is
                ignored.
            tensors_by_ref: A list of tensor objects to fetch.
            batch_dims_by_name: A list of integers that correspond to the
                batch dimension in each wanted tensor specified by name.
            batch_dims_by_ref: A list of integers that correspond to the
                batch dimension in each wanted tensor specified by reference.
            select_session: An optional integer specifying the session to use
                in case of ensembling. When not used, tensors from all sessions
                are stored. In case of a single session, this option has no
                effect.
            single_tensor: If `True`, it is assumed that only one tensor is to
                be fetched, and the execution result will consist of this
                tensor only. If `False`, the result will be a dict mapping
                tensor names to NumPy arrays.
        """
        check_argument_types()
        BaseRunner[ModelPart].__init__(self, output_series, toplevel_modelpart)

        total_tensors = len(tensors_by_name) + len(tensors_by_ref)
        if single_tensor and total_tensors > 1:
            raise ValueError(
                "single_tensor is True, but {} tensors were given".format(
                    total_tensors))

        self._names = tensors_by_name
        self._tensors = tensors_by_ref
        self._batch_dims_name = batch_dims_by_name
        self._batch_dims_ref = batch_dims_by_ref
        self._select_session = select_session
        self._single_tensor = single_tensor

        log("Blessing toplevel tensors for tensor runner:")
        for tensor in toplevel_tensors:
            log("Toplevel tensor: {}".format(tensor))

        self._fetches = {}  # type: Dict[str, tf.Tensor]
        self._batch_ids = {}  # type: Dict[str, int]

        for name, bid in zip(self._names, self._batch_dims_name):
            try:
                self._fetches[name] = (
                    Experiment.get_current().graph.get_tensor_by_name(name))
                self._batch_ids[name] = bid
            except KeyError:
                warn(("The tensor of name '{}' is not present in the "
                      "graph.").format(name))
예제 #39
0
def run_on_dataset(
    tf_manager: TensorFlowManager,
    runners: List[BaseRunner],
    dataset: Dataset,
    postprocess: Postprocess,
    write_out: bool = False,
    batch_size: Optional[int] = None,
    log_progress: int = 0
) -> Tuple[List[ExecutionResult], Dict[str, List[Any]]]:
    """Apply the model on a dataset and optionally write outputs to files.

    Args:
        tf_manager: TensorFlow manager with initialized sessions.
        runners: A function that runs the code
        dataset: The dataset on which the model will be executed.
        evaluators: List of evaluators that are used for the model
            evaluation if the target data are provided.
        postprocess: an object to use as postprocessing of the
        write_out: Flag whether the outputs should be printed to a file defined
            in the dataset object.
        batch_size: size of the minibatch
        log_progress: log progress every X seconds

        extra_fetches: Extra tensors to evaluate for each batch.

    Returns:
        Tuple of resulting sentences/numpy arrays, and evaluation results if
        they are available which are dictionary function -> value.

    """
    contains_targets = all(
        dataset.has_series(runner.decoder_data_id) for runner in runners
        if runner.decoder_data_id is not None)

    all_results = tf_manager.execute(dataset,
                                     runners,
                                     compute_losses=contains_targets,
                                     batch_size=batch_size,
                                     log_progress=log_progress)

    result_data = {
        runner.output_series: result.outputs
        for runner, result in zip(runners, all_results)
    }

    if postprocess is not None:
        for series_name, postprocessor in postprocess:
            postprocessed = postprocessor(dataset, result_data)
            if not hasattr(postprocessed, "__len__"):
                postprocessed = list(postprocessed)

            result_data[series_name] = postprocessed

    # check output series lengths
    for series_id, data in result_data.items():
        if len(data) != len(dataset):
            warn("Output '{}' for dataset '{}' has length {}, but "
                 "len(dataset) == {}".format(series_id, dataset.name,
                                             len(data), len(dataset)))

    def _check_savable_dict(data):
        """Check if the data is of savable type."""
        if not (data and data[0]):
            return False

        supported_type = Union[List[Dict[str, np.ndarray]],
                               List[List[Dict[str, np.ndarray]]]]

        try:
            check_type("data", data, supported_type, None)
        except TypeError:
            return False
        return True

    if write_out:
        for series_id, data in result_data.items():
            if series_id in dataset.series_outputs:
                path = dataset.series_outputs[series_id]
                if isinstance(data, np.ndarray):
                    np.save(path, data)
                    log("Result saved as numpy array to '{}'".format(path))
                elif _check_savable_dict(data):
                    unbatched = dict(
                        zip(data[0], zip(*[d.values() for d in data])))

                    np.savez(path, **unbatched)
                    log("Result saved as numpy data to '{}.npz'".format(path))
                else:
                    with open(path, "w", encoding="utf-8") as f_out:
                        f_out.writelines([
                            " ".join(sent) + "\n" if isinstance(
                                sent, collections.Iterable) else str(sent) +
                            "\n" for sent in data
                        ])
                    log("Result saved as plain text '{}'".format(path))
            else:
                log("There is no output file for dataset: {}".format(
                    dataset.name),
                    color="red")

    return all_results, result_data
예제 #40
0
def training_loop(
        tf_manager: TensorFlowManager,
        epochs: int,
        trainer: GenericTrainer,  # TODO better annotate
        batch_size: int,
        log_directory: str,
        evaluators: EvalConfiguration,
        runners: List[BaseRunner],
        train_dataset: Dataset,
        val_dataset: Union[Dataset, List[Dataset]],
        test_datasets: Optional[List[Dataset]] = None,
        logging_period: Union[str, int] = 20,
        validation_period: Union[str, int] = 500,
        val_preview_input_series: Optional[List[str]] = None,
        val_preview_output_series: Optional[List[str]] = None,
        val_preview_num_examples: int = 15,
        train_start_offset: int = 0,
        runners_batch_size: Optional[int] = None,
        initial_variables: Optional[Union[str, List[str]]] = None,
        postprocess: Postprocess = None) -> None:
    """Execute the training loop for given graph and data.

    Args:
        tf_manager: TensorFlowManager with initialized sessions.
        epochs: Number of epochs for which the algoritm will learn.
        trainer: The trainer object containg the TensorFlow code for computing
            the loss and optimization operation.
        batch_size: number of examples in one mini-batch
        log_directory: Directory where the TensordBoard log will be generated.
            If None, nothing will be done.
        evaluators: List of evaluators. The last evaluator is used as the main.
            An evaluator is a tuple of the name of the generated
            series, the name of the dataset series the generated one is
            evaluated with and the evaluation function. If only one
            series names is provided, it means the generated and
            dataset series have the same name.
        runners: List of runners for logging and evaluation runs
        train_dataset: Dataset used for training
        val_dataset: used for validation. Can be Dataset or a list of datasets.
            The last dataset is used as the main one for storing best results.
            When using multiple datasets. It is recommended to name them for
            better Tensorboard visualization.
        test_datasets: List of datasets used for testing
        logging_period: after how many batches should the logging happen. It
            can also be defined as a time period in format like: 3s; 4m; 6h;
            1d; 3m15s; 3seconds; 4minutes; 6hours; 1days
        validation_period: after how many batches should the validation happen.
            It can also be defined as a time period in same format as logging
        val_preview_input_series: which input series to preview in validation
        val_preview_output_series: which output series to preview in validation
        val_preview_num_examples: how many examples should be printed during
            validation
        train_start_offset: how many lines from the training dataset should be
            skipped. The training starts from the next batch.
        runners_batch_size: batch size of runners. It is the same as batch_size
            if not specified
        initial_variables: variables used for initialization, for example for
            continuation of training. Provide it with a path to your model
            directory and its checkpoint file group common prefix, e.g.
            "variables.data", or "variables.data.3" in case of multiple
            checkpoints per experiment.
        postprocess: A function which takes the dataset with its output series
            and generates additional series from them.
    """
    check_argument_types()

    if isinstance(val_dataset, Dataset):
        val_datasets = [val_dataset]
    else:
        val_datasets = val_dataset

    log_period_batch, log_period_time = _resolve_period(logging_period)
    val_period_batch, val_period_time = _resolve_period(validation_period)

    _check_series_collisions(runners, postprocess)

    _log_model_variables(var_list=trainer.var_list)

    if runners_batch_size is None:
        runners_batch_size = batch_size

    evaluators = [(e[0], e[0], e[1]) if len(e) == 2 else e for e in evaluators]

    if evaluators:
        main_metric = "{}/{}".format(evaluators[-1][0],
                                     evaluators[-1][-1].name)
    else:
        main_metric = "{}/{}".format(runners[-1].decoder_data_id,
                                     runners[-1].loss_names[0])

        if not tf_manager.minimize_metric:
            raise ValueError("minimize_metric must be set to True in "
                             "TensorFlowManager when using loss as "
                             "the main metric")

    step = 0
    seen_instances = 0
    last_seen_instances = 0

    if initial_variables is None:
        # Assume we don't look at coder checkpoints when global
        # initial variables are supplied
        tf_manager.initialize_model_parts(runners + [trainer],
                                          save=True)  # type: ignore
    else:
        try:
            tf_manager.restore(initial_variables)
        except tf.errors.NotFoundError:
            warn("Some variables were not found in checkpoint.)")

    if log_directory:
        log("Initializing TensorBoard summary writer.")
        tb_writer = tf.summary.FileWriter(log_directory,
                                          tf_manager.sessions[0].graph)
        log("TensorBoard writer initialized.")

    log("Starting training")
    last_log_time = time.process_time()
    last_val_time = time.process_time()
    interrupt = None
    try:
        for epoch_n in range(1, epochs + 1):
            log_print("")
            log("Epoch {} begins".format(epoch_n), color="red")

            train_dataset.shuffle()
            train_batched_datasets = train_dataset.batch_dataset(batch_size)

            if epoch_n == 1 and train_start_offset:
                if not isinstance(train_dataset, LazyDataset):
                    warn("Not skipping training instances with "
                         "shuffled in-memory dataset")
                else:
                    _skip_lines(train_start_offset, train_batched_datasets)

            for batch_n, batch_dataset in enumerate(train_batched_datasets):
                step += 1
                seen_instances += len(batch_dataset)
                if _is_logging_time(step, log_period_batch, last_log_time,
                                    log_period_time):
                    trainer_result = tf_manager.execute(batch_dataset,
                                                        [trainer],
                                                        train=True,
                                                        summaries=True)
                    train_results, train_outputs = run_on_dataset(
                        tf_manager,
                        runners,
                        batch_dataset,
                        postprocess,
                        write_out=False,
                        batch_size=runners_batch_size)
                    # ensure train outputs are iterable more than once
                    train_outputs = {
                        k: list(v)
                        for k, v in train_outputs.items()
                    }
                    train_evaluation = evaluation(evaluators, batch_dataset,
                                                  runners, train_results,
                                                  train_outputs)

                    _log_continuous_evaluation(tb_writer,
                                               main_metric,
                                               train_evaluation,
                                               seen_instances,
                                               epoch_n,
                                               epochs,
                                               trainer_result,
                                               train=True)
                    last_log_time = time.process_time()
                else:
                    tf_manager.execute(batch_dataset, [trainer],
                                       train=True,
                                       summaries=False)

                if _is_logging_time(step, val_period_batch, last_val_time,
                                    val_period_time):
                    log_print("")
                    val_duration_start = time.process_time()
                    val_examples = 0
                    for val_id, valset in enumerate(val_datasets):
                        val_examples += len(valset)

                        val_results, val_outputs = run_on_dataset(
                            tf_manager,
                            runners,
                            valset,
                            postprocess,
                            write_out=False,
                            batch_size=runners_batch_size)
                        # ensure val outputs are iterable more than once
                        val_outputs = {
                            k: list(v)
                            for k, v in val_outputs.items()
                        }
                        val_evaluation = evaluation(evaluators, valset,
                                                    runners, val_results,
                                                    val_outputs)

                        valheader = (
                            "Validation (epoch {}, batch number {}):".format(
                                epoch_n, batch_n))
                        log(valheader, color="blue")
                        _print_examples(valset, val_outputs,
                                        val_preview_input_series,
                                        val_preview_output_series,
                                        val_preview_num_examples)
                        log_print("")
                        log(valheader, color="blue")

                        # The last validation set is selected to be the main
                        if val_id == len(val_datasets) - 1:
                            this_score = val_evaluation[main_metric]
                            tf_manager.validation_hook(this_score, epoch_n,
                                                       batch_n)

                            if this_score == tf_manager.best_score:
                                best_score_str = colored("{:.4g}".format(
                                    tf_manager.best_score),
                                                         attrs=["bold"])

                                # store also graph parts
                                all_coders = set.union(*[
                                    rnr.all_coders
                                    for rnr in runners + [trainer]
                                ])  # type: ignore
                                for coder in all_coders:
                                    for session in tf_manager.sessions:
                                        coder.save(session)
                            else:
                                best_score_str = "{:.4g}".format(
                                    tf_manager.best_score)

                            log("best {} on validation: {} (in epoch {}, "
                                "after batch number {})".format(
                                    main_metric, best_score_str,
                                    tf_manager.best_score_epoch,
                                    tf_manager.best_score_batch),
                                color="blue")

                        v_name = valset.name if len(val_datasets) > 1 else None
                        _log_continuous_evaluation(tb_writer,
                                                   main_metric,
                                                   val_evaluation,
                                                   seen_instances,
                                                   epoch_n,
                                                   epochs,
                                                   val_results,
                                                   train=False,
                                                   dataset_name=v_name)

                    # how long was the training between validations
                    training_duration = val_duration_start - last_val_time
                    val_duration = time.process_time() - val_duration_start

                    # the training should take at least twice the time of val.
                    steptime = (training_duration /
                                (seen_instances - last_seen_instances))
                    valtime = val_duration / val_examples
                    last_seen_instances = seen_instances
                    log("Validation time: {:.2f}s, inter-validation: {:.2f}s, "
                        "per-instance (train): {:.2f}s, per-instance (val): "
                        "{:.2f}s".format(val_duration, training_duration,
                                         steptime, valtime),
                        color="blue")
                    if training_duration < 2 * val_duration:
                        notice("Validation period setting is inefficient.")

                    log_print("")
                    last_val_time = time.process_time()

    except KeyboardInterrupt as ex:
        interrupt = ex

    log("Training finished. Maximum {} on validation data: {:.4g}, epoch {}".
        format(main_metric, tf_manager.best_score,
               tf_manager.best_score_epoch))

    if test_datasets:
        tf_manager.restore_best_vars()

        for dataset in test_datasets:
            test_results, test_outputs = run_on_dataset(
                tf_manager,
                runners,
                dataset,
                postprocess,
                write_out=True,
                batch_size=runners_batch_size)
            # ensure test outputs are iterable more than once
            test_outputs = {k: list(v) for k, v in test_outputs.items()}
            eval_result = evaluation(evaluators, dataset, runners,
                                     test_results, test_outputs)
            print_final_evaluation(dataset.name, eval_result)

    log("Finished.")

    if interrupt is not None:
        raise interrupt  # pylint: disable=raising-bad-type
예제 #41
0
def xent_objective(decoder, weight=None) -> Objective:
    """Get XENT objective from decoder with cost."""
    warn("Using deprecated xent_objective function. Use the CostObjective "
         "class directly.")
    return CostObjective(decoder, weight)
예제 #42
0
def rl_objective(*args, **kwargs) -> ReinforceObjective:
    warn("Using deprecated rl_objective function. Use ReinforceObjective class"
         " directly.")
    return ReinforceObjective(*args, **kwargs)
예제 #43
0
    def __init__(self,
                 name: str,
                 encoders: List[Attendable],
                 vocabulary: Vocabulary,
                 data_id: str,
                 # TODO infer the default for these three from the encoder
                 ff_hidden_size: int,
                 n_heads_self: int,
                 n_heads_enc: Union[List[int], int],
                 depth: int,
                 max_output_len: int,
                 attention_combination_strategy: str = "serial",
                 n_heads_hier: int = None,
                 dropout_keep_prob: float = 1.0,
                 embedding_size: int = None,
                 embeddings_source: EmbeddedSequence = None,
                 tie_embeddings: bool = True,
                 label_smoothing: float = None,
                 self_attention_dropout_keep_prob: float = 1.0,
                 attention_dropout_keep_prob: Union[float, List[float]] = 1.0,
                 use_att_transform_bias: bool = False,
                 supress_unk: bool = False,
                 reuse: ModelPart = None,
                 save_checkpoint: str = None,
                 load_checkpoint: str = None,
                 initializers: InitializerSpecs = None) -> None:
        """Create a decoder of the Transformer model.

        Described in Vaswani et al. (2017), arxiv.org/abs/1706.03762

        Arguments:
            encoders: Input encoders for the decoder.
            vocabulary: Target vocabulary.
            data_id: Target data series.
            name: Name of the decoder. Should be unique accross all Neural
                Monkey objects.
            max_output_len: Maximum length of an output sequence.
            dropout_keep_prob: Probability of keeping a value during dropout.
            embedding_size: Size of embedding vectors for target words.
            embeddings_source: Embedded sequence to take embeddings from.
            tie_embeddings: Use decoder.embedding_matrix also in place
                of the output decoding matrix.
            ff_hidden_size: Size of the feedforward sublayers.
            n_heads_self: Number of the self-attention heads.
            n_heads_enc: Number of the attention heads over each encoder.
                Either a list which size must be equal to ``encoders``, or a
                single integer. In the latter case, the number of heads is
                equal for all encoders.
            attention_comnbination_strategy: One of ``serial``, ``parallel``,
                ``flat``, ``hierarchical``. Controls the attention combination
                strategy for enc-dec attention.
            n_heads_hier: Number of the attention heads for the second
                attention in the ``hierarchical`` attention combination.
            depth: Number of sublayers.
            label_smoothing: A label smoothing parameter for cross entropy
                loss computation.
            attention_dropout_keep_prob: Probability of keeping a value
                during dropout on the attention output.
            supress_unk: If true, decoder will not produce symbols for unknown
                tokens.
            reuse: Reuse the variables from the given model part.
        """
        check_argument_types()
        AutoregressiveDecoder.__init__(
            self,
            name=name,
            vocabulary=vocabulary,
            data_id=data_id,
            max_output_len=max_output_len,
            dropout_keep_prob=dropout_keep_prob,
            embedding_size=embedding_size,
            embeddings_source=embeddings_source,
            tie_embeddings=tie_embeddings,
            label_smoothing=label_smoothing,
            supress_unk=supress_unk,
            reuse=reuse,
            save_checkpoint=save_checkpoint,
            load_checkpoint=load_checkpoint)

        self.encoders = encoders
        self.ff_hidden_size = ff_hidden_size
        self.n_heads_self = n_heads_self

        if isinstance(n_heads_enc, int):
            if attention_combination_strategy == "flat":
                self.n_heads_enc = [n_heads_enc]
            else:
                self.n_heads_enc = [n_heads_enc for _ in self.encoders]
        else:
            self.n_heads_enc = n_heads_enc

        self.depth = depth
        if isinstance(attention_dropout_keep_prob, float):
            self.attention_dropout_keep_prob = [
                attention_dropout_keep_prob for _ in encoders]
        else:
            self.attention_dropout_keep_prob = attention_dropout_keep_prob
        self.self_att_dropout_keep_prob = self_attention_dropout_keep_prob
        self.use_att_transform_bias = use_att_transform_bias
        self.attention_combination_strategy = attention_combination_strategy
        self.n_heads_hier = n_heads_hier

        self.encoder_states = lambda: [get_attention_states(e)
                                       for e in self.encoders]
        self.encoder_masks = lambda: [get_attention_mask(e)
                                      for e in self.encoders]

        if self.attention_combination_strategy not in STRATEGIES:
            raise ValueError(
                "Unknown attention combination strategy '{}'. "
                "Allowed: {}.".format(self.attention_combination_strategy,
                                      ", ".join(STRATEGIES)))

        if (self.attention_combination_strategy == "hierarchical"
                and self.n_heads_hier is None):
            raise ValueError(
                "You must provide n_heads_hier when using the hierarchical "
                "attention combination strategy.")

        if (self.attention_combination_strategy != "hierarchical"
                and self.n_heads_hier is not None):
            warn("Ignoring n_heads_hier parameter -- use the hierarchical "
                 "attention combination strategy instead.")

        if (self.attention_combination_strategy == "flat"
                and len(self.n_heads_enc) != 1):
            raise ValueError(
                "For the flat attention combination strategy, only a single "
                "value is permitted in n_heads_enc.")

        self._variable_scope.set_initializer(tf.variance_scaling_initializer(
            mode="fan_avg", distribution="uniform"))
예제 #44
0
def from_wordlist(path: str,
                  encoding: str = "utf-8",
                  contains_header: bool = True,
                  contains_frequencies: bool = True) -> "Vocabulary":
    """Load a vocabulary from a wordlist.

    The file can contain either list of words with no header.
    Or it can contain words and their counts separated
    by tab and a header on the first line.

    Arguments:
        path: The path to the wordlist file
        encoding: The encoding of the wordlist file (defaults to UTF-8)
        contains_header: if the file have a header on first line
        contains_frequencies: if the file contains a second column

    Returns:
        The new Vocabulary instance.
    """
    check_argument_types()
    vocabulary = []  # type: List[str]

    with open(path, encoding=encoding) as wordlist:
        line_number = 1
        if contains_header:
            # skip the header
            line_number += 1
            next(wordlist)

        for line in wordlist:
            line = line.strip()
            # check if line is empty
            if not line:
                warn("Vocabulary file {}:{}: line empty"
                     .format(path, line_number))
                line_number += 1
                continue

            if contains_frequencies:
                info = line.split("\t")
                if len(info) != 2:
                    raise ValueError(
                        "Vocabulary file {}:{}: line does not have two columns"
                        .format(path, line_number))
                word = info[0]
            else:
                if "\t" in line:
                    warn("Vocabulary file {}:{}: line contains a tabulator"
                         .format(path, line_number))
                word = line

            if line_number <= len(SPECIAL_TOKENS) + int(contains_header):
                should_be = SPECIAL_TOKENS[
                    line_number - 1 - int(contains_header)]
                if word != should_be:
                    notice("Expected special token {} but encountered a "
                           "different word: {}".format(should_be, word))
                    vocabulary.append(word)
                line_number += 1
                continue

            vocabulary.append(word)
            line_number += 1

    log("Vocabulary from wordlist loaded, containing {} words"
        .format(len(vocabulary)))
    log_sample(vocabulary)
    return Vocabulary(vocabulary)