def check_dataset_and_coders(dataset, runners): # pylint: disable=protected-access data_list = [] for runner in runners: for c in runner.all_coders: if hasattr(c, "data_id"): data_list.append((c.data_id, c)) elif hasattr(c, "data_ids"): data_list.extend([(d, c) for d in c.data_ids]) else: warn(("Coder: {} does not have" "a data attribute").format(c)) debug("Found series: {}".format(str(data_list)), "checking") missing = [] for (serie, coder) in data_list: if not dataset.has_series(serie): log("dataset {} does not have serie {}".format( dataset.name, serie)) missing.append((coder, serie)) if len(missing) > 0: formated = [ "{} ({}, {}.{})".format(serie, cod.name, cod.__class__.__module__, cod.__class__.__name__) for cod, serie in missing ] raise CheckingException("Dataset '{}' is mising series {}:".format( dataset.name, ", ".join(formated)))
def from_dataset(datasets: List[Dataset], series_ids: List[str], max_size: int, save_file: str = None, overwrite: bool = False, min_freq: Optional[int] = None, unk_sample_prob: float = 0.5) -> 'Vocabulary': """Loads vocabulary from a dataset with an option to save it. Arguments: datasets: A list of datasets from which to create the vocabulary series_ids: A list of ids of series of the datasets that should be used producing the vocabulary max_size: The maximum size of the vocabulary save_file: A file to save the vocabulary to. If None (default), the vocabulary will not be saved. overwrite: Overwrite existing file. min_freq: Do not include words with frequency smaller than this. unk_sample_prob: The probability with which to sample unks out of words with frequency 1. Defaults to 0.5. Returns: The new Vocabulary instance. """ check_argument_types() vocabulary = Vocabulary(unk_sample_prob=unk_sample_prob) for dataset in datasets: if isinstance(dataset, LazyDataset): warn("Inferring vocabulary from lazy dataset!") for series_id in series_ids: if not dataset.has_series(series_id): warn("Data series '{}' not present in the dataset".format( series_id)) series = dataset.get_series(series_id, allow_none=True) if series: vocabulary.add_tokenized_text( [token for sent in series for token in sent]) vocabulary.truncate(max_size) if min_freq is not None: if min_freq > 1: vocabulary.truncate_by_min_freq(min_freq) log("Vocabulary for series {} initialized, containing {} words".format( series_ids, len(vocabulary))) vocabulary.log_sample() if save_file is not None: directory = os.path.dirname(save_file) if not os.path.exists(directory): os.makedirs(directory) vocabulary.save_to_file(save_file, overwrite) return vocabulary
def reader(files: List[str]) -> Iterable[List[str]]: column_count = None text_reader = string_reader(encoding) for line in text_reader(files): io_line = io.StringIO(line.strip()) if quotechar is not None: parsed_csv = list( csv.reader(io_line, delimiter=delimiter, quotechar=quotechar, skipinitialspace=True)) else: parsed_csv = list( csv.reader(io_line, delimiter=delimiter, quoting=csv.QUOTE_NONE, skipinitialspace=True)) columns = len(parsed_csv[0]) if column_count is None: column_count = columns elif column_count != columns: warn("A mismatch in number of columns. Expected {} got {}". format(column_count, columns)) if columns < column: warn("There is a missing column number {} in the dataset.". format(column)) yield [] else: yield parsed_csv[0][column - 1].split()
def load_dataset_from_files(name: str, lazy: bool = False, preprocessors: List[Tuple[str, str, Callable]] = None, **kwargs) -> "Dataset": """Load a dataset from the files specified by the provided arguments. Paths to the data are provided in a form of dictionary. Keyword arguments: name: The name of the dataset to use. If None (default), the name will be inferred from the file names. lazy: Boolean flag specifying whether to use lazy loading (useful for large files). Note that the lazy dataset cannot be shuffled. Defaults to False. preprocessor: A callable used for preprocessing of the input sentences. kwargs: Dataset keyword argument specs. These parameters should begin with 's_' prefix and may end with '_out' suffix. For example, a data series 'source' which specify the source sentences should be initialized with the 's_source' parameter, which specifies the path and optinally reader of the source file. If runners generate data of the 'target' series, the output file should be initialized with the 's_target_out' parameter. Series identifiers should not contain underscores. Dataset-level preprocessors are defined with 'pre_' prefix followed by a new series name. In case of the pre-processed series, a callable taking the dataset and returning a new series is expected as a value. Returns: The newly created dataset. """ warn("Use of deprecated function. Consider using dataset.load instead.") check_argument_types() series_paths_and_readers = _get_series_paths_and_readers(kwargs) outputs = _get_series_outputs(kwargs) if not series_paths_and_readers: raise ValueError("No input files were provided.") series, data = [list(x) for x in zip(*series_paths_and_readers.items())] # Series-level preprocessors if preprocessors: for src, tgt, fun in preprocessors: series.append(tgt) data.append((fun, src)) # Dataset-level preprocessors keys = [key for key in kwargs if PREPROCESSED_SERIES.match(key)] for key in keys: s_name = get_first_match(PREPROCESSED_SERIES, key) preprocessor = cast(DatasetPreprocess, kwargs[key]) series.append(s_name) data.append(preprocessor) buffer_size = None if not lazy else 5000 return load(name, series, data, outputs, buffer_size, False)
def __init__(self, output_series: str, encoder: GenericModelPart, attribute: str = "output", select_session: int = None) -> None: """Initialize the representation runner. Args: output_series: Name of the output series with vectors. encoder: The encoder to use. This can be any ``GenericModelPart`` object. attribute: The name of the encoder attribute that contains the data. used_session: Id of the TensorFlow session used in case of model ensembles. """ check_argument_types() if attribute not in dir(encoder): warn("The encoder '{}' seems not to have the specified " "attribute '{}'".format(encoder, attribute)) TensorRunner.__init__(self, output_series, modelparts=[encoder], tensors=[attribute], batch_dims=[0], tensors_by_name=[], batch_dims_by_name=[], select_session=select_session, single_tensor=True)
def func( train_mode: tf.Tensor, rnn_size: int, encoders: List[TemporalStatefulWithOutput]) -> tf.Tensor: if len(encoders) != 1: raise ValueError("Exactly one encoder required for this type of " "projection. {} given.".format(len(encoders))) encoder = encoders[0] # shape (batch, time) masked_sum = tf.reduce_sum( encoder.temporal_states * tf.expand_dims(encoder.temporal_mask, 2), 1) # shape (batch, 1) lengths = tf.reduce_sum(encoder.temporal_mask, 1, keep_dims=True) means = masked_sum / lengths means = dropout(means, dropout_keep_prob, train_mode) encoder_rnn_size = means.get_shape()[1].value kernel_initializer = orthogonal_initializer() if encoder_rnn_size != rnn_size: kernel_initializer = tf.glorot_normal_initializer() warn("Using nematus projection on nonequal encoder and decoder " "state sizes ({} vs {})".format(encoder_rnn_size, rnn_size)) return tf.layers.dense(means, rnn_size, activation=tf.tanh, kernel_initializer=kernel_initializer, name="encoders_projection")
def reader(files: List[str]) -> Iterable[List[str]]: column_count = None text_reader = string_reader(encoding) for line in text_reader(files): io_line = io.StringIO(line.strip()) if quotechar is not None: parsed_csv = list(csv.reader(io_line, delimiter=delimiter, quotechar=quotechar, skipinitialspace=True)) else: parsed_csv = list(csv.reader(io_line, delimiter=delimiter, quoting=csv.QUOTE_NONE, skipinitialspace=True)) columns = len(parsed_csv[0]) if column_count is None: column_count = columns elif column_count != columns: warn("A mismatch in number of columns. Expected {} got {}" .format(column_count, columns)) if columns < column: warn("There is a missing column number {} in the dataset." .format(column)) yield [] else: yield parsed_csv[0][column - 1].split()
def __init__(self, output_series: str, encoder: GenericModelPart, attribute: str = "output", select_session: int = None) -> None: """Initialize the representation runner. Args: output_series: Name of the output series with vectors. encoder: The encoder to use. This can be any ``GenericModelPart`` object. attribute: The name of the encoder attribute that contains the data. used_session: Id of the TensorFlow session used in case of model ensembles. """ check_argument_types() if attribute not in dir(encoder): warn("The encoder '{}' seems not to have the specified " "attribute '{}'".format(encoder, attribute)) TensorRunner.__init__( self, output_series, modelparts=[encoder], tensors=[attribute], batch_dims=[0], tensors_by_name=[], batch_dims_by_name=[], select_session=select_session, single_tensor=True)
def __init__(self, wrapper: str, name: str = "MultEval", encoding: str = "utf-8", metric: str = "bleu", language: str = "en") -> None: """Initialize the wrapper. Arguments: wrapper: Path to multeval.sh script name: Name of the evaluator encoding: Encoding of input files language: Language of hypotheses and references metric: Evaluation metric "bleu", "ter", "meteor" """ check_argument_types() super().__init__("{}_{}_{}".format(name, metric, language)) self.wrapper = wrapper self.encoding = encoding self.language = language self.metric = metric if self.metric not in ["bleu", "ter", "meteor"]: warn("{} metric is not valid. Using bleu instead.". format(self.metric)) self.metric = "bleu"
def from_wordlist(path: str, encoding: str = "utf-8", contains_header: bool = True, contains_frequencies: bool = True) -> "Vocabulary": """Load a vocabulary from a wordlist. The file can contain either list of words with no header. Or it can contain words and their counts separated by tab and a header on the first line. Arguments: path: The path to the wordlist file encoding: The encoding of the wordlist file (defaults to UTF-8) contains_header: if the file have a header on first line contains_frequencies: if the file contains frequencies in second column Returns: The new Vocabulary instance. """ vocabulary = Vocabulary() with open(path, encoding=encoding) as wordlist: line_number = 1 if contains_header: # skip the header line_number += 1 next(wordlist) for line in wordlist: line = line.strip() # check if line is empty if not line: warn("Vocabulary file {}:{}: line empty".format( path, line_number)) line_number += 1 continue # if contains_frequencies: # info = line.split("\t") # if len(info) != 2: # raise ValueError( # "Vocabulary file {}:{}: line does not have two columns" # .format(path, line_number)) # vocabulary.add_word(info[0], int(info[1])) # else: # if "\t" in line: # warn("Vocabulary file {}:{}: line contains a tabulator" # .format(path, line_number)) # vocabulary.add_word(line) if contains_frequencies: info = line.split("\t") vocabulary.add_word(info[0]) line_number += 1 log("Vocabulary from wordlist loaded, containing {} words".format( len(vocabulary))) vocabulary.log_sample() return vocabulary
def rnn_layer(rnn_input: tf.Tensor, lengths: tf.Tensor, rnn_spec: RNNSpec, add_residual: bool) -> Tuple[tf.Tensor, tf.Tensor]: """Construct a RNN layer given its inputs and specs. Arguments: rnn_inputs: The input sequence to the RNN. lengths: Lengths of input sequences. rnn_spec: A valid RNNSpec tuple specifying the network architecture. add_residual: Add residual connections to the layer output. """ if rnn_spec.direction == "bidirectional": fw_cell = _make_rnn_cell(rnn_spec) bw_cell = _make_rnn_cell(rnn_spec) outputs_tup, states_tup = tf.nn.bidirectional_dynamic_rnn( fw_cell, bw_cell, rnn_input, sequence_length=lengths, dtype=tf.float32) outputs = tf.concat(outputs_tup, 2) if rnn_spec.cell_type == "LSTM": states_tup = (state.h for state in states_tup) final_state = tf.concat(list(states_tup), 1) else: if rnn_spec.direction == "backward": rnn_input = tf.reverse_sequence(rnn_input, lengths, seq_axis=1) cell = _make_rnn_cell(rnn_spec) outputs, final_state = tf.nn.dynamic_rnn(cell, rnn_input, sequence_length=lengths, dtype=tf.float32) if rnn_spec.direction == "backward": outputs = tf.reverse_sequence(outputs, lengths, seq_axis=1) if rnn_spec.cell_type == "LSTM": final_state = final_state.h if add_residual: if outputs.get_shape()[-1].value != rnn_input.get_shape()[-1].value: warn("Size of the RNN layer input ({}) and layer output ({}) " "must match when applying residual connection. Reshaping " "the rnn output using linear projection.".format( outputs.get_shape(), rnn_input.get_shape())) # pylint: disable=redefined-variable-type outputs = tf.layers.dense(outputs, rnn_input.shape.as_list()[-1]) # pylint: enable=redefined-variable-type outputs += rnn_input return outputs, final_state
def initialize_model(tf_manager: TensorFlowManager, initial_variables: Optional[List[str]], executables: List[GraphExecutor]): if initial_variables is None: # Assume we don't look at coder checkpoints when global # initial variables are supplied tf_manager.initialize_model_parts(executables, save=True) else: try: tf_manager.restore(initial_variables) except tf.errors.NotFoundError: warn("Some variables were not found in checkpoint.)")
def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict: fd = ModelPart.feed_dict(self, dataset, train) alignment = dataset.maybe_get_series(self.data_id) if alignment is None: if train: warn("Training alignment not present!") alignment = np.zeros((len(dataset), self.decoder.max_output_len, self.enc_input.max_length), np.float32) fd[self.ref_alignment] = alignment return fd
def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict: fd = {} alignment = dataset.get_series(self.data_id, allow_none=True) if alignment is None: if train: warn("Training alignment not present!") alignment = np.zeros((len(dataset), self.decoder.max_output_len, self.encoder.max_input_len), np.float32) fd[self.ref_alignment] = alignment return fd
def embedding_size(self) -> int: if self.embeddings_source is None: if self._embedding_size is None: raise ValueError( "You must specify either embedding size or the embedded " "sequence from which to reuse the embeddings (e.g. set " "'embedding_size' or 'embeddings_source' parameter)") return self._embedding_size if self.embeddings_source is not None: if self._embedding_size is not None: warn("Overriding the embedding_size parameter with the " "size of the reused embeddings from the encoder.") return self.embeddings_source.embedding_matrix.get_shape()[1].value
def embedding_size(self) -> int: if self.embeddings_source is None: if self._embedding_size is None: raise ValueError( "You must specify either embedding size or the embedded " "sequence from which to reuse the embeddings (e.g. set " "'embedding_size' or 'embeddings_source' parameter)") return self._embedding_size if self.embeddings_source is not None: if self._embedding_size is not None: warn("Overriding the embedding_size parameter with the " "size of the reused embeddings from the encoder.") return self.embeddings_source.embedding_matrix.get_shape()[1].value
def __init__(self, name: str, reuse: "Parameterized" = None, save_checkpoint: str = None, load_checkpoint: str = None, initializers: InitializerSpecs = None) -> None: """Construct a new parameterized object. Arguments: name: The name for the model part. Will be used in the variable and name scopes. reuse: Optional parameterized part with which to share parameters. save_checkpoint: Optional path to a checkpoint file which will store the parameters of this object. load_checkpoint: Optional path to a checkpoint file from which to load initial variables for this object. initializers: An `InitializerSpecs` instance with specification of the initializers. """ self._name = name self._save_checkpoint = save_checkpoint self._load_checkpoint = load_checkpoint self._saver = None # type: tf.train.Saver self._reuse = reuse is not None if reuse is not None: # pylint: disable=unidiomatic-typecheck # Here we need an exact match of types if type(self) != type(reuse): warn("Warning: sharing parameters between model parts of " "different types.") # pylint: enable=unidiomatic-typecheck if initializers is not None: raise ValueError("Cannot use initializers in model part '{}' " "that reuses variables from '{}'.".format( name, reuse.name)) # pylint: disable=protected-access self._variable_scope = reuse._variable_scope # type: ignore # pylint: enable=protected-access else: with tf.variable_scope(name) as scope: self._variable_scope = scope if initializers is not None: update_initializers((scope.name + "/" + name, initializer) for name, initializer in initializers)
def __init__(self, name: str, reuse: "Parameterized" = None, save_checkpoint: str = None, load_checkpoint: str = None, initializers: InitializerSpecs = None) -> None: """Construct a new parameterized object. Arguments: name: The name for the model part. Will be used in the variable and name scopes. reuse: Optional parameterized part with which to share parameters. save_checkpoint: Optional path to a checkpoint file which will store the parameters of this object. load_checkpoint: Optional path to a checkpoint file from which to load initial variables for this object. initializers: An `InitializerSpecs` instance with specification of the initializers. """ self._name = name self._save_checkpoint = save_checkpoint self._load_checkpoint = load_checkpoint self._saver = None # type: tf.train.Saver self._reuse = reuse is not None if reuse is not None: # pylint: disable=unidiomatic-typecheck # Here we need an exact match of types if type(self) != type(reuse): warn("Warning: sharing parameters between model parts of " "different types.") # pylint: enable=unidiomatic-typecheck if initializers is not None: raise ValueError("Cannot use initializers in model part '{}' " "that reuses variables from '{}'." .format(name, reuse.name)) # pylint: disable=protected-access self._variable_scope = reuse._variable_scope # type: ignore # pylint: enable=protected-access else: with tf.variable_scope(name) as scope: self._variable_scope = scope if initializers is not None: update_initializers((scope.name + "/" + name, initializer) for name, initializer in initializers)
def regularization_losses(self) -> Tuple[tf.Tensor, tf.Tensor]: """Compute the regularization losses, e.g. L1 and L2.""" regularizable = [v for v in tf.trainable_variables() if not BIAS_REGEX.findall(v.name) and not v.name.startswith("vgg") and not v.name.startswith("Inception") and not v.name.startswith("resnet")] if not regularizable: warn("It seems that there are no trainable variables in the model") return tf.zeros([]), tf.zeros([]) with tf.name_scope("regularization"): l1_norm = sum(tf.reduce_sum(abs(v)) for v in regularizable) l2_norm = sum(tf.reduce_sum(v ** 2) for v in regularizable) return l1_norm, l2_norm
def initialize_vocabulary(directory: str, name: str, datasets: List[Dataset] = None, series_ids: List[str] = None, max_size: int = None) -> "Vocabulary": """Initialize a vocabulary. This function is supposed to initialize vocabulary when called from the configuration file. It first checks whether the vocabulary is already loaded on the provided path and if not, it tries to generate it from the provided dataset. Args: directory: Directory where the vocabulary should be stored. name: Name of the vocabulary which is also the name of the file it is stored it. datasets: A a list of datasets from which the vocabulary can be created. series_ids: A list of ids of series of the datasets that should be used for producing the vocabulary. max_size: The maximum size of the vocabulary Returns: The new vocabulary """ warn("Use of deprecated initialize_vocabulary method. " "Did you think this through?") file_name = os.path.join(directory, name + ".pickle") if os.path.exists(file_name): return from_wordlist(file_name) if datasets is None or series_ids is None or max_size is None: raise Exception("Vocabulary does not exist in '{}', " "neither dataset and series_id were provided.") return from_dataset(datasets, series_ids, max_size, save_file=file_name, overwrite=False)
def truncate(self, size: int) -> None: """Truncate the vocabulary to the requested size. The infrequent tokens are discarded. Arguments: size: The final size of the vocabulary """ if not self.correct_counts: raise ValueError("The vocabulary does not have correct " "word_counts to use for vocabulary truncate") # sort by frequency # sorting words first makes vocabulary generation deterministic words_by_freq = sorted(list(sorted(self.word_count.keys())), key=lambda w: self.word_count[w]) # keep the least frequent words which are not special symbols to_delete = len(self) - size if to_delete < 0: to_delete = 0 warn("Actual vocabulary size ({}) is smaller than max_size ({})". format(len(self), size)) words_to_delete = [] # type: List[str] for word in words_by_freq: if len(words_to_delete) == to_delete: break if not is_special_token(word): words_to_delete.append(word) # sort by index ... bigger indices needs to be removed first # to keep the lists propertly shaped delete_words_by_index = sorted([(w, self.word_to_index[w]) for w in words_to_delete], key=lambda p: -p[1]) for word, index in delete_words_by_index: del self.word_count[word] del self.index_to_word[index] self.word_to_index = {} for index, word in enumerate(self.index_to_word): self.word_to_index[word] = index
def from_wordlist(path: str, encoding: str = "utf-8", contains_header: bool = True, contains_frequencies: bool = True) -> 'Vocabulary': """Loads vocabulary from a wordlist. The file can contain either list of words with no header. Or it can contain words and their counts separated by tab and a header on the first line. Arguments: path: The path to the wordlist file encoding: The encoding of the merge file (defaults to UTF-8) contains_header: if the file have a header on first line contains_frequencies: if the file contains frequencies in second column Returns: The new Vocabulary instance. """ vocabulary = Vocabulary() with open(path, encoding=encoding) as wordlist: if contains_header: # skip the header next(wordlist) for line in wordlist: line = line.strip() # check if line is empty if not line: continue if contains_frequencies: info = line.split('\t') if len(info) != 2: raise ValueError("Vocabulary file do not have two columns") vocabulary.add_word(info[0], int(info[1])) else: if '\t' in line: warn("The vocabulary contains a tabulator") vocabulary.add_word(line) log("Vocabulary from wordlist loaded, containing {} words".format( len(vocabulary))) vocabulary.log_sample() return vocabulary
def save_git_info(git_commit_file: str, git_diff_file: str, branch: str = "HEAD", repo_dir: str = None) -> None: if shutil.which("git") is not None: if repo_dir is None: # This points inside the neuralmonkey/ dir inside the repo, but # it does not matter for git. repo_dir = os.path.dirname(os.path.realpath(__file__)) with open(git_commit_file, "wb") as file: subprocess.run(["git", "log", "-1", "--format=%H", branch], cwd=repo_dir, stdout=file) with open(git_diff_file, "wb") as file: subprocess.run( ["git", "--no-pager", "diff", "--color=always", branch], cwd=repo_dir, stdout=file ) else: warn("No git executable found. Not storing git commit and diffs")
def build_config(config_dicts: Dict[str, Any], ignore_names: Set[str], warn_unused: bool = False) -> Tuple[Dict[str, Any], Dict[str, Any]]: """Build the model from the configuration. Arguments: config_dicts: The parsed configuration file ignore_names: A set of names that should be ignored during the loading. warn_unused: Emit a warning if there are unused sections. Returns: A tuple containing a dictionary corresponding to the main section and a dictionary mapping section names to objects. """ if "main" not in config_dicts: raise Exception("Configuration does not contain the main block.") existing_objects = collections.OrderedDict() # type: Dict[str, Any] main_config = config_dicts["main"] existing_objects["main"] = Namespace(**main_config) configuration = collections.OrderedDict() # type: Dict[str, Any] # TODO ensure tf_manager goes last in a better way for key, value in sorted(main_config.items(), key=lambda t: t[0] if t[0] != "tf_manager" else "zzz"): if key not in ignore_names: try: configuration[key] = build_object( value, config_dicts, existing_objects, 0) except Exception as exc: raise ConfigBuildException(key, exc) from None if warn_unused: existing_names = set(existing_objects.keys()) | {"main"} unused = config_dicts.keys() - existing_names if unused: warn("Configuration contains unused sections: " + str(unused) + ".") return configuration, existing_objects
def get_executable(self, compute_losses: bool, summaries: bool, num_sessions: int) -> TensorExecutable: fetches = {} batch_ids = {} for name, bid in zip(self._names, self._batch_dims_name): try: fetches[name] = tf.get_default_graph().get_tensor_by_name(name) batch_ids[name] = bid except KeyError: warn(("The tensor of name '{}' is not present in the " "graph.").format(name)) for tensor, bid in zip(self._tensors, self._batch_dims_ref): fetches[tensor.name] = tensor batch_ids[tensor.name] = bid return TensorExecutable(self.all_coders, fetches, batch_ids, self._select_session)
def build_config( config_dicts: Dict[str, Any], ignore_names: Set[str], warn_unused: bool = False) -> Tuple[Dict[str, Any], Dict[str, Any]]: """Build the model from the configuration. Arguments: config_dicts: The parsed configuration file ignore_names: A set of names that should be ignored during the loading. warn_unused: Emit a warning if there are unused sections. Returns: A tuple containing a dictionary corresponding to the main section and a dictionary mapping section names to objects. """ if "main" not in config_dicts: raise Exception("Configuration does not contain the main block.") existing_objects = collections.OrderedDict() # type: Dict[str, Any] main_config = config_dicts["main"] existing_objects["main"] = Namespace(**main_config) configuration = collections.OrderedDict() # type: Dict[str, Any] # TODO ensure tf_manager goes last in a better way for key, value in sorted(main_config.items(), key=lambda t: t[0] if t[0] != "tf_manager" else "zzz"): if key not in ignore_names: try: configuration[key] = build_object(value, config_dicts, existing_objects, 0) except Exception as exc: raise ConfigBuildException(key, exc) from None if warn_unused: existing_names = set(existing_objects.keys()) | {"main"} unused = config_dicts.keys() - existing_names if unused: warn("Configuration contains unused sections: " + str(unused) + ".") return configuration, existing_objects
def load(list_files: List[str]) -> Iterable[np.ndarray]: for list_file in list_files: with open(list_file) as f_list: for i, image_file in enumerate(f_list): path = os.path.join(prefix, image_file.rstrip()) if not os.path.exists(path): raise Exception( ("Image file '{}' no." "{} does not exist.").format(path, i + 1)) try: image = Image.open(path).convert(mode) except IOError: warn("Skipping image from file '{}' no. '{}'.".format( path, i + 1)) image = Image.new(mode, (pad_w, pad_h)) image = _rescale_or_crop(image, pad_w, pad_h, rescale_w, rescale_h, keep_aspect_ratio) image_np = np.array(image) if len(image_np.shape) == 2: img_channels = 1 image_np = np.expand_dims(image_np, 2) elif len(image_np.shape) == 3: img_channels = image_np.shape[2] else: raise ValueError( ("Image should have either 2 (black and white) " "or three dimensions (color channels), has {} " "dimension.").format(len(image_np.shape))) if channels != img_channels: raise ValueError( "Image does not have the pre-declared number of " "channels {}, but {}.".format( channels, img_channels)) yield _pad(image_np, pad_w, pad_h, channels)
def load(list_files: List[str]) -> Iterable[np.ndarray]: for list_file in list_files: with open(list_file) as f_list: for i, image_file in enumerate(f_list): path = os.path.join(prefix, image_file.rstrip()) if not os.path.exists(path): raise Exception( ("Image file '{}' no." "{} does not exist.").format(path, i + 1)) try: image = Image.open(path).convert(mode) except IOError: warn("Skipping image from file '{}' no. '{}'.".format( path, i + 1)) image = Image.new(mode, (pad_w, pad_h)) image = _rescale_or_crop(image, pad_w, pad_h, rescale_w, rescale_h, keep_aspect_ratio) image_np = np.array(image) if len(image_np.shape) == 2: img_channels = 1 image_np = np.expand_dims(image_np, 2) elif len(image_np.shape) == 3: img_channels = image_np.shape[2] else: raise ValueError( ("Image should have either 2 (black and white) " "or three dimensions (color channels), has {} " "dimension.").format(len(image_np.shape))) if channels != img_channels: raise ValueError( "Image does not have the pre-declared number of " "channels {}, but {}.".format( channels, img_channels)) yield _pad(image_np, pad_w, pad_h, channels)
def save_git_info(git_commit_file: str, git_diff_file: str, branch: str = "HEAD", repo_dir: str = None) -> None: if shutil.which("git") is not None: if repo_dir is None: # This points inside the neuralmonkey/ dir inside the repo, but # it does not matter for git. repo_dir = os.path.dirname(os.path.realpath(__file__)) with open(git_commit_file, "wb") as file: subprocess.run(["git", "log", "-1", "--format=%H", branch], cwd=repo_dir, stdout=file) with open(git_diff_file, "wb") as file: subprocess.run( ["git", "--no-pager", "diff", "--color=always", branch], cwd=repo_dir, stdout=file) else: warn("No git executable found. Not storing git commit and diffs")
def build_config(config_dicts: Dict[str, Any], ignore_names: Set[str], warn_unused: bool = False) -> Dict[str, Any]: """ Builds the model from the configuration Arguments: config_dicts: The parsed configuration file ignore_names: A set of names that should be ignored during the loading. warn_unused: Emit a warning if there are unused sections. """ if "main" not in config_dicts: raise Exception("Configuration does not contain the main block.") existing_objects = collections.OrderedDict() # type: Dict[str, Any] main_config = config_dicts['main'] configuration = collections.OrderedDict() # type: Dict[str, Any] # TODO ensure tf_manager goes last in a better way for key, value in sorted(main_config.items(), key=lambda t: t[0] if t[0] != 'tf_manager' else 'zzz'): if key not in ignore_names: try: configuration[key] = build_object(value, config_dicts, existing_objects, 0) except Exception as exc: raise ConfigBuildException(key, exc) from None if warn_unused: existing_names = {x[7:] for x in existing_objects.keys()} | {'main'} unused = config_dicts.keys() - existing_names if unused: warn("Configuration contains unused sections: " + str(unused) + ".") return configuration
def fetches(self) -> Dict[str, tf.Tensor]: fetches = {} # type: Dict[str, tf.Tensor] for name, bid in zip(self._names, self._batch_dims_name): try: fetches[name] = ( Experiment.get_current().graph.get_tensor_by_name(name)) self.batch_ids[name] = bid except KeyError: warn(("The tensor of name '{}' is not present in the " "graph.").format(name)) for mpart, tname, bid in zip(self._modelparts, self._tensors, self.batch_dims): if not hasattr(mpart, tname): raise ValueError("Model part {} does not have a tensor called " "{}.".format(mpart, tname)) tensorval = getattr(mpart, tname) fetches[tensorval.name] = tensorval self.batch_ids[tensorval.name] = bid return fetches
def fetches(self) -> Dict[str, tf.Tensor]: fetches = {} # type: Dict[str, tf.Tensor] for name, bid in zip(self._names, self._batch_dims_name): try: fetches[name] = ( Experiment.get_current().graph.get_tensor_by_name(name)) self.batch_ids[name] = bid except KeyError: warn(("The tensor of name '{}' is not present in the " "graph.").format(name)) for mpart, tname, bid in zip(self._modelparts, self._tensors, self.batch_dims): if not hasattr(mpart, tname): raise ValueError("Model part {} does not have a tensor called " "{}.".format(mpart, tname)) tensorval = getattr(mpart, tname) fetches[tensorval.name] = tensorval self.batch_ids[tensorval.name] = bid return fetches
def score_batch(self, hypotheses: List[List[str]], references: List[List[str]]) -> float: ref_bytes = self.serialize_to_bytes(references) hyp_bytes = self.serialize_to_bytes(hypotheses) with tempfile.NamedTemporaryFile() as reffile, \ tempfile.NamedTemporaryFile() as hypfile: reffile.write(ref_bytes) reffile.flush() hypfile.write(hyp_bytes) hypfile.flush() args = [self.wrapper, "eval", "--refs", reffile.name, "--hyps-baseline", hypfile.name, "--metrics", self.metric] if self.metric == "meteor": args.extend(["--meteor.language", self.language]) # problem: if meteor run for the first time, # paraphrase tables are downloaded output_proc = subprocess.run( args, stderr=subprocess.PIPE, stdout=subprocess.PIPE) proc_stdout = output_proc.stdout.decode("utf-8") # type: ignore lines = proc_stdout.splitlines() if not lines: return 0.0 try: filtered = float(lines[1].split()[1]) eval_score = filtered / 100. return eval_score except IndexError: warn("Error: Malformed output from MultEval wrapper:") warn(proc_stdout) warn("=======") return 0.0 except ValueError: warn("Value error - '{}' is not a number.".format(lines[0])) return 0.0
def _normalize_train_cfg(cfg: Namespace) -> None: """Given a configuration namespace, normalize the values it contains. This function is only executed when training mode has been invoked. Arguments: cfg: The namespace object returned by `Configuration.make_namespace` """ if not isinstance(cfg.val_dataset, List): cfg.val_datasets = [cfg.val_dataset] else: cfg.val_datasets = cfg.val_dataset if not isinstance(cfg.trainer, List): cfg.trainers = [cfg.trainer] else: cfg.trainers = cfg.trainer # deal with delayed trainer and logging periods # the correct way if there are more trainers is perhaps to do a # lowest common denominator of their batches_per_update. # But we can also warn because it is a very weird setup. delayed_trainers = [ t for t in cfg.trainers if isinstance(t, DelayedUpdateTrainer) ] denominator = 1 if len(cfg.trainers) > 1 and delayed_trainers: warn("Weird setup: using more trainers and one of them is delayed " "update trainer. No-one can vouch for your safety, user!") warn("Using the lowest common denominator of all delayed trainers'" " batches_per_update parameters for logging period") warn("Note that if you are using a multi-task trainer, it is on " "your own risk") denominator = np.lcm.reduce( [t.batches_per_update for t in delayed_trainers]) elif delayed_trainers: assert len(cfg.trainers) == 1 denominator = cfg.trainers[0].batches_per_update cfg.log_timer = _resolve_period(cfg.logging_period, denominator) cfg.val_timer = _resolve_period(cfg.validation_period, denominator)
def _normalize_train_cfg(cfg: Namespace) -> None: """Given a configuration namespace, normalize the values it contains. This function is only executed when training mode has been invoked. Arguments: cfg: The namespace object returned by `Configuration.make_namespace` """ if not isinstance(cfg.val_dataset, List): cfg.val_datasets = [cfg.val_dataset] else: cfg.val_datasets = cfg.val_dataset if not isinstance(cfg.trainer, List): cfg.trainers = [cfg.trainer] else: cfg.trainers = cfg.trainer # deal with delayed trainer and logging periods # the correct way if there are more trainers is perhaps to do a # lowest common denominator of their batches_per_update. # But we can also warn because it is a very weird setup. delayed_trainers = [t for t in cfg.trainers if isinstance(t, DelayedUpdateTrainer)] denominator = 1 if len(cfg.trainers) > 1 and delayed_trainers: warn("Weird setup: using more trainers and one of them is delayed " "update trainer. No-one can vouch for your safety, user!") warn("Using the lowest common denominator of all delayed trainers'" " batches_per_update parameters for logging period") warn("Note that if you are using a multi-task trainer, it is on " "your own risk") denominator = np.lcm.reduce([t.batches_per_update for t in delayed_trainers]) elif delayed_trainers: assert len(cfg.trainers) == 1 denominator = cfg.trainers[0].batches_per_update cfg.log_timer = _resolve_period(cfg.logging_period, denominator) cfg.val_timer = _resolve_period(cfg.validation_period, denominator)
def batches(self) -> Iterator["Dataset"]: """Split the dataset into batches. Returns: Generator yielding the batches. """ if self.batching.batch_size is not None: max_bs = self.batching.batch_size else: assert self.batching.bucket_batch_sizes is not None max_bs = max(self.batching.bucket_batch_sizes) if self.lazy and self.buffer_min_size < max_bs: warn("Minimum buffer size ({}) lower than batch size ({}). " "It is recommended to use large buffer size." .format(self.buffer_min_size, max_bs)) # Initialize iterators iterators = {s: it() for s, it in self.iterators.items()} # Create iterator over instances zipped_iterator = ( dict(zip(iterators, row)) for row in zip(*iterators.values())) # Fill the buffer with initial values, shuffle optionally if self.lazy: # pylint: disable=stop-iteration-return # This is pylint issue https://github.com/PyCQA/pylint/issues/2158 lbuf = list(next(zipped_iterator) for _ in range(self.buffer_size)) # pylint: enable=stop-iteration-return else: lbuf = list(zipped_iterator) if self.shuffled: random.shuffle(lbuf) buf = deque(lbuf) def _make_datagen(rows, key): def itergen(): return (row[key] for row in rows) return itergen # Iterate over the rest of the data until buffer is empty batch_index = 0 buckets = [[]] # type: List[List[DataExample]] if self.batching.bucket_boundaries is not None: buckets += [[] for _ in self.batching.bucket_boundaries] while buf: row = buf.popleft() if self.batching.bucket_boundaries is None: bucket_id = 0 else: # TODO: use only specific series to determine the bucket number length = max(len(row[key]) for key in row) bucket_id = -1 for b_id, limit in enumerate(self.batching.bucket_boundaries): fits_in = length <= limit tighter_fit = ( bucket_id == -1 or limit < self.batching.bucket_boundaries[ bucket_id]) if fits_in and tighter_fit: bucket_id = b_id buckets[bucket_id].append(row) if self.batching.bucket_batch_sizes is None: assert self.batching.batch_size is not None is_full = len(buckets[bucket_id]) >= self.batching.batch_size else: is_full = (len(buckets[bucket_id]) >= self.batching.bucket_batch_sizes[bucket_id]) if is_full: # Create the batch name = "{}.batch.{}".format(self.name, batch_index) data = {key: _make_datagen(buckets[bucket_id], key) for key in buckets[bucket_id][0]} yield Dataset( name=name, iterators=data, batching=self.batching) batch_index += 1 buckets[bucket_id] = [] # If lazy, refill buffer & shuffle if needed # Otherwise, all of the data is already loaded in the buffer. if self.lazy and len(buf) < self.buffer_min_size: # In case buffer_size is lower than batch_size to_add = self.buffer_size - len(buf) for _, item in zip(range(to_add), zipped_iterator): buf.append(item) if self.shuffled: lbuf = list(buf) random.shuffle(lbuf) buf = deque(lbuf) if not self.batching.drop_remainder: for bucket in buckets: if bucket: name = "{}.batch.{}".format(self.name, batch_index) data = {key: _make_datagen(bucket, key) for key in bucket[0]} yield Dataset( name=name, iterators=data, batching=self.batching) batch_index += 1
def __init__(self, encoders: List[Stateful], vocabulary: Vocabulary, data_id: str, name: str, max_output_len: int, dropout_keep_prob: float = 1.0, rnn_size: int = None, embedding_size: int = None, output_projection: OutputProjectionSpec = None, encoder_projection: EncoderProjection = None, attentions: List[BaseAttention] = None, embeddings_source: EmbeddedSequence = None, attention_on_input: bool = True, rnn_cell: str = "GRU", conditional_gru: bool = False, save_checkpoint: str = None, load_checkpoint: str = None) -> None: """Create a refactored version of monster decoder. Arguments: encoders: Input encoders of the decoder vocabulary: Target vocabulary data_id: Target data series name: Name of the decoder. Should be unique accross all Neural Monkey objects max_output_len: Maximum length of an output sequence dropout_keep_prob: Probability of keeping a value during dropout Keyword arguments: rnn_size: Size of the decoder hidden state, if None set according to encoders. embedding_size: Size of embedding vectors for target words output_projection: How to generate distribution over vocabulary from decoder rnn_outputs encoder_projection: How to construct initial state from encoders attention: The attention object to use. Optional. embeddings_source: Embedded sequence to take embeddings from rnn_cell: RNN Cell used by the decoder (GRU or LSTM) conditional_gru: Flag whether to use the Conditional GRU architecture attention_on_input: Flag whether attention from previous decoding step should be combined with the input in the next step. """ ModelPart.__init__(self, name, save_checkpoint, load_checkpoint) check_argument_types() log("Initializing decoder, name: '{}'".format(name)) self.encoders = encoders self.vocabulary = vocabulary self.data_id = data_id self.max_output_len = max_output_len self.dropout_keep_prob = dropout_keep_prob self.embedding_size = embedding_size self.rnn_size = rnn_size self.output_projection_spec = output_projection self.encoder_projection = encoder_projection self.attentions = attentions self.embeddings_source = embeddings_source self._conditional_gru = conditional_gru self._attention_on_input = attention_on_input self._rnn_cell_str = rnn_cell if self.attentions is None: self.attentions = [] if self.embedding_size is None and self.embeddings_source is None: raise ValueError("You must specify either embedding size or the " "embedded sequence from which to reuse the " "embeddings (e.g. set either 'embedding_size' or " " 'embeddings_source' parameter)") if self.embeddings_source is not None: if self.embedding_size is not None: warn("Overriding the embedding_size parameter with the" " size of the reused embeddings from the encoder.") self.embedding_size = ( self.embeddings_source.embedding_matrix.get_shape()[1].value) if self.encoder_projection is None: if not self.encoders: log("No encoder - language model only.") self.encoder_projection = empty_initial_state elif rnn_size is None: log("No rnn_size or encoder_projection: Using concatenation of" " encoded states") self.encoder_projection = concat_encoder_projection self.rnn_size = sum(e.output.get_shape()[1].value for e in encoders) else: log("Using linear projection of encoders as the initial state") self.encoder_projection = linear_encoder_projection( self.dropout_keep_prob) assert self.rnn_size is not None if self._rnn_cell_str not in RNN_CELL_TYPES: raise ValueError("RNN cell must be a either 'GRU', 'LSTM', or " "'NematusGRU'. Not {}".format(self._rnn_cell_str)) if self.output_projection_spec is None: log("No output projection specified - using tanh projection") self.output_projection = nonlinear_output( self.rnn_size, tf.tanh)[0] self.output_projection_size = self.rnn_size elif isinstance(self.output_projection_spec, tuple): (self.output_projection, self.output_projection_size) = tuple(self.output_projection_spec) else: self.output_projection = self.output_projection_spec self.output_projection_size = self.rnn_size if self._attention_on_input: self.input_projection = self.input_plus_attention else: self.input_projection = self.embed_input_symbol with self.use_scope(): with tf.variable_scope("attention_decoder") as self.step_scope: pass # TODO when it is possible, remove the printing of the cost var log("Decoder initalized. Cost var: {}".format(str(self.cost))) log("Runtime logits tensor: {}".format(str(self.runtime_logits)))
def __init__(self, output_series: str, toplevel_modelpart: ModelPart, toplevel_tensors: List[tf.Tensor], tensors_by_name: List[str], tensors_by_ref: List[tf.Tensor], batch_dims_by_name: List[int], batch_dims_by_ref: List[int], select_session: int = None, single_tensor: bool = False) -> None: """Construct a new ``TensorRunner`` object. Note that at this time, one must specify the toplevel objects so that it is ensured that the graph is built. The reason for this behavior is that the graph is constructed lazily and therefore if the tensors to store are provided by indirect reference (name), the system does not know early enough that it needs to create them. Args: output_series: The name of the generated output data series. toplevel_modelpart: A ``ModelPart`` object that is used as the top-level component of the model. This object should depend on values of all the wanted tensors. toplevel_tensors: A list of tensors that should be constructed. Use this when the toplevel model part does not depend on this tensor. The tensors are constructed during running this constructor method which prints them out. tensors_by_name: A list of tensor names to fetch. If a tensor is not in the graph, a warning is generated and the tensor is ignored. tensors_by_ref: A list of tensor objects to fetch. batch_dims_by_name: A list of integers that correspond to the batch dimension in each wanted tensor specified by name. batch_dims_by_ref: A list of integers that correspond to the batch dimension in each wanted tensor specified by reference. select_session: An optional integer specifying the session to use in case of ensembling. When not used, tensors from all sessions are stored. In case of a single session, this option has no effect. single_tensor: If `True`, it is assumed that only one tensor is to be fetched, and the execution result will consist of this tensor only. If `False`, the result will be a dict mapping tensor names to NumPy arrays. """ check_argument_types() BaseRunner[ModelPart].__init__(self, output_series, toplevel_modelpart) total_tensors = len(tensors_by_name) + len(tensors_by_ref) if single_tensor and total_tensors > 1: raise ValueError( "single_tensor is True, but {} tensors were given".format( total_tensors)) self._names = tensors_by_name self._tensors = tensors_by_ref self._batch_dims_name = batch_dims_by_name self._batch_dims_ref = batch_dims_by_ref self._select_session = select_session self._single_tensor = single_tensor log("Blessing toplevel tensors for tensor runner:") for tensor in toplevel_tensors: log("Toplevel tensor: {}".format(tensor)) self._fetches = {} # type: Dict[str, tf.Tensor] self._batch_ids = {} # type: Dict[str, int] for name, bid in zip(self._names, self._batch_dims_name): try: self._fetches[name] = ( Experiment.get_current().graph.get_tensor_by_name(name)) self._batch_ids[name] = bid except KeyError: warn(("The tensor of name '{}' is not present in the " "graph.").format(name))
def run_on_dataset( tf_manager: TensorFlowManager, runners: List[BaseRunner], dataset: Dataset, postprocess: Postprocess, write_out: bool = False, batch_size: Optional[int] = None, log_progress: int = 0 ) -> Tuple[List[ExecutionResult], Dict[str, List[Any]]]: """Apply the model on a dataset and optionally write outputs to files. Args: tf_manager: TensorFlow manager with initialized sessions. runners: A function that runs the code dataset: The dataset on which the model will be executed. evaluators: List of evaluators that are used for the model evaluation if the target data are provided. postprocess: an object to use as postprocessing of the write_out: Flag whether the outputs should be printed to a file defined in the dataset object. batch_size: size of the minibatch log_progress: log progress every X seconds extra_fetches: Extra tensors to evaluate for each batch. Returns: Tuple of resulting sentences/numpy arrays, and evaluation results if they are available which are dictionary function -> value. """ contains_targets = all( dataset.has_series(runner.decoder_data_id) for runner in runners if runner.decoder_data_id is not None) all_results = tf_manager.execute(dataset, runners, compute_losses=contains_targets, batch_size=batch_size, log_progress=log_progress) result_data = { runner.output_series: result.outputs for runner, result in zip(runners, all_results) } if postprocess is not None: for series_name, postprocessor in postprocess: postprocessed = postprocessor(dataset, result_data) if not hasattr(postprocessed, "__len__"): postprocessed = list(postprocessed) result_data[series_name] = postprocessed # check output series lengths for series_id, data in result_data.items(): if len(data) != len(dataset): warn("Output '{}' for dataset '{}' has length {}, but " "len(dataset) == {}".format(series_id, dataset.name, len(data), len(dataset))) def _check_savable_dict(data): """Check if the data is of savable type.""" if not (data and data[0]): return False supported_type = Union[List[Dict[str, np.ndarray]], List[List[Dict[str, np.ndarray]]]] try: check_type("data", data, supported_type, None) except TypeError: return False return True if write_out: for series_id, data in result_data.items(): if series_id in dataset.series_outputs: path = dataset.series_outputs[series_id] if isinstance(data, np.ndarray): np.save(path, data) log("Result saved as numpy array to '{}'".format(path)) elif _check_savable_dict(data): unbatched = dict( zip(data[0], zip(*[d.values() for d in data]))) np.savez(path, **unbatched) log("Result saved as numpy data to '{}.npz'".format(path)) else: with open(path, "w", encoding="utf-8") as f_out: f_out.writelines([ " ".join(sent) + "\n" if isinstance( sent, collections.Iterable) else str(sent) + "\n" for sent in data ]) log("Result saved as plain text '{}'".format(path)) else: log("There is no output file for dataset: {}".format( dataset.name), color="red") return all_results, result_data
def training_loop( tf_manager: TensorFlowManager, epochs: int, trainer: GenericTrainer, # TODO better annotate batch_size: int, log_directory: str, evaluators: EvalConfiguration, runners: List[BaseRunner], train_dataset: Dataset, val_dataset: Union[Dataset, List[Dataset]], test_datasets: Optional[List[Dataset]] = None, logging_period: Union[str, int] = 20, validation_period: Union[str, int] = 500, val_preview_input_series: Optional[List[str]] = None, val_preview_output_series: Optional[List[str]] = None, val_preview_num_examples: int = 15, train_start_offset: int = 0, runners_batch_size: Optional[int] = None, initial_variables: Optional[Union[str, List[str]]] = None, postprocess: Postprocess = None) -> None: """Execute the training loop for given graph and data. Args: tf_manager: TensorFlowManager with initialized sessions. epochs: Number of epochs for which the algoritm will learn. trainer: The trainer object containg the TensorFlow code for computing the loss and optimization operation. batch_size: number of examples in one mini-batch log_directory: Directory where the TensordBoard log will be generated. If None, nothing will be done. evaluators: List of evaluators. The last evaluator is used as the main. An evaluator is a tuple of the name of the generated series, the name of the dataset series the generated one is evaluated with and the evaluation function. If only one series names is provided, it means the generated and dataset series have the same name. runners: List of runners for logging and evaluation runs train_dataset: Dataset used for training val_dataset: used for validation. Can be Dataset or a list of datasets. The last dataset is used as the main one for storing best results. When using multiple datasets. It is recommended to name them for better Tensorboard visualization. test_datasets: List of datasets used for testing logging_period: after how many batches should the logging happen. It can also be defined as a time period in format like: 3s; 4m; 6h; 1d; 3m15s; 3seconds; 4minutes; 6hours; 1days validation_period: after how many batches should the validation happen. It can also be defined as a time period in same format as logging val_preview_input_series: which input series to preview in validation val_preview_output_series: which output series to preview in validation val_preview_num_examples: how many examples should be printed during validation train_start_offset: how many lines from the training dataset should be skipped. The training starts from the next batch. runners_batch_size: batch size of runners. It is the same as batch_size if not specified initial_variables: variables used for initialization, for example for continuation of training. Provide it with a path to your model directory and its checkpoint file group common prefix, e.g. "variables.data", or "variables.data.3" in case of multiple checkpoints per experiment. postprocess: A function which takes the dataset with its output series and generates additional series from them. """ check_argument_types() if isinstance(val_dataset, Dataset): val_datasets = [val_dataset] else: val_datasets = val_dataset log_period_batch, log_period_time = _resolve_period(logging_period) val_period_batch, val_period_time = _resolve_period(validation_period) _check_series_collisions(runners, postprocess) _log_model_variables(var_list=trainer.var_list) if runners_batch_size is None: runners_batch_size = batch_size evaluators = [(e[0], e[0], e[1]) if len(e) == 2 else e for e in evaluators] if evaluators: main_metric = "{}/{}".format(evaluators[-1][0], evaluators[-1][-1].name) else: main_metric = "{}/{}".format(runners[-1].decoder_data_id, runners[-1].loss_names[0]) if not tf_manager.minimize_metric: raise ValueError("minimize_metric must be set to True in " "TensorFlowManager when using loss as " "the main metric") step = 0 seen_instances = 0 last_seen_instances = 0 if initial_variables is None: # Assume we don't look at coder checkpoints when global # initial variables are supplied tf_manager.initialize_model_parts(runners + [trainer], save=True) # type: ignore else: try: tf_manager.restore(initial_variables) except tf.errors.NotFoundError: warn("Some variables were not found in checkpoint.)") if log_directory: log("Initializing TensorBoard summary writer.") tb_writer = tf.summary.FileWriter(log_directory, tf_manager.sessions[0].graph) log("TensorBoard writer initialized.") log("Starting training") last_log_time = time.process_time() last_val_time = time.process_time() interrupt = None try: for epoch_n in range(1, epochs + 1): log_print("") log("Epoch {} begins".format(epoch_n), color="red") train_dataset.shuffle() train_batched_datasets = train_dataset.batch_dataset(batch_size) if epoch_n == 1 and train_start_offset: if not isinstance(train_dataset, LazyDataset): warn("Not skipping training instances with " "shuffled in-memory dataset") else: _skip_lines(train_start_offset, train_batched_datasets) for batch_n, batch_dataset in enumerate(train_batched_datasets): step += 1 seen_instances += len(batch_dataset) if _is_logging_time(step, log_period_batch, last_log_time, log_period_time): trainer_result = tf_manager.execute(batch_dataset, [trainer], train=True, summaries=True) train_results, train_outputs = run_on_dataset( tf_manager, runners, batch_dataset, postprocess, write_out=False, batch_size=runners_batch_size) # ensure train outputs are iterable more than once train_outputs = { k: list(v) for k, v in train_outputs.items() } train_evaluation = evaluation(evaluators, batch_dataset, runners, train_results, train_outputs) _log_continuous_evaluation(tb_writer, main_metric, train_evaluation, seen_instances, epoch_n, epochs, trainer_result, train=True) last_log_time = time.process_time() else: tf_manager.execute(batch_dataset, [trainer], train=True, summaries=False) if _is_logging_time(step, val_period_batch, last_val_time, val_period_time): log_print("") val_duration_start = time.process_time() val_examples = 0 for val_id, valset in enumerate(val_datasets): val_examples += len(valset) val_results, val_outputs = run_on_dataset( tf_manager, runners, valset, postprocess, write_out=False, batch_size=runners_batch_size) # ensure val outputs are iterable more than once val_outputs = { k: list(v) for k, v in val_outputs.items() } val_evaluation = evaluation(evaluators, valset, runners, val_results, val_outputs) valheader = ( "Validation (epoch {}, batch number {}):".format( epoch_n, batch_n)) log(valheader, color="blue") _print_examples(valset, val_outputs, val_preview_input_series, val_preview_output_series, val_preview_num_examples) log_print("") log(valheader, color="blue") # The last validation set is selected to be the main if val_id == len(val_datasets) - 1: this_score = val_evaluation[main_metric] tf_manager.validation_hook(this_score, epoch_n, batch_n) if this_score == tf_manager.best_score: best_score_str = colored("{:.4g}".format( tf_manager.best_score), attrs=["bold"]) # store also graph parts all_coders = set.union(*[ rnr.all_coders for rnr in runners + [trainer] ]) # type: ignore for coder in all_coders: for session in tf_manager.sessions: coder.save(session) else: best_score_str = "{:.4g}".format( tf_manager.best_score) log("best {} on validation: {} (in epoch {}, " "after batch number {})".format( main_metric, best_score_str, tf_manager.best_score_epoch, tf_manager.best_score_batch), color="blue") v_name = valset.name if len(val_datasets) > 1 else None _log_continuous_evaluation(tb_writer, main_metric, val_evaluation, seen_instances, epoch_n, epochs, val_results, train=False, dataset_name=v_name) # how long was the training between validations training_duration = val_duration_start - last_val_time val_duration = time.process_time() - val_duration_start # the training should take at least twice the time of val. steptime = (training_duration / (seen_instances - last_seen_instances)) valtime = val_duration / val_examples last_seen_instances = seen_instances log("Validation time: {:.2f}s, inter-validation: {:.2f}s, " "per-instance (train): {:.2f}s, per-instance (val): " "{:.2f}s".format(val_duration, training_duration, steptime, valtime), color="blue") if training_duration < 2 * val_duration: notice("Validation period setting is inefficient.") log_print("") last_val_time = time.process_time() except KeyboardInterrupt as ex: interrupt = ex log("Training finished. Maximum {} on validation data: {:.4g}, epoch {}". format(main_metric, tf_manager.best_score, tf_manager.best_score_epoch)) if test_datasets: tf_manager.restore_best_vars() for dataset in test_datasets: test_results, test_outputs = run_on_dataset( tf_manager, runners, dataset, postprocess, write_out=True, batch_size=runners_batch_size) # ensure test outputs are iterable more than once test_outputs = {k: list(v) for k, v in test_outputs.items()} eval_result = evaluation(evaluators, dataset, runners, test_results, test_outputs) print_final_evaluation(dataset.name, eval_result) log("Finished.") if interrupt is not None: raise interrupt # pylint: disable=raising-bad-type
def xent_objective(decoder, weight=None) -> Objective: """Get XENT objective from decoder with cost.""" warn("Using deprecated xent_objective function. Use the CostObjective " "class directly.") return CostObjective(decoder, weight)
def rl_objective(*args, **kwargs) -> ReinforceObjective: warn("Using deprecated rl_objective function. Use ReinforceObjective class" " directly.") return ReinforceObjective(*args, **kwargs)
def __init__(self, name: str, encoders: List[Attendable], vocabulary: Vocabulary, data_id: str, # TODO infer the default for these three from the encoder ff_hidden_size: int, n_heads_self: int, n_heads_enc: Union[List[int], int], depth: int, max_output_len: int, attention_combination_strategy: str = "serial", n_heads_hier: int = None, dropout_keep_prob: float = 1.0, embedding_size: int = None, embeddings_source: EmbeddedSequence = None, tie_embeddings: bool = True, label_smoothing: float = None, self_attention_dropout_keep_prob: float = 1.0, attention_dropout_keep_prob: Union[float, List[float]] = 1.0, use_att_transform_bias: bool = False, supress_unk: bool = False, reuse: ModelPart = None, save_checkpoint: str = None, load_checkpoint: str = None, initializers: InitializerSpecs = None) -> None: """Create a decoder of the Transformer model. Described in Vaswani et al. (2017), arxiv.org/abs/1706.03762 Arguments: encoders: Input encoders for the decoder. vocabulary: Target vocabulary. data_id: Target data series. name: Name of the decoder. Should be unique accross all Neural Monkey objects. max_output_len: Maximum length of an output sequence. dropout_keep_prob: Probability of keeping a value during dropout. embedding_size: Size of embedding vectors for target words. embeddings_source: Embedded sequence to take embeddings from. tie_embeddings: Use decoder.embedding_matrix also in place of the output decoding matrix. ff_hidden_size: Size of the feedforward sublayers. n_heads_self: Number of the self-attention heads. n_heads_enc: Number of the attention heads over each encoder. Either a list which size must be equal to ``encoders``, or a single integer. In the latter case, the number of heads is equal for all encoders. attention_comnbination_strategy: One of ``serial``, ``parallel``, ``flat``, ``hierarchical``. Controls the attention combination strategy for enc-dec attention. n_heads_hier: Number of the attention heads for the second attention in the ``hierarchical`` attention combination. depth: Number of sublayers. label_smoothing: A label smoothing parameter for cross entropy loss computation. attention_dropout_keep_prob: Probability of keeping a value during dropout on the attention output. supress_unk: If true, decoder will not produce symbols for unknown tokens. reuse: Reuse the variables from the given model part. """ check_argument_types() AutoregressiveDecoder.__init__( self, name=name, vocabulary=vocabulary, data_id=data_id, max_output_len=max_output_len, dropout_keep_prob=dropout_keep_prob, embedding_size=embedding_size, embeddings_source=embeddings_source, tie_embeddings=tie_embeddings, label_smoothing=label_smoothing, supress_unk=supress_unk, reuse=reuse, save_checkpoint=save_checkpoint, load_checkpoint=load_checkpoint) self.encoders = encoders self.ff_hidden_size = ff_hidden_size self.n_heads_self = n_heads_self if isinstance(n_heads_enc, int): if attention_combination_strategy == "flat": self.n_heads_enc = [n_heads_enc] else: self.n_heads_enc = [n_heads_enc for _ in self.encoders] else: self.n_heads_enc = n_heads_enc self.depth = depth if isinstance(attention_dropout_keep_prob, float): self.attention_dropout_keep_prob = [ attention_dropout_keep_prob for _ in encoders] else: self.attention_dropout_keep_prob = attention_dropout_keep_prob self.self_att_dropout_keep_prob = self_attention_dropout_keep_prob self.use_att_transform_bias = use_att_transform_bias self.attention_combination_strategy = attention_combination_strategy self.n_heads_hier = n_heads_hier self.encoder_states = lambda: [get_attention_states(e) for e in self.encoders] self.encoder_masks = lambda: [get_attention_mask(e) for e in self.encoders] if self.attention_combination_strategy not in STRATEGIES: raise ValueError( "Unknown attention combination strategy '{}'. " "Allowed: {}.".format(self.attention_combination_strategy, ", ".join(STRATEGIES))) if (self.attention_combination_strategy == "hierarchical" and self.n_heads_hier is None): raise ValueError( "You must provide n_heads_hier when using the hierarchical " "attention combination strategy.") if (self.attention_combination_strategy != "hierarchical" and self.n_heads_hier is not None): warn("Ignoring n_heads_hier parameter -- use the hierarchical " "attention combination strategy instead.") if (self.attention_combination_strategy == "flat" and len(self.n_heads_enc) != 1): raise ValueError( "For the flat attention combination strategy, only a single " "value is permitted in n_heads_enc.") self._variable_scope.set_initializer(tf.variance_scaling_initializer( mode="fan_avg", distribution="uniform"))
def from_wordlist(path: str, encoding: str = "utf-8", contains_header: bool = True, contains_frequencies: bool = True) -> "Vocabulary": """Load a vocabulary from a wordlist. The file can contain either list of words with no header. Or it can contain words and their counts separated by tab and a header on the first line. Arguments: path: The path to the wordlist file encoding: The encoding of the wordlist file (defaults to UTF-8) contains_header: if the file have a header on first line contains_frequencies: if the file contains a second column Returns: The new Vocabulary instance. """ check_argument_types() vocabulary = [] # type: List[str] with open(path, encoding=encoding) as wordlist: line_number = 1 if contains_header: # skip the header line_number += 1 next(wordlist) for line in wordlist: line = line.strip() # check if line is empty if not line: warn("Vocabulary file {}:{}: line empty" .format(path, line_number)) line_number += 1 continue if contains_frequencies: info = line.split("\t") if len(info) != 2: raise ValueError( "Vocabulary file {}:{}: line does not have two columns" .format(path, line_number)) word = info[0] else: if "\t" in line: warn("Vocabulary file {}:{}: line contains a tabulator" .format(path, line_number)) word = line if line_number <= len(SPECIAL_TOKENS) + int(contains_header): should_be = SPECIAL_TOKENS[ line_number - 1 - int(contains_header)] if word != should_be: notice("Expected special token {} but encountered a " "different word: {}".format(should_be, word)) vocabulary.append(word) line_number += 1 continue vocabulary.append(word) line_number += 1 log("Vocabulary from wordlist loaded, containing {} words" .format(len(vocabulary))) log_sample(vocabulary) return Vocabulary(vocabulary)