def _bless_graph_executors(self) -> None: """Pre-compute the tensors referenced by the graph executors. Due to the lazy nature of the computational graph related components, nothing is actually added to the graph until it is "blessed" ( referenced, and therefore, executed). "Blessing" is usually implemented in the form of a log or a debug call with the blessed tensor as parameter. Referencing a `Tensor` causes the whole computational graph that is needed to evaluate the tensor to be built. This function "blesses" all tensors that could be potentially used using the `fetches` property of the provided runner objects. If the experiment runs in the training mode, this function also blesses the tensors fetched by the trainer(s). """ log("Building TF Graph") if hasattr(self.model, "trainer"): if isinstance(self.model.trainer, List): trainers = self.model.trainer else: trainers = [self.model.trainer] for trainer in trainers: debug("Trainer fetches: {}".format(trainer.fetches), "bless") for runner in self.model.runners: debug("Runner fetches: {}".format(runner.fetches), "bless") log("TF Graph built")
def dataset_from_files(**kwargs): """ Creates a dataset from the provided arguments. Paths to the data are provided in a form of dictionary. Args: kwargs: Arguments are treated as a dictionary. Paths to the data series are specified here. Series identifiers should not contain underscores. You can specify a language for the serie by adding a preprocess method you want to apply on the textual data by naming the function as <identifier>_preprocess=function OR the preprocessor can be specified globally """ random_seed = kwargs.get('random_seed', None) preprocess = kwargs.get('preprocessor', lambda x: x) series_paths = _get_series_paths(kwargs) if len(series_paths) > 0: log("Initializing dataset with: {}".format(", ".join(series_paths))) series = {s: Dataset.create_series(series_paths[s], preprocess) for s in series_paths} name = kwargs.get('name', _get_name_from_paths(series_paths)) series_outputs = {SERIES_OUTPUT.match(key)[1]: value for key, value in kwargs.items() if SERIES_OUTPUT.match(key)} dataset = Dataset(name, series, series_outputs, random_seed) log("Dataset length: {}".format(len(dataset))) return dataset
def create_serie(self, name, args): """ Loads a data serie from a file """ path = args[name] log("Loading {}".format(path)) file_type = magic.from_file(path, mime=True) # if the dataset has no name, generate it from files if 'name' not in args: self.name += "-"+path if file_type.startswith('text/'): if name+"_preprocess" in args: preprocess = args[name+"_preprocess"] else: preprocess = lambda s: s.split(" ") with codecs.open(path, 'r', 'utf-8') as f_data: for line in f_data: yield preprocess(line.rstrip()) elif file_type == 'application/gzip': with gzip.open(path, 'rb') as f_data: try: while True: yield pickle.load(f_data) except EOFError: pass else: raise Exception("\"{}\" has Unsopported data type: {}".format(path, file_type))
def main(): # pylint: disable=no-member,broad-except if len(sys.argv) != 3: print("Usage: run.py <run_ini_file> <test_datasets>") exit(1) test_datasets = Configuration() test_datasets.add_argument('test_datasets') args, sess = initialize_for_running(sys.argv[1]) datasets_args = test_datasets.load_file(sys.argv[2]) print("") try: for dataset in datasets_args.test_datasets: check_dataset_and_coders(dataset, args.encoders) except Exception as exc: log(exc.message, color='red') exit(1) for dataset in datasets_args.test_datasets: _, _, evaluation = run_on_dataset( sess, args.runner, args.encoders + [args.decoder], args.decoder, dataset, args.evaluation, args.postprocess, write_out=True) if evaluation: print_dataset_evaluation(dataset.name, evaluation)
def save_wordlist(self, path: str, overwrite: bool = False, encoding: str = "utf-8") -> None: """Save the vocabulary as a wordlist. The file is ordered by the ids of words. This function is used mainly for embedding visualization. Arguments: path: The path to save the file to. overwrite: Flag whether to overwrite existing file. Defaults to False. Raises: FileExistsError if the file exists and overwrite flag is disabled. """ if os.path.exists(path) and not overwrite: raise FileExistsError("Cannot save vocabulary: File exists and " "overwrite is disabled. {}".format(path)) with open(path, "w", encoding=encoding) as output_file: log("Storing vocabulary without frequencies.") for word in self._vocabulary: output_file.write("{}\n".format(word))
def load_config_file(config_file, ignore_names): """ Loads and builds the model from the configuration Arguments: config_file: The configuration file ignore_names: A set of names that should be ignored during the loading. """ config_dicts = parsing.parse_file(config_file) config_file.close() log("INI file is parsed.") # first load the configuration into a dictionary if "main" not in config_dicts: raise Exception("Configuration does not contain the main block.") existing_objects = dict() main_config = config_dicts['main'] configuration = dict() for key, value in main_config.items(): if key not in ignore_names: try: configuration[key] = build_object( value, config_dicts, existing_objects, 0) except Exception as exc: raise ConfigBuildException(key, exc) from None return configuration
def initialize_for_running(ini_file): """Prepares everything that is necessary for running a model. Arguments: ini_file: Path to the configuration file. Returns: A tuple of parsed configuration (inlucding built computation graph) and a TensorFlow session with already loaded model variables. """ # pylint: disable=no-member args = CONFIG.load_file(ini_file) print("") variables_file = os.path.join(args.output, "variables.data.best") cont_index = 1 def continuation_file(): return os.path.join(args.output, "variables.data.cont-{}.best".format(cont_index)) while os.path.exists(continuation_file()): variables_file = continuation_file() cont_index += 1 if not os.path.exists(variables_file): log("No variables file is stored in {}".format(args.output), color="red") exit(1) sess, _ = initialize_tf(variables_file, args.threads) print("") return args, sess
def main() -> None: try: _main() except KeyboardInterrupt: log("Training interrupted by user.") debug(traceback.format_exc()) exit(1)
def from_t2t_vocabulary(path: str, encoding: str = "utf-8") -> "Vocabulary": """Load a vocabulary generated during tensor2tensor training. Arguments: path: The path to the vocabulary file. encoding: The encoding of the vocabulary file (defaults to UTF-8). Returns: The new Vocabulary instantce. """ check_argument_types() vocabulary = [] # type: List[str] with open(path, encoding=encoding) as wordlist: for line in wordlist: line = line.strip() # T2T vocab tends to wrap words in single quotes if ((line.startswith("'") and line.endswith("'")) or (line.startswith('"') and line.endswith('"'))): line = line[1:-1] if line in ["<pad>", "<EOS>"]: continue vocabulary.append(line) log("Vocabulary form wordlist loaded, containing {} words" .format(len(vocabulary))) log_sample(vocabulary) return Vocabulary(vocabulary)
def __init__(self, merge_file: str, separator: str = "@@", encoding: str = "utf-8") -> None: log("Initializing BPE preprocessor") with open(merge_file, "r", encoding=encoding) as f_data: self.bpe = BPE(f_data, separator)
def save(self, session: tf.Session) -> None: """Save model part to a checkpoint file.""" if self._save_checkpoint: self._init_saver() self._saver.save(session, self._save_checkpoint) log("Variables of '{}' saved to '{}'".format( self.name, self._save_checkpoint))
def load(self, session: tf.Session) -> None: """Load model part from a checkpoint file.""" if self._load_checkpoint: self._init_saver() self._saver.restore(session, self._load_checkpoint) log("Variables of '{}' loaded from '{}'".format( self.name, self._load_checkpoint))
def output_projection_spec(self) -> Tuple[OutputProjection, int]: if self._output_projection_spec is None: log("No output projection specified - using tanh projection") return (nonlinear_output(self.rnn_size, tf.tanh)[0], self.rnn_size) if isinstance(self._output_projection_spec, tuple): return self._output_projection_spec return cast(OutputProjection, self._output_projection_spec), self.rnn_size
def initialize_sessions(self) -> None: log("Initializing variables") init_op = tf.global_variables_initializer() init_tables = tf.tables_initializer() for sess in self.sessions: sess.run([init_op, init_tables]) log("Initializing tf.train.Saver") self.saver = tf.train.Saver(max_to_keep=None, var_list=[g for g in tf.global_variables() if "reward_" not in g.name])
def log_sample(vocabulary: List[str], size: int = 5) -> None: """Log a sample of the vocabulary. Arguments: size: How many sample words to log. """ if size > len(vocabulary): log("Vocabulary: {}".format(vocabulary)) else: sample_ids = np.random.permutation(np.arange(len(vocabulary)))[:size] log("Sample of the vocabulary: {}".format( [vocabulary[i] for i in sample_ids]))
def __init__(self, config_path: str, train_mode: bool = False, overwrite_output_dir: bool = False, config_changes: List[str] = None) -> None: """Initialize a Neural Monkey experiment. Arguments: config_path: The path to the experiment configuration file. train_mode: Indicates whether the model should be prepared for training. overwrite_output_dir: Indicates whether an existing experiment should be reused. If `True`, this overrides the setting in the configuration file. config_changes: A list of modifications that will be made to the loaded configuration file before parsing. """ self.train_mode = train_mode self._config_path = config_path self.graph = tf.Graph() self._initializers = {} # type: Dict[str, Callable] self._initialized_variables = set() # type: Set[str] self.cont_index = -1 self._model_built = False self._vars_loaded = False self._model = None # type: Optional[Namespace] self.config = create_config(train_mode) self.config.load_file(config_path, config_changes) args = self.config.args if self.train_mode: # We may need to create the experiment directory. if (os.path.isdir(args.output) and os.path.exists( os.path.join(args.output, "experiment.ini"))): if args.overwrite_output_dir or overwrite_output_dir: # we do not want to delete the directory contents log("Directory with experiment.ini '{}' exists, " "overwriting enabled, proceeding.".format(args.output)) else: raise RuntimeError( "Directory with experiment.ini '{}' exists, " "overwriting disabled.".format(args.output)) if not os.path.isdir(args.output): os.mkdir(args.output) # Find how many times the experiment has been continued. while any(os.path.exists(self.get_path(f, self.cont_index + 1)) for f in _EXPERIMENT_FILES): self.cont_index += 1
def __missing__(self, key): """Try to fetch and parse the variable value from `os.environ`.""" if key in os.environ: try: value = _parse_value(os.environ[key], self) except ParseError: # If we cannot parse it, use it as a string. value = os.environ[key] log("Variable {}={!r} taken from the environment." .format(key, value)) return value raise ParseError("Undefined variable: {}".format(key))
def score_batch(self, hypotheses: List[List[str]], references: List[List[str]]) -> float: ref_bytes = self.serialize_to_bytes(references) hyp_bytes = self.serialize_to_bytes(hypotheses) reffile = tempfile.NamedTemporaryFile() reffile.write(ref_bytes) reffile.flush() output_proc = subprocess.run(["perl", self.wrapper, reffile.name], input=hyp_bytes, stderr=subprocess.PIPE, stdout=subprocess.PIPE) proc_stdout = output_proc.stdout.decode("utf-8") # type: ignore lines = proc_stdout.splitlines() try: bleu_score = float(lines[0]) return bleu_score except IndexError: log("Error: Malformed output from BLEU wrapper:", color="red") log(proc_stdout, color="red") log("=======", color="red") return 0.0 except ValueError: log("Value error - bleu '{}' is not a number.".format(lines[0]), color="red") return 0.0
def create_series(path, preprocess=lambda x: x): """ Loads a data serie from a file """ log("Loading {}".format(path)) file_type = magic.from_file(path, mime=True) if file_type.startswith('text/'): reader = PlainTextFileReader(path) return list([preprocess(line) for line in reader.read()]) elif file_type == 'application/octet-stream': return np.load(path) else: raise Exception("\"{}\" has Unsupported data type: {}" .format(path, file_type))
def restore(self, variable_files: Union[str, List[str]]) -> None: if self.saver is None: raise RuntimeError("Saver uninitialized") if isinstance(variable_files, str): variable_files = [variable_files] if len(variable_files) != len(self.sessions): raise Exception( "Provided {} files for restoring {} sessions.".format( len(variable_files), len(self.sessions))) for sess, file_name in zip(self.sessions, variable_files): log("Loading variables from {}".format(file_name)) self.saver.restore(sess, file_name) log("Variables loaded from {}".format(file_name))
def log_after_validation( self, val_examples: int, train_examples: int) -> None: train_duration = self.inter_val_times[-1] val_duration = self.validation_times[-1] train_speed = train_examples / train_duration val_speed = val_examples / val_duration log("Validation time: {:.2f}s ({:.1f} instances/sec), " "inter-validation: {:.2f}s, ({:.1f} instances/sec)" .format(val_duration, val_speed, train_duration, train_speed), color="blue") if self.inter_val_times[-1] < 2 * self.validation_times[-1]: notice("Validation period setting is inefficient.")
def _apply_change(config_dict: Dict[str, Any], setting: str) -> None: if "=" not in setting: raise ParseError("Invalid setting '{}'".format(setting)) key, value = (s.strip() for s in setting.split("=", maxsplit=1)) if "." in key: section, option = key.split(".", maxsplit=1) else: section = "main" option = key if section not in config_dict: log("Creating new section '{}'".format(section)) config_dict[section] = OrderedDict() config_dict[section][option] = -1, value # no line number
def train(self) -> None: """Train model specified by this experiment. This function is one of the main functions (entrypoints) called on the experiment. It builds the model (if needed) and runs the training procedure. Raises: `RuntimeError` when the experiment is not intended for training. """ if not self.train_mode: raise RuntimeError("train() was called, but the experiment was " "created with train_mode=False") if not self._model_built: self.build_model() self.cont_index += 1 # Initialize the experiment directory. self.config.save_file(self.get_path("experiment.ini")) shutil.copyfile(self._config_path, self.get_path("original.ini")) save_git_info(self.get_path("git_commit"), self.get_path("git_diff")) Logging.set_log_file(self.get_path("experiment.log")) Logging.print_header(self.model.name, self.model.output) with self.graph.as_default(): self.model.tf_manager.init_saving(self.get_path("variables.data")) training_loop(cfg=self.model) final_variables = self.get_path("variables.data.final") log("Saving final variables in {}".format(final_variables)) self.model.tf_manager.save(final_variables) if self.model.test_datasets: if self.model.tf_manager.best_score_index is not None: self.model.tf_manager.restore_best_vars() for test_id, dataset in enumerate(self.model.test_datasets): self.evaluate(dataset, write_out=True, name="test_{}".format(test_id)) log("Finished.") self._vars_loaded = True
def load_variables(self, variable_files: List[str] = None) -> None: """Load variables of the built model from file(s). When variable files are not provided, Neural Monkey will try to infer the name of a default checkpoint file using the following key: 1. Look for the averaged checkpoints named `variables.data.avg` or `variables.data.avg-0`. 2. Look for file `variables.data.best` file which usually contains the best scoring checkpoint from the run. 3. Look for the final checkpoint saved in `variables.data.final`. Arguments: variable_files: A list of variable files to load. The length of this list should match the number of sessions. """ if not self._model_built: self.build_model() if variable_files is None: if os.path.exists(self.get_path("variables.data.avg-0.index")): variable_files = [self.get_path("variables.data.avg-0")] elif os.path.exists(self.get_path("variables.data.avg.index")): variable_files = [self.get_path("variables.data.avg")] elif os.path.exists(self.get_path("variables.data.best")): best_var_file = self.get_path("variables.data.best") with open(best_var_file, "r") as f_best: var_path = f_best.read().rstrip() variable_files = [os.path.join(self.config.args.output, var_path)] elif os.path.exists(self.get_path("variables.data.final.index")): variable_files = [self.get_path("variables.data.final")] else: raise RuntimeError("Cannot infer default variables file") log("Default variable file '{}' will be used for loading " "variables.".format(variable_files[0])) for vfile in variable_files: if not os.path.exists("{}.index".format(vfile)): raise RuntimeError( "Index file for var prefix {} does not exist" .format(vfile)) self.model.tf_manager.restore(variable_files) self._vars_loaded = True
def test_mt_trainer(self): # TODO(tf-data) multitask trainer is likely broken by the changes trainer = MultitaskTrainer( [self.trainer1, self.trainer2, self.trainer1]) feedables = {self.mpart, self.mpart_2, self.trainer1, self.trainer2} for feedable in feedables: feedable.register_input(self.dataset) log("Blessing trainer fetches: {}".format(trainer.fetches)) self.assertSetEqual(trainer.feedables, feedables) self.assertSetEqual(trainer.parameterizeds, {self.mpart, self.mpart_2}) self.assertSetEqual( set(trainer.var_list), {self.mpart.var, self.mpart_2.var}) self.assertTrue(trainer.trainer_idx == 0) executable = trainer.get_executable() # mparts = trainer.feedables fetches, feeds = executable.next_to_execute() # self.assertSetEqual(mparts, {self.mpart}) self.assertFalse(feeds) self.assertTrue(trainer.trainer_idx == 1) self.assertTrue(fetches["losses"][0] == self.mpart.loss) executable = trainer.get_executable() fetches, feeds = executable.next_to_execute() # self.assertSetEqual(mparts, {self.mpart_2}) self.assertFalse(feeds) self.assertTrue(trainer.trainer_idx == 2) self.assertTrue(fetches["losses"][0] == self.mpart_2.loss) executable = trainer.get_executable() fetches, feeds = executable.next_to_execute() # self.assertSetEqual(mparts, {self.mpart}) self.assertFalse(feeds) self.assertTrue(trainer.trainer_idx == 0) self.assertTrue(fetches["losses"][0] == self.mpart.loss)
def _main() -> None: parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("config", metavar="INI-FILE", help="the configuration file for the experiment") parser.add_argument("-s", "--set", type=str, metavar="SETTING", action="append", dest="config_changes", default=[], help="override an option in the configuration; the " "syntax is [section.]option=value") parser.add_argument("-v", "--var", type=str, metavar="VAR", default=[], action="append", dest="config_vars", help="set a variable in the configuration; the syntax " "is var=value (shorthand for -s vars.var=value)") parser.add_argument("-i", "--init", dest="init_only", action="store_true", help="initialize the experiment directory and exit " "without building the model") parser.add_argument("-f", "--overwrite", action="store_true", help="force overwriting the output directory; can be " "used to start an experiment created with --init") args = parser.parse_args() args.config_changes.extend("vars.{}".format(s) for s in args.config_vars) exp = Experiment(config_path=args.config, config_changes=args.config_changes, train_mode=True, overwrite_output_dir=args.overwrite) with open(exp.get_path("args", exp.cont_index + 1), "w") as file: print(" ".join(shlex.quote(a) for a in sys.argv), file=file) if args.init_only: if exp.cont_index >= 0: log("The experiment directory already exists.", color="red") exit(1) exp.config.save_file(exp.get_path("experiment.ini", 0)) copyfile(args.config, exp.get_path("original.ini", 0)) log("Experiment directory initialized.") cmd = [os.path.basename(sys.argv[0]), "-f", exp.get_path("experiment.ini", 0)] log("To start experiment, run: {}".format(" ".join(shlex.quote(a) for a in cmd))) exit(0) try: exp.train() except KeyboardInterrupt: # pylint: disable=try-except-raise raise except Exception: # pylint: disable=broad-except log(traceback.format_exc(), color="red") exit(1)
def from_datasets(datasets, series_ids, max_size, random_seed=None): # type: (List[Dataset], List[str], int, int) -> Vocabulary vocabulary = Vocabulary(random_seed=random_seed) for dataset in datasets: for series_id in series_ids: series = dataset.get_series(series_id, allow_none=True) if series: vocabulary.add_tokenized_text([token for sent in series for token in sent]) vocabulary.trunkate(max_size) log("Vocabulary for series {} initialized, containing {} words" .format(series_ids, len(vocabulary))) log("Sample of the vocabulary: {}" .format([vocabulary.index_to_word[i] for i in np.random.randint(0, len(vocabulary), 5)])) return vocabulary
def concat_encoder_projection( train_mode: tf.Tensor, rnn_size: int = None, encoders: List[Stateful] = None) -> tf.Tensor: """Concatenate the encoded values of the encoders.""" if encoders is None or not encoders: raise ValueError("There must be at least one encoder for this type " "of encoder projection") output_size = sum(e.output.get_shape()[1].value for e in encoders) if rnn_size is not None and rnn_size != output_size: raise ValueError("RNN size supplied for concat projection ({}) does " "not match the size of the concatenated vectors ({})." .format(rnn_size, output_size)) log("The inferred rnn_size of this encoder projection will be {}" .format(output_size)) encoded_concat = tf.concat([e.output for e in encoders], 1) return encoded_concat
def main(): parser = argparse.ArgumentParser(description="Prepares the STR data.") parser.add_argument("--list", type=argparse.FileType('r'), help="File with images.", required=True) parser.add_argument("--img-root", type=str, required=True, help="Directory with images.") parser.add_argument("--height", type=int, default=32) parser.add_argument("--max-width", type=int, default=320) parser.add_argument("--output-file", type=str, required=True) parser.add_argument("--output-log", type=argparse.FileType('w'), required=True) args = parser.parse_args() preprocessor = STRPreprocessor(args.height, args.max_width) f_out = gzip.open(args.output_file, mode='wb') processed = 0 for i, line in enumerate(args.list): img_path = os.path.join(args.img_root, line.rstrip()) try: img = preprocessor(img_path) pickle.dump(img, f_out) args.output_log.write("{}\n".format(img_path)) processed += 1 if i % 1000 == 999: log("Processed {} images".format(i + 1)) except Exception as exc: log("Skipped {} (no. {}), expeption {}".format(img_path, i, exc), color='red') log("Done, saved {} images to {}".format(processed, args.output_file)) f_out.close() log("Padded {} times, on averaged {:.0f} pixels".\ format(len(preprocessor.paddings), np.mean(preprocessor.paddings) if preprocessor.paddings else 0.0)) log("Shrinked {} times, on averaged {:.0f} pixels".\ format(len(preprocessor.shrinkages), np.mean(preprocessor.shrinkages) if preprocessor.shrinkages else 0.0))
def initialize_tf(initial_variables, threads): """ Initializes the TensorFlow session after the graph is built. Args: initial_variables: File with the saved TF variables. Returns: A tuple of the TF session and the the TF saver object. """ log("Initializing the TensorFlow session.") sess = tf.Session(config=tf.ConfigProto(inter_op_parallelism_threads=threads, intra_op_parallelism_threads=threads)) sess.run(tf.initialize_all_variables()) saver = tf.train.Saver() if initial_variables: log("Loading variables from {}".format(initial_variables)) saver.restore(sess, initial_variables) log("Session initialization done.") return sess, saver
def __init__(self, name: str, input_sequence: TemporalStateful, ff_hidden_size: int, depth: int, n_heads: int, dropout_keep_prob: float = 1.0, attention_dropout_keep_prob: float = 1.0, target_space_id: int = None, use_att_transform_bias: bool = False, use_positional_encoding: bool = True, input_for_cross_attention: Attendable = None, n_cross_att_heads: int = None, reuse: ModelPart = None, save_checkpoint: str = None, load_checkpoint: str = None, initializers: InitializerSpecs = None) -> None: """Create an encoder of the Transformer model. Described in Vaswani et al. (2017), arxiv.org/abs/1706.03762 Arguments: input_sequence: Embedded input sequence. name: Name of the decoder. Should be unique accross all Neural Monkey objects. reuse: Reuse the model variables. dropout_keep_prob: Probability of keeping a value during dropout. target_space_id: Specifies the modality of the target space. use_att_transform_bias: Add bias when transforming qkv vectors for attention. use_positional_encoding: If True, position encoding signal is added to the input. Keyword arguments: ff_hidden_size: Size of the feedforward sublayers. n_heads: Number of the self-attention heads. depth: Number of sublayers. attention_dropout_keep_prob: Probability of keeping a value during dropout on the attention output. input_for_cross_attention: An attendable model part that is attended using cross-attention on every layer of the decoder, analogically to how encoder is attended in the decoder. n_cross_att_heads: Number of heads used in the cross-attention. """ check_argument_types() ModelPart.__init__(self, name, reuse, save_checkpoint, load_checkpoint, initializers) self.input_sequence = input_sequence self.model_dimension = self.input_sequence.dimension self.ff_hidden_size = ff_hidden_size self.depth = depth self.n_heads = n_heads self.dropout_keep_prob = dropout_keep_prob self.attention_dropout_keep_prob = attention_dropout_keep_prob self.target_space_id = target_space_id self.use_att_transform_bias = use_att_transform_bias self.use_positional_encoding = use_positional_encoding self.input_for_cross_attention = input_for_cross_attention self.n_cross_att_heads = n_cross_att_heads if self.depth <= 0: raise ValueError("Depth must be a positive integer.") if self.ff_hidden_size <= 0: raise ValueError("Feed forward hidden size must be a " "positive integer.") if self.dropout_keep_prob <= 0.0 or self.dropout_keep_prob > 1.0: raise ValueError("Dropout keep prob must be inside (0,1].") if (self.attention_dropout_keep_prob <= 0.0 or self.attention_dropout_keep_prob > 1.0): raise ValueError("Dropout keep prob for attn must be in (0,1].") if self.target_space_id is not None and (self.target_space_id >= 32 or self.target_space_id < 0): raise ValueError( "If provided, the target space ID should be between 0 and 31. " "Was: {}".format(self.target_space_id)) if (input_for_cross_attention is None) != (n_cross_att_heads is None): raise ValueError( "Either both input_for_cross_attention and n_cross_att_heads " "must be provided or none of them.") if input_for_cross_attention is not None: cross_att_dim = get_attention_states( input_for_cross_attention).get_shape()[-1].value if cross_att_dim != self.model_dimension: raise ValueError( "The input for cross-attention must be of the same " "dimension as the model, was {}.".format(cross_att_dim)) self._variable_scope.set_initializer( tf.variance_scaling_initializer(mode="fan_avg", distribution="uniform")) log("Output op: {}".format(self.output))
def main() -> None: # pylint: disable=no-member,broad-except parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("config", metavar="INI-FILE", help="the configuration file of the experiment") parser.add_argument("datasets", metavar="INI-TEST-DATASETS", help="the configuration of the test datasets") parser.add_argument("--json", type=str, help="write the evaluation " "results to this file in JSON format") parser.add_argument("-g", "--grid", dest="grid", action="store_true", help="look at the SGE variables for slicing the data") args = parser.parse_args() test_datasets = Configuration() test_datasets.add_argument("test_datasets") test_datasets.add_argument("variables", cond=lambda x: isinstance(x, list)) test_datasets.load_file(args.datasets) test_datasets.build_model() datasets_model = test_datasets.model exp = Experiment(config_path=args.config) exp.build_model() exp.load_variables(datasets_model.variables) if args.grid and len(datasets_model.test_datasets) > 1: raise ValueError("Only one test dataset supported when using --grid") results = [] for dataset in datasets_model.test_datasets: if args.grid: if ("SGE_TASK_FIRST" not in os.environ or "SGE_TASK_LAST" not in os.environ or "SGE_TASK_STEPSIZE" not in os.environ or "SGE_TASK_ID" not in os.environ): raise EnvironmentError( "Some SGE environment variables are missing") length = int(os.environ["SGE_TASK_STEPSIZE"]) start = int(os.environ["SGE_TASK_ID"]) - 1 end = int(os.environ["SGE_TASK_LAST"]) - 1 if start + length > end: length = end - start + 1 log("Running grid task {} starting at {} with step {}".format( start // length, start, length)) dataset = dataset.subset(start, length) if exp.config.args.evaluation is None: exp.run_model(dataset, write_out=True) else: eval_result = exp.evaluate(dataset, write_out=True) results.append(eval_result) if args.json: with open(args.json, "w") as f_out: json.dump(results, f_out) f_out.write("\n") for session in exp.config.model.tf_manager.sessions: session.close()
def main() -> None: # pylint: disable=no-member,broad-except parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("config", metavar="INI-FILE", help="the configuration file of the experiment") parser.add_argument('datasets', metavar='INI-TEST-DATASETS', help="the configuration of the test datasets") parser.add_argument("-g", "--grid", dest="grid", action="store_true", help="look at the SGE variables for slicing the data") args = parser.parse_args() test_datasets = Configuration() test_datasets.add_argument('test_datasets') test_datasets.add_argument('variables') CONFIG.load_file(args.config) CONFIG.build_model() test_datasets.load_file(args.datasets) test_datasets.build_model() datasets_model = test_datasets.model initialize_for_running(CONFIG.model.output, CONFIG.model.tf_manager, datasets_model.variables) print("") evaluators = [(e[0], e[0], e[1]) if len(e) == 2 else e for e in CONFIG.model.evaluation] if args.grid and len(datasets_model.test_datasets) > 1: raise ValueError("Only one test dataset supported when using --grid") for dataset in datasets_model.test_datasets: if args.grid: if ("SGE_TASK_FIRST" not in os.environ or "SGE_TASK_LAST" not in os.environ or "SGE_TASK_STEPSIZE" not in os.environ or "SGE_TASK_ID" not in os.environ): raise EnvironmentError( "Some SGE environment variables are missing") length = int(os.environ["SGE_TASK_STEPSIZE"]) start = int(os.environ["SGE_TASK_ID"]) - 1 end = int(os.environ["SGE_TASK_LAST"]) - 1 if start + length > end: length = end - start + 1 log("Running grid task {} starting at {} with step {}".format( start // length, start, length)) dataset = dataset.subset(start, length) if CONFIG.model.runners_batch_size is None: runners_batch_size = CONFIG.model.batch_size else: runners_batch_size = CONFIG.model.runners_batch_size execution_results, output_data = run_on_dataset( CONFIG.model.tf_manager, CONFIG.model.runners, dataset, CONFIG.model.postprocess, write_out=True, batch_size=runners_batch_size, log_progress=60) # TODO what if there is no ground truth eval_result = evaluation(evaluators, dataset, CONFIG.model.runners, execution_results, output_data) if eval_result: print_final_evaluation(dataset.name, eval_result) for _ in range(len(CONFIG.model.tf_manager.sessions)): del CONFIG.model.tf_manager.sessions[0]
def __init__(self, max_input_len, vocabulary, data_id, embedding_size, rnn_size, name, dropout_keep_p=0.5, use_noisy_activations=False, use_pervasive_dropout=False, attention_type=None, attention_fertility=3, parent_encoder=None): self.name = name self.max_input_len = max_input_len assert_type(self, 'vocabulary', vocabulary, Vocabulary) self.vocabulary = vocabulary self.data_id = data_id self.embedding_size = embedding_size self.rnn_size = rnn_size self.max_input_len = max_input_len self.dropout_keep_p = dropout_keep_p self.use_noisy_activations = use_noisy_activations self.use_pervasive_dropout = use_pervasive_dropout self.attention_type = attention_type self.attention_fertility = attention_fertility assert_type(self, 'parent_encoder', parent_encoder, SentenceEncoder, can_be_none=True) self.parent_encoder = parent_encoder log("Initializing sentence encoder, name: \"{}\"".format(name)) with tf.variable_scope(name): self.dropout_placeholder = tf.placeholder(tf.float32, name="dropout") self.is_training = tf.placeholder(tf.bool, name="is_training") self.inputs = [tf.placeholder(tf.int32, shape=[None], name="input_{}".format(i)) for i in range(max_input_len + 2)] self.weight_ins = [tf.placeholder(tf.float32, shape=[None], name="input_{}".format(i)) for i in range(max_input_len + 2)] self.weight_tensor = tf.concat(1, [tf.expand_dims(w, 1) for w in self.weight_ins]) self.sentence_lengths = tf.to_int64(sum(self.weight_ins)) if parent_encoder: self.word_embeddings = parent_encoder.word_embeddings else: self.word_embeddings = tf.Variable(tf.random_uniform( [len(vocabulary), embedding_size], -1.0, 1.0)) embedded_inputs = [tf.nn.embedding_lookup(self.word_embeddings, i) for i in self.inputs] dropped_embedded_inputs = [ tf.nn.dropout(i, self.dropout_placeholder) for i in embedded_inputs] if parent_encoder: self.forward_gru = parent_encoder.forward_gru self.backward_gru = parent_encoder.backward_gru else: if use_noisy_activations: self.forward_gru = NoisyGRUCell(rnn_size, self.is_training) self.backward_gru = NoisyGRUCell(rnn_size, self.is_training) else: self.forward_gru = tf.nn.rnn_cell.GRUCell(rnn_size) self.backward_gru = tf.nn.rnn_cell.GRUCell(rnn_size) if use_pervasive_dropout: # create dropout mask (shape batch x rnn_size) # floor (random uniform + dropout_keep) shape = tf.concat(0, [tf.shape(self.inputs[0]), [rnn_size]]) forward_dropout_mask = tf.floor( tf.random_uniform(shape, 0.0, 1.0) + self.dropout_placeholder) backward_dropout_mask = tf.floor( tf.random_uniform(shape, 0.0, 1.0) + self.dropout_placeholder) scale = tf.inv(self.dropout_placeholder) self.forward_gru = PervasiveDropoutWrapper( self.forward_gru, forward_dropout_mask, scale) self.backward_gru = PervasiveDropoutWrapper( self.backward_gru, backward_dropout_mask, scale) bidi_layer = BidirectionalRNNLayer(self.forward_gru, self.backward_gru, dropped_embedded_inputs, self.sentence_lengths) self.outputs_bidi = bidi_layer.outputs_bidi self.encoded = bidi_layer.encoded self.attention_tensor = tf.concat(1, [tf.expand_dims(o, 1) for o in self.outputs_bidi]) self.attention_object = attention_type( self.attention_tensor, scope="attention_{}".format(name), dropout_placeholder=self.dropout_placeholder, input_weights=self.weight_tensor, max_fertility=attention_fertility) if attention_type else None log("Sentence encoder initialized")
def __init__(self, decoder, initial_trainer, xent_calls, moving_calls): """ Constructs the TensorFlow graph for the MIXER code - i.e. the regressor estimating BLEU from hidden states and the gradients from the REINFORCE algorithm. Args: decoder: Decoder. xent_calls: The number minibatches for which the standard crossentropy learning will be used. moving_calls: Number of minibatches after which the algorithm will proceed to use the REINFORCE algorithm for a longer suffix of the senntences. """ # TODO L2 regularization # TODO plot gradients self.xent_trainer = initial_trainer self.decoder = decoder self.called = 0 self.xent_calls = xent_calls self.moving_calls = moving_calls with tf.variable_scope('mixer'): # BLEU score needs to be computed outside the TF self.bleu = tf.placeholder(tf.float32, [None]) hidden_states = decoder.hidden_states # a simple regressor that estimates the BLEU score from the network's hidden states with tf.variable_scope('exprected_reward_regressor'): linear_reg_W = tf.Variable(tf.truncated_normal([decoder.rnn_size, 1])) linear_reg_b = tf.Variable(tf.zeros([1])) expected_rewards = [ tf.squeeze(tf.matmul(h, linear_reg_W)) + linear_reg_b for h in hidden_states] regression_loss = sum([(r - self.bleu) ** 2 for r in expected_rewards]) * 0.5 self.regression_optimizer = tf.train.AdamOptimizer(1e-3).minimize(regression_loss) ## decoded_logits: list of [batch x vabulary] tensors (length max sequence) ## decoded_seq: list of [batch x 1] tensors (length sequence) -- ## contains vocabulary indices (argmaxs) with tf.variable_scope("reinforce_gradients"): # this is a dirty trick to get the indices of maxima in the logits max_logits = \ [tf.expand_dims(tf.reduce_max(l, 1), 1) \ for l in decoder.decoded_logits] ## batch x 1 x 1 indicator = \ [tf.to_float(tf.equal(ml, l)) \ for ml, l in zip(max_logits, decoder.decoded_logits)] ## batch x slovnik log("Forward cmomputation graph ready") # this is implementation of equation (11) in the paper derivatives = [ tf.reduce_sum( tf.expand_dims(self.bleu - r, 1) * (tf.nn.softmax(l) - i) * w, 0, keep_dims=True) for r, l, i, w in zip( expected_rewards, decoder.decoded_logits, indicator, decoder.weights_ins)] ## ^^^ list of [1 x vocabulary] tensors # this derivatives are constant for us now, we don't really # want to propagate the dradient back to this computaiton derivatives_stopped = [tf.stop_gradient(d) for d in derivatives] # we must train the regressor independently trainable_vars = \ [v for v in tf.trainable_variables() if not v.name.startswith('mixer')] # this is implementation of equation (10) in the paper reinforce_gradients = \ [tf.gradients(l * d, trainable_vars) \ for l, d in zip(decoder.decoded_logits, derivatives_stopped)] ## ^^^ [slovnik x shape promenny](delky max seq) log("Reinfoce gradients computed") with tf.variable_scope("cross_entropy_gradients"): cross_entropies = [ tf.reduce_sum(tf.nn.sparse_softmax_cross_entropy_with_logits(l, t) * w, 0) for l, t, w in zip(decoder.decoded_logits, decoder.targets, decoder.weights_ins) ] ## ^^^ list of scalars in time xent_gradients = [tf.gradients(e, trainable_vars) for e in cross_entropies] log("Cross-entropy gradients computed") self.mixer_weights_plc = [tf.placeholder(tf.float32, []) for _ in hidden_states] mixed_gradients = [] # a list for each of the traininable variables for i, (rgs, xent_gs, mix_w) in enumerate( zip(reinforce_gradients, xent_gradients, self.mixer_weights_plc)): for j, (rg, xent_g) in enumerate(zip(rgs, xent_gs)): if xent_g is None and i == 0: mixed_gradients.append(None) continue if type(xent_g) == tf.Tensor or type(xent_g) == tf.IndexedSlices: g = tf.add(tf.scalar_mul(mix_w, xent_g), tf.scalar_mul(1 - mix_w, rg)) elif xent_g is None: continue else: raise Exception("Unnkown type of gradients: {}".format(type(xg))) if i == 0: mixed_gradients.append(g) else: if mixed_gradients[j] is None: mixed_gradients[j] = g else: mixed_gradients[j] += g self.mixer_optimizer = \ tf.train.AdamOptimizer().apply_gradients(list(zip(mixed_gradients, trainable_vars))) self.summary_gradients = tf.merge_summary(tf.get_collection("summary_gradients")) self.summary_train = summary_train = tf.merge_summary(tf.get_collection("summary_train"))
def load(name: str, series: List[str], data: List[SourceSpec], outputs: List[OutputSpec] = None, buffer_size: int = None, shuffled: bool = False) -> "Dataset": """Create a dataset using specification from the configuration. The dataset provides iterators over data series. The dataset has a buffer, which pre-fetches a given number of the data series lazily. In case the dataset is not lazy (buffer size is `None`), the iterators are built on top of in-memory arrays. Otherwise, the iterators operate on the data sources directly. Arguments: name: The name of the dataset. series: A list of names of data series the dataset contains. data: The specification of the data sources for each series. outputs: A list of output specifications. buffer_size: The size of the buffer. If set, the dataset will be loaded lazily into the buffer (useful for large datasets). The buffer size specifies the number of sequences to pre-load. This is useful for pseudo-shuffling of large data on-the-fly. Ideally, this should be (much) larger than the batch size. Note that the buffer gets refilled each time its size is less than half the `buffer_size`. When refilling, the buffer gets refilled to the specified size. shuffled: Whether to shuffle the dataset buffer (done upon refill). """ check_argument_types() if not series: raise ValueError("No dataset series specified.") if not [s for s in data if match_type(s, ReaderDef)]: # type: ignore raise ValueError("At least one data series should be from a file") if len(series) != len(data): raise ValueError( "The 'series' and 'data' lists should have the same number" " of elements: {} vs {}.".format(len(series), len(data))) if len(series) != len(set(series)): raise ValueError("There are duplicate series.") if outputs is not None: output_sources = [o[0] for o in outputs] if len(output_sources) != len(set(output_sources)): raise ValueError("Multiple outputs for a single series") log("Initializing dataset {}.".format(name)) iterators = {} # type: Dict[str, Callable[[], DataSeries]] prep_sl = {} # type: Dict[str, Tuple[Callable, str]] prep_dl = {} # type: Dict[str, DatasetPreprocess] def _make_iterator(reader, files): def itergen(): return reader(files) return itergen def _make_sl_iterator(src, prep): def itergen(): return (prep(item) for item in iterators[src]()) return itergen def _make_dl_iterator(func): def itergen(): return func(iterators) return itergen # First, prepare iterators for series using file readers for s_name, source_spec in zip(series, data): if match_type(source_spec, ReaderDef): # type: ignore files, reader = _normalize_readerdef(cast(ReaderDef, source_spec)) for path in files: if not os.path.isfile(path): raise FileNotFoundError( "File not found. Series: {}, Path: {}".format( s_name, path)) iterators[s_name] = _make_iterator(reader, files) elif match_type(source_spec, Tuple[Callable, str]): prep_sl[s_name] = cast(Tuple[Callable, str], source_spec) else: assert match_type(source_spec, DatasetPreprocess) # type: ignore prep_dl[s_name] = cast(DatasetPreprocess, source_spec) # Second, prepare series-level preprocessors. # Note that series-level preprocessors cannot be stacked on the dataset # specification level. for s_name, (preprocessor, source) in prep_sl.items(): if source not in iterators: raise ValueError( "Source series for series-level preprocessor nonexistent: " "Preprocessed series '{}', source series '{}'") iterators[s_name] = _make_sl_iterator(source, preprocessor) # Finally, dataset-level preprocessors. for s_name, func in prep_dl.items(): iterators[s_name] = _make_dl_iterator(func) output_dict = None if outputs is not None: output_dict = { s_name: (path, writer) for s_name, path, writer in [_normalize_outputspec(out) for out in outputs] } if buffer_size is not None: return Dataset(name, iterators, output_dict, (buffer_size // 2, buffer_size), shuffled) return Dataset(name, iterators, output_dict, None, shuffled)
def __init__(self, encoders: List[Stateful], vocabulary: Vocabulary, data_id: str, name: str, max_output_len: int, dropout_keep_prob: float = 1.0, embedding_size: int = None, embeddings_source: EmbeddedSequence = None, tie_embeddings: bool = False, label_smoothing: float = None, rnn_size: int = None, output_projection: OutputProjectionSpec = None, encoder_projection: EncoderProjection = None, attentions: List[BaseAttention] = None, attention_on_input: bool = False, rnn_cell: str = "GRU", conditional_gru: bool = False, supress_unk: bool = False, save_checkpoint: str = None, load_checkpoint: str = None, initializers: InitializerSpecs = None) -> None: """Create a refactored version of monster decoder. Arguments: encoders: Input encoders of the decoder. vocabulary: Target vocabulary. data_id: Target data series. name: Name of the decoder. Should be unique accross all Neural Monkey objects. max_output_len: Maximum length of an output sequence. dropout_keep_prob: Probability of keeping a value during dropout. embedding_size: Size of embedding vectors for target words. embeddings_source: Embedded sequence to take embeddings from. tie_embeddings: Use decoder.embedding_matrix also in place of the output decoding matrix. Keyword arguments: rnn_size: Size of the decoder hidden state, if None set according to encoders. output_projection: How to generate distribution over vocabulary from decoder_outputs. encoder_projection: How to construct initial state from encoders. attention: The attention object to use. Optional. rnn_cell: RNN Cell used by the decoder (GRU or LSTM). conditional_gru: Flag whether to use the Conditional GRU architecture. attention_on_input: Flag whether attention from previous decoding step should be combined with the input in the next step. supress_unk: If true, decoder will not produce symbols for unknown tokens. """ check_argument_types() AutoregressiveDecoder.__init__(self, name=name, vocabulary=vocabulary, data_id=data_id, max_output_len=max_output_len, dropout_keep_prob=dropout_keep_prob, embedding_size=embedding_size, embeddings_source=embeddings_source, tie_embeddings=tie_embeddings, label_smoothing=label_smoothing, supress_unk=supress_unk, save_checkpoint=save_checkpoint, load_checkpoint=load_checkpoint, initializers=initializers) self.encoders = encoders self.output_projection_spec = output_projection self._conditional_gru = conditional_gru self._attention_on_input = attention_on_input self._rnn_cell_str = rnn_cell self.attentions = [] # type: List[BaseAttention] if attentions is not None: self.attentions = attentions if rnn_size is not None: self.rnn_size = rnn_size if encoder_projection is not None: self.encoder_projection = encoder_projection elif not self.encoders: log("No direct encoder input. Using empty initial state") self.encoder_projection = empty_initial_state elif rnn_size is None: log("No rnn_size or encoder_projection: Using concatenation of" " encoded states") self.encoder_projection = concat_encoder_projection self.rnn_size = sum(e.output.get_shape()[1].value for e in encoders) else: log("Using linear projection of encoders as the initial state") self.encoder_projection = linear_encoder_projection( self.dropout_keep_prob) assert self.rnn_size is not None if self._rnn_cell_str not in RNN_CELL_TYPES: raise ValueError("RNN cell must be a either 'GRU', 'LSTM', or " "'NematusGRU'. Not {}".format(self._rnn_cell_str)) if self.output_projection_spec is None: log("No output projection specified - using tanh projection") self.output_projection = nonlinear_output(self.rnn_size, tf.tanh)[0] self.output_projection_size = self.rnn_size elif isinstance(self.output_projection_spec, tuple): self.output_projection_spec = cast(Tuple[OutputProjection, int], self.output_projection_spec) (self.output_projection, self.output_projection_size) = self.output_projection_spec else: self.output_projection = cast(OutputProjection, self.output_projection_spec) self.output_projection_size = self.rnn_size if self._attention_on_input: self.input_projection = self.input_plus_attention else: self.input_projection = self.embed_input_symbol with self.use_scope(): with tf.variable_scope("attention_decoder") as self.step_scope: pass self._variable_scope.set_initializer( tf.random_normal_initializer(stddev=0.001)) # TODO when it is possible, remove the printing of the cost var log("Decoder initalized. Cost var: {}".format(str(self.cost))) log("Runtime logits tensor: {}".format(str(self.runtime_logits)))
def __init__(self, encoders: List[Stateful], vocabulary: Vocabulary, data_id: str, name: str, max_output_len: int, dropout_keep_prob: float = 1.0, rnn_size: int = None, embedding_size: int = None, output_projection: OutputProjectionSpec = None, encoder_projection: EncoderProjection = None, attentions: List[BaseAttention] = None, embeddings_source: EmbeddedSequence = None, attention_on_input: bool = True, rnn_cell: str = "GRU", conditional_gru: bool = False, save_checkpoint: str = None, load_checkpoint: str = None) -> None: """Create a refactored version of monster decoder. Arguments: encoders: Input encoders of the decoder vocabulary: Target vocabulary data_id: Target data series name: Name of the decoder. Should be unique accross all Neural Monkey objects max_output_len: Maximum length of an output sequence dropout_keep_prob: Probability of keeping a value during dropout Keyword arguments: rnn_size: Size of the decoder hidden state, if None set according to encoders. embedding_size: Size of embedding vectors for target words output_projection: How to generate distribution over vocabulary from decoder_outputs encoder_projection: How to construct initial state from encoders attention: The attention object to use. Optional. embeddings_source: Embedded sequence to take embeddings from rnn_cell: RNN Cell used by the decoder (GRU or LSTM) conditional_gru: Flag whether to use the Conditional GRU architecture attention_on_input: Flag whether attention from previous decoding step should be combined with the input in the next step. """ check_argument_types() AutoregressiveDecoder.__init__( self, name=name, vocabulary=vocabulary, data_id=data_id, max_output_len=max_output_len, dropout_keep_prob=dropout_keep_prob, save_checkpoint=save_checkpoint, load_checkpoint=load_checkpoint) self.encoders = encoders self.embedding_size = embedding_size self.rnn_size = rnn_size self.output_projection_spec = output_projection self.encoder_projection = encoder_projection self.attentions = attentions self.embeddings_source = embeddings_source self._conditional_gru = conditional_gru self._attention_on_input = attention_on_input self._rnn_cell_str = rnn_cell if self.attentions is None: self.attentions = [] if self.embedding_size is None and self.embeddings_source is None: raise ValueError("You must specify either embedding size or the " "embedded sequence from which to reuse the " "embeddings (e.g. set either 'embedding_size' or " " 'embeddings_source' parameter)") if self.embeddings_source is not None: if self.embedding_size is not None: warn("Overriding the embedding_size parameter with the" " size of the reused embeddings from the encoder.") self.embedding_size = ( self.embeddings_source.embedding_matrix.get_shape()[1].value) if self.encoder_projection is None: if not self.encoders: log("No encoder - language model only.") self.encoder_projection = empty_initial_state elif rnn_size is None: log("No rnn_size or encoder_projection: Using concatenation of" " encoded states") self.encoder_projection = concat_encoder_projection self.rnn_size = sum(e.output.get_shape()[1].value for e in encoders) else: log("Using linear projection of encoders as the initial state") self.encoder_projection = linear_encoder_projection( self.dropout_keep_prob) assert self.rnn_size is not None if self._rnn_cell_str not in RNN_CELL_TYPES: raise ValueError("RNN cell must be a either 'GRU', 'LSTM', or " "'NematusGRU'. Not {}".format(self._rnn_cell_str)) if self.output_projection_spec is None: log("No output projection specified - using tanh projection") self.output_projection = nonlinear_output( self.rnn_size, tf.tanh)[0] self.output_projection_size = self.rnn_size elif isinstance(self.output_projection_spec, tuple): (self.output_projection, self.output_projection_size) = tuple(self.output_projection_spec) else: self.output_projection = self.output_projection_spec self.output_projection_size = self.rnn_size if self._attention_on_input: self.input_projection = self.input_plus_attention else: self.input_projection = self.embed_input_symbol with self.use_scope(): with tf.variable_scope("attention_decoder") as self.step_scope: pass # TODO when it is possible, remove the printing of the cost var log("Decoder initalized. Cost var: {}".format(str(self.cost))) log("Runtime logits tensor: {}".format(str(self.runtime_logits)))
def __init__(self, encoders: List[Any], vocabulary: Vocabulary, data_id: str, name: str, max_output_len: int, dropout_keep_prob: float, rnn_size: Optional[int]=None, embedding_size: Optional[int]=None, output_projection: Optional[Callable[ [tf.Tensor, tf.Tensor, List[tf.Tensor]], tf.Tensor]]=None, encoder_projection: Optional[Callable[ [tf.Tensor, Optional[int], Optional[List[Any]]], tf.Tensor]]=None, use_attention: bool=False, embeddings_encoder: Optional[Any]=None, rnn_cell: str='GRU', attention_on_input: bool=True, save_checkpoint: Optional[str]=None, load_checkpoint: Optional[str]=None) -> None: """Create a refactored version of monster decoder. Arguments: encoders: Input encoders of the decoder vocabulary: Target vocabulary data_id: Target data series name: Name of the decoder. Should be unique accross all Neural Monkey objects max_output_len: Maximum length of an output sequence dropout_keep_prob: Probability of keeping a value during dropout Keyword arguments: rnn_size: Size of the decoder hidden state, if None set according to encoders. embedding_size: Size of embedding vectors for target words output_projection: How to generate distribution over vocabulary from decoder rnn_outputs encoder_projection: How to construct initial state from encoders use_attention: Flag whether to look at attention vectors of the encoders embeddings_encoder: Encoder to take embeddings from rnn_cell: RNN Cell used by the decoder (GRU or LSTM) attention_on_input: Flag whether attention from previous decoding step should be combined with the input in the next step. """ ModelPart.__init__(self, name, save_checkpoint, load_checkpoint) log("Initializing decoder, name: '{}'".format(name)) self.encoders = encoders self.vocabulary = vocabulary self.data_id = data_id self.max_output_len = max_output_len self.dropout_keep_prob = dropout_keep_prob self.embedding_size = embedding_size self.rnn_size = rnn_size self.output_projection = output_projection self.encoder_projection = encoder_projection self.use_attention = use_attention self.embeddings_encoder = embeddings_encoder self._rnn_cell = rnn_cell if self.embedding_size is None and self.embeddings_encoder is None: raise ValueError("You must specify either embedding size or the " "encoder from which to reuse the embeddings (" "e.g. set either 'embedding_size' or " " 'embeddings_encoder' parameter)") if self.embeddings_encoder is not None: if self.embedding_size is not None: log("Warning: Overriding the embedding_size parameter with the" " size of the reused embeddings from the encoder.", color="red") self.embedding_size = ( self.embeddings_encoder.embedding_matrix.get_shape()[1].value) if self.encoder_projection is None: if len(self.encoders) == 0: log("No encoder - language model only.") self.encoder_projection = empty_initial_state elif rnn_size is None: log("No rnn_size or encoder_projection: Using concatenation of" " encoded states") self.encoder_projection = concat_encoder_projection self.rnn_size = sum(e.encoded.get_shape()[1].value for e in encoders) else: log("Using linear projection of encoders as the initial state") self.encoder_projection = linear_encoder_projection( self.dropout_keep_prob) assert self.rnn_size is not None if self.output_projection is None: log("No output projection specified - using simple concatenation") self.output_projection = no_deep_output with tf.variable_scope(name): self._create_input_placeholders() self._create_training_placeholders() self._create_initial_state() self._create_embedding_matrix() with tf.name_scope("output_projection"): self.decoding_w = tf.get_variable( "state_to_word_W", [self.rnn_size, len(self.vocabulary)], initializer=tf.random_uniform_initializer(-0.5, 0.5)) self.decoding_b = tf.get_variable( "state_to_word_b", [len(self.vocabulary)], initializer=tf.constant_initializer( - math.log(len(self.vocabulary)))) # POSLEDNI TRAIN INPUT SE V DEKODOVACI FUNKCI NEPOUZIJE # (jen jako target) embedded_train_inputs = self._embed_and_dropout( self.train_inputs[:-1]) # POZOR TADY SE NEDELA DROPOUT embedded_go_symbols = tf.nn.embedding_lookup(self.embedding_matrix, self.go_symbols) # fetch train attention objects self._train_attention_objects = {} # type: Dict[Attentive, tf.Tensor] if self.use_attention: with tf.name_scope("attention_object"): self._train_attention_objects = { e: e.create_attention_object() for e in self.encoders if isinstance(e, Attentive)} train_rnn_outputs, _ = self._attention_decoder( embedded_go_symbols, attention_on_input=attention_on_input, train_inputs=embedded_train_inputs, train_mode=True) assert not tf.get_variable_scope().reuse tf.get_variable_scope().reuse_variables() # fetch runtime attention objects self._runtime_attention_objects = {} # type: Dict[Attentive, tf.Tensor] if self.use_attention: self._runtime_attention_objects = { e: e.create_attention_object() for e in self.encoders if isinstance(e, Attentive)} (self.runtime_rnn_outputs, self.runtime_rnn_states) = self._attention_decoder( embedded_go_symbols, attention_on_input=attention_on_input, train_mode=False) self.hidden_states = self.runtime_rnn_outputs def decode(rnn_outputs): with tf.name_scope("output_projection"): logits = [] decoded = [] for out in rnn_outputs: out_activation = self._logit_function(out) logits.append(out_activation) decoded.append(tf.argmax(out_activation[:, 1:], 1) + 1) return decoded, logits _, self.train_logits = decode(train_rnn_outputs) train_targets = tf.unpack(self.train_inputs) self.train_loss = tf.nn.seq2seq.sequence_loss( self.train_logits, train_targets, tf.unpack(self.train_padding), len(self.vocabulary)) self.cost = self.train_loss self.train_logprobs = [tf.nn.log_softmax(l) for l in self.train_logits] self.decoded, self.runtime_logits = decode( self.runtime_rnn_outputs) self.runtime_loss = tf.nn.seq2seq.sequence_loss( self.runtime_logits, train_targets, tf.unpack(self.train_padding), len(self.vocabulary)) self.runtime_logprobs = [tf.nn.log_softmax(l) for l in self.runtime_logits] tf.scalar_summary('train_loss_with_gt_intpus', self.train_loss, collections=["summary_train"]) tf.scalar_summary('train_loss_with_decoded_inputs', self.runtime_loss, collections=["summary_train"]) tf.scalar_summary('train_optimization_cost', self.cost, collections=["summary_train"]) self._visualize_attention() log("Decoder initalized.")
def from_dataset(datasets: List[Dataset], series_ids: List[str], max_size: int, save_file: str = None, overwrite: bool = False, min_freq: Optional[int] = None, unk_sample_prob: float = 0.5) -> 'Vocabulary': """Loads vocabulary from a dataset with an option to save it. Arguments: datasets: A list of datasets from which to create the vocabulary series_ids: A list of ids of series of the datasets that should be used producing the vocabulary max_size: The maximum size of the vocabulary save_file: A file to save the vocabulary to. If None (default), the vocabulary will not be saved. overwrite: Overwrite existing file. min_freq: Do not include words with frequency smaller than this. unk_sample_prob: The probability with which to sample unks out of words with frequency 1. Defaults to 0.5. Returns: The new Vocabulary instance. """ assert check_argument_types() vocabulary = Vocabulary(unk_sample_prob=unk_sample_prob) for dataset in datasets: if isinstance(dataset, LazyDataset): warn("Inferring vocabulary from lazy dataset!") for series_id in series_ids: if not dataset.has_series(series_id): warn("Data series '{}' not present in the dataset".format( series_id)) series = dataset.get_series(series_id, allow_none=True) if series: vocabulary.add_tokenized_text( [token for sent in series for token in sent]) vocabulary.truncate(max_size) if min_freq is not None: if min_freq > 1: vocabulary.truncate_by_min_freq(min_freq) log("Vocabulary for series {} initialized, containing {} words".format( series_ids, len(vocabulary))) vocabulary.log_sample() if save_file is not None: directory = os.path.dirname(save_file) if not os.path.exists(directory): os.makedirs(directory) vocabulary.save_to_file(save_file, overwrite) return vocabulary
def __init__(self, name: str, vocabulary: Vocabulary, data_id: str, embedding_size: int, segment_size: int, highway_depth: int, rnn_size: int, filters: List[Tuple[int, int]], max_input_len: Optional[int] = None, dropout_keep_prob: float = 1.0, attention_type: Optional[Any] = None, attention_fertility: int = 3, use_noisy_activations: bool = False, save_checkpoint: Optional[str] = None, load_checkpoint: Optional[str] = None) -> None: """Create a new instance of the sentence encoder. Arguments: vocabulary: Input vocabulary data_id: Identifier of the data series fed to this encoder name: An unique identifier for this encoder max_input_len: Maximum length of an encoded sequence embedding_size: The size of the embedding vector assigned to each word segment_size: The size of the segments over which we apply max-pooling. highway_depth: Depth of the highway layer. rnn_size: The size of the encoder's hidden state. Note that the actual encoder output state size will be twice as long because it is the result of concatenation of forward and backward hidden states. filters: Specification of CNN filters. It is a list of tuples specifying the filter size and number of channels. Keyword arguments: dropout_keep_prob: The dropout keep probability (default 1.0) attention_type: The class that is used for creating attention mechanism (default None) attention_fertility: Fertility parameter used with CoverageAttention (default 3). """ ModelPart.__init__(self, name, save_checkpoint, load_checkpoint) Attentive.__init__(self, attention_type, attention_fertility=attention_fertility) assert check_argument_types() self.vocabulary = vocabulary self.data_id = data_id self.max_input_len = max_input_len self.embedding_size = embedding_size self.segment_size = segment_size self.highway_depth = highway_depth self.rnn_size = rnn_size self.filters = filters self.dropout_keep_p = dropout_keep_prob self.use_noisy_activations = use_noisy_activations if max_input_len is not None and max_input_len <= 0: raise ValueError("Input length must be a positive integer.") log("Initializing sentence encoder, name: '{}'".format(self.name)) with self.use_scope(): self._create_input_placeholders() with tf.variable_scope('input_projection'): self._create_embedding_matrix() embedded_inputs = self._embed(self.inputs) # type: tf.Tensor self.embedded_inputs = embedded_inputs # CNN Network pooled_outputs = [] for filter_size, num_filters in self.filters: with tf.variable_scope("conv-maxpool-%s" % filter_size): filter_shape = [filter_size, embedding_size, num_filters] w_filter = tf.get_variable( "conv_W", filter_shape, initializer=tf.random_uniform_initializer(-0.5, 0.5)) b_filter = tf.get_variable( "conv_bias", [num_filters], initializer=tf.constant_initializer(0.0)) conv = tf.nn.conv1d(embedded_inputs, w_filter, stride=1, padding="SAME", name="conv") # Apply nonlinearity conv_relu = tf.nn.relu(tf.nn.bias_add(conv, b_filter)) # Max-pooling over the output segments expanded_conv_relu = tf.expand_dims(conv_relu, -1) pooled = tf.nn.max_pool( expanded_conv_relu, ksize=[1, self.segment_size, 1, 1], strides=[1, self.segment_size, 1, 1], padding="SAME", name="maxpool") pooled_outputs.append(pooled) # Combine all the pooled features self.cnn_encoded = tf.concat(pooled_outputs, axis=2) self.cnn_encoded = tf.squeeze(self.cnn_encoded, [3]) # Highway Network batch_size = tf.shape(self.cnn_encoded)[0] # pylint: disable=no-member cnn_out_size = self.cnn_encoded.get_shape().as_list()[-1] highway_layer = tf.reshape(self.cnn_encoded, [-1, cnn_out_size]) for i in range(self.highway_depth): highway_layer = highway(highway_layer, scope=("highway_layer_%s" % i)) highway_layer = tf.reshape(highway_layer, [batch_size, -1, cnn_out_size]) # BiRNN Network fw_cell, bw_cell = self.rnn_cells() # type: RNNCellTuple seq_lens = tf.ceil( tf.divide(self.sentence_lengths, self.segment_size)) seq_lens = tf.cast(seq_lens, tf.int32) outputs_bidi_tup, encoded_tup = tf.nn.bidirectional_dynamic_rnn( fw_cell, bw_cell, highway_layer, sequence_length=seq_lens, dtype=tf.float32) self.hidden_states = tf.concat(outputs_bidi_tup, 2) with tf.variable_scope('attention_tensor'): self.__attention_tensor = self._dropout(self.hidden_states) self.encoded = tf.concat(encoded_tup, 1) log("Sentence encoder initialized")