Exemplo n.º 1
0
    def _bless_graph_executors(self) -> None:
        """Pre-compute the tensors referenced by the graph executors.

        Due to the lazy nature of the computational graph related components,
        nothing is actually added to the graph until it is "blessed" (
        referenced, and therefore, executed).

        "Blessing" is usually implemented in the form of a log or a debug call
        with the blessed tensor as parameter. Referencing a `Tensor` causes the
        whole computational graph that is needed to evaluate the tensor to be
        built.

        This function "blesses" all tensors that could be potentially used
        using the `fetches` property of the provided runner objects.

        If the experiment runs in the training mode, this function also
        blesses the tensors fetched by the trainer(s).
        """
        log("Building TF Graph")
        if hasattr(self.model, "trainer"):
            if isinstance(self.model.trainer, List):
                trainers = self.model.trainer
            else:
                trainers = [self.model.trainer]

            for trainer in trainers:
                debug("Trainer fetches: {}".format(trainer.fetches), "bless")

        for runner in self.model.runners:
            debug("Runner fetches: {}".format(runner.fetches), "bless")
        log("TF Graph built")
Exemplo n.º 2
0
def dataset_from_files(**kwargs):
    """
    Creates a dataset from the provided arguments. Paths to the data are
    provided in a form of dictionary.

    Args:

        kwargs: Arguments are treated as a dictionary. Paths to the data
            series are specified here. Series identifiers should not contain
            underscores. You can specify a language for the serie by adding
            a preprocess method you want to apply on the textual data by
            naming the function as <identifier>_preprocess=function
            OR the preprocessor can be specified globally

    """

    random_seed = kwargs.get('random_seed', None)
    preprocess = kwargs.get('preprocessor', lambda x: x)

    series_paths = _get_series_paths(kwargs)

    if len(series_paths) > 0:
        log("Initializing dataset with: {}".format(", ".join(series_paths)))
        series = {s: Dataset.create_series(series_paths[s], preprocess)
                  for s in series_paths}
        name = kwargs.get('name', _get_name_from_paths(series_paths))

    series_outputs = {SERIES_OUTPUT.match(key)[1]: value
                      for key, value in kwargs.items()
                      if SERIES_OUTPUT.match(key)}

    dataset = Dataset(name, series, series_outputs, random_seed)
    log("Dataset length: {}".format(len(dataset)))
    return dataset
Exemplo n.º 3
0
    def create_serie(self, name, args):
        """ Loads a data serie from a file """
        path = args[name]
        log("Loading {}".format(path))
        file_type = magic.from_file(path, mime=True)

        # if the dataset has no name, generate it from files
        if 'name' not in args:
            self.name += "-"+path

        if file_type.startswith('text/'):
            if name+"_preprocess" in args:
                preprocess = args[name+"_preprocess"]
            else:
                preprocess = lambda s: s.split(" ")

            with codecs.open(path, 'r', 'utf-8') as f_data:
                for line in f_data:
                    yield preprocess(line.rstrip())
        elif file_type == 'application/gzip':
            with gzip.open(path, 'rb') as f_data:
                try:
                    while True:
                        yield pickle.load(f_data)
                except EOFError:
                    pass
        else:
            raise Exception("\"{}\" has Unsopported data type: {}".format(path, file_type))
Exemplo n.º 4
0
def main():
    # pylint: disable=no-member,broad-except
    if len(sys.argv) != 3:
        print("Usage: run.py <run_ini_file> <test_datasets>")
        exit(1)

    test_datasets = Configuration()
    test_datasets.add_argument('test_datasets')

    args, sess = initialize_for_running(sys.argv[1])

    datasets_args = test_datasets.load_file(sys.argv[2])
    print("")

    try:
        for dataset in datasets_args.test_datasets:
            check_dataset_and_coders(dataset, args.encoders)
    except Exception as exc:
        log(exc.message, color='red')
        exit(1)

    for dataset in datasets_args.test_datasets:
        _, _, evaluation = run_on_dataset(
            sess, args.runner, args.encoders + [args.decoder], args.decoder,
            dataset, args.evaluation, args.postprocess, write_out=True)
        if evaluation:
            print_dataset_evaluation(dataset.name, evaluation)
Exemplo n.º 5
0
    def save_wordlist(self, path: str, overwrite: bool = False,
                      encoding: str = "utf-8") -> None:
        """Save the vocabulary as a wordlist.

        The file is ordered by the ids of words.
        This function is used mainly for embedding visualization.

        Arguments:
            path: The path to save the file to.
            overwrite: Flag whether to overwrite existing file.
                Defaults to False.

        Raises:
            FileExistsError if the file exists and overwrite flag is
            disabled.
        """
        if os.path.exists(path) and not overwrite:
            raise FileExistsError("Cannot save vocabulary: File exists and "
                                  "overwrite is disabled. {}".format(path))

        with open(path, "w", encoding=encoding) as output_file:
            log("Storing vocabulary without frequencies.")

            for word in self._vocabulary:
                output_file.write("{}\n".format(word))
Exemplo n.º 6
0
def load_config_file(config_file, ignore_names):
    """ Loads and builds the model from the configuration

    Arguments:
        config_file: The configuration file
        ignore_names: A set of names that should be ignored during the loading.
    """
    config_dicts = parsing.parse_file(config_file)
    config_file.close()
    log("INI file is parsed.")

    # first load the configuration into a dictionary

    if "main" not in config_dicts:
        raise Exception("Configuration does not contain the main block.")

    existing_objects = dict()

    main_config = config_dicts['main']

    configuration = dict()
    for key, value in main_config.items():
        if key not in ignore_names:
            try:
                configuration[key] = build_object(
                    value, config_dicts, existing_objects, 0)
            except Exception as exc:
                raise ConfigBuildException(key, exc) from None

    return configuration
Exemplo n.º 7
0
def initialize_for_running(ini_file):
    """Prepares everything that is necessary for running a model.

    Arguments:
        ini_file: Path to the configuration file.

    Returns:
        A tuple of parsed configuration (inlucding built computation graph)
        and a TensorFlow session with already loaded model variables.
    """
    # pylint: disable=no-member
    args = CONFIG.load_file(ini_file)
    print("")
    variables_file = os.path.join(args.output, "variables.data.best")
    cont_index = 1

    def continuation_file():
        return os.path.join(args.output,
                            "variables.data.cont-{}.best".format(cont_index))
    while os.path.exists(continuation_file()):
        variables_file = continuation_file()
        cont_index += 1

    if not os.path.exists(variables_file):
        log("No variables file is stored in {}".format(args.output),
            color="red")
        exit(1)

    sess, _ = initialize_tf(variables_file, args.threads)
    print("")

    return args, sess
Exemplo n.º 8
0
def main() -> None:
    try:
        _main()
    except KeyboardInterrupt:
        log("Training interrupted by user.")
        debug(traceback.format_exc())
        exit(1)
Exemplo n.º 9
0
def from_t2t_vocabulary(path: str,
                        encoding: str = "utf-8") -> "Vocabulary":
    """Load a vocabulary generated during tensor2tensor training.

    Arguments:
        path: The path to the vocabulary file.
        encoding: The encoding of the vocabulary file (defaults to UTF-8).

    Returns:
        The new Vocabulary instantce.
    """
    check_argument_types()
    vocabulary = []  # type: List[str]

    with open(path, encoding=encoding) as wordlist:
        for line in wordlist:
            line = line.strip()

            # T2T vocab tends to wrap words in single quotes
            if ((line.startswith("'") and line.endswith("'"))
                    or (line.startswith('"') and line.endswith('"'))):
                line = line[1:-1]

            if line in ["<pad>", "<EOS>"]:
                continue

            vocabulary.append(line)

    log("Vocabulary form wordlist loaded, containing {} words"
        .format(len(vocabulary)))
    log_sample(vocabulary)

    return Vocabulary(vocabulary)
Exemplo n.º 10
0
    def __init__(self,
                 merge_file: str,
                 separator: str = "@@",
                 encoding: str = "utf-8") -> None:
        log("Initializing BPE preprocessor")

        with open(merge_file, "r", encoding=encoding) as f_data:
            self.bpe = BPE(f_data, separator)
Exemplo n.º 11
0
    def save(self, session: tf.Session) -> None:
        """Save model part to a checkpoint file."""
        if self._save_checkpoint:
            self._init_saver()
            self._saver.save(session, self._save_checkpoint)

            log("Variables of '{}' saved to '{}'".format(
                self.name, self._save_checkpoint))
Exemplo n.º 12
0
    def load(self, session: tf.Session) -> None:
        """Load model part from a checkpoint file."""
        if self._load_checkpoint:
            self._init_saver()
            self._saver.restore(session, self._load_checkpoint)

            log("Variables of '{}' loaded from '{}'".format(
                self.name, self._load_checkpoint))
Exemplo n.º 13
0
    def output_projection_spec(self) -> Tuple[OutputProjection, int]:
        if self._output_projection_spec is None:
            log("No output projection specified - using tanh projection")
            return (nonlinear_output(self.rnn_size, tf.tanh)[0], self.rnn_size)

        if isinstance(self._output_projection_spec, tuple):
            return self._output_projection_spec

        return cast(OutputProjection,
                    self._output_projection_spec), self.rnn_size
Exemplo n.º 14
0
    def initialize_sessions(self) -> None:
        log("Initializing variables")
        init_op = tf.global_variables_initializer()
        init_tables = tf.tables_initializer()
        for sess in self.sessions:
            sess.run([init_op, init_tables])

        log("Initializing tf.train.Saver")
        self.saver = tf.train.Saver(max_to_keep=None,
                                    var_list=[g for g in tf.global_variables()
                                              if "reward_" not in g.name])
Exemplo n.º 15
0
def log_sample(vocabulary: List[str], size: int = 5) -> None:
    """Log a sample of the vocabulary.

    Arguments:
        size: How many sample words to log.
    """
    if size > len(vocabulary):
        log("Vocabulary: {}".format(vocabulary))
    else:
        sample_ids = np.random.permutation(np.arange(len(vocabulary)))[:size]
        log("Sample of the vocabulary: {}".format(
            [vocabulary[i] for i in sample_ids]))
Exemplo n.º 16
0
    def __init__(self,
                 config_path: str,
                 train_mode: bool = False,
                 overwrite_output_dir: bool = False,
                 config_changes: List[str] = None) -> None:
        """Initialize a Neural Monkey experiment.

        Arguments:
            config_path: The path to the experiment configuration file.
            train_mode: Indicates whether the model should be prepared for
                training.
            overwrite_output_dir: Indicates whether an existing experiment
                should be reused. If `True`, this overrides the setting in
                the configuration file.
            config_changes: A list of modifications that will be made to the
                loaded configuration file before parsing.
        """
        self.train_mode = train_mode
        self._config_path = config_path

        self.graph = tf.Graph()
        self._initializers = {}  # type: Dict[str, Callable]
        self._initialized_variables = set()  # type: Set[str]
        self.cont_index = -1
        self._model_built = False
        self._vars_loaded = False
        self._model = None  # type: Optional[Namespace]

        self.config = create_config(train_mode)
        self.config.load_file(config_path, config_changes)
        args = self.config.args

        if self.train_mode:
            # We may need to create the experiment directory.
            if (os.path.isdir(args.output)
                    and os.path.exists(
                        os.path.join(args.output, "experiment.ini"))):
                if args.overwrite_output_dir or overwrite_output_dir:
                    # we do not want to delete the directory contents
                    log("Directory with experiment.ini '{}' exists, "
                        "overwriting enabled, proceeding.".format(args.output))
                else:
                    raise RuntimeError(
                        "Directory with experiment.ini '{}' exists, "
                        "overwriting disabled.".format(args.output))

            if not os.path.isdir(args.output):
                os.mkdir(args.output)

        # Find how many times the experiment has been continued.
        while any(os.path.exists(self.get_path(f, self.cont_index + 1))
                  for f in _EXPERIMENT_FILES):
            self.cont_index += 1
Exemplo n.º 17
0
    def __missing__(self, key):
        """Try to fetch and parse the variable value from `os.environ`."""
        if key in os.environ:
            try:
                value = _parse_value(os.environ[key], self)
            except ParseError:
                # If we cannot parse it, use it as a string.
                value = os.environ[key]
            log("Variable {}={!r} taken from the environment."
                .format(key, value))
            return value

        raise ParseError("Undefined variable: {}".format(key))
Exemplo n.º 18
0
    def score_batch(self,
                    hypotheses: List[List[str]],
                    references: List[List[str]]) -> float:

        ref_bytes = self.serialize_to_bytes(references)
        hyp_bytes = self.serialize_to_bytes(hypotheses)

        reffile = tempfile.NamedTemporaryFile()
        reffile.write(ref_bytes)
        reffile.flush()

        output_proc = subprocess.run(["perl", self.wrapper, reffile.name],
                                     input=hyp_bytes,
                                     stderr=subprocess.PIPE,
                                     stdout=subprocess.PIPE)

        proc_stdout = output_proc.stdout.decode("utf-8")  # type: ignore
        lines = proc_stdout.splitlines()

        try:
            bleu_score = float(lines[0])
            return bleu_score
        except IndexError:
            log("Error: Malformed output from BLEU wrapper:", color="red")
            log(proc_stdout, color="red")
            log("=======", color="red")
            return 0.0
        except ValueError:
            log("Value error - bleu '{}' is not a number.".format(lines[0]),
                color="red")
            return 0.0
Exemplo n.º 19
0
    def create_series(path, preprocess=lambda x: x):
        """ Loads a data serie from a file """
        log("Loading {}".format(path))
        file_type = magic.from_file(path, mime=True)

        if file_type.startswith('text/'):
            reader = PlainTextFileReader(path)
            return list([preprocess(line) for line in reader.read()])

        elif file_type == 'application/octet-stream':
            return np.load(path)
        else:
            raise Exception("\"{}\" has Unsupported data type: {}"
                            .format(path, file_type))
Exemplo n.º 20
0
    def restore(self, variable_files: Union[str, List[str]]) -> None:
        if self.saver is None:
            raise RuntimeError("Saver uninitialized")

        if isinstance(variable_files, str):
            variable_files = [variable_files]
        if len(variable_files) != len(self.sessions):
            raise Exception(
                "Provided {} files for restoring {} sessions.".format(
                    len(variable_files), len(self.sessions)))

        for sess, file_name in zip(self.sessions, variable_files):
            log("Loading variables from {}".format(file_name))
            self.saver.restore(sess, file_name)
            log("Variables loaded from {}".format(file_name))
Exemplo n.º 21
0
    def log_after_validation(
            self, val_examples: int, train_examples: int) -> None:

        train_duration = self.inter_val_times[-1]
        val_duration = self.validation_times[-1]

        train_speed = train_examples / train_duration
        val_speed = val_examples / val_duration

        log("Validation time: {:.2f}s ({:.1f} instances/sec), "
            "inter-validation: {:.2f}s, ({:.1f} instances/sec)"
            .format(val_duration, val_speed, train_duration, train_speed),
            color="blue")

        if self.inter_val_times[-1] < 2 * self.validation_times[-1]:
            notice("Validation period setting is inefficient.")
Exemplo n.º 22
0
def _apply_change(config_dict: Dict[str, Any], setting: str) -> None:
    if "=" not in setting:
        raise ParseError("Invalid setting '{}'".format(setting))
    key, value = (s.strip() for s in setting.split("=", maxsplit=1))

    if "." in key:
        section, option = key.split(".", maxsplit=1)
    else:
        section = "main"
        option = key

    if section not in config_dict:
        log("Creating new section '{}'".format(section))
        config_dict[section] = OrderedDict()

    config_dict[section][option] = -1, value  # no line number
Exemplo n.º 23
0
    def train(self) -> None:
        """Train model specified by this experiment.

        This function is one of the main functions (entrypoints) called on
        the experiment. It builds the model (if needed) and runs the training
        procedure.

        Raises:
            `RuntimeError` when the experiment is not intended for training.
        """
        if not self.train_mode:
            raise RuntimeError("train() was called, but the experiment was "
                               "created with train_mode=False")
        if not self._model_built:
            self.build_model()

        self.cont_index += 1

        # Initialize the experiment directory.
        self.config.save_file(self.get_path("experiment.ini"))
        shutil.copyfile(self._config_path, self.get_path("original.ini"))
        save_git_info(self.get_path("git_commit"), self.get_path("git_diff"))
        Logging.set_log_file(self.get_path("experiment.log"))

        Logging.print_header(self.model.name, self.model.output)

        with self.graph.as_default():
            self.model.tf_manager.init_saving(self.get_path("variables.data"))

            training_loop(cfg=self.model)

            final_variables = self.get_path("variables.data.final")
            log("Saving final variables in {}".format(final_variables))
            self.model.tf_manager.save(final_variables)

            if self.model.test_datasets:
                if self.model.tf_manager.best_score_index is not None:
                    self.model.tf_manager.restore_best_vars()

                for test_id, dataset in enumerate(self.model.test_datasets):
                    self.evaluate(dataset, write_out=True,
                                  name="test_{}".format(test_id))

            log("Finished.")
            self._vars_loaded = True
Exemplo n.º 24
0
    def load_variables(self, variable_files: List[str] = None) -> None:
        """Load variables of the built model from file(s).

        When variable files are not provided, Neural Monkey will try to infer
        the name of a default checkpoint file using the following key:
        1. Look for the averaged checkpoints named `variables.data.avg` or
           `variables.data.avg-0`.
        2. Look for file `variables.data.best` file which usually contains the
           best scoring checkpoint from the run.
        3. Look for the final checkpoint saved in `variables.data.final`.

        Arguments:
            variable_files: A list of variable files to load. The length of
                this list should match the number of sessions.
        """
        if not self._model_built:
            self.build_model()

        if variable_files is None:
            if os.path.exists(self.get_path("variables.data.avg-0.index")):
                variable_files = [self.get_path("variables.data.avg-0")]
            elif os.path.exists(self.get_path("variables.data.avg.index")):
                variable_files = [self.get_path("variables.data.avg")]
            elif os.path.exists(self.get_path("variables.data.best")):
                best_var_file = self.get_path("variables.data.best")
                with open(best_var_file, "r") as f_best:
                    var_path = f_best.read().rstrip()
                variable_files = [os.path.join(self.config.args.output,
                                               var_path)]
            elif os.path.exists(self.get_path("variables.data.final.index")):
                variable_files = [self.get_path("variables.data.final")]
            else:
                raise RuntimeError("Cannot infer default variables file")

            log("Default variable file '{}' will be used for loading "
                "variables.".format(variable_files[0]))

        for vfile in variable_files:
            if not os.path.exists("{}.index".format(vfile)):
                raise RuntimeError(
                    "Index file for var prefix {} does not exist"
                    .format(vfile))

        self.model.tf_manager.restore(variable_files)
        self._vars_loaded = True
Exemplo n.º 25
0
    def test_mt_trainer(self):
        # TODO(tf-data) multitask trainer is likely broken by the changes

        trainer = MultitaskTrainer(
            [self.trainer1, self.trainer2, self.trainer1])

        feedables = {self.mpart, self.mpart_2, self.trainer1, self.trainer2}
        for feedable in feedables:
            feedable.register_input(self.dataset)

        log("Blessing trainer fetches: {}".format(trainer.fetches))

        self.assertSetEqual(trainer.feedables, feedables)
        self.assertSetEqual(trainer.parameterizeds, {self.mpart, self.mpart_2})

        self.assertSetEqual(
            set(trainer.var_list), {self.mpart.var, self.mpart_2.var})

        self.assertTrue(trainer.trainer_idx == 0)

        executable = trainer.get_executable()
        # mparts = trainer.feedables
        fetches, feeds = executable.next_to_execute()
        # self.assertSetEqual(mparts, {self.mpart})
        self.assertFalse(feeds)

        self.assertTrue(trainer.trainer_idx == 1)
        self.assertTrue(fetches["losses"][0] == self.mpart.loss)

        executable = trainer.get_executable()
        fetches, feeds = executable.next_to_execute()
        # self.assertSetEqual(mparts, {self.mpart_2})
        self.assertFalse(feeds)

        self.assertTrue(trainer.trainer_idx == 2)
        self.assertTrue(fetches["losses"][0] == self.mpart_2.loss)

        executable = trainer.get_executable()
        fetches, feeds = executable.next_to_execute()
        # self.assertSetEqual(mparts, {self.mpart})
        self.assertFalse(feeds)

        self.assertTrue(trainer.trainer_idx == 0)
        self.assertTrue(fetches["losses"][0] == self.mpart.loss)
Exemplo n.º 26
0
def _main() -> None:
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("config", metavar="INI-FILE",
                        help="the configuration file for the experiment")
    parser.add_argument("-s", "--set", type=str, metavar="SETTING",
                        action="append", dest="config_changes", default=[],
                        help="override an option in the configuration; the "
                        "syntax is [section.]option=value")
    parser.add_argument("-v", "--var", type=str, metavar="VAR", default=[],
                        action="append", dest="config_vars",
                        help="set a variable in the configuration; the syntax "
                        "is var=value (shorthand for -s vars.var=value)")
    parser.add_argument("-i", "--init", dest="init_only", action="store_true",
                        help="initialize the experiment directory and exit "
                        "without building the model")
    parser.add_argument("-f", "--overwrite", action="store_true",
                        help="force overwriting the output directory; can be "
                        "used to start an experiment created with --init")
    args = parser.parse_args()

    args.config_changes.extend("vars.{}".format(s) for s in args.config_vars)

    exp = Experiment(config_path=args.config,
                     config_changes=args.config_changes,
                     train_mode=True,
                     overwrite_output_dir=args.overwrite)

    with open(exp.get_path("args", exp.cont_index + 1), "w") as file:
        print(" ".join(shlex.quote(a) for a in sys.argv), file=file)

    if args.init_only:
        if exp.cont_index >= 0:
            log("The experiment directory already exists.", color="red")
            exit(1)

        exp.config.save_file(exp.get_path("experiment.ini", 0))
        copyfile(args.config, exp.get_path("original.ini", 0))

        log("Experiment directory initialized.")

        cmd = [os.path.basename(sys.argv[0]), "-f",
               exp.get_path("experiment.ini", 0)]
        log("To start experiment, run: {}".format(" ".join(shlex.quote(a)
                                                           for a in cmd)))
        exit(0)

    try:
        exp.train()
    except KeyboardInterrupt:  # pylint: disable=try-except-raise
        raise
    except Exception:  # pylint: disable=broad-except
        log(traceback.format_exc(), color="red")
        exit(1)
Exemplo n.º 27
0
    def from_datasets(datasets, series_ids, max_size, random_seed=None):
        # type: (List[Dataset], List[str], int, int) -> Vocabulary
        vocabulary = Vocabulary(random_seed=random_seed)

        for dataset in datasets:
            for series_id in series_ids:
                series = dataset.get_series(series_id, allow_none=True)
                if series:
                    vocabulary.add_tokenized_text([token for sent in series for token in sent])

        vocabulary.trunkate(max_size)

        log("Vocabulary for series {} initialized, containing {} words"
            .format(series_ids, len(vocabulary)))

        log("Sample of the vocabulary: {}"
            .format([vocabulary.index_to_word[i]
                     for i in np.random.randint(0, len(vocabulary), 5)]))
        return vocabulary
Exemplo n.º 28
0
def concat_encoder_projection(
        train_mode: tf.Tensor,
        rnn_size: int = None,
        encoders: List[Stateful] = None) -> tf.Tensor:
    """Concatenate the encoded values of the encoders."""

    if encoders is None or not encoders:
        raise ValueError("There must be at least one encoder for this type "
                         "of encoder projection")

    output_size = sum(e.output.get_shape()[1].value for e in encoders)
    if rnn_size is not None and rnn_size != output_size:
        raise ValueError("RNN size supplied for concat projection ({}) does "
                         "not match the size of the concatenated vectors ({})."
                         .format(rnn_size, output_size))

    log("The inferred rnn_size of this encoder projection will be {}"
        .format(output_size))

    encoded_concat = tf.concat([e.output for e in encoders], 1)
    return encoded_concat
Exemplo n.º 29
0
def main():
    parser = argparse.ArgumentParser(description="Prepares the STR data.")
    parser.add_argument("--list", type=argparse.FileType('r'),
                        help="File with images.", required=True)
    parser.add_argument("--img-root", type=str, required=True,
                        help="Directory with images.")
    parser.add_argument("--height", type=int, default=32)
    parser.add_argument("--max-width", type=int, default=320)
    parser.add_argument("--output-file", type=str, required=True)
    parser.add_argument("--output-log", type=argparse.FileType('w'), required=True)
    args = parser.parse_args()

    preprocessor = STRPreprocessor(args.height, args.max_width)

    f_out = gzip.open(args.output_file, mode='wb')
    processed = 0
    for i, line in enumerate(args.list):
        img_path = os.path.join(args.img_root, line.rstrip())

        try:
            img = preprocessor(img_path)
            pickle.dump(img, f_out)

            args.output_log.write("{}\n".format(img_path))
            processed += 1
            if i % 1000 == 999:
                log("Processed {} images".format(i + 1))
        except Exception as exc:
            log("Skipped {} (no. {}), expeption {}".format(img_path, i, exc), color='red')

    log("Done, saved {} images to {}".format(processed, args.output_file))

    f_out.close()

    log("Padded {} times, on averaged {:.0f} pixels".\
            format(len(preprocessor.paddings),
                   np.mean(preprocessor.paddings) if preprocessor.paddings else 0.0))
    log("Shrinked {} times, on averaged {:.0f} pixels".\
            format(len(preprocessor.shrinkages),
                   np.mean(preprocessor.shrinkages) if preprocessor.shrinkages else 0.0))
Exemplo n.º 30
0
def initialize_tf(initial_variables, threads):
    """
    Initializes the TensorFlow session after the graph is built.

    Args:

        initial_variables: File with the saved TF variables.

    Returns:

        A tuple of the TF session and the the TF saver object.

    """
    log("Initializing the TensorFlow session.")
    sess = tf.Session(config=tf.ConfigProto(inter_op_parallelism_threads=threads,
                                            intra_op_parallelism_threads=threads))
    sess.run(tf.initialize_all_variables())

    saver = tf.train.Saver()
    if initial_variables:
        log("Loading variables from {}".format(initial_variables))
        saver.restore(sess, initial_variables)

    log("Session initialization done.")

    return sess, saver
Exemplo n.º 31
0
    def __init__(self,
                 name: str,
                 input_sequence: TemporalStateful,
                 ff_hidden_size: int,
                 depth: int,
                 n_heads: int,
                 dropout_keep_prob: float = 1.0,
                 attention_dropout_keep_prob: float = 1.0,
                 target_space_id: int = None,
                 use_att_transform_bias: bool = False,
                 use_positional_encoding: bool = True,
                 input_for_cross_attention: Attendable = None,
                 n_cross_att_heads: int = None,
                 reuse: ModelPart = None,
                 save_checkpoint: str = None,
                 load_checkpoint: str = None,
                 initializers: InitializerSpecs = None) -> None:
        """Create an encoder of the Transformer model.

        Described in Vaswani et al. (2017), arxiv.org/abs/1706.03762

        Arguments:
            input_sequence: Embedded input sequence.
            name: Name of the decoder. Should be unique accross all Neural
                Monkey objects.
            reuse: Reuse the model variables.
            dropout_keep_prob: Probability of keeping a value during dropout.
            target_space_id: Specifies the modality of the target space.
            use_att_transform_bias: Add bias when transforming qkv vectors
                for attention.
            use_positional_encoding: If True, position encoding signal is added
                to the input.

        Keyword arguments:
            ff_hidden_size: Size of the feedforward sublayers.
            n_heads: Number of the self-attention heads.
            depth: Number of sublayers.
            attention_dropout_keep_prob: Probability of keeping a value
                during dropout on the attention output.
            input_for_cross_attention: An attendable model part that is
                attended using cross-attention on every layer of the decoder,
                analogically to how encoder is attended in the decoder.
            n_cross_att_heads: Number of heads used in the cross-attention.

        """
        check_argument_types()
        ModelPart.__init__(self, name, reuse, save_checkpoint, load_checkpoint,
                           initializers)

        self.input_sequence = input_sequence
        self.model_dimension = self.input_sequence.dimension
        self.ff_hidden_size = ff_hidden_size
        self.depth = depth
        self.n_heads = n_heads
        self.dropout_keep_prob = dropout_keep_prob
        self.attention_dropout_keep_prob = attention_dropout_keep_prob
        self.target_space_id = target_space_id
        self.use_att_transform_bias = use_att_transform_bias
        self.use_positional_encoding = use_positional_encoding
        self.input_for_cross_attention = input_for_cross_attention
        self.n_cross_att_heads = n_cross_att_heads

        if self.depth <= 0:
            raise ValueError("Depth must be a positive integer.")

        if self.ff_hidden_size <= 0:
            raise ValueError("Feed forward hidden size must be a "
                             "positive integer.")

        if self.dropout_keep_prob <= 0.0 or self.dropout_keep_prob > 1.0:
            raise ValueError("Dropout keep prob must be inside (0,1].")

        if (self.attention_dropout_keep_prob <= 0.0
                or self.attention_dropout_keep_prob > 1.0):
            raise ValueError("Dropout keep prob for attn must be in (0,1].")

        if self.target_space_id is not None and (self.target_space_id >= 32
                                                 or self.target_space_id < 0):
            raise ValueError(
                "If provided, the target space ID should be between 0 and 31. "
                "Was: {}".format(self.target_space_id))

        if (input_for_cross_attention is None) != (n_cross_att_heads is None):
            raise ValueError(
                "Either both input_for_cross_attention and n_cross_att_heads "
                "must be provided or none of them.")

        if input_for_cross_attention is not None:
            cross_att_dim = get_attention_states(
                input_for_cross_attention).get_shape()[-1].value
            if cross_att_dim != self.model_dimension:
                raise ValueError(
                    "The input for cross-attention must be of the same "
                    "dimension as the model, was {}.".format(cross_att_dim))

        self._variable_scope.set_initializer(
            tf.variance_scaling_initializer(mode="fan_avg",
                                            distribution="uniform"))

        log("Output op: {}".format(self.output))
Exemplo n.º 32
0
def main() -> None:
    # pylint: disable=no-member,broad-except
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("config",
                        metavar="INI-FILE",
                        help="the configuration file of the experiment")
    parser.add_argument("datasets",
                        metavar="INI-TEST-DATASETS",
                        help="the configuration of the test datasets")
    parser.add_argument("--json",
                        type=str,
                        help="write the evaluation "
                        "results to this file in JSON format")
    parser.add_argument("-g",
                        "--grid",
                        dest="grid",
                        action="store_true",
                        help="look at the SGE variables for slicing the data")
    args = parser.parse_args()

    test_datasets = Configuration()
    test_datasets.add_argument("test_datasets")
    test_datasets.add_argument("variables", cond=lambda x: isinstance(x, list))

    test_datasets.load_file(args.datasets)
    test_datasets.build_model()
    datasets_model = test_datasets.model

    exp = Experiment(config_path=args.config)
    exp.build_model()
    exp.load_variables(datasets_model.variables)

    if args.grid and len(datasets_model.test_datasets) > 1:
        raise ValueError("Only one test dataset supported when using --grid")

    results = []
    for dataset in datasets_model.test_datasets:
        if args.grid:
            if ("SGE_TASK_FIRST" not in os.environ
                    or "SGE_TASK_LAST" not in os.environ
                    or "SGE_TASK_STEPSIZE" not in os.environ
                    or "SGE_TASK_ID" not in os.environ):
                raise EnvironmentError(
                    "Some SGE environment variables are missing")

            length = int(os.environ["SGE_TASK_STEPSIZE"])
            start = int(os.environ["SGE_TASK_ID"]) - 1
            end = int(os.environ["SGE_TASK_LAST"]) - 1

            if start + length > end:
                length = end - start + 1

            log("Running grid task {} starting at {} with step {}".format(
                start // length, start, length))

            dataset = dataset.subset(start, length)

        if exp.config.args.evaluation is None:
            exp.run_model(dataset, write_out=True)
        else:
            eval_result = exp.evaluate(dataset, write_out=True)
            results.append(eval_result)

    if args.json:
        with open(args.json, "w") as f_out:
            json.dump(results, f_out)
            f_out.write("\n")

    for session in exp.config.model.tf_manager.sessions:
        session.close()
Exemplo n.º 33
0
def main() -> None:
    # pylint: disable=no-member,broad-except
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("config",
                        metavar="INI-FILE",
                        help="the configuration file of the experiment")
    parser.add_argument('datasets',
                        metavar='INI-TEST-DATASETS',
                        help="the configuration of the test datasets")
    parser.add_argument("-g",
                        "--grid",
                        dest="grid",
                        action="store_true",
                        help="look at the SGE variables for slicing the data")
    args = parser.parse_args()

    test_datasets = Configuration()
    test_datasets.add_argument('test_datasets')
    test_datasets.add_argument('variables')

    CONFIG.load_file(args.config)
    CONFIG.build_model()
    test_datasets.load_file(args.datasets)
    test_datasets.build_model()
    datasets_model = test_datasets.model
    initialize_for_running(CONFIG.model.output, CONFIG.model.tf_manager,
                           datasets_model.variables)

    print("")

    evaluators = [(e[0], e[0], e[1]) if len(e) == 2 else e
                  for e in CONFIG.model.evaluation]

    if args.grid and len(datasets_model.test_datasets) > 1:
        raise ValueError("Only one test dataset supported when using --grid")

    for dataset in datasets_model.test_datasets:
        if args.grid:
            if ("SGE_TASK_FIRST" not in os.environ
                    or "SGE_TASK_LAST" not in os.environ
                    or "SGE_TASK_STEPSIZE" not in os.environ
                    or "SGE_TASK_ID" not in os.environ):
                raise EnvironmentError(
                    "Some SGE environment variables are missing")

            length = int(os.environ["SGE_TASK_STEPSIZE"])
            start = int(os.environ["SGE_TASK_ID"]) - 1
            end = int(os.environ["SGE_TASK_LAST"]) - 1

            if start + length > end:
                length = end - start + 1

            log("Running grid task {} starting at {} with step {}".format(
                start // length, start, length))

            dataset = dataset.subset(start, length)

        if CONFIG.model.runners_batch_size is None:
            runners_batch_size = CONFIG.model.batch_size
        else:
            runners_batch_size = CONFIG.model.runners_batch_size

        execution_results, output_data = run_on_dataset(
            CONFIG.model.tf_manager,
            CONFIG.model.runners,
            dataset,
            CONFIG.model.postprocess,
            write_out=True,
            batch_size=runners_batch_size,
            log_progress=60)
        # TODO what if there is no ground truth
        eval_result = evaluation(evaluators, dataset, CONFIG.model.runners,
                                 execution_results, output_data)
        if eval_result:
            print_final_evaluation(dataset.name, eval_result)

    for _ in range(len(CONFIG.model.tf_manager.sessions)):
        del CONFIG.model.tf_manager.sessions[0]
Exemplo n.º 34
0
    def __init__(self, max_input_len, vocabulary, data_id, embedding_size,
                 rnn_size, name, dropout_keep_p=0.5,
                 use_noisy_activations=False, use_pervasive_dropout=False,
                 attention_type=None, attention_fertility=3,
                 parent_encoder=None):

        self.name = name
        self.max_input_len = max_input_len
        assert_type(self, 'vocabulary', vocabulary, Vocabulary)
        self.vocabulary = vocabulary
        self.data_id = data_id
        self.embedding_size = embedding_size
        self.rnn_size = rnn_size
        self.max_input_len = max_input_len
        self.dropout_keep_p = dropout_keep_p
        self.use_noisy_activations = use_noisy_activations
        self.use_pervasive_dropout = use_pervasive_dropout
        self.attention_type = attention_type
        self.attention_fertility = attention_fertility

        assert_type(self, 'parent_encoder', parent_encoder, SentenceEncoder,
                    can_be_none=True)
        self.parent_encoder = parent_encoder

        log("Initializing sentence encoder, name: \"{}\"".format(name))
        with tf.variable_scope(name):
            self.dropout_placeholder = tf.placeholder(tf.float32,
                                                      name="dropout")
            self.is_training = tf.placeholder(tf.bool, name="is_training")

            self.inputs = [tf.placeholder(tf.int32, shape=[None],
                                          name="input_{}".format(i))
                           for i in range(max_input_len + 2)]

            self.weight_ins = [tf.placeholder(tf.float32, shape=[None],
                                              name="input_{}".format(i))
                               for i in range(max_input_len + 2)]

            self.weight_tensor = tf.concat(1, [tf.expand_dims(w, 1)
                                               for w in self.weight_ins])

            self.sentence_lengths = tf.to_int64(sum(self.weight_ins))

            if parent_encoder:
                self.word_embeddings = parent_encoder.word_embeddings
            else:
                self.word_embeddings = tf.Variable(tf.random_uniform(
                    [len(vocabulary), embedding_size], -1.0, 1.0))

            embedded_inputs = [tf.nn.embedding_lookup(self.word_embeddings, i)
                               for i in self.inputs]
            dropped_embedded_inputs = [
                tf.nn.dropout(i, self.dropout_placeholder)
                for i in embedded_inputs]

            if parent_encoder:
                self.forward_gru = parent_encoder.forward_gru
                self.backward_gru = parent_encoder.backward_gru
            else:
                if use_noisy_activations:
                    self.forward_gru = NoisyGRUCell(rnn_size, self.is_training)
                    self.backward_gru = NoisyGRUCell(rnn_size, self.is_training)
                else:
                    self.forward_gru = tf.nn.rnn_cell.GRUCell(rnn_size)
                    self.backward_gru = tf.nn.rnn_cell.GRUCell(rnn_size)

            if use_pervasive_dropout:

                # create dropout mask (shape batch x rnn_size)
                # floor (random uniform + dropout_keep)

                shape = tf.concat(0, [tf.shape(self.inputs[0]), [rnn_size]])

                forward_dropout_mask = tf.floor(
                    tf.random_uniform(shape, 0.0, 1.0) + self.dropout_placeholder)

                backward_dropout_mask = tf.floor(
                    tf.random_uniform(shape, 0.0, 1.0) + self.dropout_placeholder)

                scale = tf.inv(self.dropout_placeholder)

                self.forward_gru = PervasiveDropoutWrapper(
                    self.forward_gru, forward_dropout_mask, scale)
                self.backward_gru = PervasiveDropoutWrapper(
                    self.backward_gru, backward_dropout_mask, scale)




            bidi_layer = BidirectionalRNNLayer(self.forward_gru,
                                               self.backward_gru,
                                               dropped_embedded_inputs,
                                               self.sentence_lengths)

            self.outputs_bidi = bidi_layer.outputs_bidi
            self.encoded = bidi_layer.encoded

            self.attention_tensor = tf.concat(1, [tf.expand_dims(o, 1)
                                                  for o in self.outputs_bidi])

            self.attention_object = attention_type(
                self.attention_tensor, scope="attention_{}".format(name),
                dropout_placeholder=self.dropout_placeholder,
                input_weights=self.weight_tensor,
                max_fertility=attention_fertility) if attention_type else None

            log("Sentence encoder initialized")
Exemplo n.º 35
0
    def __init__(self, decoder, initial_trainer, xent_calls, moving_calls):
        """
        Constructs the TensorFlow graph for the MIXER code - i.e. the regressor
        estimating BLEU from hidden states and the gradients from the REINFORCE
        algorithm.

        Args:

            decoder: Decoder.

            xent_calls: The number minibatches for which the standard
                crossentropy learning will be used.

            moving_calls: Number of minibatches after which the algorithm will
                proceed to use the REINFORCE algorithm for a longer suffix of the
                senntences.

        """
        # TODO L2 regularization
        # TODO plot gradients
        self.xent_trainer = initial_trainer
        self.decoder = decoder
        self.called = 0
        self.xent_calls = xent_calls
        self.moving_calls = moving_calls

        with tf.variable_scope('mixer'):
            # BLEU score needs to be computed outside the TF
            self.bleu = tf.placeholder(tf.float32, [None])

            hidden_states = decoder.hidden_states

            # a simple regressor that estimates the BLEU score from the network's hidden states
            with tf.variable_scope('exprected_reward_regressor'):
                linear_reg_W = tf.Variable(tf.truncated_normal([decoder.rnn_size, 1]))
                linear_reg_b = tf.Variable(tf.zeros([1]))

                expected_rewards = [
                    tf.squeeze(tf.matmul(h, linear_reg_W)) + linear_reg_b for h in hidden_states]

                regression_loss = sum([(r - self.bleu) ** 2 for r in expected_rewards]) * 0.5
                self.regression_optimizer = tf.train.AdamOptimizer(1e-3).minimize(regression_loss)


            ## decoded_logits: list of [batch x vabulary] tensors (length max sequence)
            ## decoded_seq: list of [batch x 1] tensors (length sequence) --
            ##   contains vocabulary indices (argmaxs)
            with tf.variable_scope("reinforce_gradients"):
                # this is a dirty trick to get the indices of maxima in the logits
                max_logits = \
                    [tf.expand_dims(tf.reduce_max(l, 1), 1) \
                        for l in decoder.decoded_logits] ## batch x 1 x 1
                indicator = \
                    [tf.to_float(tf.equal(ml, l)) \
                        for ml, l in zip(max_logits, decoder.decoded_logits)] ## batch x slovnik

                log("Forward cmomputation graph ready")

                # this is implementation of equation (11) in the paper
                derivatives = [
                    tf.reduce_sum(
                        tf.expand_dims(self.bleu - r, 1) * (tf.nn.softmax(l) - i) * w,
                        0, keep_dims=True)
                    for r, l, i, w in zip(
                        expected_rewards, decoder.decoded_logits, indicator, decoder.weights_ins)]
                ## ^^^ list of  [1 x vocabulary] tensors

                # this derivatives are constant for us now, we don't really
                # want to propagate the dradient back to this computaiton
                derivatives_stopped = [tf.stop_gradient(d) for d in derivatives]

                # we must train the regressor independently
                trainable_vars = \
                    [v for v in tf.trainable_variables() if not v.name.startswith('mixer')]

                # this is implementation of equation (10) in the paper
                reinforce_gradients = \
                    [tf.gradients(l * d, trainable_vars) \
                        for l, d in zip(decoder.decoded_logits, derivatives_stopped)]
                ## ^^^ [slovnik x shape promenny](delky max seq)

                log("Reinfoce gradients computed")

            with tf.variable_scope("cross_entropy_gradients"):
                cross_entropies = [
                    tf.reduce_sum(tf.nn.sparse_softmax_cross_entropy_with_logits(l, t) * w, 0)
                    for l, t, w in zip(decoder.decoded_logits, decoder.targets, decoder.weights_ins)
                ]
                    ## ^^^ list of scalars in time

                xent_gradients = [tf.gradients(e, trainable_vars) for e in cross_entropies]
                log("Cross-entropy gradients computed")

            self.mixer_weights_plc = [tf.placeholder(tf.float32, []) for _ in hidden_states]

            mixed_gradients = [] # a list for each of the traininable variables

            for i, (rgs, xent_gs, mix_w) in enumerate(
                    zip(reinforce_gradients, xent_gradients, self.mixer_weights_plc)):
                for j, (rg, xent_g) in enumerate(zip(rgs, xent_gs)):
                    if xent_g is None and i == 0:
                        mixed_gradients.append(None)
                        continue

                    if type(xent_g) == tf.Tensor or type(xent_g) == tf.IndexedSlices:
                        g = tf.add(tf.scalar_mul(mix_w, xent_g), tf.scalar_mul(1 - mix_w, rg))
                    elif xent_g is None:
                        continue
                    else:
                        raise Exception("Unnkown type of gradients: {}".format(type(xg)))

                    if i == 0:
                        mixed_gradients.append(g)
                    else:
                        if mixed_gradients[j] is None:
                            mixed_gradients[j] = g
                        else:
                            mixed_gradients[j] += g

            self.mixer_optimizer = \
                    tf.train.AdamOptimizer().apply_gradients(list(zip(mixed_gradients, trainable_vars)))

        self.summary_gradients = tf.merge_summary(tf.get_collection("summary_gradients"))
        self.summary_train = summary_train = tf.merge_summary(tf.get_collection("summary_train"))
Exemplo n.º 36
0
def load(name: str,
         series: List[str],
         data: List[SourceSpec],
         outputs: List[OutputSpec] = None,
         buffer_size: int = None,
         shuffled: bool = False) -> "Dataset":
    """Create a dataset using specification from the configuration.

    The dataset provides iterators over data series. The dataset has a buffer,
    which pre-fetches a given number of the data series lazily. In case the
    dataset is not lazy (buffer size is `None`), the iterators are built on top
    of in-memory arrays. Otherwise, the iterators operate on the data sources
    directly.

    Arguments:
        name: The name of the dataset.
        series: A list of names of data series the dataset contains.
        data: The specification of the data sources for each series.
        outputs: A list of output specifications.
        buffer_size: The size of the buffer. If set, the dataset will be loaded
            lazily into the buffer (useful for large datasets). The buffer size
            specifies the number of sequences to pre-load. This is useful for
            pseudo-shuffling of large data on-the-fly. Ideally, this should be
            (much) larger than the batch size. Note that the buffer gets
            refilled each time its size is less than half the `buffer_size`.
            When refilling, the buffer gets refilled to the specified size.
        shuffled: Whether to shuffle the dataset buffer (done upon refill).

    """
    check_argument_types()

    if not series:
        raise ValueError("No dataset series specified.")

    if not [s for s in data if match_type(s, ReaderDef)]:  # type: ignore
        raise ValueError("At least one data series should be from a file")

    if len(series) != len(data):
        raise ValueError(
            "The 'series' and 'data' lists should have the same number"
            " of elements: {} vs {}.".format(len(series), len(data)))

    if len(series) != len(set(series)):
        raise ValueError("There are duplicate series.")

    if outputs is not None:
        output_sources = [o[0] for o in outputs]
        if len(output_sources) != len(set(output_sources)):
            raise ValueError("Multiple outputs for a single series")

    log("Initializing dataset {}.".format(name))

    iterators = {}  # type: Dict[str, Callable[[], DataSeries]]

    prep_sl = {}  # type: Dict[str, Tuple[Callable, str]]
    prep_dl = {}  # type: Dict[str, DatasetPreprocess]

    def _make_iterator(reader, files):
        def itergen():
            return reader(files)

        return itergen

    def _make_sl_iterator(src, prep):
        def itergen():
            return (prep(item) for item in iterators[src]())

        return itergen

    def _make_dl_iterator(func):
        def itergen():
            return func(iterators)

        return itergen

    # First, prepare iterators for series using file readers
    for s_name, source_spec in zip(series, data):
        if match_type(source_spec, ReaderDef):  # type: ignore
            files, reader = _normalize_readerdef(cast(ReaderDef, source_spec))
            for path in files:
                if not os.path.isfile(path):
                    raise FileNotFoundError(
                        "File not found. Series: {}, Path: {}".format(
                            s_name, path))

            iterators[s_name] = _make_iterator(reader, files)

        elif match_type(source_spec, Tuple[Callable, str]):
            prep_sl[s_name] = cast(Tuple[Callable, str], source_spec)

        else:
            assert match_type(source_spec, DatasetPreprocess)  # type: ignore
            prep_dl[s_name] = cast(DatasetPreprocess, source_spec)

    # Second, prepare series-level preprocessors.
    # Note that series-level preprocessors cannot be stacked on the dataset
    # specification level.
    for s_name, (preprocessor, source) in prep_sl.items():
        if source not in iterators:
            raise ValueError(
                "Source series for series-level preprocessor nonexistent: "
                "Preprocessed series '{}', source series '{}'")
        iterators[s_name] = _make_sl_iterator(source, preprocessor)

    # Finally, dataset-level preprocessors.
    for s_name, func in prep_dl.items():
        iterators[s_name] = _make_dl_iterator(func)

    output_dict = None
    if outputs is not None:
        output_dict = {
            s_name: (path, writer)
            for s_name, path, writer in
            [_normalize_outputspec(out) for out in outputs]
        }

    if buffer_size is not None:
        return Dataset(name, iterators, output_dict,
                       (buffer_size // 2, buffer_size), shuffled)

    return Dataset(name, iterators, output_dict, None, shuffled)
Exemplo n.º 37
0
    def __init__(self,
                 encoders: List[Stateful],
                 vocabulary: Vocabulary,
                 data_id: str,
                 name: str,
                 max_output_len: int,
                 dropout_keep_prob: float = 1.0,
                 embedding_size: int = None,
                 embeddings_source: EmbeddedSequence = None,
                 tie_embeddings: bool = False,
                 label_smoothing: float = None,
                 rnn_size: int = None,
                 output_projection: OutputProjectionSpec = None,
                 encoder_projection: EncoderProjection = None,
                 attentions: List[BaseAttention] = None,
                 attention_on_input: bool = False,
                 rnn_cell: str = "GRU",
                 conditional_gru: bool = False,
                 supress_unk: bool = False,
                 save_checkpoint: str = None,
                 load_checkpoint: str = None,
                 initializers: InitializerSpecs = None) -> None:
        """Create a refactored version of monster decoder.

        Arguments:
            encoders: Input encoders of the decoder.
            vocabulary: Target vocabulary.
            data_id: Target data series.
            name: Name of the decoder. Should be unique accross all Neural
                Monkey objects.
            max_output_len: Maximum length of an output sequence.
            dropout_keep_prob: Probability of keeping a value during dropout.
            embedding_size: Size of embedding vectors for target words.
            embeddings_source: Embedded sequence to take embeddings from.
            tie_embeddings: Use decoder.embedding_matrix also in place
                of the output decoding matrix.

        Keyword arguments:
            rnn_size: Size of the decoder hidden state, if None set
                according to encoders.
            output_projection: How to generate distribution over vocabulary
                from decoder_outputs.
            encoder_projection: How to construct initial state from encoders.
            attention: The attention object to use. Optional.
            rnn_cell: RNN Cell used by the decoder (GRU or LSTM).
            conditional_gru: Flag whether to use the Conditional GRU
                architecture.
            attention_on_input: Flag whether attention from previous decoding
                step should be combined with the input in the next step.
            supress_unk: If true, decoder will not produce symbols for unknown
                tokens.
        """
        check_argument_types()
        AutoregressiveDecoder.__init__(self,
                                       name=name,
                                       vocabulary=vocabulary,
                                       data_id=data_id,
                                       max_output_len=max_output_len,
                                       dropout_keep_prob=dropout_keep_prob,
                                       embedding_size=embedding_size,
                                       embeddings_source=embeddings_source,
                                       tie_embeddings=tie_embeddings,
                                       label_smoothing=label_smoothing,
                                       supress_unk=supress_unk,
                                       save_checkpoint=save_checkpoint,
                                       load_checkpoint=load_checkpoint,
                                       initializers=initializers)

        self.encoders = encoders
        self.output_projection_spec = output_projection
        self._conditional_gru = conditional_gru
        self._attention_on_input = attention_on_input
        self._rnn_cell_str = rnn_cell

        self.attentions = []  # type: List[BaseAttention]
        if attentions is not None:
            self.attentions = attentions

        if rnn_size is not None:
            self.rnn_size = rnn_size

        if encoder_projection is not None:
            self.encoder_projection = encoder_projection
        elif not self.encoders:
            log("No direct encoder input. Using empty initial state")
            self.encoder_projection = empty_initial_state
        elif rnn_size is None:
            log("No rnn_size or encoder_projection: Using concatenation of"
                " encoded states")
            self.encoder_projection = concat_encoder_projection
            self.rnn_size = sum(e.output.get_shape()[1].value
                                for e in encoders)
        else:
            log("Using linear projection of encoders as the initial state")
            self.encoder_projection = linear_encoder_projection(
                self.dropout_keep_prob)

        assert self.rnn_size is not None

        if self._rnn_cell_str not in RNN_CELL_TYPES:
            raise ValueError("RNN cell must be a either 'GRU', 'LSTM', or "
                             "'NematusGRU'. Not {}".format(self._rnn_cell_str))

        if self.output_projection_spec is None:
            log("No output projection specified - using tanh projection")
            self.output_projection = nonlinear_output(self.rnn_size,
                                                      tf.tanh)[0]
            self.output_projection_size = self.rnn_size
        elif isinstance(self.output_projection_spec, tuple):
            self.output_projection_spec = cast(Tuple[OutputProjection, int],
                                               self.output_projection_spec)
            (self.output_projection,
             self.output_projection_size) = self.output_projection_spec
        else:
            self.output_projection = cast(OutputProjection,
                                          self.output_projection_spec)
            self.output_projection_size = self.rnn_size

        if self._attention_on_input:
            self.input_projection = self.input_plus_attention
        else:
            self.input_projection = self.embed_input_symbol

        with self.use_scope():
            with tf.variable_scope("attention_decoder") as self.step_scope:
                pass

        self._variable_scope.set_initializer(
            tf.random_normal_initializer(stddev=0.001))

        # TODO when it is possible, remove the printing of the cost var
        log("Decoder initalized. Cost var: {}".format(str(self.cost)))
        log("Runtime logits tensor: {}".format(str(self.runtime_logits)))
Exemplo n.º 38
0
    def __init__(self,
                 encoders: List[Stateful],
                 vocabulary: Vocabulary,
                 data_id: str,
                 name: str,
                 max_output_len: int,
                 dropout_keep_prob: float = 1.0,
                 rnn_size: int = None,
                 embedding_size: int = None,
                 output_projection: OutputProjectionSpec = None,
                 encoder_projection: EncoderProjection = None,
                 attentions: List[BaseAttention] = None,
                 embeddings_source: EmbeddedSequence = None,
                 attention_on_input: bool = True,
                 rnn_cell: str = "GRU",
                 conditional_gru: bool = False,
                 save_checkpoint: str = None,
                 load_checkpoint: str = None) -> None:
        """Create a refactored version of monster decoder.

        Arguments:
            encoders: Input encoders of the decoder
            vocabulary: Target vocabulary
            data_id: Target data series
            name: Name of the decoder. Should be unique accross all Neural
                Monkey objects
            max_output_len: Maximum length of an output sequence
            dropout_keep_prob: Probability of keeping a value during dropout

        Keyword arguments:
            rnn_size: Size of the decoder hidden state, if None set
                according to encoders.
            embedding_size: Size of embedding vectors for target words
            output_projection: How to generate distribution over vocabulary
                from decoder_outputs
            encoder_projection: How to construct initial state from encoders
            attention: The attention object to use. Optional.
            embeddings_source: Embedded sequence to take embeddings from
            rnn_cell: RNN Cell used by the decoder (GRU or LSTM)
            conditional_gru: Flag whether to use the Conditional GRU
                architecture
            attention_on_input: Flag whether attention from previous decoding
                step should be combined with the input in the next step.
        """
        check_argument_types()
        AutoregressiveDecoder.__init__(
            self,
            name=name,
            vocabulary=vocabulary,
            data_id=data_id,
            max_output_len=max_output_len,
            dropout_keep_prob=dropout_keep_prob,
            save_checkpoint=save_checkpoint,
            load_checkpoint=load_checkpoint)

        self.encoders = encoders
        self.embedding_size = embedding_size
        self.rnn_size = rnn_size
        self.output_projection_spec = output_projection
        self.encoder_projection = encoder_projection
        self.attentions = attentions
        self.embeddings_source = embeddings_source
        self._conditional_gru = conditional_gru
        self._attention_on_input = attention_on_input
        self._rnn_cell_str = rnn_cell

        if self.attentions is None:
            self.attentions = []

        if self.embedding_size is None and self.embeddings_source is None:
            raise ValueError("You must specify either embedding size or the "
                             "embedded sequence from which to reuse the "
                             "embeddings (e.g. set either 'embedding_size' or "
                             " 'embeddings_source' parameter)")

        if self.embeddings_source is not None:
            if self.embedding_size is not None:
                warn("Overriding the embedding_size parameter with the"
                     " size of the reused embeddings from the encoder.")

            self.embedding_size = (
                self.embeddings_source.embedding_matrix.get_shape()[1].value)

        if self.encoder_projection is None:
            if not self.encoders:
                log("No encoder - language model only.")
                self.encoder_projection = empty_initial_state
            elif rnn_size is None:
                log("No rnn_size or encoder_projection: Using concatenation of"
                    " encoded states")
                self.encoder_projection = concat_encoder_projection
                self.rnn_size = sum(e.output.get_shape()[1].value
                                    for e in encoders)
            else:
                log("Using linear projection of encoders as the initial state")
                self.encoder_projection = linear_encoder_projection(
                    self.dropout_keep_prob)

        assert self.rnn_size is not None

        if self._rnn_cell_str not in RNN_CELL_TYPES:
            raise ValueError("RNN cell must be a either 'GRU', 'LSTM', or "
                             "'NematusGRU'. Not {}".format(self._rnn_cell_str))

        if self.output_projection_spec is None:
            log("No output projection specified - using tanh projection")
            self.output_projection = nonlinear_output(
                self.rnn_size, tf.tanh)[0]
            self.output_projection_size = self.rnn_size
        elif isinstance(self.output_projection_spec, tuple):
            (self.output_projection,
             self.output_projection_size) = tuple(self.output_projection_spec)
        else:
            self.output_projection = self.output_projection_spec
            self.output_projection_size = self.rnn_size

        if self._attention_on_input:
            self.input_projection = self.input_plus_attention
        else:
            self.input_projection = self.embed_input_symbol

        with self.use_scope():
            with tf.variable_scope("attention_decoder") as self.step_scope:
                pass

        # TODO when it is possible, remove the printing of the cost var
        log("Decoder initalized. Cost var: {}".format(str(self.cost)))
        log("Runtime logits tensor: {}".format(str(self.runtime_logits)))
Exemplo n.º 39
0
    def __init__(self,
                 encoders: List[Any],
                 vocabulary: Vocabulary,
                 data_id: str,
                 name: str,
                 max_output_len: int,
                 dropout_keep_prob: float,
                 rnn_size: Optional[int]=None,
                 embedding_size: Optional[int]=None,
                 output_projection: Optional[Callable[
                     [tf.Tensor, tf.Tensor, List[tf.Tensor]], tf.Tensor]]=None,
                 encoder_projection: Optional[Callable[
                     [tf.Tensor, Optional[int], Optional[List[Any]]],
                     tf.Tensor]]=None,
                 use_attention: bool=False,
                 embeddings_encoder: Optional[Any]=None,
                 rnn_cell: str='GRU',
                 attention_on_input: bool=True,
                 save_checkpoint: Optional[str]=None,
                 load_checkpoint: Optional[str]=None) -> None:
        """Create a refactored version of monster decoder.

        Arguments:
            encoders: Input encoders of the decoder
            vocabulary: Target vocabulary
            data_id: Target data series
            name: Name of the decoder. Should be unique accross all Neural
                Monkey objects
            max_output_len: Maximum length of an output sequence
            dropout_keep_prob: Probability of keeping a value during dropout

        Keyword arguments:
            rnn_size: Size of the decoder hidden state, if None set
                according to encoders.
            embedding_size: Size of embedding vectors for target words
            output_projection: How to generate distribution over vocabulary
                from decoder rnn_outputs
            encoder_projection: How to construct initial state from encoders
            use_attention: Flag whether to look at attention vectors of the
                encoders
            embeddings_encoder: Encoder to take embeddings from
            rnn_cell: RNN Cell used by the decoder (GRU or LSTM)
            attention_on_input: Flag whether attention from previous decoding
                step should be combined with the input in the next step.
        """
        ModelPart.__init__(self, name, save_checkpoint, load_checkpoint)
        log("Initializing decoder, name: '{}'".format(name))

        self.encoders = encoders
        self.vocabulary = vocabulary
        self.data_id = data_id
        self.max_output_len = max_output_len
        self.dropout_keep_prob = dropout_keep_prob
        self.embedding_size = embedding_size
        self.rnn_size = rnn_size
        self.output_projection = output_projection
        self.encoder_projection = encoder_projection
        self.use_attention = use_attention
        self.embeddings_encoder = embeddings_encoder
        self._rnn_cell = rnn_cell

        if self.embedding_size is None and self.embeddings_encoder is None:
            raise ValueError("You must specify either embedding size or the "
                             "encoder from which to reuse the embeddings ("
                             "e.g. set either 'embedding_size' or "
                             " 'embeddings_encoder' parameter)")

        if self.embeddings_encoder is not None:
            if self.embedding_size is not None:
                log("Warning: Overriding the embedding_size parameter with the"
                    " size of the reused embeddings from the encoder.",
                    color="red")

            self.embedding_size = (
                self.embeddings_encoder.embedding_matrix.get_shape()[1].value)

        if self.encoder_projection is None:
            if len(self.encoders) == 0:
                log("No encoder - language model only.")
                self.encoder_projection = empty_initial_state
            elif rnn_size is None:
                log("No rnn_size or encoder_projection: Using concatenation of"
                    " encoded states")
                self.encoder_projection = concat_encoder_projection
                self.rnn_size = sum(e.encoded.get_shape()[1].value
                                    for e in encoders)
            else:
                log("Using linear projection of encoders as the initial state")
                self.encoder_projection = linear_encoder_projection(
                    self.dropout_keep_prob)

        assert self.rnn_size is not None

        if self.output_projection is None:
            log("No output projection specified - using simple concatenation")
            self.output_projection = no_deep_output

        with tf.variable_scope(name):
            self._create_input_placeholders()
            self._create_training_placeholders()
            self._create_initial_state()
            self._create_embedding_matrix()

            with tf.name_scope("output_projection"):
                self.decoding_w = tf.get_variable(
                    "state_to_word_W", [self.rnn_size, len(self.vocabulary)],
                    initializer=tf.random_uniform_initializer(-0.5, 0.5))

                self.decoding_b = tf.get_variable(
                    "state_to_word_b", [len(self.vocabulary)],
                    initializer=tf.constant_initializer(
                        - math.log(len(self.vocabulary))))

            # POSLEDNI TRAIN INPUT SE V DEKODOVACI FUNKCI NEPOUZIJE
            # (jen jako target)
            embedded_train_inputs = self._embed_and_dropout(
                self.train_inputs[:-1])

            # POZOR TADY SE NEDELA DROPOUT
            embedded_go_symbols = tf.nn.embedding_lookup(self.embedding_matrix,
                                                         self.go_symbols)

            # fetch train attention objects
            self._train_attention_objects = {}
            # type: Dict[Attentive, tf.Tensor]
            if self.use_attention:
                with tf.name_scope("attention_object"):
                    self._train_attention_objects = {
                        e: e.create_attention_object()
                        for e in self.encoders
                        if isinstance(e, Attentive)}

            train_rnn_outputs, _ = self._attention_decoder(
                embedded_go_symbols,
                attention_on_input=attention_on_input,
                train_inputs=embedded_train_inputs,
                train_mode=True)

            assert not tf.get_variable_scope().reuse
            tf.get_variable_scope().reuse_variables()

            # fetch runtime attention objects
            self._runtime_attention_objects = {}
            # type: Dict[Attentive, tf.Tensor]
            if self.use_attention:
                self._runtime_attention_objects = {
                    e: e.create_attention_object()
                    for e in self.encoders
                    if isinstance(e, Attentive)}

            (self.runtime_rnn_outputs,
             self.runtime_rnn_states) = self._attention_decoder(
                 embedded_go_symbols,
                 attention_on_input=attention_on_input,
                 train_mode=False)

            self.hidden_states = self.runtime_rnn_outputs

            def decode(rnn_outputs):
                with tf.name_scope("output_projection"):
                    logits = []
                    decoded = []

                    for out in rnn_outputs:
                        out_activation = self._logit_function(out)
                        logits.append(out_activation)
                        decoded.append(tf.argmax(out_activation[:, 1:], 1) + 1)

                    return decoded, logits

            _, self.train_logits = decode(train_rnn_outputs)

            train_targets = tf.unpack(self.train_inputs)

            self.train_loss = tf.nn.seq2seq.sequence_loss(
                self.train_logits, train_targets,
                tf.unpack(self.train_padding), len(self.vocabulary))
            self.cost = self.train_loss

            self.train_logprobs = [tf.nn.log_softmax(l)
                                   for l in self.train_logits]

            self.decoded, self.runtime_logits = decode(
                self.runtime_rnn_outputs)

            self.runtime_loss = tf.nn.seq2seq.sequence_loss(
                self.runtime_logits, train_targets,
                tf.unpack(self.train_padding), len(self.vocabulary))

            self.runtime_logprobs = [tf.nn.log_softmax(l)
                                     for l in self.runtime_logits]

            tf.scalar_summary('train_loss_with_gt_intpus',
                              self.train_loss,
                              collections=["summary_train"])

            tf.scalar_summary('train_loss_with_decoded_inputs',
                              self.runtime_loss,
                              collections=["summary_train"])

            tf.scalar_summary('train_optimization_cost', self.cost,
                              collections=["summary_train"])

            self._visualize_attention()

            log("Decoder initalized.")
Exemplo n.º 40
0
def from_dataset(datasets: List[Dataset],
                 series_ids: List[str],
                 max_size: int,
                 save_file: str = None,
                 overwrite: bool = False,
                 min_freq: Optional[int] = None,
                 unk_sample_prob: float = 0.5) -> 'Vocabulary':
    """Loads vocabulary from a dataset with an option to save it.

    Arguments:
        datasets: A list of datasets from which to create the vocabulary
        series_ids: A list of ids of series of the datasets that should be used
                    producing the vocabulary
        max_size: The maximum size of the vocabulary
        save_file: A file to save the vocabulary to. If None (default),
                   the vocabulary will not be saved.
        overwrite: Overwrite existing file.
        min_freq: Do not include words with frequency smaller than this.
        unk_sample_prob: The probability with which to sample unks out of
                         words with frequency 1. Defaults to 0.5.

    Returns:
        The new Vocabulary instance.
    """

    assert check_argument_types()

    vocabulary = Vocabulary(unk_sample_prob=unk_sample_prob)

    for dataset in datasets:
        if isinstance(dataset, LazyDataset):
            warn("Inferring vocabulary from lazy dataset!")

        for series_id in series_ids:
            if not dataset.has_series(series_id):
                warn("Data series '{}' not present in the dataset".format(
                    series_id))

            series = dataset.get_series(series_id, allow_none=True)
            if series:
                vocabulary.add_tokenized_text(
                    [token for sent in series for token in sent])

    vocabulary.truncate(max_size)

    if min_freq is not None:
        if min_freq > 1:
            vocabulary.truncate_by_min_freq(min_freq)

    log("Vocabulary for series {} initialized, containing {} words".format(
        series_ids, len(vocabulary)))

    vocabulary.log_sample()

    if save_file is not None:
        directory = os.path.dirname(save_file)
        if not os.path.exists(directory):
            os.makedirs(directory)
        vocabulary.save_to_file(save_file, overwrite)

    return vocabulary
Exemplo n.º 41
0
    def __init__(self,
                 name: str,
                 vocabulary: Vocabulary,
                 data_id: str,
                 embedding_size: int,
                 segment_size: int,
                 highway_depth: int,
                 rnn_size: int,
                 filters: List[Tuple[int, int]],
                 max_input_len: Optional[int] = None,
                 dropout_keep_prob: float = 1.0,
                 attention_type: Optional[Any] = None,
                 attention_fertility: int = 3,
                 use_noisy_activations: bool = False,
                 save_checkpoint: Optional[str] = None,
                 load_checkpoint: Optional[str] = None) -> None:
        """Create a new instance of the sentence encoder.

        Arguments:
            vocabulary: Input vocabulary
            data_id: Identifier of the data series fed to this encoder
            name: An unique identifier for this encoder
            max_input_len: Maximum length of an encoded sequence
            embedding_size: The size of the embedding vector assigned
                to each word
            segment_size: The size of the segments over which we apply
                max-pooling.
            highway_depth: Depth of the highway layer.
            rnn_size: The size of the encoder's hidden state. Note
                that the actual encoder output state size will be
                twice as long because it is the result of
                concatenation of forward and backward hidden states.
            filters: Specification of CNN filters. It is a list of tuples
                specifying the filter size and number of channels.

        Keyword arguments:
            dropout_keep_prob: The dropout keep probability
                (default 1.0)
            attention_type: The class that is used for creating
                attention mechanism (default None)
            attention_fertility: Fertility parameter used with
                CoverageAttention (default 3).
        """
        ModelPart.__init__(self, name, save_checkpoint, load_checkpoint)
        Attentive.__init__(self,
                           attention_type,
                           attention_fertility=attention_fertility)

        assert check_argument_types()

        self.vocabulary = vocabulary
        self.data_id = data_id

        self.max_input_len = max_input_len
        self.embedding_size = embedding_size
        self.segment_size = segment_size
        self.highway_depth = highway_depth
        self.rnn_size = rnn_size
        self.filters = filters
        self.dropout_keep_p = dropout_keep_prob
        self.use_noisy_activations = use_noisy_activations

        if max_input_len is not None and max_input_len <= 0:
            raise ValueError("Input length must be a positive integer.")

        log("Initializing sentence encoder, name: '{}'".format(self.name))

        with self.use_scope():
            self._create_input_placeholders()
            with tf.variable_scope('input_projection'):
                self._create_embedding_matrix()
                embedded_inputs = self._embed(self.inputs)  # type: tf.Tensor
                self.embedded_inputs = embedded_inputs

            # CNN Network
            pooled_outputs = []
            for filter_size, num_filters in self.filters:
                with tf.variable_scope("conv-maxpool-%s" % filter_size):
                    filter_shape = [filter_size, embedding_size, num_filters]
                    w_filter = tf.get_variable(
                        "conv_W",
                        filter_shape,
                        initializer=tf.random_uniform_initializer(-0.5, 0.5))
                    b_filter = tf.get_variable(
                        "conv_bias", [num_filters],
                        initializer=tf.constant_initializer(0.0))
                    conv = tf.nn.conv1d(embedded_inputs,
                                        w_filter,
                                        stride=1,
                                        padding="SAME",
                                        name="conv")

                    # Apply nonlinearity
                    conv_relu = tf.nn.relu(tf.nn.bias_add(conv, b_filter))

                    # Max-pooling over the output segments
                    expanded_conv_relu = tf.expand_dims(conv_relu, -1)
                    pooled = tf.nn.max_pool(
                        expanded_conv_relu,
                        ksize=[1, self.segment_size, 1, 1],
                        strides=[1, self.segment_size, 1, 1],
                        padding="SAME",
                        name="maxpool")
                    pooled_outputs.append(pooled)

            # Combine all the pooled features
            self.cnn_encoded = tf.concat(pooled_outputs, axis=2)
            self.cnn_encoded = tf.squeeze(self.cnn_encoded, [3])

            # Highway Network
            batch_size = tf.shape(self.cnn_encoded)[0]
            # pylint: disable=no-member
            cnn_out_size = self.cnn_encoded.get_shape().as_list()[-1]
            highway_layer = tf.reshape(self.cnn_encoded, [-1, cnn_out_size])
            for i in range(self.highway_depth):
                highway_layer = highway(highway_layer,
                                        scope=("highway_layer_%s" % i))
            highway_layer = tf.reshape(highway_layer,
                                       [batch_size, -1, cnn_out_size])

            # BiRNN Network
            fw_cell, bw_cell = self.rnn_cells()  # type: RNNCellTuple
            seq_lens = tf.ceil(
                tf.divide(self.sentence_lengths, self.segment_size))
            seq_lens = tf.cast(seq_lens, tf.int32)
            outputs_bidi_tup, encoded_tup = tf.nn.bidirectional_dynamic_rnn(
                fw_cell,
                bw_cell,
                highway_layer,
                sequence_length=seq_lens,
                dtype=tf.float32)

            self.hidden_states = tf.concat(outputs_bidi_tup, 2)

            with tf.variable_scope('attention_tensor'):
                self.__attention_tensor = self._dropout(self.hidden_states)

            self.encoded = tf.concat(encoded_tup, 1)

        log("Sentence encoder initialized")