Exemplo n.º 1
0
def ResolvePreprocessedId(content_id: str, config: corpus_pb2.Corpus) -> str:
    """Compute the hash of a corpus of preprocessed contentfiles.

  The hash is computed from the ID of the input files and the serialized
  representation of the preprocessor pipeline.
  """
    return crypto.sha1_list(content_id, *config.preprocessor)
Exemplo n.º 2
0
def ResolvePreprocessedId(content_id: str, config: corpus_pb2.Corpus) -> str:
  """Compute the hash of a corpus of preprocessed contentfiles.

  The hash is computed from the ID of the input files and the serialized
  representation of the preprocessor pipeline.
  """
  # TODO(github.com/ChrisCummins/phd/issues/46): Refactor this after splitting
  # out Corpus class.
  if config.pre_encoded_corpus_url:
    return 'null'
  return crypto.sha1_list(content_id, *config.preprocessor)
Exemplo n.º 3
0
def ResolveEncodedId(content_id: str, config: corpus_pb2.Corpus) -> str:
    """Compute the hash of a corpus of preprocessed and encoded contentfiles.

  The hash is computed from the ID of the input files and the serialized
  representation of the config proto.
  """
    config_without_contentfiles = corpus_pb2.Corpus()
    config_without_contentfiles.CopyFrom(config)
    # Clear the contentfiles field, since we use the content_id to uniquely
    # identify the input files. This means that corpuses with the same content
    # files delivered through different means (e.g. two separate but identical
    # directories) have the same hash.
    config_without_contentfiles.ClearField('contentfiles')
    return crypto.sha1_list(content_id,
                            config_without_contentfiles.SerializeToString())
Exemplo n.º 4
0
    def _ComputeHash(corpus_: corpuses.Corpus, config: model_pb2.Model) -> str:
        """Compute model hash.

    The hash is computed from the ID of the corpus and the serialized
    representation of the config proto. The number of epochs that the model is
    trained for does not affect the hash, since we can share checkpoints
    between different models if the only variable is the epoch count. E.g.
    we have a model trained for 10 epochs, we can use the checkpoint as the
    starting point for a training a model for 20 epochs.

    Args:
      corpus: A corpus instance.
      config: A Model config proto.

    Returns:
      The unique model ID.
    """
        config_to_hash = model_pb2.Model()
        config_to_hash.CopyFrom(config)
        config_to_hash.ClearField('corpus')
        config_to_hash.training.ClearField('num_epochs')
        return crypto.sha1_list(corpus_.hash,
                                config_to_hash.SerializeToString())
Exemplo n.º 5
0
 def _hash(self, contentid: str, opts: dict) -> str:
     """ compute corpus hash """
     opts = deepcopy(opts)
     del opts["created"]
     return crypto.sha1_list(contentid, *types.dict_values(opts))
Exemplo n.º 6
0
 def _hash(corpus: clgen.Corpus, opts: dict) -> str:
     """ compute model hash """
     hashopts = deepcopy(opts)
     del hashopts["created"]
     del hashopts["train_opts"]["epochs"]
     return crypto.sha1_list(corpus.hash, *types.dict_values(hashopts))