示例#1
0
def models() -> Iterator[Model]:
    """
    Iterate over all cached models.

    Returns
    -------
    Iterator[Model]
        An iterable over all cached models.
    """
    if fs.isdir(clgen.cachepath(), "model"):
        modeldirs = fs.ls(fs.path(clgen.cachepath(), "model"), abspaths=True)
        for modeldir in modeldirs:
            meta = jsonutil.read_file(fs.path(modeldir, "META"))
            model = Model.from_json(meta)
            yield model
示例#2
0
    def from_json(corpus_json: dict) -> 'Corpus':
        """
        Instantiate Corpus from JSON.

        Parameters
        ----------
        corpus_json : dict
            Specification.

        Returns
        -------
        Corpus
            Insantiated corpus.
        """
        path = corpus_json.pop("path", None)
        uid = corpus_json.pop("id", None)
        language = clgen.Language.from_str(corpus_json.get("language"))

        if path:
            path = unpack_directory_if_needed(fs.abspath(path))
            if not fs.isdir(path):
                raise clgen.UserError(
                    "Corpus path '{}' is not a directory".format(path))

            dirhashcache = DirHashCache(clgen.cachepath("dirhash.db"), 'sha1')
            uid = prof.profile(dirhashcache.dirhash, path)
        elif uid:
            cache_path = clgen.mkcache("contentfiles",
                                       f"{language}-{uid}").path
            if not fs.isdir(cache_path):
                raise clgen.UserError(
                    "Corpus content {} not found".format(uid))
        else:
            raise clgen.UserError("No corpus path or ID provided")

        if "stats" in corpus_json:  # ignore stats
            del corpus_json["stats"]

        if "contentfiles" in corpus_json:
            del corpus_json["contentfiles"]

        return prof.profile(Corpus, uid, path=path, **corpus_json)
示例#3
0
文件: cli.py 项目: yasutakawada/clgen
        def _main() -> None:
            cache = clgen.cachepath()

            log.warning("Not Implemented: refresh corpuses")

            if fs.isdir(cache, "model"):
                cached_modeldirs = fs.ls(fs.path(cache, "model"), abspaths=True)
                for cached_modeldir in cached_modeldirs:
                    cached_model_id = fs.basename(cached_modeldir)
                    cached_meta = jsonutil.read_file(fs.path(cached_modeldir, "META"))

                    model = clgen.Model.from_json(cached_meta)

                    if cached_model_id != model.hash:
                        log.info(cached_model_id, '->', model.hash)

                        if fs.isdir(model.cache.path):
                            log.fatal("cache conflict", file=sys.stderr)

                        fs.mv(cached_modeldir, model.cache.path)

            log.warning("Not Implemented: refresh samplers")
示例#4
0
 def shorthash(self):
     return clgen._shorthash(self.hash, clgen.cachepath("corpus"))
示例#5
0
    def __init__(self, contentid: str, path: str=None, **opts):
        """
        Instantiate a corpus.

        If this is a new corpus, a number of files will be created, which may
        take some time.

        Parameters
        ----------
        contentid : str
            ID of corpus content.
        path : str, optional
            Path to corpus.
        **opts
            Keyword options.
        """
        # Validate options
        for key in opts.keys():
            if key not in DEFAULT_CORPUS_OPTS:
                raise clgen.UserError(
                    "Unsupported corpus option '{}'. Valid keys: {}".format(
                        key, ','.join(sorted(DEFAULT_CORPUS_OPTS.keys()))))

        self.opts = deepcopy(DEFAULT_CORPUS_OPTS)
        types.update(self.opts, opts)
        self.opts["id"] = contentid

        # check that contentid exists
        self.language = clgen.Language.from_str(opts.get("language"))
        if (path is None and
            not fs.isdir(clgen.cachepath("contentfiles", f"{self.language}-{contentid}"))):
            raise clgen.UserError("corpus {self.language}-{contentid} not found"
                                  .format(**vars()))

        self.contentid = contentid
        self.contentcache = clgen.mkcache("contentfiles", f"{self.language}-{contentid}")
        self.kernels_db = self.contentcache.keypath('kernels.db')

        self.hash = self._hash(contentid, self.opts)
        self.cache = clgen.mkcache("corpus", f"{self.language}-{self.hash}")

        log.debug("contentfiles {self.contentid}".format(**vars()))
        log.debug("corpus {hash}".format(hash=self.hash))

        # validate metadata against cache
        self.stats = {
            "preprocess_time": 0
        }
        meta = deepcopy(self.to_json())
        if self.cache.get("META"):
            cached_meta = jsonutil.read_file(self.cache["META"])
            self.stats = cached_meta["stats"]  # restore stats

            if "created" in cached_meta:
                del cached_meta["created"]
            del meta["created"]

            if "stats" in cached_meta:
                del cached_meta["stats"]
            del meta["stats"]

            if meta != cached_meta:
                raise clgen.InternalError("corpus metadata mismatch")
        else:
            self._flush_meta()

        with self.lock.acquire(replace_stale=True):
            self._create_files(path)
示例#6
0
 def shorthash(self) -> str:
     return clgen._shorthash(self.hash, clgen.cachepath("sampler"))
示例#7
0
 def shorthash(self):
     return clgen._shorthash(self.hash, clgen.cachepath("model"))