def __init__(self, sampler_opts: dict, kernel_opts: dict): """ Instantiate a sampler. Parameters ---------- sampler_opts : dict Sampler options. kernel_opts : dict Kernel options. """ def _hash(sampler_opts: dict, kernel_opts: dict) -> str: # we don't consider the number of samples in the ID sampler_opts = deepcopy(sampler_opts) del sampler_opts["min_samples"] del sampler_opts["min_kernels"] del sampler_opts["created"] checksum_data = sorted( [str(x) for x in sampler_opts.values()] + [str(x) for x in kernel_opts.values()]) string = "".join([str(x) for x in checksum_data]) return crypto.sha1_str(string) def _start_text(args): if args is None: return "__kernel void A(" else: return serialize_argspec(args) assert(type(sampler_opts) is dict) assert(type(kernel_opts) is dict) # Validate options for key in sampler_opts.keys(): if key not in DEFAULT_SAMPLER_OPTS: raise clgen.UserError( "Unsupported sampler option '{}'. Valid keys: {}".format( key, ','.join(sorted(DEFAULT_SAMPLER_OPTS.keys())))) for key in kernel_opts.keys(): if key not in DEFAULT_KERNELS_OPTS: raise clgen.UserError( "Unsupported kernels option '{}'. Valid keys: {}".format( key, ','.join(sorted(DEFAULT_KERNELS_OPTS.keys())))) # set properties self.sampler_opts = types.update(deepcopy(DEFAULT_SAMPLER_OPTS), sampler_opts) self.kernel_opts = types.update(deepcopy(DEFAULT_KERNELS_OPTS), kernel_opts) self.hash = _hash(self.sampler_opts, self.kernel_opts) self.start_text = _start_text(self.kernel_opts["args"]) # options to pass to preprocess_db() self.preprocess_opts = { "use_gpuverify": self.sampler_opts["gpuverify"] }
def __init__(self, contentid: str, path: str=None, **opts): """ Instantiate a corpus. If this is a new corpus, a number of files will be created, which may take some time. Parameters ---------- contentid : str ID of corpus content. path : str, optional Path to corpus. **opts Keyword options. """ # Validate options for key in opts.keys(): if key not in DEFAULT_CORPUS_OPTS: raise clgen.UserError( "Unsupported corpus option '{}'. Valid keys: {}".format( key, ','.join(sorted(DEFAULT_CORPUS_OPTS.keys())))) self.opts = deepcopy(DEFAULT_CORPUS_OPTS) types.update(self.opts, opts) self.opts["id"] = contentid # check that contentid exists self.language = clgen.Language.from_str(opts.get("language")) if (path is None and not fs.isdir(clgen.cachepath("contentfiles", f"{self.language}-{contentid}"))): raise clgen.UserError("corpus {self.language}-{contentid} not found" .format(**vars())) self.contentid = contentid self.contentcache = clgen.mkcache("contentfiles", f"{self.language}-{contentid}") self.kernels_db = self.contentcache.keypath('kernels.db') self.hash = self._hash(contentid, self.opts) self.cache = clgen.mkcache("corpus", f"{self.language}-{self.hash}") log.debug("contentfiles {self.contentid}".format(**vars())) log.debug("corpus {hash}".format(hash=self.hash)) # validate metadata against cache self.stats = { "preprocess_time": 0 } meta = deepcopy(self.to_json()) if self.cache.get("META"): cached_meta = jsonutil.read_file(self.cache["META"]) self.stats = cached_meta["stats"] # restore stats if "created" in cached_meta: del cached_meta["created"] del meta["created"] if "stats" in cached_meta: del cached_meta["stats"] del meta["stats"] if meta != cached_meta: raise clgen.InternalError("corpus metadata mismatch") else: self._flush_meta() with self.lock.acquire(replace_stale=True): self._create_files(path)
def __init__(self, sampler_opts: dict, kernel_opts: dict): """ Instantiate a sampler. Parameters ---------- sampler_opts : dict Sampler options. kernel_opts : dict Kernel options. """ def _hash(sampler_opts: dict, kernel_opts: dict) -> str: # we don't consider the number of samples in the ID sampler_opts = deepcopy(sampler_opts) del sampler_opts["min_samples"] del sampler_opts["min_kernels"] del sampler_opts["created"] checksum_data = sorted([str(x) for x in sampler_opts.values()] + [str(x) for x in kernel_opts.values()]) string = "".join([str(x) for x in checksum_data]) return crypto.sha1_str(string) # FIXME(polyglot): def _start_text(lang: clgen.Language, args: Union[List[str], None], start_text: str): if lang == clgen.Language.OPENCL: if args is None: return "__kernel void A(" else: return serialize_opencl_argspec(args) else: return start_text or "" assert (type(sampler_opts) is dict) assert (type(kernel_opts) is dict) # Validate options for key in sampler_opts.keys(): if key not in DEFAULT_SAMPLER_OPTS: raise clgen.UserError( "Unsupported sampler option '{}'. Valid keys: {}".format( key, ','.join(sorted(DEFAULT_SAMPLER_OPTS.keys())))) for key in kernel_opts.keys(): if key not in DEFAULT_KERNELS_OPTS: raise clgen.UserError( "Unsupported kernels option '{}'. Valid keys: {}".format( key, ','.join(sorted(DEFAULT_KERNELS_OPTS.keys())))) # set properties self.sampler_opts = types.update(deepcopy(DEFAULT_SAMPLER_OPTS), sampler_opts) self.kernel_opts = types.update(deepcopy(DEFAULT_KERNELS_OPTS), kernel_opts) self.hash = _hash(self.sampler_opts, self.kernel_opts) self.language = clgen.Language.from_str(kernel_opts.get("language")) self.start_text = _start_text(self.language, self.kernel_opts.get("args", []), self.kernel_opts.get("start_text", "")) # pop "start_text" option del self.kernel_opts["start_text"] # options to pass to preprocess_db() self.preprocess_opts = { "use_gpuverify": self.sampler_opts["gpuverify"] }
def __init__(self, corpus: clgen.Corpus, **opts): """ Instantiate model. Parameters ---------- corpus : clgen.Corpus Corpus instance. **opts Training options. """ assert(isinstance(corpus, clgen.Corpus)) def _hash(corpus: clgen.Corpus, opts: dict) -> str: """ compute model hash """ hashopts = deepcopy(opts) del hashopts["created"] del hashopts["train_opts"]["epochs"] return crypto.sha1_list(corpus.hash, *types.dict_values(hashopts)) # Validate options for key in opts: if key not in DEFAULT_MODEL_OPTS: raise clgen.UserError( "Unsupported model option '{}'. Valid keys: {}".format( key, ','.join(sorted(DEFAULT_MODEL_OPTS.keys())))) # set properties self.opts = types.update(deepcopy(DEFAULT_MODEL_OPTS), opts) self.corpus = corpus self.hash = _hash(self.corpus, self.opts) self.cache = clgen.mkcache("model", f"{corpus.language}-{self.hash}") log.debug("model", self.hash) # validate metadata against cache, and restore stats self.stats = { "epoch_times": [], "epoch_costs": [], "epoch_batches": [] } meta = deepcopy(self.to_json()) if self.cache.get("META"): cached_meta = jsonutil.read_file(self.cache["META"]) self.stats = cached_meta["stats"] # restore stats if "created" in cached_meta: del cached_meta["created"] del meta["created"] if "created" in cached_meta["corpus"]: del cached_meta["corpus"]["created"] del meta["corpus"]["created"] if "stats" in cached_meta: del cached_meta["stats"] del meta["stats"] if "epochs" in cached_meta["train_opts"]: del cached_meta["train_opts"]["epochs"] del meta["train_opts"]["epochs"] if meta != cached_meta: log.error("Computed META:", jsonutil.format_json(meta)) raise clgen.InternalError( "metadata mismatch in model %s" % self.cache["META"]) else: self._flush_meta()