def process_cl_file(db_path: str, path: str) -> None: """ Process OpenCL file. Arguments: db_path (str): Path to output database. path (str): Path to input file. Raises: FetchError: In case of IO error. """ db = dbutil.connect(db_path) c = db.cursor() log.debug("fetch {path}".format(path=fs.abspath(path))) try: contents = inline_fs_headers(path, []) except IOError: raise FetchError( "cannot read file '{path}'".format(path=fs.abspath(path))) c.execute('INSERT OR IGNORE INTO ContentFiles VALUES(?,?)', (path, contents)) db.commit() c.close()
def create_batches(self) -> None: """ Create batches for training. """ log.debug("creating batches") self.reset_batch_pointer() # generate a kernel corpus data = self._generate_kernel_corpus() # encode corpus into vocab indices self._tensor = self.atomizer.atomize(data) batch_size = self.batch_size seq_length = self.seq_length # set corpus size and number of batches self._size = len(self._tensor) self._num_batches = int(self.size / (batch_size * seq_length)) if self.num_batches == 0: raise clgen.UserError( "Not enough data. Use a smaller seq_length and batch_size") # split into batches self._tensor = self._tensor[:self.num_batches * batch_size * seq_length] xdata = self._tensor ydata = np.copy(self._tensor) ydata[:-1] = xdata[1:] ydata[-1] = xdata[0] self._x_batches = np.split(xdata.reshape(batch_size, -1), self.num_batches, 1) self._y_batches = np.split(ydata.reshape(batch_size, -1), self.num_batches, 1)
def fetch_fs(db_path: str, paths: list=[]) -> None: """ Fetch from a list of files. Arguments: db_path (str): Output dataset. paths (str[]): List of file paths. """ paths = clgen.files_from_list(paths) # expand directories db = dbutil.connect(db_path) c = db.cursor() for path in paths: log.debug("fetch", path) try: contents = inline_fs_headers(path, []) except IOError: db.commit() raise FetchError( "cannot read file '{path}'".format(path=fs.abspath(path))) c.execute('INSERT OR IGNORE INTO ContentFiles VALUES(?,?)', (path, contents)) db.commit()
def _create_kernels_db(self, path: str, encoding: str = "default") -> None: """creates and caches kernels.db""" log.debug("creating database") # create a database and put it in the cache tmppath = fs.path(self.contentcache.path, "kernels.db.tmp") dbutil.create_db(tmppath) self.contentcache["kernels.db"] = tmppath # get a list of files in the corpus filelist = [ f for f in fs.ls(path, abspaths=True, recursive=True) if fs.isfile(f) ] # import files into database fetch.fetch_fs(self.contentcache["kernels.db"], filelist) # preprocess files preprocess.preprocess_db(self.contentcache["kernels.db"]) # encode kernel db encode(self.contentcache["kernels.db"], encoding) # print database stats explore.explore(self.contentcache["kernels.db"])
def _create_txt(self) -> None: """creates and caches corpus.txt""" log.debug("creating corpus") # TODO: additional options in corpus JSON to accomodate for EOF, # different encodings etc. tmppath = self.cache.keypath("corpus.txt.tmp") dbutil.dump_db(self.contentcache["kernels.db"], tmppath) self.cache["corpus.txt"] = tmppath
def _create_kernels_db(self, path: str) -> None: """creates and caches kernels.db""" log.debug("creating database") # create a database and put it in the cache tmppath = self.contentcache.keypath("kernels.db.tmp") dbutil.create_db(tmppath) self.contentcache["kernels.db"] = tmppath # get a list of files in the corpus filelist = [f for f in fs.ls(path, abspaths=True, recursive=True) if fs.isfile(f)] # import files into database clgen.fetch(self.contentcache["kernels.db"], filelist)
def _create_atomizer(self, vocab: str = "char") -> None: """creates and caches atomizer.pkl""" def _get_atomizer(corpus_txt: str, vocab: str = "char") -> list: """ Get atomizer for a corpus. Parameters ---------- corpus : str Corpus. vocab : str, optional Vocabularly type. Returns ------- clgen.Atomizer Atomizer. """ atomizers = { "char": clgen.CharacterAtomizer, "greedy": clgen.GreedyAtomizer, } self.vocab_type = vocab atomizerclass = atomizers.get(vocab, None) if atomizerclass is None: raise clgen.UserError("Unknown vocabulary type '{bad}'. " "Supported values: {good}".format( bad=vocab, good=", ".join( sorted(atomizers.keys())))) else: return atomizerclass.from_text(corpus_txt) log.debug("creating vocab file") data = self._read_txt() self.atomizer = _get_atomizer(data, vocab) self.atoms = self.atomizer.atoms self.vocab_size = self.atomizer.vocab_size self.vocab = self.atomizer.vocab tmp_vocab_file = self.cache.keypath("atomizer.tmp.pkl") with open(tmp_vocab_file, 'wb') as f: pickle.dump(self.atomizer, f) self.cache["atomizer.pkl"] = tmp_vocab_file
def to_dist(self, distpath: str, author: str = None) -> str: """ Create a dist file. Arguments: distpath (str): Path to dist file. author (str, optional): Author name. Returns: str: Path to generated distfile. """ outpath = fs.abspath(distpath) + ".tar.bz2" if fs.exists(outpath): raise DistError("file {} exists".format(outpath)) meta = self.meta if author is not None: meta["author"] = author log.debug(clgen.format_json(meta)) try: tar = tarfile.open(outpath, 'w:bz2') # write meta metapath = mktemp(prefix="clgen-", suffix=".json") clgen.write_file(metapath, clgen.format_json(meta)) log.debug("metafile:", metapath) # create tarball tar.add(metapath, arcname="meta.json") # pack contents: for path in meta["contents"]: abspath = fs.path(cache.ROOT, path) log.verbose("packing", abspath) tar.add(abspath, arcname=fs.path("contents", path)) # tidy up fs.rm(metapath) tar.close() except Exception as e: tar.close() fs.rm(metapath) fs.rm(outpath) raise e return outpath
def _create_atomizer(self, vocab: str = "char") -> None: """creates and caches atomizer.pkl""" log.debug("creating vocab file") data = self._read_txt() self.atomizer = get_atomizer(data, vocab) self.atoms = self.atomizer.atoms self.vocab_size = self.atomizer.vocab_size self.vocab = self.atomizer.vocab tmp_vocab_file = fs.path(self.cache.path, "atomizer.tmp.pkl") with open(tmp_vocab_file, 'wb') as f: cPickle.dump(self.atomizer, f) self.cache["atomizer.pkl"] = tmp_vocab_file
def _preprocess_db_worker(job: dict) -> None: """Database worker thread""" db_path = job["db_in"] db_index_range = job["db_index_range"] outpath = job["json_out"] log.debug("worker", os.getpid(), outpath) db = dbutil.connect(db_path) c = db.cursor() split_start, split_end = db_index_range split_size = split_end - split_start # get the files to preprocess c.execute('SELECT id,contents FROM ContentFiles LIMIT {} OFFSET {}'.format( split_size, split_start)) with open(outpath, 'wb') as outfile: for row in c.fetchall(): id, contents = row # Get checksum of cached file: c.execute('SELECT id FROM PreprocessedFiles WHERE id=?', (id, )) result = c.fetchone() cached_id = result[0] if result else None # Check that file is modified: if id != cached_id: try: # Try and preprocess it: contents = preprocess(contents, id) status = 0 except BadCodeException as e: contents = str(e) status = 1 except UglyCodeException as e: contents = str(e) status = 2 # write result to json line = json.dumps([id, status, contents]).encode('utf-8') outfile.write(line) outfile.write('\n'.encode('utf-8')) c.close() db.close()
def stats_worker(db_path: str) -> list: """ Generate dataset stats. """ log.debug("stats worker ...") db = dbutil.connect(db_path) c = db.cursor() stats = [] # ContentFiles c.execute("SELECT Count(DISTINCT id) from ContentFiles") nb_uniq_ocl_files = c.fetchone()[0] stats.append(('Number of content files', bigint(nb_uniq_ocl_files))) c.execute("SELECT contents FROM ContentFiles") code = c.fetchall() code_lcs = [len(x[0].split('\n')) for x in code] code_lcs.sort() code_lc = sum(code_lcs) stats.append(('Total content line count', bigint(code_lc))) stats.append(('Content file line counts', seq_stats(code_lcs))) stats.append(('', '')) # Preprocessed c.execute("SELECT Count(*) FROM PreprocessedFiles WHERE status=0") nb_pp_files = c.fetchone()[0] ratio_pp_files = div(nb_pp_files, nb_uniq_ocl_files) stats.append( ('Number of good preprocessed files', bigint(nb_pp_files) + ' ({:.0f}%)'.format(ratio_pp_files * 100))) c.execute('SELECT contents FROM PreprocessedFiles WHERE status=0') bc = c.fetchall() pp_lcs = [len(x[0].split('\n')) for x in bc] pp_lcs.sort() pp_lc = sum(pp_lcs) ratio_pp_lcs = div(pp_lc, code_lc) stats.append(('Lines of good preprocessed code', bigint(pp_lc) + ' ({:.0f}%)'.format(ratio_pp_lcs * 100))) stats.append(('Good preprocessed line counts', seq_stats(pp_lcs))) stats.append(('', '')) return stats
def _finalize(db_path, cache): """Tidy up after worker threads finish""" log.debug("worker finalize") db = dbutil.connect(db_path) c = db.cursor() # import results from worker threads for outpath in fs.ls(cache.path, abspaths=True): with open(outpath) as infile: for line in infile: c.execute( 'INSERT OR REPLACE INTO PreprocessedFiles ' 'VALUES(?,?,?)', json.loads(line)) # write changes to database and remove cache db.commit() db.close() cache.empty()
def __setitem__(self, key: str, value: str) -> None: """ Emplace file in cache. Arguments: key (str): Key. value (str): Path of file to insert in cache. Raises: clgen.File404: If no "value" does nto exist. """ assert (isinstance(key, string_types)) assert (isinstance(value, string_types)) clgen.must_exist(value, error=clgen.File404) path = self.keypath(key) move(value, path) log.debug("cached {path}".format(key=key, path=path))
def __init__(self, corpus: Corpus, **opts): """ Instantiate model. Arguments: corpus (Corpus): Corpus instance. opts (dict): Training options. """ assert (isinstance(corpus, Corpus)) # Validate options for key in opts.keys(): if key not in DEFAULT_MODEL_OPTS: raise clgen.UserError( "Unsupported model option '{}'. Valid keys: {}".format( key, ','.join(sorted(DEFAULT_MODEL_OPTS.keys())))) # set properties self.opts = clgen.update(deepcopy(DEFAULT_MODEL_OPTS), opts) self.corpus = corpus self.hash = self._hash(self.corpus, self.opts) self.cache = Cache(fs.path("model", self.hash)) log.debug("model", self.hash)
def __init__(self, contentid: str, path: str=None, **opts): """ Instantiate a corpus. If this is a new corpus, a number of files will be created, which may take some time. Parameters ---------- contentid : str ID of corpus content. path : str, optional Path to corpus. **opts Keyword options. """ # Validate options for key in opts.keys(): if key not in DEFAULT_CORPUS_OPTS: raise clgen.UserError( "Unsupported corpus option '{}'. Valid keys: {}".format( key, ','.join(sorted(DEFAULT_CORPUS_OPTS.keys())))) self.opts = deepcopy(DEFAULT_CORPUS_OPTS) types.update(self.opts, opts) self.opts["id"] = contentid # check that contentid exists self.language = clgen.Language.from_str(opts.get("language")) if (path is None and not fs.isdir(clgen.cachepath("contentfiles", f"{self.language}-{contentid}"))): raise clgen.UserError("corpus {self.language}-{contentid} not found" .format(**vars())) self.contentid = contentid self.contentcache = clgen.mkcache("contentfiles", f"{self.language}-{contentid}") self.kernels_db = self.contentcache.keypath('kernels.db') self.hash = self._hash(contentid, self.opts) self.cache = clgen.mkcache("corpus", f"{self.language}-{self.hash}") log.debug("contentfiles {self.contentid}".format(**vars())) log.debug("corpus {hash}".format(hash=self.hash)) # validate metadata against cache self.stats = { "preprocess_time": 0 } meta = deepcopy(self.to_json()) if self.cache.get("META"): cached_meta = jsonutil.read_file(self.cache["META"]) self.stats = cached_meta["stats"] # restore stats if "created" in cached_meta: del cached_meta["created"] del meta["created"] if "stats" in cached_meta: del cached_meta["stats"] del meta["stats"] if meta != cached_meta: raise clgen.InternalError("corpus metadata mismatch") else: self._flush_meta() with self.lock.acquire(replace_stale=True): self._create_files(path)
def train(self, quiet: bool = False) -> None: """ Train model. """ tf = self._init_tensorflow(infer=False) # training options learning_rate = self.train_opts["learning_rate"] decay_rate = self.train_opts["lr_decary_rate"] checkpoint_path = fs.path(self.cache.path, "model.ckpt") # resume from prior checkpoint ckpt_path, ckpt_paths = None, None if self.checkpoint_path: # check if all necessary files exist assert (fs.isdir(self.checkpoint_path)) ckpt = tf.train.get_checkpoint_state(self.checkpoint_path) assert (ckpt) assert (ckpt.model_checkpoint_path) ckpt_path, ckpt_paths = self._get_params_path(ckpt) with tf.Session() as sess: tf.global_variables_initializer().run() # keep all checkpoints saver = tf.train.Saver(tf.global_variables(), max_to_keep=100) # restore model from closest checkpoint if ckpt_path: log.debug("restoring", ckpt_path) saver.restore(sess, ckpt_path) log.info("restored checkpoint {}".format(ckpt_path)) # make sure we don't lose track of other checkpoints if ckpt_paths: saver.recover_last_checkpoints(ckpt_paths) start_batch = sess.run(self.epoch) * self.corpus.num_batches batch_count = 0 total_elapsed = 0 total_atomize = 0 total_checkpoint, avg_checkpoint = 0, 0 eta_d, eta_h, eta_m = 0, 0, 0 for e in range(sess.run(self.epoch) + 1, self.epochs + 1): if quiet: log.info("epoch", e, "of", self.epochs + 1) # decay and set learning rate new_learning_rate = learning_rate * ( (float(100 - decay_rate) / 100.0)**(e - 1)) sess.run(tf.assign(self.learning_rate, new_learning_rate)) sess.run(tf.assign(self.epoch, e)) time_start = time.time() self.corpus.create_batches() total_atomize += time.time() - time_start avg_atomize = total_atomize / e state = sess.run(self.initial_state) for b in range(self.corpus.num_batches): time_start = time.time() batch_count += 1 x, y = self.corpus.next_batch() feed = {self.input_data: x, self.targets: y} for i, (c, h) in enumerate(self.initial_state): feed[c] = state[i].c feed[h] = state[i].h train_loss, state, _ = sess.run( [self.cost, self.final_state, self.train_op], feed) batch_num = (e - 1) * self.corpus.num_batches + b max_batch = self.epochs * self.corpus.num_batches progress = float((batch_num + 1 - start_batch) / (max_batch - start_batch)) time_end = time.time() elapsed = time_end - time_start if not quiet: total_elapsed += elapsed avg_elapsed = total_elapsed / batch_count remaining_time = ( (max_batch - batch_count) * avg_elapsed + # batches (e - self.epochs) * avg_atomize + # atomizings (e - self.epochs) * avg_checkpoint) # checkpoints eta_h, eta_m = divmod(remaining_time / 60, 60) eta_d, eta_h = divmod(eta_h, 24) print("\r\033[K" "{progress:3.1f}% | " "{size}x{layers}x{max_epoch} {model} | " "epoch={epoch_num}/{max_epoch} | " "batch={batch_num}/{max_batch} | " "lr={lr:.5f} | " "loss={tloss:.3f} | " "t1={time_atomize:.3f}s " "t2={time_batch:.3f}s " "t3={time_checkpoint:.3f}s | " "eta={eta_d}d{eta_h}h{eta_m:02d}m".format( size=self.rnn_size, layers=self.num_layers, model=self.model_type.upper(), progress=progress * 100, epoch_num=e, max_epoch=self.epochs, batch_num=b + 1, max_batch=self.corpus.num_batches, lr=new_learning_rate, tloss=train_loss, time_atomize=avg_atomize, time_batch=avg_elapsed, time_checkpoint=avg_checkpoint, eta_d=int(eta_d), eta_h=int(eta_h), eta_m=int(eta_m)), end="") save = self.opts["train_opts"]["intermediate_checkpoints"] save |= e == self.epochs # last epoch if save: if not quiet: print() time_start = time.time() saver.save(sess, checkpoint_path, global_step=batch_num) total_checkpoint += time.time() - time_start avg_checkpoint = total_checkpoint / e log.info("model saved to {}".format(checkpoint_path))
def empty(self) -> None: """ Empty the filesystem cache. """ log.debug("empty cache {path}".format(path=self.path)) fs.rm(self.path)
def __init__(self, contentid: str, path: str = None, **opts): """ Instantiate a corpus. If this is a new corpus, a number of files will be created, which may take some time. Arguments: contentid (str): ID of corpus content. path (str, optional): Path to corpus. **opts: Keyword options. """ def _init_error(err: Exception) -> None: """ tidy up in case of error """ log.error("corpus creation failed. Deleting corpus files") paths = [ fs.path(self.contentcache.path, "kernels.db"), fs.path(self.cache.path, "corpus.txt"), fs.path(self.cache.path, "tensor.npy"), fs.path(self.cache.path, "atomizer.pkl") ] for path in paths: if fs.exists(path): log.info("removing", path) fs.rm(path) raise err # Validate options for key in opts.keys(): if key not in DEFAULT_CORPUS_OPTS: raise clgen.UserError( "Unsupported corpus option '{}'. Valid keys: {}".format( key, ','.join(sorted(DEFAULT_CORPUS_OPTS.keys())))) self.opts = deepcopy(DEFAULT_CORPUS_OPTS) clgen.update(self.opts, opts) self.contentid = contentid self.hash = self._hash(contentid, self.opts) self.cache = Cache(fs.path("corpus", self.hash)) self.contentcache = Cache(fs.path("contentfiles", contentid)) self.kernels_db = self.contentcache['kernels.db'] log.debug("corpus {hash}".format(hash=self.hash)) try: if path is not None: if not fs.isdir(path): raise clgen.UserError( "Corpus path '{}' is not a directory".format(path)) # create kernels database if necessary if not self.contentcache["kernels.db"]: self._create_kernels_db(path, self.opts["encoding"]) assert (self.contentcache["kernels.db"]) # create corpus text if not exists if not self.cache["corpus.txt"]: self._create_txt() assert (self.cache["corpus.txt"]) # create atomizer if needed if self.cache["atomizer.pkl"]: self._load_atomizer() assert (self.cache["atomizer.pkl"]) else: self._create_atomizer(self.opts["vocabulary"]) except Exception as e: _init_error(e)
def inline_fs_headers(path: Path, stack: List[str], lang: clgen.Language = clgen.Language.OPENCL, topdir: Path = None) -> str: """ Recursively inline headers in file. Parameters ---------- path : str File. stack : List[str] File stack. topdir : Path The top level directory to stop searching for includes in. Returns ------- str Inlined file. """ stack.append(path) if topdir is None: topdir = fs.dirname(path) # shell escaped top directory escp_topdir = topdir.replace('"', '\\"') include_re = clgen.include_regexp(lang) with open(path, encoding="utf-8") as infile: src = infile.read() outlines = [] for line in src.split('\n'): match = re.match(include_re, line) if match: # We have an import to inline! include = match.group("path") # Search for files with that name in the repository include_basename = fs.basename(include) esc_basename = include_basename.replace('"', '\\"') candidates = [x for x in subprocess.check_output( f'find "{escp_topdir}" -type f -name {esc_basename}', shell=True, universal_newlines=True)\ .split('\n') if x] # Select which file to inline: if len(candidates) == 1: # If there's exactly one match, then we're done: file_to_inline = candidates[0] elif len(candidates) > 1: # We have multiple candidates to inline, so we'll compare the # full paths (relative to the top directory) to select the one # whose name is the closest match: rel_matches = [match[len(topdir) + 1:] for match in candidates] distances = [ editdistance.eval(include, path) for path in rel_matches ] min_distance = min(distances) file_to_inline = candidates[distances.index(min_distance)] log.debug( f"Inferred include '{file_to_inline}' from '{line}' with distance {min_distance}" ) else: # We didn't find anything suitable: file_to_inline = None # Process the inline file: if file_to_inline in stack: # We've already inlined this file, so ignore it: outlines.append( clgen.format_as_comment( lang, f'[FETCH] ignored_include({line})')) elif file_to_inline: # Inline the file by recursively expanding its contents: outlines.append( clgen.format_as_comment(lang, f'[FETCH] begin_include({line})')) inline_src = inline_fs_headers(file_to_inline, stack) outlines.append(inline_src) outlines.append( clgen.format_as_comment(lang, f'[FETCH] end_include({line})')) else: # We didn't find anything suitable, so keep the original # include: outlines.append( clgen.format_as_comment(lang, f'[FETCH] not_found({line})')) outlines.append(line) else: outlines.append(line) return '\n'.join(outlines)
def _locked_train(self) -> 'Model': tf = self._init_tensorflow(infer=False) # training options learning_rate = self.train_opts["learning_rate"] decay_rate = self.train_opts["lr_decay_rate"] # resume from prior checkpoint ckpt_path, ckpt_paths = None, None if self.checkpoint_path: # check that all necessary files exist assert(fs.isdir(self.checkpoint_path)) ckpt = tf.train.get_checkpoint_state(self.checkpoint_path) assert(ckpt) assert(ckpt.model_checkpoint_path) ckpt_path, ckpt_paths = self._get_params_path(ckpt) with tf.Session() as sess: tf.global_variables_initializer().run() # keep all checkpoints saver = tf.train.Saver(tf.global_variables(), max_to_keep=100) # restore model from closest checkpoint if ckpt_path: log.debug("restoring", ckpt_path) saver.restore(sess, ckpt_path) log.verbose("restored checkpoint {}".format(ckpt_path)) # make sure we don't lose track of other checkpoints if ckpt_paths: saver.recover_last_checkpoints(ckpt_paths) coord = tf.train.Coordinator() self.corpus.create_batches() threading.Thread(target=self.enqueue_x, args=(coord, sess)).start() max_batch = self.epochs * self.corpus.num_batches # progress bar bar = progressbar.ProgressBar(max_value=max_batch) if sess.run(self.epoch) != self.epochs: log.info("training", self) for e in range(sess.run(self.epoch) + 1, self.epochs + 1): epoch_start = time() # decay and set learning rate new_learning_rate = learning_rate * ( (float(100 - decay_rate) / 100.0) ** (e - 1)) sess.run(tf.assign(self.learning_rate, new_learning_rate)) sess.run(tf.assign(self.epoch, e)) for b in range(self.corpus.num_batches): train_cost, _, state, _ = sess.run([self.cost, self.KL_cost, self.final_state, self.train_op]) # update progress bar batch_num = (e - 1) * self.corpus.num_batches + b bar.update(batch_num) save = self.opts["train_opts"]["intermediate_checkpoints"] save |= e == self.epochs # always save on last epoch if save: saver.save(sess, self.cache.keypath("model.ckpt"), global_step=batch_num) next_checkpoint = e * self.corpus.num_batches + b max_epoch = self.epochs log.verbose("\n{self} epoch {e} / {max_epoch}. " "next checkpoint at batch {next_checkpoint}" .format(**vars())) # update training time epoch_duration = time() - epoch_start self.stats["epoch_costs"].append(float(train_cost)) self.stats["epoch_times"].append(epoch_duration) self.stats["epoch_batches"].append(batch_num + 1) self._flush_meta() coord.request_stop() return self
def __init__(self, corpus: clgen.Corpus, **opts): """ Instantiate model. Parameters ---------- corpus : clgen.Corpus Corpus instance. **opts Training options. """ assert(isinstance(corpus, clgen.Corpus)) def _hash(corpus: clgen.Corpus, opts: dict) -> str: """ compute model hash """ hashopts = deepcopy(opts) del hashopts["created"] del hashopts["train_opts"]["epochs"] return crypto.sha1_list(corpus.hash, *types.dict_values(hashopts)) # Validate options for key in opts: if key not in DEFAULT_MODEL_OPTS: raise clgen.UserError( "Unsupported model option '{}'. Valid keys: {}".format( key, ','.join(sorted(DEFAULT_MODEL_OPTS.keys())))) # set properties self.opts = types.update(deepcopy(DEFAULT_MODEL_OPTS), opts) self.corpus = corpus self.hash = _hash(self.corpus, self.opts) self.cache = clgen.mkcache("model", f"{corpus.language}-{self.hash}") log.debug("model", self.hash) # validate metadata against cache, and restore stats self.stats = { "epoch_times": [], "epoch_costs": [], "epoch_batches": [] } meta = deepcopy(self.to_json()) if self.cache.get("META"): cached_meta = jsonutil.read_file(self.cache["META"]) self.stats = cached_meta["stats"] # restore stats if "created" in cached_meta: del cached_meta["created"] del meta["created"] if "created" in cached_meta["corpus"]: del cached_meta["corpus"]["created"] del meta["corpus"]["created"] if "stats" in cached_meta: del cached_meta["stats"] del meta["stats"] if "epochs" in cached_meta["train_opts"]: del cached_meta["train_opts"]["epochs"] del meta["train_opts"]["epochs"] if meta != cached_meta: log.error("Computed META:", jsonutil.format_json(meta)) raise clgen.InternalError( "metadata mismatch in model %s" % self.cache["META"]) else: self._flush_meta()