def main(): log.init(verbose=True) m = model.from_json(clgen.load_json_file(sys.argv[1])) c = corpus.Corpus.from_json({"path": "~/data/github"}) print("CLgen: ", clgen.version()) print("Corpus size:", c.size) print("Vocab size: ", c.vocab_size) m.train() p, _ = corpus.most_common_prototypes(c, 20) for i, row in enumerate(p): outpath = "./inference-p" + str(i + 1) + "-" + fs.basename(sys.argv[1]) if fs.exists(outpath): continue _, prototype = row argspec = [' '.join(x.split()[:-1]) for x in prototype.split(',')] print("argspec", ','.join([str(x) for x in argspec])) s = sampler.from_json({ "kernels": { "args": argspec, "max_length": 5000 }, "sampler": { "batch_size": 2000, "max_batches": 1, "static_checker": False, "dynamic_checker": False } }) info = evaluate(m, s) clgen.write_file(outpath, clgen.format_json(info))
def meta(self) -> dict: """ Get trained model metadata. Format spec: https://github.com/ChrisCummins/clgen/issues/25 Returns: dict: Metadata. """ # checksum corpus and model cache files. Paths are relative to cache # root. cache_root_re = r'^' + cache.ROOT + '/' corpus_files = dict( (re.sub(cache_root_re, "", x), clgen.checksum_file(x)) for x in fs.ls(self.corpus.cache.path, abspaths=True)) model_files = dict( (re.sub(cache_root_re, "", x), clgen.checksum_file(x)) for x in fs.ls(self.cache.path, abspaths=True)) contents = corpus_files.copy() contents.update(model_files) _meta = deepcopy(self.opts) _meta["version"] = clgen.version() _meta["date_packaged"] = labtime.nowstr() _meta["corpus"] = self.corpus.meta, _meta["contents"] = contents return _meta
def get_all_sampler_datasets(all_clgen_versions: bool=True) -> list: if all_clgen_versions: versiondirs = fs.ls(fs.path("~/.cache/clgen"), abspaths=True) else: versiondirs = [fs.path("~/.cache/clgen", clgen.version())] versiondirs = [v for v in versiondirs if fs.isdir(v, "sampler")] datasets = [] for versiondir in versiondirs: for samplerdir in fs.ls(fs.path(versiondir, "sampler"), abspaths=True): inpath = fs.path(samplerdir, "kernels.db") if fs.isfile(inpath): datasets.append(inpath) return datasets
def set_version_meta(path: str, version: str=clgen.version()) -> None: """ Set the "version" key in an database. This is useful for marking version requirements of specific datasets, e.g. a databse schema which requires a particular CLgen version, or a scheme which is likely to change in the future. Parameters ---------- path : str Path to database. version : str, optional Version value (defaults to CLgen version). """ set_meta(path, "version", version)
def version_meta_matches(path: str, version: str=clgen.version()) -> bool: """ Check that the "version" key in a database matches the expected value. If the database does not have a "version" key in the Meta table, returns False. Parameters ---------- path : str Path to database. version : str, optional Version value (defaults to CLgen version). Returns ------- bool True if version in database matches expected version, else False. """ return get_meta(path, "version") == version
def main(): log.init(verbose=True) m = model.from_json(clgen.load_json_file(sys.argv[1])) c = corpus.Corpus.from_json({"path": "~/data/github"}) print("CLgen: ", clgen.version()) print("Corpus size:", c.size) print("Vocab size: ", c.vocab_size) m.train() p, _ = corpus.most_common_prototypes(c, 20) for i, row in enumerate(p): outpath = "./inference-p" + str(i + 1) + "-" + fs.basename(sys.argv[1]) if fs.exists(outpath): print("skipped result for", outpath) continue else: print("starting result for", outpath) _, prototype = row argspec = [' '.join(x.split()[:-1]) for x in prototype.split(',')] print("argspec", ','.join([str(x) for x in argspec])) s = sampler.from_json({ "kernels": { "args": argspec, "max_length": 5000 }, "sampler": { "batch_size": 2000, "max_batches": 1, "static_checker": False, "dynamic_checker": False } }) info = evaluate(m, s) clgen.write_file(outpath, clgen.format_json(info))
from time import time from typing import Iterable, List, Tuple import clgen from clgen import dbutil from clgen import features from clgen import log # Default options used for corpus. Any values provided by the user will override # these defaults. DEFAULT_CORPUS_OPTS = { "created": { "author": clgen.get_default_author(), "date": str(datetime.now()), "version": clgen.version(), }, "eof": False, "batch_size": 50, "seq_length": 50, "vocabulary": "char", "encoding": "default", "preprocess": True, "preserve_order": False, "language": None, # Note no explicit default language. } class FeaturesError(clgen.CLgenError): """ Thrown in case of error during features encoding.
def print_version_and_exit(): """ Print the clgen version. This function does not return. """ print("clgen ", clgen.version()) exit(0)
# You should have received a copy of the GNU General Public License # along with CLgen. If not, see <http://www.gnu.org/licenses/>. # """ CLgen persistent cache mechanism. """ import re from labm8 import fs from shutil import move from six import string_types import clgen from clgen import log ROOT = fs.path("~", ".cache", "clgen", clgen.version()) class Cache404(clgen.File404): """ Error thrown for cache misses. """ pass class Cache(clgen.CLgenObject): """ Persistent filesystem cache. """ def __init__(self, name: str): """
def main(self, args: List[str]=sys.argv[1:]): """ A deep learning program generator for the OpenCL programming language. The core operations of CLgen are: 1. OpenCL files are collected from a model specification file. 2. These files are preprocessed into an OpenCL kernel database. 3. A training corpus is generated from the input files. 4. A machine learning model is trained on the corpus of files. 5. The trained model is sampled for new kernels. 6. The samples are tested for compilability. This program automates the execution of all six stages of the pipeline. The pipeline can be interrupted and resumed at any time. Results are cached across runs. If installed with CUDA support, NVIDIA GPUs will be used to improve performance where possible. """ parser = ArgumentParser( prog="clgen", description=inspect.getdoc(self), epilog=""" For information about a specific command, run `clgen <command> --help`. """ + __help_epilog__, formatter_class=RawDescriptionHelpFormatter) # TODO: # parser.add_argument( # "-l", "--lang", metavar="<language>", # help="programming language (default: OpenCL)") parser.add_argument( "-v", "--verbose", action="store_true", help="increase output verbosity") parser.add_argument( "--version", action="store_true", help="show version information and exit") parser.add_argument( "--debug", action="store_true", help="in case of error, print debugging information") parser.add_argument( "--profile", action="store_true", help=("enable internal API profiling. When combined with --verbose, " "prints a complete profiling trace")) parser.add_argument( "--corpus-dir", metavar="<corpus>", type=FileType("r"), help="print path to corpus cache") parser.add_argument( "--model-dir", metavar="<model>", type=FileType("r"), help="print path to model cache") parser.add_argument( "--sampler-dir", metavar=("<model>", "<sampler>"), type=FileType("r"), nargs=2, help="print path to sampler cache") subparser = parser.add_subparsers(title="available commands") subparsers = [ _register_test_parser, _register_train_parser, _register_sample_parser, _register_db_parser, _register_fetch_parser, _register_ls_parser, _register_preprocess_parser, _register_features_parser, _register_atomize_parser, _register_cache_parser, ] for register_fn in subparsers: register_fn(subparser) args = parser.parse_args(args) # set log level log.init(args.verbose) # set debug option if args.debug: os.environ["DEBUG"] = "1" # set profile option if args.profile: prof.enable() # options whch override the normal argument parsing process. if args.version: version = clgen.version() print(f"clgen {version} made with \033[1;31m♥\033[0;0m by " "Chris Cummins <*****@*****.**>.") elif args.corpus_dir: model = clgen.Model.from_json(jsonutil.loads(args.corpus_dir.read())) print(model.corpus.cache.path) elif args.model_dir: model = clgen.Model.from_json(jsonutil.loads(args.model_dir.read())) print(model.cache.path) elif args.sampler_dir: model = clgen.Model.from_json(jsonutil.loads(args.sampler_dir[0].read())) sampler = clgen.Sampler.from_json(jsonutil.loads(args.sampler_dir[1].read())) print(sampler.cache(model).path) else: # strip the arguments from the top-level parser dispatch_func = args.dispatch_func opts = vars(args) del opts["version"] del opts["verbose"] del opts["debug"] del opts["profile"] del opts["corpus_dir"] del opts["model_dir"] del opts["sampler_dir"] del opts["dispatch_func"] run(dispatch_func, **opts)