def from_json(corpus_json: dict): """ Instantiate Corpus from JSON. Arguments: corpus_json (dict): Specification. Returns: Corpus: Insantiated corpus. """ path = corpus_json.pop("path", None) uid = corpus_json.pop("id", None) if path: path = unpack_directory_if_needed(fs.abspath(path)) if not fs.isdir(path): raise clgen.UserError( "Corpus path '{}' is not a directory".format(path)) uid = dirhash(path, 'sha1') elif uid: cache_path = fs.path(cache.ROOT, "corpus", uid) if not fs.isdir(cache_path): raise clgen.UserError("Corpus {} not found".format(uid)) else: raise clgen.UserError("No corpus path or ID provided") return Corpus(uid, path=path, **corpus_json)
def __init__(self, sampler_opts: dict, kernel_opts: dict): """ Instantiate a sampler. Parameters ---------- sampler_opts : dict Sampler options. kernel_opts : dict Kernel options. """ def _hash(sampler_opts: dict, kernel_opts: dict) -> str: # we don't consider the number of samples in the ID sampler_opts = deepcopy(sampler_opts) del sampler_opts["min_samples"] del sampler_opts["min_kernels"] del sampler_opts["created"] checksum_data = sorted( [str(x) for x in sampler_opts.values()] + [str(x) for x in kernel_opts.values()]) string = "".join([str(x) for x in checksum_data]) return crypto.sha1_str(string) def _start_text(args): if args is None: return "__kernel void A(" else: return serialize_argspec(args) assert(type(sampler_opts) is dict) assert(type(kernel_opts) is dict) # Validate options for key in sampler_opts.keys(): if key not in DEFAULT_SAMPLER_OPTS: raise clgen.UserError( "Unsupported sampler option '{}'. Valid keys: {}".format( key, ','.join(sorted(DEFAULT_SAMPLER_OPTS.keys())))) for key in kernel_opts.keys(): if key not in DEFAULT_KERNELS_OPTS: raise clgen.UserError( "Unsupported kernels option '{}'. Valid keys: {}".format( key, ','.join(sorted(DEFAULT_KERNELS_OPTS.keys())))) # set properties self.sampler_opts = types.update(deepcopy(DEFAULT_SAMPLER_OPTS), sampler_opts) self.kernel_opts = types.update(deepcopy(DEFAULT_KERNELS_OPTS), kernel_opts) self.hash = _hash(self.sampler_opts, self.kernel_opts) self.start_text = _start_text(self.kernel_opts["args"]) # options to pass to preprocess_db() self.preprocess_opts = { "use_gpuverify": self.sampler_opts["gpuverify"] }
def from_str(string: str) -> 'Language': if not string: raise clgen.UserError(f"no language specified!") lang = { "opencl": Language.OPENCL, "sol": Language.SOLIDITY, "solidity": Language.SOLIDITY, "glsl": Language.GLSL, }.get(string.lower(), None) if not lang: raise clgen.UserError(f"unknown language '{string}'") return lang
def create_db(path: str, github: bool=False) -> None: """ Create an empty OpenCL kernel database. Parameters ---------- path : str Path to database to create. github : bool, optional Add tables for GitHub metadata. """ path = os.path.expanduser(path) if os.path.exists(path): raise clgen.UserError("'{}' already exists".format(path)) db = sqlite3.connect(path) c = db.cursor() if github: script = clgen.sql_script('create-gh-samples-db') else: script = clgen.sql_script('create-samples-db') c.executescript(script) c.close() db.commit() db.close()
def from_json(model_json: dict) -> 'Model': """ Load model from JSON. Parameters ---------- model_json : dict JSON specification. Returns ------- Model Model instance. """ assert(isinstance(model_json, dict)) if "corpus" not in model_json: raise clgen.UserError("model JSON has no corpus entry") # create corpus and remove from JSON corpus = clgen.Corpus.from_json(model_json.pop("corpus")) if "stats" in model_json: # ignore stats del model_json["stats"] return Model(corpus, **model_json)
def from_json(sampler_json: dict) -> 'Sampler': """ Instantiate sampler from JSON. Parameters ---------- sampler_json : dict JSON data. Returns ------- Sampler Instantiate sampler. """ unrecognized_keys = (set(sampler_json.keys()) - set(["sampler", "kernels"])) if unrecognized_keys: raise clgen.UserError( "unrecognized sampler JSON options '{}'".format(",".join( ["'{}'".format(key) for key in unrecognized_keys]))) sampler_opts = sampler_json.get("sampler", {}) kernel_opts = sampler_json.get("kernels", {}) return Sampler(sampler_opts, kernel_opts)
def _create_files(self, path): def _init_error(err: Exception, files_to_rm: List[str]=[]) -> None: """ tidy up in case of error """ log.error("corpus creation failed. Deleting corpus files") for path in files_to_rm: if fs.exists(path): log.info("removing", path) fs.rm(path) raise err # create kernels database if necessary try: if path is not None: if not fs.isdir(path): raise clgen.UserError( "Corpus path '{}' is not a directory".format(path)) try: self.contentcache["kernels.db"] except KeyError: self._create_kernels_db(path) except Exception as e: _init_error(e, [self.contentcache.keypath("kernels.db")]) # preprocess and encode kernel db try: modified = False preprocess_time = time() encoding = self.opts["encoding"] if clgen.preprocess_db(self.contentcache["kernels.db"], lang=self.language): modified = True encode_kernels_db(self.contentcache["kernels.db"], encoding) except Exception as e: _init_error(e) if modified: preprocess_time = time() - preprocess_time self.stats["preprocess_time"] += preprocess_time self._flush_meta() # create corpus text if not exists try: try: self.cache["corpus.txt"] except KeyError: self._create_txt() assert(self.cache["corpus.txt"]) except Exception as e: _init_error(e, [self.cache.keypath("corpus.txt")]) # create atomizer if needed try: try: self.cache["atomizer.pkl"] self._load_atomizer() except KeyError: self._create_atomizer(self.opts["vocabulary"]) assert(self.cache["atomizer.pkl"]) except Exception as e: _init_error(e, [self.cache.keypath("atomizer.pkl")])
def _get_atomizer(corpus_txt: str, vocab: str="char") -> list: """ Get atomizer for a corpus. Parameters ---------- corpus : str Corpus. vocab : str, optional Vocabularly type. Returns ------- clgen.Atomizer Atomizer. """ atomizers = { "char": clgen.CharacterAtomizer, "greedy": clgen.GreedyAtomizer, } atomizerclass = atomizers.get(vocab, None) if atomizerclass is None: raise clgen.UserError( "Unknown vocabulary type '{bad}'. " "Supported values: {good}".format( bad=vocab, good=", ".join(sorted(atomizers.keys())))) else: return atomizerclass.from_text(self.language, corpus_txt)
def create_batches(self) -> None: """ Create batches for training. """ self.reset_batch_pointer() # generate a kernel corpus data = self._generate_kernel_corpus() # encode corpus into vocab indices self._tensor = self.atomizer.atomize(data) batch_size = self.batch_size seq_length = self.seq_length # set corpus size and number of batches self._size = len(self._tensor) self._num_batches = int(self.size / (batch_size * seq_length)) if self.num_batches == 0: raise clgen.UserError( "Not enough data. Use a smaller seq_length and batch_size") # split into batches self._tensor = self._tensor[:self.num_batches * batch_size * seq_length] xdata = self._tensor ydata = np.copy(self._tensor) ydata[:-1] = xdata[1:] ydata[-1] = xdata[0] self._x_batches = np.split(xdata.reshape(batch_size, -1), self.num_batches, 1) self._y_batches = np.split(ydata.reshape(batch_size, -1), self.num_batches, 1)
def encode_kernels_db(kernels_db: str, encoding: str) -> None: """ Encode a kernels database. Parameters ---------- kernels_db : str Path to kernels database. encoding : str Encoding type. """ def _default(kernels_db: str) -> None: pass def _static_features(kernels_db: str) -> None: log.verbose("Static feature encoding") db = dbutil.connect(kernels_db) c = db.cursor() c.execute("SELECT id,contents FROM PreprocessedFiles WHERE status=0") for row in list(c.fetchall()): id, contents = row c.execute("DELETE FROM PreprocessedFiles WHERE id=?", (id,)) for i, kernel in enumerate(get_cl_kernels(contents)): features = get_kernel_features(kernel) kid = "{}-{}".format(id, i) if len(features) == 8: log.verbose("features", kid) feature_str = ("/* {:10} {:10} {:10} {:10} {:10} {:10}" "{:10.3f} {:10.3f} */".format( int(features[0]), int(features[1]), int(features[2]), int(features[3]), int(features[4]), int(features[5]), features[6], features[7])) newsource = feature_str + '\n' + kernel c.execute(""" INSERT INTO PreprocessedFiles (id,contents,status) VALUES (?,?,?) """, (kid, newsource, 0)) else: log.verbose("ignored", kid) c.close() db.commit() # dispatch encoder based on encoding encoders = { "default": _default, "static_features": _static_features, } encoder = encoders.get(encoding, None) if encoder is None: raise clgen.UserError( "Unknown encoding type '{bad}'. Supported values: {good}".format( bad=encoding, good=", ".join(sorted(encoders.keys())))) else: encoder(kernels_db)
def get_cell(model_type): cell_fn = { "lstm": rnn.BasicLSTMCell, "gru": rnn.GRUCell, "rnn": rnn.BasicRNNCell }.get(model_type, None) if cell_fn is None: raise clgen.UserError("Unrecognized model type") return cell_fn
def from_json(corpus_json: dict) -> 'Corpus': """ Instantiate Corpus from JSON. Parameters ---------- corpus_json : dict Specification. Returns ------- Corpus Insantiated corpus. """ path = corpus_json.pop("path", None) uid = corpus_json.pop("id", None) language = clgen.Language.from_str(corpus_json.get("language")) if path: path = unpack_directory_if_needed(fs.abspath(path)) if not fs.isdir(path): raise clgen.UserError( "Corpus path '{}' is not a directory".format(path)) dirhashcache = DirHashCache(clgen.cachepath("dirhash.db"), 'sha1') uid = prof.profile(dirhashcache.dirhash, path) elif uid: cache_path = clgen.mkcache("contentfiles", f"{language}-{uid}").path if not fs.isdir(cache_path): raise clgen.UserError( "Corpus content {} not found".format(uid)) else: raise clgen.UserError("No corpus path or ID provided") if "stats" in corpus_json: # ignore stats del corpus_json["stats"] if "contentfiles" in corpus_json: del corpus_json["contentfiles"] return prof.profile(Corpus, uid, path=path, **corpus_json)
def create_data(self) -> None: """create a numpy array with all the training data""" data = self._generate_kernel_corpus() seq_length = self.seq_length batch_size = self.batch_size pad = self.atomizer.vocab['__PAD__'] lst_x = [] lst_w = [] lst_l = [] inps = [self.atomizer.atomize(kernel.strip()) for kernel in data] def next_sequence(inp, length): """ produce the next sequence out of the input array """ x = np.full((seq_length, ), pad, dtype=np.int32) weights = np.ones((seq_length, ), dtype=np.int32) actual_length = 0 if length >= seq_length: x[:seq_length] = inp[:seq_length] actual_length = seq_length else: x[:length] = inp actual_length = length + 1 if length <= seq_length - 2: weights[length + 1:] = 0 return x, weights, actual_length for inp in inps: length = np.shape(inp)[0] while length > 16: x, weights, actual_length = next_sequence(inp, length) lst_x.append(x) lst_w.append(weights) lst_l.append(actual_length) inp = inp[actual_length:] length = length - actual_length num_examples = len(lst_x) # set corpus size and number of batches self._size = num_examples * seq_length self._num_batches = int(num_examples / batch_size) if self.num_batches == 0: raise clgen.UserError( "Not enough data. Use a smaller seq_length and batch_size") self.tensor_x = np.array(lst_x) self.tensor_w = np.array(lst_w) self.tensor_l = np.array(lst_l)
def from_json(sampler_json: dict) -> Sampler: """ Instantiate sampler from JSON. Arguments: sampler_json (dict): JSON data. Returns: Sampler: Instantiate sampler. """ sampler_opts = sampler_json.get("sampler", {}) kernel_opts = sampler_json.get("kernels", {}) if not kernel_opts: raise clgen.UserError("no kernels section in sampler specification") return Sampler(sampler_opts, kernel_opts)
def _scrape_github_for_files(db_path: str, github_username: str, github_pw: str, github_token: str, query_terms: List[str], file_is_intetesting, download_file_cb): global errors_counter g = Github(github_username, github_pw) db = dbutil.connect(db_path) if not dbutil.is_github: raise clgen.UserError("not a GitHub database") # fetch the repositories to iterate over for query in query_terms: # forks are okay - we use checksums to ensure uniqueness in # final dataset repos = g.search_repositories(query + ' fork:true sort:stars') for repo in repos: # do nothing unless the repo is new or modified if not _process_repo(g, db, repo): continue # iterate over the entire git tree of the repo's default branch # (usually 'master'). If a file ends with the .cl extension, check # to see if we already have it, else download it try: branch = repo.default_branch tree_iterator = repo.get_git_tree(branch, recursive=True).tree for f in tree_iterator: if file_is_intetesting(f): try: _process_file(g, github_token, db, repo, f, download_file_cb) except Exception as e: print(e) sys.exit(1) errors_counter += 1 except GithubException: # do nothing in case of error (such as an empty repo) pass _print_counters() print("\n\ndone.") db.close()
def from_json(model_json: dict) -> Model: """ Load model from JSON. Arguments: model_json (dict): JSON specification. Returns: Model: Model instance. """ assert (type(model_json) is dict) if "corpus" not in model_json: raise clgen.UserError("model JSON has no corpus entry") # create corpus and remove from JSON corpus = Corpus.from_json(model_json.pop("corpus")) return Model(corpus, **model_json)
def get_atomizer(corpus: str, vocab: str = "char") -> list: """ Get atomizer for a corpus. Arguments: corpus (str): Corpus. vocab (str, optional): Vocabularly type. Returns: atomizer.Atomizer: Atomizer. """ atomizers = { "char": atomizer.CharacterAtomizer, "greedy": atomizer.GreedyAtomizer, } atomizerclass = atomizers.get(vocab, None) if atomizerclass is None: raise clgen.UserError( "Unknown vocabulary type '{bad}'. Supported values: {good}".format( bad=vocab, good=", ".join(sorted(atomizers.keys())))) else: return atomizerclass.from_text(corpus)
def __init__(self, corpus: Corpus, **opts): """ Instantiate model. Arguments: corpus (Corpus): Corpus instance. opts (dict): Training options. """ assert (isinstance(corpus, Corpus)) # Validate options for key in opts.keys(): if key not in DEFAULT_MODEL_OPTS: raise clgen.UserError( "Unsupported model option '{}'. Valid keys: {}".format( key, ','.join(sorted(DEFAULT_MODEL_OPTS.keys())))) # set properties self.opts = clgen.update(deepcopy(DEFAULT_MODEL_OPTS), opts) self.corpus = corpus self.hash = self._hash(self.corpus, self.opts) self.cache = Cache(fs.path("model", self.hash)) log.debug("model", self.hash)
def _init_tensorflow(self, infer: bool = False) -> 'tf': """ Deferred importing of tensorflow and initializing model for training or sampling. This is necessary for two reasons: first, the tensorflow graph is different for training and inference, so must be reset when switching between modes. Second, importing tensorflow takes a long time, so we only want to do it if we actually need to. Parameters ---------- infer : bool If True, initialize model for inference. If False, initialize model for training. Returns ------- module TensorFlow module. """ # quiet tensorflow. See: https://github.com/tensorflow/tensorflow/issues/1258 os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' import tensorflow as tf import tensorflow.contrib.legacy_seq2seq as seq2seq from tensorflow.contrib import rnn self.cell_fn = { "lstm": rnn.BasicLSTMCell, "gru": rnn.GRUCell, "rnn": rnn.BasicRNNCell }.get(self.model_type, None) if self.cell_fn is None: raise clgen.UserError("Unrecognized model type") # reset the graph when switching between training and inference tf.reset_default_graph() # corpus info: batch_size = 1 if infer else self.corpus.batch_size seq_length = 1 if infer else self.corpus.seq_length vocab_size = self.corpus.vocab_size cell = self.cell_fn(self.rnn_size, state_is_tuple=True) self.cell = cell = rnn.MultiRNNCell([cell] * self.num_layers, state_is_tuple=True) self.input_data = tf.placeholder(tf.int32, [batch_size, seq_length]) self.targets = tf.placeholder(tf.int32, [batch_size, seq_length]) self.initial_state = self.cell.zero_state(batch_size, tf.float32) scope_name = 'rnnlm' with tf.variable_scope(scope_name): softmax_w = tf.get_variable("softmax_w", [self.rnn_size, vocab_size]) softmax_b = tf.get_variable("softmax_b", [vocab_size]) with tf.device("/cpu:0"): embedding = tf.get_variable("embedding", [vocab_size, self.rnn_size]) inputs = tf.split(axis=1, num_or_size_splits=seq_length, value=tf.nn.embedding_lookup( embedding, self.input_data)) inputs = [tf.squeeze(input_, [1]) for input_ in inputs] def loop(prev, _): prev = tf.matmul(prev, softmax_w) + softmax_b prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(embedding, prev_symbol) outputs, last_state = seq2seq.rnn_decoder( inputs, self.initial_state, cell, loop_function=loop if infer else None, scope=scope_name) output = tf.reshape(tf.concat(axis=1, values=outputs), [-1, self.rnn_size]) self.logits = tf.matmul(output, softmax_w) + softmax_b self.probs = tf.nn.softmax(self.logits) loss = seq2seq.sequence_loss_by_example( [self.logits], [tf.reshape(self.targets, [-1])], [tf.ones([batch_size * seq_length])], vocab_size) self.cost = tf.reduce_sum(loss) / batch_size / seq_length self.final_state = last_state self.learning_rate = tf.Variable(0.0, trainable=False) self.epoch = tf.Variable(0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm( # Argument of potential interest: # aggregation_method=tf.AggregationMethod.EXPERIMENTAL_TREE # # See: # https://www.tensorflow.org/api_docs/python/tf/gradients # https://www.tensorflow.org/api_docs/python/tf/AggregationMethod tf.gradients(self.cost, tvars), self.grad_clip) optimizer = tf.train.AdamOptimizer(self.learning_rate) self.train_op = optimizer.apply_gradients(zip(grads, tvars)) return tf
def _dump_db(db, out_path: str, gh: bool = False, fileid: bool = False, reverse: bool = False, input_samples: bool = False, status: int = 0, eof: bool = False, dir: bool = False) -> None: """ Dump database contents. Parameters ---------- db : slite3.Connection Dataset. out_path : str Path to output. gh : bool, optional Dataset is GitHub. fileid : bool, optional Include file IDs. reverse : bool, optional Reverse ordering of output. input_samples : bool, optional If True, use un-preprocessed files. status : int, optional Filter preprocess status. eof : bool, optional Include EOF separators. dir : bool, optional Write output to directory. """ log.info('writing corpus', out_path, '...') order = 'ASC' if reverse else 'DESC' c = db.cursor() # Query components table = 'ContentFiles' if input_samples else 'PreprocessedFiles' select = 'SELECT {}.id,{}.contents'.format(table, table, table) if input_samples: qualifier = '' else: qualifier = 'WHERE {}.status={}'.format(table, status) if gh: table += (' LEFT JOIN ContentMeta ON {}.id=ContentMeta.id' ' LEFT JOIN Repositories ON ' 'ContentMeta.repo_url=Repositories.url'.format(table)) orderby = 'Repositories.stars' else: orderby = 'LC_col(contents)' query = ( '{select} FROM {table} {qualifier} ORDER BY {orderby} {order}'.format( select=select, table=table, qualifier=qualifier, orderby=orderby, order=order)) c.execute(query) rows = c.fetchall() if dir: log.info('writing to directory ', out_path, '/', sep='') if os.path.exists(out_path): if len(fs.ls(out_path)): raise clgen.UserError('directory already exists!') else: os.makedirs(out_path) for row in rows: id, contents = row path = os.path.join(out_path, kid_to_path(id) + '.cl') with open(path, 'w') as out: out.write(contents) else: log.info('writing file', out_path) with open(out_path, 'wb') as out: for row in rows: id, contents = row if fileid: # Print file ID out.write('/* ID: {} */\n\n'.format(id).encode('utf-8')) out.write(contents.encode('utf-8')) if eof: # Print EOF token out.write('\n/* EOF */\n\n'.encode('utf-8')) else: out.write('\n\n'.encode('utf-8'))
def __init__(self, contentid: str, path: str=None, **opts): """ Instantiate a corpus. If this is a new corpus, a number of files will be created, which may take some time. Parameters ---------- contentid : str ID of corpus content. path : str, optional Path to corpus. **opts Keyword options. """ # Validate options for key in opts.keys(): if key not in DEFAULT_CORPUS_OPTS: raise clgen.UserError( "Unsupported corpus option '{}'. Valid keys: {}".format( key, ','.join(sorted(DEFAULT_CORPUS_OPTS.keys())))) self.opts = deepcopy(DEFAULT_CORPUS_OPTS) types.update(self.opts, opts) self.opts["id"] = contentid # check that contentid exists self.language = clgen.Language.from_str(opts.get("language")) if (path is None and not fs.isdir(clgen.cachepath("contentfiles", f"{self.language}-{contentid}"))): raise clgen.UserError("corpus {self.language}-{contentid} not found" .format(**vars())) self.contentid = contentid self.contentcache = clgen.mkcache("contentfiles", f"{self.language}-{contentid}") self.kernels_db = self.contentcache.keypath('kernels.db') self.hash = self._hash(contentid, self.opts) self.cache = clgen.mkcache("corpus", f"{self.language}-{self.hash}") log.debug("contentfiles {self.contentid}".format(**vars())) log.debug("corpus {hash}".format(hash=self.hash)) # validate metadata against cache self.stats = { "preprocess_time": 0 } meta = deepcopy(self.to_json()) if self.cache.get("META"): cached_meta = jsonutil.read_file(self.cache["META"]) self.stats = cached_meta["stats"] # restore stats if "created" in cached_meta: del cached_meta["created"] del meta["created"] if "stats" in cached_meta: del cached_meta["stats"] del meta["stats"] if meta != cached_meta: raise clgen.InternalError("corpus metadata mismatch") else: self._flush_meta() with self.lock.acquire(replace_stale=True): self._create_files(path)
def _init_tensorflow(self, infer: bool = False): """ Deferred importing of tensorflow and initializing model for training or sampling. This is necessary for two reasons: first, the tensorflow graph is different for training and inference, so must be reset when switching between modes. Second, importing tensorflow takes a long time, so we only want to do it if we actually need to. Arguments: infer (bool): If True, initialize model for inference. If False, initialize model for training. Returns: module: imported TensorFlow module """ import tensorflow as tf from tensorflow.python.ops import rnn_cell from tensorflow.python.ops import seq2seq # Use self.tensorflow_state to mark whether or not model is configured # for training or inference. try: if self.tensorflow_state == infer: return tf except AttributeError: pass self.cell_fn = { "lstm": rnn_cell.BasicLSTMCell, "gru": rnn_cell.GRUCell, "rnn": rnn_cell.BasicRNNCell }.get(self.model_type, None) if self.cell_fn is None: raise clgen.UserError("Unrecognized model type") # reset the graph when switching between training and inference tf.reset_default_graph() # corpus info: batch_size = 1 if infer else self.corpus.batch_size seq_length = 1 if infer else self.corpus.seq_length vocab_size = self.corpus.vocab_size fs.mkdir(self.cache.path) cell = self.cell_fn(self.rnn_size, state_is_tuple=True) self.cell = cell = rnn_cell.MultiRNNCell([cell] * self.num_layers, state_is_tuple=True) self.input_data = tf.placeholder(tf.int32, [batch_size, seq_length]) self.targets = tf.placeholder(tf.int32, [batch_size, seq_length]) self.initial_state = self.cell.zero_state(batch_size, tf.float32) scope_name = 'rnnlm' with tf.variable_scope(scope_name): softmax_w = tf.get_variable("softmax_w", [self.rnn_size, vocab_size]) softmax_b = tf.get_variable("softmax_b", [vocab_size]) with tf.device("/cpu:0"): embedding = tf.get_variable("embedding", [vocab_size, self.rnn_size]) inputs = tf.split( 1, seq_length, tf.nn.embedding_lookup(embedding, self.input_data)) inputs = [tf.squeeze(input_, [1]) for input_ in inputs] def loop(prev, _): prev = tf.matmul(prev, softmax_w) + softmax_b prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(embedding, prev_symbol) outputs, last_state = seq2seq.rnn_decoder( inputs, self.initial_state, cell, loop_function=loop if infer else None, scope=scope_name) output = tf.reshape(tf.concat(1, outputs), [-1, self.rnn_size]) self.logits = tf.matmul(output, softmax_w) + softmax_b self.probs = tf.nn.softmax(self.logits) loss = seq2seq.sequence_loss_by_example( [self.logits], [tf.reshape(self.targets, [-1])], [tf.ones([batch_size * seq_length])], vocab_size) self.cost = tf.reduce_sum(loss) / batch_size / seq_length self.final_state = last_state self.learning_rate = tf.Variable(0.0, trainable=False) self.epoch = tf.Variable(0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), self.grad_clip) optimizer = tf.train.AdamOptimizer(self.learning_rate) self.train_op = optimizer.apply_gradients(zip(grads, tvars)) # set model status self.tensorflow_state = infer return tf
def __init__(self, corpus: clgen.Corpus, **opts): """ Instantiate model. Parameters ---------- corpus : clgen.Corpus Corpus instance. **opts Training options. """ assert(isinstance(corpus, clgen.Corpus)) def _hash(corpus: clgen.Corpus, opts: dict) -> str: """ compute model hash """ hashopts = deepcopy(opts) del hashopts["created"] del hashopts["train_opts"]["epochs"] return crypto.sha1_list(corpus.hash, *types.dict_values(hashopts)) # Validate options for key in opts: if key not in DEFAULT_MODEL_OPTS: raise clgen.UserError( "Unsupported model option '{}'. Valid keys: {}".format( key, ','.join(sorted(DEFAULT_MODEL_OPTS.keys())))) # set properties self.opts = types.update(deepcopy(DEFAULT_MODEL_OPTS), opts) self.corpus = corpus self.hash = _hash(self.corpus, self.opts) self.cache = clgen.mkcache("model", f"{corpus.language}-{self.hash}") log.debug("model", self.hash) # validate metadata against cache, and restore stats self.stats = { "epoch_times": [], "epoch_costs": [], "epoch_batches": [] } meta = deepcopy(self.to_json()) if self.cache.get("META"): cached_meta = jsonutil.read_file(self.cache["META"]) self.stats = cached_meta["stats"] # restore stats if "created" in cached_meta: del cached_meta["created"] del meta["created"] if "created" in cached_meta["corpus"]: del cached_meta["corpus"]["created"] del meta["corpus"]["created"] if "stats" in cached_meta: del cached_meta["stats"] del meta["stats"] if "epochs" in cached_meta["train_opts"]: del cached_meta["train_opts"]["epochs"] del meta["train_opts"]["epochs"] if meta != cached_meta: log.error("Computed META:", jsonutil.format_json(meta)) raise clgen.InternalError( "metadata mismatch in model %s" % self.cache["META"]) else: self._flush_meta()
def __init__(self, sampler_opts: dict, kernel_opts: dict): """ Instantiate a sampler. Parameters ---------- sampler_opts : dict Sampler options. kernel_opts : dict Kernel options. """ def _hash(sampler_opts: dict, kernel_opts: dict) -> str: # we don't consider the number of samples in the ID sampler_opts = deepcopy(sampler_opts) del sampler_opts["min_samples"] del sampler_opts["min_kernels"] del sampler_opts["created"] checksum_data = sorted([str(x) for x in sampler_opts.values()] + [str(x) for x in kernel_opts.values()]) string = "".join([str(x) for x in checksum_data]) return crypto.sha1_str(string) # FIXME(polyglot): def _start_text(lang: clgen.Language, args: Union[List[str], None], start_text: str): if lang == clgen.Language.OPENCL: if args is None: return "__kernel void A(" else: return serialize_opencl_argspec(args) else: return start_text or "" assert (type(sampler_opts) is dict) assert (type(kernel_opts) is dict) # Validate options for key in sampler_opts.keys(): if key not in DEFAULT_SAMPLER_OPTS: raise clgen.UserError( "Unsupported sampler option '{}'. Valid keys: {}".format( key, ','.join(sorted(DEFAULT_SAMPLER_OPTS.keys())))) for key in kernel_opts.keys(): if key not in DEFAULT_KERNELS_OPTS: raise clgen.UserError( "Unsupported kernels option '{}'. Valid keys: {}".format( key, ','.join(sorted(DEFAULT_KERNELS_OPTS.keys())))) # set properties self.sampler_opts = types.update(deepcopy(DEFAULT_SAMPLER_OPTS), sampler_opts) self.kernel_opts = types.update(deepcopy(DEFAULT_KERNELS_OPTS), kernel_opts) self.hash = _hash(self.sampler_opts, self.kernel_opts) self.language = clgen.Language.from_str(kernel_opts.get("language")) self.start_text = _start_text(self.language, self.kernel_opts.get("args", []), self.kernel_opts.get("start_text", "")) # pop "start_text" option del self.kernel_opts["start_text"] # options to pass to preprocess_db() self.preprocess_opts = { "use_gpuverify": self.sampler_opts["gpuverify"] }
def github(db_path: str, github_username: str, github_pw: str, github_token: str) -> None: """ Download all of the OpenCL on GitHub (!) Shortcomings of this appraoch: * Only includes exclusively OpenCL files, no inline strings. * Occasionally (< 1%) can't find headers to include. Arguments: db_path (str): Dataset path. github_username (str): Authorization. github_pw (str): Authorization. github_token (str): Authorization. """ global errors_counter g = Github(github_username, github_pw) db = dbutil.connect(db_path) if not dbutil.is_github: raise clgen.UserError("not a GitHub database") handle_repo = partial(process_repo, g, db) # fetch the repositories to iterate over. Since opencl isn't # treated as a first-class language by GitHub, we can't use the # 'language=' keyword for queries, so instead we through a much # wider net and filter the results afterwards. query_terms = [ 'opencl', 'cl', 'khronos', 'gpu', 'gpgpu', 'cuda', 'amd', 'nvidia', 'heterogeneous' ] for query in query_terms: # forks are okay - we use checksums to ensure uniqueness in # final dataset repos = g.search_repositories(query + ' fork:true sort:stars') for repo in repos: repo_modified = handle_repo(repo) # do nothing unless the repo is new or modified if not repo_modified: continue handle_file = partial(process_file, g, github_token, db, repo) # iterate over the entire git tree of the repo's default # branch (usually 'master'). If a file ends with the .cl # extension, check to see if we already have it, else download # it try: branch = repo.default_branch tree_iterator = repo.get_git_tree(branch, recursive=True).tree for f in tree_iterator: try: handle_file(f) except Exception: errors_counter += 1 except GithubException: # do nothing in case of error (such as an empty repo) pass print_counters() print("\n\ndone.") db.close()
def _init_tensorflow(self, infer: bool=False) -> 'tf': """ Deferred importing of tensorflow and initializing model for training or sampling. This is necessary for two reasons: first, the tensorflow graph is different for training and inference, so must be reset when switching between modes. Second, importing tensorflow takes a long time, so we only want to do it if we actually need to. Parameters ---------- infer : bool If True, initialize model for inference. If False, initialize model for training. Returns ------- module TensorFlow module. """ # quiet tensorflow. See: https://github.com/tensorflow/tensorflow/issues/1258 os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' self.cell_fn = { "lstm": rnn.BasicLSTMCell, "gru": rnn.GRUCell, "rnn": rnn.BasicRNNCell }.get(self.model_type, None) if self.cell_fn is None: raise clgen.UserError("Unrecognized model type") # reset the graph when switching between training and inference tf.reset_default_graph() # corpus info: batch_size = 1 if infer else self.corpus.batch_size seq_length = 1 if infer else self.corpus.seq_length vocab_size = self.corpus.vocab_size cells_lst = [self.cell_fn(self.rnn_size, state_is_tuple=True) for _ in range(self.num_layers)] self.cell = rnn.MultiRNNCell(cells_lst, state_is_tuple=True) with tf.device("/cpu:0"): # Inputs self.encoder_input = tf.placeholder(tf.int32, [batch_size, seq_length]) self.decoder_input = tf.placeholder(tf.int32, [batch_size, seq_length]) self.target_weights = tf.placeholder(tf.int32, [batch_size, seq_length]) self.lengths = tf.placeholder(tf.int32, [batch_size]) self.q = tf.FIFOQueue(capacity=4, dtypes=[tf.int32, tf.int32, tf.int32, tf.int32], shapes=[tf.TensorShape([batch_size, seq_length]), tf.TensorShape([batch_size, seq_length]), tf.TensorShape([batch_size, seq_length]), tf.TensorShape([batch_size])]) self.enqueue_op = self.q.enqueue((self.encoder_input, self.decoder_input, self.target_weights, self.lengths)) next_example = self.q.dequeue() self.inputs = next_example[0] self.dec_inp = next_example[1] self.tweights = tf.to_float(next_example[2]) self.lens = next_example[3] scope_name = 'rnnlm' with tf.variable_scope(scope_name): softmax_w = tf.get_variable("softmax_w", [self.rnn_size, vocab_size]) softmax_b = tf.get_variable("softmax_b", [vocab_size]) with tf.device("/cpu:0"): embedding_dec = tf.get_variable("embedding_dec", [vocab_size, self.rnn_size]) dec_inp2 = tf.nn.embedding_lookup(embedding_dec, self.dec_inp) encoder = SeqEncoder(self.model_type, self.rnn_size, self.num_layers, batch_size, vocab_size) encoder_state = encoder.encode(self.inputs, self.lens) self.mean_latent, self.logvar_latent = encoder_to_latent(encoder_state, self.rnn_size, 32, self.num_layers, tf.float32) self.latent, self.KL_obj, self.KL_cost = sample(self.mean_latent, self.logvar_latent, 32) self.decoder_initial_state = latent_to_decoder(self.latent, self.rnn_size, 32, self.num_layers, tf.float32) decoder_initial_state2 = tuple([rnn.LSTMStateTuple(*single_layer_state) for single_layer_state in self.decoder_initial_state]) helper = seq2seq.TrainingHelper(dec_inp2, self.lens, time_major=False) decoder = seq2seq.BasicDecoder(self.cell, helper, decoder_initial_state2, Dense(vocab_size)) self.final_outputs, self.final_state = seq2seq.dynamic_decode(decoder, output_time_major=False, impute_finished=True, swap_memory=True, scope='rnnlm') self.final_out = self.final_outputs.rnn_output self.probs = tf.nn.softmax(self.final_out) self.cost = seq2seq.sequence_loss(self.final_out, self.inputs, self.tweights) self.learning_rate = tf.Variable(0.0, trainable=False) self.epoch = tf.Variable(0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm( tf.gradients(self.cost + self.KL_obj, tvars, aggregation_method = 2), self.grad_clip) optimizer = tf.train.AdamOptimizer(self.learning_rate) self.train_op = optimizer.apply_gradients(zip(grads, tvars)) return tf
def fetch_repos(db_path: Path, indir: Path, lang: clgen.Language) -> None: db = dbutil.connect(db_path) if not dbutil.is_github(db): raise clgen.UserError("not a GitHub database") c = db.cursor() for directory in fs.ls(indir, abspaths=True): # hacky hardcoded interpretation of `git remote -v` gitdir = fs.path(directory, ".git") output = subprocess.check_output( ["git", "--git-dir", gitdir, "remote", "-v"], universal_newlines=True) url = output.split("\n")[0].split("\t")[1].split(" ")[0] name = fs.basename(directory) output = subprocess.check_output( f"git --git-dir {gitdir} rev-list --format=format:'%ai' " + f"--max-count=1 $(git --git-dir {gitdir} rev-parse HEAD) | tail -n1", shell=True, universal_newlines=True) try: updated_at = dateutil.parser.parse(output) except ValueError: log.error(f"failed to process {name} {url}") continue c.execute("SELECT updated_at FROM Repositories WHERE url=?", (url, )) cached_updated_at = c.fetchone() # Do nothing unless updated timestamps don't match # if cached_updated_at and cached_updated_at[0] >= updated_at: # log.verbose(name, "already in database") # continue c.execute("DELETE FROM Repositories WHERE url=?", (url, )) c.execute("INSERT INTO Repositories VALUES(?,?,?,?,?,?,?,?,?)", (url, "<unknown>", name, 0, 0, 0, 0, updated_at, updated_at)) name_str = " -o ".join( [f"-name '*{ext}'" for ext in clgen.file_extensions(lang)]) output = subprocess.check_output( f"find {directory} -type f {name_str} | grep -v '.git/' || true", shell=True, universal_newlines=True) files = [x.strip() for x in output.split("\n") if x.strip()] # nothing to import if not len(files): # log.verbose("no files in", name) continue log.verbose("processing", len(files), "files in", name) for path in files: relpath = path[len(directory) + 1:] try: contents = inline_fs_headers(path, [], lang=lang) sha = crypto.sha1_str(contents) c.execute('INSERT OR IGNORE INTO ContentFiles VALUES(?,?)', (sha, contents)) c.execute( "INSERT OR IGNORE INTO ContentMeta VALUES(?,?,?,?,?)", (sha, relpath, url, sha, len(contents))) except UnicodeDecodeError: log.warning("non UTF-8 file", path) db.commit() c = db.cursor()
def __init__(self, contentid: str, path: str = None, **opts): """ Instantiate a corpus. If this is a new corpus, a number of files will be created, which may take some time. Arguments: contentid (str): ID of corpus content. path (str, optional): Path to corpus. **opts: Keyword options. """ def _init_error(err: Exception) -> None: """ tidy up in case of error """ log.error("corpus creation failed. Deleting corpus files") paths = [ fs.path(self.contentcache.path, "kernels.db"), fs.path(self.cache.path, "corpus.txt"), fs.path(self.cache.path, "tensor.npy"), fs.path(self.cache.path, "atomizer.pkl") ] for path in paths: if fs.exists(path): log.info("removing", path) fs.rm(path) raise err # Validate options for key in opts.keys(): if key not in DEFAULT_CORPUS_OPTS: raise clgen.UserError( "Unsupported corpus option '{}'. Valid keys: {}".format( key, ','.join(sorted(DEFAULT_CORPUS_OPTS.keys())))) self.opts = deepcopy(DEFAULT_CORPUS_OPTS) clgen.update(self.opts, opts) self.contentid = contentid self.hash = self._hash(contentid, self.opts) self.cache = Cache(fs.path("corpus", self.hash)) self.contentcache = Cache(fs.path("contentfiles", contentid)) self.kernels_db = self.contentcache['kernels.db'] log.debug("corpus {hash}".format(hash=self.hash)) try: if path is not None: if not fs.isdir(path): raise clgen.UserError( "Corpus path '{}' is not a directory".format(path)) # create kernels database if necessary if not self.contentcache["kernels.db"]: self._create_kernels_db(path, self.opts["encoding"]) assert (self.contentcache["kernels.db"]) # create corpus text if not exists if not self.cache["corpus.txt"]: self._create_txt() assert (self.cache["corpus.txt"]) # create atomizer if needed if self.cache["atomizer.pkl"]: self._load_atomizer() assert (self.cache["atomizer.pkl"]) else: self._create_atomizer(self.opts["vocabulary"]) except Exception as e: _init_error(e)