Пример #1
0
    def from_json(corpus_json: dict):
        """
        Instantiate Corpus from JSON.

        Arguments:
            corpus_json (dict): Specification.

        Returns:
            Corpus: Insantiated corpus.
        """
        path = corpus_json.pop("path", None)
        uid = corpus_json.pop("id", None)

        if path:
            path = unpack_directory_if_needed(fs.abspath(path))
            if not fs.isdir(path):
                raise clgen.UserError(
                    "Corpus path '{}' is not a directory".format(path))
            uid = dirhash(path, 'sha1')
        elif uid:
            cache_path = fs.path(cache.ROOT, "corpus", uid)
            if not fs.isdir(cache_path):
                raise clgen.UserError("Corpus {} not found".format(uid))
        else:
            raise clgen.UserError("No corpus path or ID provided")

        return Corpus(uid, path=path, **corpus_json)
Пример #2
0
    def __init__(self, sampler_opts: dict, kernel_opts: dict):
        """
        Instantiate a sampler.

        Parameters
        ----------
        sampler_opts : dict
            Sampler options.
        kernel_opts : dict
            Kernel options.
        """
        def _hash(sampler_opts: dict, kernel_opts: dict) -> str:
            # we don't consider the number of samples in the ID
            sampler_opts = deepcopy(sampler_opts)
            del sampler_opts["min_samples"]
            del sampler_opts["min_kernels"]
            del sampler_opts["created"]

            checksum_data = sorted(
                [str(x) for x in sampler_opts.values()] +
                [str(x) for x in kernel_opts.values()])
            string = "".join([str(x) for x in checksum_data])
            return crypto.sha1_str(string)

        def _start_text(args):
            if args is None:
                return "__kernel void A("
            else:
                return serialize_argspec(args)

        assert(type(sampler_opts) is dict)
        assert(type(kernel_opts) is dict)

        # Validate options
        for key in sampler_opts.keys():
            if key not in DEFAULT_SAMPLER_OPTS:
                raise clgen.UserError(
                    "Unsupported sampler option '{}'. Valid keys: {}".format(
                        key, ','.join(sorted(DEFAULT_SAMPLER_OPTS.keys()))))
        for key in kernel_opts.keys():
            if key not in DEFAULT_KERNELS_OPTS:
                raise clgen.UserError(
                    "Unsupported kernels option '{}'. Valid keys: {}".format(
                        key, ','.join(sorted(DEFAULT_KERNELS_OPTS.keys()))))

        # set properties
        self.sampler_opts = types.update(deepcopy(DEFAULT_SAMPLER_OPTS),
                                         sampler_opts)
        self.kernel_opts = types.update(deepcopy(DEFAULT_KERNELS_OPTS),
                                        kernel_opts)

        self.hash = _hash(self.sampler_opts, self.kernel_opts)

        self.start_text = _start_text(self.kernel_opts["args"])

        # options to pass to preprocess_db()
        self.preprocess_opts = {
            "use_gpuverify": self.sampler_opts["gpuverify"]
        }
Пример #3
0
 def from_str(string: str) -> 'Language':
     if not string:
         raise clgen.UserError(f"no language specified!")
     lang = {
         "opencl": Language.OPENCL,
         "sol": Language.SOLIDITY,
         "solidity": Language.SOLIDITY,
         "glsl": Language.GLSL,
     }.get(string.lower(), None)
     if not lang:
         raise clgen.UserError(f"unknown language '{string}'")
     return lang
Пример #4
0
def create_db(path: str, github: bool=False) -> None:
    """
    Create an empty OpenCL kernel database.

    Parameters
    ----------
    path : str
        Path to database to create.
    github : bool, optional
        Add tables for GitHub metadata.
    """
    path = os.path.expanduser(path)

    if os.path.exists(path):
        raise clgen.UserError("'{}' already exists".format(path))

    db = sqlite3.connect(path)
    c = db.cursor()
    if github:
        script = clgen.sql_script('create-gh-samples-db')
    else:
        script = clgen.sql_script('create-samples-db')
    c.executescript(script)
    c.close()
    db.commit()
    db.close()
Пример #5
0
    def from_json(model_json: dict) -> 'Model':
        """
        Load model from JSON.

        Parameters
        ----------
        model_json : dict
            JSON specification.

        Returns
        -------
        Model
            Model instance.
        """
        assert(isinstance(model_json, dict))

        if "corpus" not in model_json:
            raise clgen.UserError("model JSON has no corpus entry")

        # create corpus and remove from JSON
        corpus = clgen.Corpus.from_json(model_json.pop("corpus"))

        if "stats" in model_json:  # ignore stats
            del model_json["stats"]

        return Model(corpus, **model_json)
Пример #6
0
    def from_json(sampler_json: dict) -> 'Sampler':
        """
        Instantiate sampler from JSON.

        Parameters
        ----------
        sampler_json : dict
            JSON data.

        Returns
        -------
        Sampler
            Instantiate sampler.
        """
        unrecognized_keys = (set(sampler_json.keys()) -
                             set(["sampler", "kernels"]))
        if unrecognized_keys:
            raise clgen.UserError(
                "unrecognized sampler JSON options '{}'".format(",".join(
                    ["'{}'".format(key) for key in unrecognized_keys])))

        sampler_opts = sampler_json.get("sampler", {})
        kernel_opts = sampler_json.get("kernels", {})

        return Sampler(sampler_opts, kernel_opts)
Пример #7
0
    def _create_files(self, path):
        def _init_error(err: Exception, files_to_rm: List[str]=[]) -> None:
            """ tidy up in case of error """
            log.error("corpus creation failed. Deleting corpus files")
            for path in files_to_rm:
                if fs.exists(path):
                    log.info("removing", path)
                    fs.rm(path)
            raise err

        # create kernels database if necessary
        try:
            if path is not None:
                if not fs.isdir(path):
                    raise clgen.UserError(
                        "Corpus path '{}' is not a directory".format(path))
                try:
                    self.contentcache["kernels.db"]
                except KeyError:
                    self._create_kernels_db(path)
        except Exception as e:
            _init_error(e, [self.contentcache.keypath("kernels.db")])

        # preprocess and encode kernel db
        try:
            modified = False
            preprocess_time = time()
            encoding = self.opts["encoding"]
            if clgen.preprocess_db(self.contentcache["kernels.db"],
                                   lang=self.language):
                modified = True
                encode_kernels_db(self.contentcache["kernels.db"], encoding)
        except Exception as e:
            _init_error(e)

        if modified:
            preprocess_time = time() - preprocess_time
            self.stats["preprocess_time"] += preprocess_time
            self._flush_meta()

        # create corpus text if not exists
        try:
            try:
                self.cache["corpus.txt"]
            except KeyError:
                self._create_txt()
                assert(self.cache["corpus.txt"])
        except Exception as e:
            _init_error(e, [self.cache.keypath("corpus.txt")])

        # create atomizer if needed
        try:
            try:
                self.cache["atomizer.pkl"]
                self._load_atomizer()
            except KeyError:
                self._create_atomizer(self.opts["vocabulary"])
                assert(self.cache["atomizer.pkl"])
        except Exception as e:
            _init_error(e, [self.cache.keypath("atomizer.pkl")])
Пример #8
0
        def _get_atomizer(corpus_txt: str, vocab: str="char") -> list:
            """
            Get atomizer for a corpus.

            Parameters
            ----------
            corpus : str
                Corpus.
            vocab : str, optional
                Vocabularly type.

            Returns
            -------
            clgen.Atomizer
                Atomizer.
            """
            atomizers = {
                "char": clgen.CharacterAtomizer,
                "greedy": clgen.GreedyAtomizer,
            }
            atomizerclass = atomizers.get(vocab, None)
            if atomizerclass is None:
                raise clgen.UserError(
                    "Unknown vocabulary type '{bad}'. "
                    "Supported values: {good}".format(
                        bad=vocab, good=", ".join(sorted(atomizers.keys()))))
            else:
                return atomizerclass.from_text(self.language, corpus_txt)
Пример #9
0
    def create_batches(self) -> None:
        """
        Create batches for training.
        """
        self.reset_batch_pointer()

        # generate a kernel corpus
        data = self._generate_kernel_corpus()

        # encode corpus into vocab indices
        self._tensor = self.atomizer.atomize(data)

        batch_size = self.batch_size
        seq_length = self.seq_length

        # set corpus size and number of batches
        self._size = len(self._tensor)
        self._num_batches = int(self.size / (batch_size * seq_length))
        if self.num_batches == 0:
            raise clgen.UserError(
                "Not enough data. Use a smaller seq_length and batch_size")

        # split into batches
        self._tensor = self._tensor[:self.num_batches * batch_size * seq_length]
        xdata = self._tensor
        ydata = np.copy(self._tensor)
        ydata[:-1] = xdata[1:]
        ydata[-1] = xdata[0]
        self._x_batches = np.split(xdata.reshape(batch_size, -1),
                                   self.num_batches, 1)
        self._y_batches = np.split(ydata.reshape(batch_size, -1),
                                   self.num_batches, 1)
Пример #10
0
def encode_kernels_db(kernels_db: str, encoding: str) -> None:
    """
    Encode a kernels database.

    Parameters
    ----------
    kernels_db : str
        Path to kernels database.
    encoding : str
        Encoding type.
    """
    def _default(kernels_db: str) -> None:
        pass

    def _static_features(kernels_db: str) -> None:
        log.verbose("Static feature encoding")
        db = dbutil.connect(kernels_db)
        c = db.cursor()
        c.execute("SELECT id,contents FROM PreprocessedFiles WHERE status=0")
        for row in list(c.fetchall()):
            id, contents = row
            c.execute("DELETE FROM PreprocessedFiles WHERE id=?", (id,))
            for i, kernel in enumerate(get_cl_kernels(contents)):
                features = get_kernel_features(kernel)
                kid = "{}-{}".format(id, i)
                if len(features) == 8:
                    log.verbose("features", kid)
                    feature_str = ("/* {:10} {:10} {:10} {:10} {:10} {:10}"
                                   "{:10.3f} {:10.3f} */".format(
                                       int(features[0]),
                                       int(features[1]),
                                       int(features[2]),
                                       int(features[3]),
                                       int(features[4]),
                                       int(features[5]),
                                       features[6],
                                       features[7]))
                    newsource = feature_str + '\n' + kernel
                    c.execute("""
                        INSERT INTO PreprocessedFiles (id,contents,status)
                        VALUES (?,?,?)
                    """, (kid, newsource, 0))
                else:
                    log.verbose("ignored", kid)
        c.close()
        db.commit()

    # dispatch encoder based on encoding
    encoders = {
        "default": _default,
        "static_features": _static_features,
    }
    encoder = encoders.get(encoding, None)
    if encoder is None:
        raise clgen.UserError(
            "Unknown encoding type '{bad}'. Supported values: {good}".format(
                bad=encoding, good=", ".join(sorted(encoders.keys()))))
    else:
        encoder(kernels_db)
Пример #11
0
def get_cell(model_type):
    cell_fn = {
        "lstm": rnn.BasicLSTMCell,
        "gru": rnn.GRUCell,
        "rnn": rnn.BasicRNNCell
    }.get(model_type, None)
    if cell_fn is None:
        raise clgen.UserError("Unrecognized model type")
    return cell_fn
Пример #12
0
    def from_json(corpus_json: dict) -> 'Corpus':
        """
        Instantiate Corpus from JSON.

        Parameters
        ----------
        corpus_json : dict
            Specification.

        Returns
        -------
        Corpus
            Insantiated corpus.
        """
        path = corpus_json.pop("path", None)
        uid = corpus_json.pop("id", None)
        language = clgen.Language.from_str(corpus_json.get("language"))

        if path:
            path = unpack_directory_if_needed(fs.abspath(path))
            if not fs.isdir(path):
                raise clgen.UserError(
                    "Corpus path '{}' is not a directory".format(path))

            dirhashcache = DirHashCache(clgen.cachepath("dirhash.db"), 'sha1')
            uid = prof.profile(dirhashcache.dirhash, path)
        elif uid:
            cache_path = clgen.mkcache("contentfiles",
                                       f"{language}-{uid}").path
            if not fs.isdir(cache_path):
                raise clgen.UserError(
                    "Corpus content {} not found".format(uid))
        else:
            raise clgen.UserError("No corpus path or ID provided")

        if "stats" in corpus_json:  # ignore stats
            del corpus_json["stats"]

        if "contentfiles" in corpus_json:
            del corpus_json["contentfiles"]

        return prof.profile(Corpus, uid, path=path, **corpus_json)
Пример #13
0
    def create_data(self) -> None:
        """create a numpy array with all the training data"""
        data = self._generate_kernel_corpus()
        seq_length = self.seq_length
        batch_size = self.batch_size
        pad = self.atomizer.vocab['__PAD__']

        lst_x = []
        lst_w = []
        lst_l = []

        inps = [self.atomizer.atomize(kernel.strip()) for kernel in data]

        def next_sequence(inp, length):
            """ produce the next sequence out of the input array """
            x = np.full((seq_length, ), pad, dtype=np.int32)
            weights = np.ones((seq_length, ), dtype=np.int32)
            actual_length = 0

            if length >= seq_length:
                x[:seq_length] = inp[:seq_length]
                actual_length = seq_length
            else:
                x[:length] = inp
                actual_length = length + 1
                if length <= seq_length - 2:
                    weights[length + 1:] = 0
            return x, weights, actual_length

        for inp in inps:
            length = np.shape(inp)[0]
            while length > 16:
                x, weights, actual_length = next_sequence(inp, length)
                lst_x.append(x)
                lst_w.append(weights)
                lst_l.append(actual_length)

                inp = inp[actual_length:]
                length = length - actual_length

        num_examples = len(lst_x)
        # set corpus size and number of batches
        self._size = num_examples * seq_length
        self._num_batches = int(num_examples / batch_size)
        if self.num_batches == 0:
            raise clgen.UserError(
                "Not enough data. Use a smaller seq_length and batch_size")

        self.tensor_x = np.array(lst_x)
        self.tensor_w = np.array(lst_w)
        self.tensor_l = np.array(lst_l)
Пример #14
0
def from_json(sampler_json: dict) -> Sampler:
    """
    Instantiate sampler from JSON.

    Arguments:
        sampler_json (dict): JSON data.

    Returns:
        Sampler: Instantiate sampler.
    """
    sampler_opts = sampler_json.get("sampler", {})

    kernel_opts = sampler_json.get("kernels", {})
    if not kernel_opts:
        raise clgen.UserError("no kernels section in sampler specification")

    return Sampler(sampler_opts, kernel_opts)
Пример #15
0
def _scrape_github_for_files(db_path: str, github_username: str,
                             github_pw: str, github_token: str,
                             query_terms: List[str], file_is_intetesting,
                             download_file_cb):
    global errors_counter

    g = Github(github_username, github_pw)
    db = dbutil.connect(db_path)

    if not dbutil.is_github:
        raise clgen.UserError("not a GitHub database")

    # fetch the repositories to iterate over
    for query in query_terms:
        # forks are okay - we use checksums to ensure uniqueness in
        # final dataset
        repos = g.search_repositories(query + ' fork:true sort:stars')

        for repo in repos:
            # do nothing unless the repo is new or modified
            if not _process_repo(g, db, repo):
                continue

            # iterate over the entire git tree of the repo's default branch
            # (usually 'master'). If a file ends with the .cl extension, check
            # to see if we already have it, else download it
            try:
                branch = repo.default_branch
                tree_iterator = repo.get_git_tree(branch, recursive=True).tree
                for f in tree_iterator:
                    if file_is_intetesting(f):
                        try:
                            _process_file(g, github_token, db, repo, f,
                                          download_file_cb)
                        except Exception as e:
                            print(e)
                            sys.exit(1)
                            errors_counter += 1
            except GithubException:
                # do nothing in case of error (such as an empty repo)
                pass

    _print_counters()
    print("\n\ndone.")
    db.close()
Пример #16
0
def from_json(model_json: dict) -> Model:
    """
    Load model from JSON.

    Arguments:
        model_json (dict): JSON specification.

    Returns:
        Model: Model instance.
    """
    assert (type(model_json) is dict)

    if "corpus" not in model_json:
        raise clgen.UserError("model JSON has no corpus entry")

    # create corpus and remove from JSON
    corpus = Corpus.from_json(model_json.pop("corpus"))

    return Model(corpus, **model_json)
Пример #17
0
def get_atomizer(corpus: str, vocab: str = "char") -> list:
    """
    Get atomizer for a corpus.

    Arguments:
        corpus (str): Corpus.
        vocab (str, optional): Vocabularly type.

    Returns:
        atomizer.Atomizer: Atomizer.
    """
    atomizers = {
        "char": atomizer.CharacterAtomizer,
        "greedy": atomizer.GreedyAtomizer,
    }
    atomizerclass = atomizers.get(vocab, None)
    if atomizerclass is None:
        raise clgen.UserError(
            "Unknown vocabulary type '{bad}'. Supported values: {good}".format(
                bad=vocab, good=", ".join(sorted(atomizers.keys()))))
    else:
        return atomizerclass.from_text(corpus)
Пример #18
0
    def __init__(self, corpus: Corpus, **opts):
        """
        Instantiate model.

        Arguments:
            corpus (Corpus): Corpus instance.
            opts (dict): Training options.
        """
        assert (isinstance(corpus, Corpus))

        # Validate options
        for key in opts.keys():
            if key not in DEFAULT_MODEL_OPTS:
                raise clgen.UserError(
                    "Unsupported model option '{}'. Valid keys: {}".format(
                        key, ','.join(sorted(DEFAULT_MODEL_OPTS.keys()))))

        # set properties
        self.opts = clgen.update(deepcopy(DEFAULT_MODEL_OPTS), opts)
        self.corpus = corpus
        self.hash = self._hash(self.corpus, self.opts)
        self.cache = Cache(fs.path("model", self.hash))

        log.debug("model", self.hash)
Пример #19
0
    def _init_tensorflow(self, infer: bool = False) -> 'tf':
        """
        Deferred importing of tensorflow and initializing model for training
        or sampling.

        This is necessary for two reasons: first, the tensorflow graph is
        different for training and inference, so must be reset when switching
        between modes. Second, importing tensorflow takes a long time, so
        we only want to do it if we actually need to.

        Parameters
        ----------
        infer : bool
            If True, initialize model for inference. If False, initialize
            model for training.

        Returns
        -------
        module
            TensorFlow module.
        """
        # quiet tensorflow. See: https://github.com/tensorflow/tensorflow/issues/1258
        os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

        import tensorflow as tf
        import tensorflow.contrib.legacy_seq2seq as seq2seq
        from tensorflow.contrib import rnn

        self.cell_fn = {
            "lstm": rnn.BasicLSTMCell,
            "gru": rnn.GRUCell,
            "rnn": rnn.BasicRNNCell
        }.get(self.model_type, None)
        if self.cell_fn is None:
            raise clgen.UserError("Unrecognized model type")

        # reset the graph when switching between training and inference
        tf.reset_default_graph()

        # corpus info:
        batch_size = 1 if infer else self.corpus.batch_size
        seq_length = 1 if infer else self.corpus.seq_length
        vocab_size = self.corpus.vocab_size

        cell = self.cell_fn(self.rnn_size, state_is_tuple=True)
        self.cell = cell = rnn.MultiRNNCell([cell] * self.num_layers,
                                            state_is_tuple=True)
        self.input_data = tf.placeholder(tf.int32, [batch_size, seq_length])
        self.targets = tf.placeholder(tf.int32, [batch_size, seq_length])
        self.initial_state = self.cell.zero_state(batch_size, tf.float32)

        scope_name = 'rnnlm'
        with tf.variable_scope(scope_name):
            softmax_w = tf.get_variable("softmax_w",
                                        [self.rnn_size, vocab_size])
            softmax_b = tf.get_variable("softmax_b", [vocab_size])

            with tf.device("/cpu:0"):
                embedding = tf.get_variable("embedding",
                                            [vocab_size, self.rnn_size])
                inputs = tf.split(axis=1,
                                  num_or_size_splits=seq_length,
                                  value=tf.nn.embedding_lookup(
                                      embedding, self.input_data))
                inputs = [tf.squeeze(input_, [1]) for input_ in inputs]

        def loop(prev, _):
            prev = tf.matmul(prev, softmax_w) + softmax_b
            prev_symbol = tf.stop_gradient(tf.argmax(prev, 1))
            return tf.nn.embedding_lookup(embedding, prev_symbol)

        outputs, last_state = seq2seq.rnn_decoder(
            inputs,
            self.initial_state,
            cell,
            loop_function=loop if infer else None,
            scope=scope_name)
        output = tf.reshape(tf.concat(axis=1, values=outputs),
                            [-1, self.rnn_size])
        self.logits = tf.matmul(output, softmax_w) + softmax_b
        self.probs = tf.nn.softmax(self.logits)
        loss = seq2seq.sequence_loss_by_example(
            [self.logits], [tf.reshape(self.targets, [-1])],
            [tf.ones([batch_size * seq_length])], vocab_size)
        self.cost = tf.reduce_sum(loss) / batch_size / seq_length
        self.final_state = last_state
        self.learning_rate = tf.Variable(0.0, trainable=False)
        self.epoch = tf.Variable(0, trainable=False)
        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(
            # Argument of potential interest:
            #   aggregation_method=tf.AggregationMethod.EXPERIMENTAL_TREE
            #
            # See:
            #   https://www.tensorflow.org/api_docs/python/tf/gradients
            #   https://www.tensorflow.org/api_docs/python/tf/AggregationMethod
            tf.gradients(self.cost, tvars),
            self.grad_clip)
        optimizer = tf.train.AdamOptimizer(self.learning_rate)
        self.train_op = optimizer.apply_gradients(zip(grads, tvars))

        return tf
Пример #20
0
def _dump_db(db,
             out_path: str,
             gh: bool = False,
             fileid: bool = False,
             reverse: bool = False,
             input_samples: bool = False,
             status: int = 0,
             eof: bool = False,
             dir: bool = False) -> None:
    """
    Dump database contents.

    Parameters
    ----------
    db : slite3.Connection
        Dataset.
    out_path : str
        Path to output.
    gh : bool, optional
        Dataset is GitHub.
    fileid : bool, optional
        Include file IDs.
    reverse : bool, optional
        Reverse ordering of output.
    input_samples : bool, optional
        If True, use un-preprocessed files.
    status : int, optional
        Filter preprocess status.
    eof : bool, optional
        Include EOF separators.
    dir : bool, optional
        Write output to directory.
    """
    log.info('writing corpus', out_path, '...')

    order = 'ASC' if reverse else 'DESC'

    c = db.cursor()

    # Query components
    table = 'ContentFiles' if input_samples else 'PreprocessedFiles'
    select = 'SELECT {}.id,{}.contents'.format(table, table, table)

    if input_samples:
        qualifier = ''
    else:
        qualifier = 'WHERE {}.status={}'.format(table, status)

    if gh:
        table += (' LEFT JOIN ContentMeta ON {}.id=ContentMeta.id'
                  ' LEFT JOIN Repositories ON '
                  'ContentMeta.repo_url=Repositories.url'.format(table))
        orderby = 'Repositories.stars'
    else:
        orderby = 'LC_col(contents)'

    query = (
        '{select} FROM {table} {qualifier} ORDER BY {orderby} {order}'.format(
            select=select,
            table=table,
            qualifier=qualifier,
            orderby=orderby,
            order=order))

    c.execute(query)
    rows = c.fetchall()

    if dir:
        log.info('writing to directory ', out_path, '/', sep='')
        if os.path.exists(out_path):
            if len(fs.ls(out_path)):
                raise clgen.UserError('directory already exists!')
        else:
            os.makedirs(out_path)
        for row in rows:
            id, contents = row
            path = os.path.join(out_path, kid_to_path(id) + '.cl')
            with open(path, 'w') as out:
                out.write(contents)
    else:
        log.info('writing file', out_path)
        with open(out_path, 'wb') as out:
            for row in rows:
                id, contents = row
                if fileid:  # Print file ID
                    out.write('/* ID: {} */\n\n'.format(id).encode('utf-8'))
                out.write(contents.encode('utf-8'))
                if eof:  # Print EOF token
                    out.write('\n/* EOF */\n\n'.encode('utf-8'))
                else:
                    out.write('\n\n'.encode('utf-8'))
Пример #21
0
    def __init__(self, contentid: str, path: str=None, **opts):
        """
        Instantiate a corpus.

        If this is a new corpus, a number of files will be created, which may
        take some time.

        Parameters
        ----------
        contentid : str
            ID of corpus content.
        path : str, optional
            Path to corpus.
        **opts
            Keyword options.
        """
        # Validate options
        for key in opts.keys():
            if key not in DEFAULT_CORPUS_OPTS:
                raise clgen.UserError(
                    "Unsupported corpus option '{}'. Valid keys: {}".format(
                        key, ','.join(sorted(DEFAULT_CORPUS_OPTS.keys()))))

        self.opts = deepcopy(DEFAULT_CORPUS_OPTS)
        types.update(self.opts, opts)
        self.opts["id"] = contentid

        # check that contentid exists
        self.language = clgen.Language.from_str(opts.get("language"))
        if (path is None and
            not fs.isdir(clgen.cachepath("contentfiles", f"{self.language}-{contentid}"))):
            raise clgen.UserError("corpus {self.language}-{contentid} not found"
                                  .format(**vars()))

        self.contentid = contentid
        self.contentcache = clgen.mkcache("contentfiles", f"{self.language}-{contentid}")
        self.kernels_db = self.contentcache.keypath('kernels.db')

        self.hash = self._hash(contentid, self.opts)
        self.cache = clgen.mkcache("corpus", f"{self.language}-{self.hash}")

        log.debug("contentfiles {self.contentid}".format(**vars()))
        log.debug("corpus {hash}".format(hash=self.hash))

        # validate metadata against cache
        self.stats = {
            "preprocess_time": 0
        }
        meta = deepcopy(self.to_json())
        if self.cache.get("META"):
            cached_meta = jsonutil.read_file(self.cache["META"])
            self.stats = cached_meta["stats"]  # restore stats

            if "created" in cached_meta:
                del cached_meta["created"]
            del meta["created"]

            if "stats" in cached_meta:
                del cached_meta["stats"]
            del meta["stats"]

            if meta != cached_meta:
                raise clgen.InternalError("corpus metadata mismatch")
        else:
            self._flush_meta()

        with self.lock.acquire(replace_stale=True):
            self._create_files(path)
Пример #22
0
    def _init_tensorflow(self, infer: bool = False):
        """
        Deferred importing of tensorflow and initializing model for training
        or sampling.

        This is necessary for two reasons: first, the tensorflow graph is
        different for training and inference, so must be reset when switching
        between modes. Second, importing tensorflow takes a long time, so
        we only want to do it if we actually need to.

        Arguments:
            infer (bool): If True, initialize model for inference. If False,
                initialize model for training.

        Returns:
            module: imported TensorFlow module
        """
        import tensorflow as tf
        from tensorflow.python.ops import rnn_cell
        from tensorflow.python.ops import seq2seq

        # Use self.tensorflow_state to mark whether or not model is configured
        # for training or inference.
        try:
            if self.tensorflow_state == infer:
                return tf
        except AttributeError:
            pass

        self.cell_fn = {
            "lstm": rnn_cell.BasicLSTMCell,
            "gru": rnn_cell.GRUCell,
            "rnn": rnn_cell.BasicRNNCell
        }.get(self.model_type, None)
        if self.cell_fn is None:
            raise clgen.UserError("Unrecognized model type")

        # reset the graph when switching between training and inference
        tf.reset_default_graph()

        # corpus info:
        batch_size = 1 if infer else self.corpus.batch_size
        seq_length = 1 if infer else self.corpus.seq_length
        vocab_size = self.corpus.vocab_size

        fs.mkdir(self.cache.path)

        cell = self.cell_fn(self.rnn_size, state_is_tuple=True)
        self.cell = cell = rnn_cell.MultiRNNCell([cell] * self.num_layers,
                                                 state_is_tuple=True)
        self.input_data = tf.placeholder(tf.int32, [batch_size, seq_length])
        self.targets = tf.placeholder(tf.int32, [batch_size, seq_length])
        self.initial_state = self.cell.zero_state(batch_size, tf.float32)

        scope_name = 'rnnlm'
        with tf.variable_scope(scope_name):
            softmax_w = tf.get_variable("softmax_w",
                                        [self.rnn_size, vocab_size])
            softmax_b = tf.get_variable("softmax_b", [vocab_size])

            with tf.device("/cpu:0"):
                embedding = tf.get_variable("embedding",
                                            [vocab_size, self.rnn_size])
                inputs = tf.split(
                    1, seq_length,
                    tf.nn.embedding_lookup(embedding, self.input_data))
                inputs = [tf.squeeze(input_, [1]) for input_ in inputs]

        def loop(prev, _):
            prev = tf.matmul(prev, softmax_w) + softmax_b
            prev_symbol = tf.stop_gradient(tf.argmax(prev, 1))
            return tf.nn.embedding_lookup(embedding, prev_symbol)

        outputs, last_state = seq2seq.rnn_decoder(
            inputs,
            self.initial_state,
            cell,
            loop_function=loop if infer else None,
            scope=scope_name)
        output = tf.reshape(tf.concat(1, outputs), [-1, self.rnn_size])
        self.logits = tf.matmul(output, softmax_w) + softmax_b
        self.probs = tf.nn.softmax(self.logits)
        loss = seq2seq.sequence_loss_by_example(
            [self.logits], [tf.reshape(self.targets, [-1])],
            [tf.ones([batch_size * seq_length])], vocab_size)
        self.cost = tf.reduce_sum(loss) / batch_size / seq_length
        self.final_state = last_state
        self.learning_rate = tf.Variable(0.0, trainable=False)
        self.epoch = tf.Variable(0, trainable=False)
        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars),
                                          self.grad_clip)
        optimizer = tf.train.AdamOptimizer(self.learning_rate)
        self.train_op = optimizer.apply_gradients(zip(grads, tvars))

        # set model status
        self.tensorflow_state = infer

        return tf
Пример #23
0
    def __init__(self, corpus: clgen.Corpus, **opts):
        """
        Instantiate model.

        Parameters
        ----------
        corpus : clgen.Corpus
            Corpus instance.
        **opts
            Training options.
        """
        assert(isinstance(corpus, clgen.Corpus))

        def _hash(corpus: clgen.Corpus, opts: dict) -> str:
            """ compute model hash """
            hashopts = deepcopy(opts)
            del hashopts["created"]
            del hashopts["train_opts"]["epochs"]
            return crypto.sha1_list(corpus.hash, *types.dict_values(hashopts))

        # Validate options
        for key in opts:
            if key not in DEFAULT_MODEL_OPTS:
                raise clgen.UserError(
                    "Unsupported model option '{}'. Valid keys: {}".format(
                        key, ','.join(sorted(DEFAULT_MODEL_OPTS.keys()))))

        # set properties
        self.opts = types.update(deepcopy(DEFAULT_MODEL_OPTS), opts)
        self.corpus = corpus
        self.hash = _hash(self.corpus, self.opts)
        self.cache = clgen.mkcache("model", f"{corpus.language}-{self.hash}")

        log.debug("model", self.hash)

        # validate metadata against cache, and restore stats
        self.stats = {
            "epoch_times": [],
            "epoch_costs": [],
            "epoch_batches": []
        }
        meta = deepcopy(self.to_json())
        if self.cache.get("META"):
            cached_meta = jsonutil.read_file(self.cache["META"])
            self.stats = cached_meta["stats"]  # restore stats

            if "created" in cached_meta:
                del cached_meta["created"]
            del meta["created"]

            if "created" in cached_meta["corpus"]:
                del cached_meta["corpus"]["created"]
            del meta["corpus"]["created"]

            if "stats" in cached_meta:
                del cached_meta["stats"]
            del meta["stats"]

            if "epochs" in cached_meta["train_opts"]:
                del cached_meta["train_opts"]["epochs"]
            del meta["train_opts"]["epochs"]

            if meta != cached_meta:
                log.error("Computed META:", jsonutil.format_json(meta))
                raise clgen.InternalError(
                    "metadata mismatch in model %s" % self.cache["META"])
        else:
            self._flush_meta()
Пример #24
0
    def __init__(self, sampler_opts: dict, kernel_opts: dict):
        """
        Instantiate a sampler.

        Parameters
        ----------
        sampler_opts : dict
            Sampler options.
        kernel_opts : dict
            Kernel options.
        """
        def _hash(sampler_opts: dict, kernel_opts: dict) -> str:
            # we don't consider the number of samples in the ID
            sampler_opts = deepcopy(sampler_opts)
            del sampler_opts["min_samples"]
            del sampler_opts["min_kernels"]
            del sampler_opts["created"]

            checksum_data = sorted([str(x) for x in sampler_opts.values()] +
                                   [str(x) for x in kernel_opts.values()])
            string = "".join([str(x) for x in checksum_data])
            return crypto.sha1_str(string)

        # FIXME(polyglot):
        def _start_text(lang: clgen.Language, args: Union[List[str], None],
                        start_text: str):
            if lang == clgen.Language.OPENCL:
                if args is None:
                    return "__kernel void A("
                else:
                    return serialize_opencl_argspec(args)
            else:
                return start_text or ""

        assert (type(sampler_opts) is dict)
        assert (type(kernel_opts) is dict)

        # Validate options
        for key in sampler_opts.keys():
            if key not in DEFAULT_SAMPLER_OPTS:
                raise clgen.UserError(
                    "Unsupported sampler option '{}'. Valid keys: {}".format(
                        key, ','.join(sorted(DEFAULT_SAMPLER_OPTS.keys()))))
        for key in kernel_opts.keys():
            if key not in DEFAULT_KERNELS_OPTS:
                raise clgen.UserError(
                    "Unsupported kernels option '{}'. Valid keys: {}".format(
                        key, ','.join(sorted(DEFAULT_KERNELS_OPTS.keys()))))

        # set properties
        self.sampler_opts = types.update(deepcopy(DEFAULT_SAMPLER_OPTS),
                                         sampler_opts)
        self.kernel_opts = types.update(deepcopy(DEFAULT_KERNELS_OPTS),
                                        kernel_opts)

        self.hash = _hash(self.sampler_opts, self.kernel_opts)

        self.language = clgen.Language.from_str(kernel_opts.get("language"))

        self.start_text = _start_text(self.language,
                                      self.kernel_opts.get("args", []),
                                      self.kernel_opts.get("start_text", ""))
        # pop "start_text" option
        del self.kernel_opts["start_text"]

        # options to pass to preprocess_db()
        self.preprocess_opts = {
            "use_gpuverify": self.sampler_opts["gpuverify"]
        }
Пример #25
0
def github(db_path: str, github_username: str, github_pw: str,
           github_token: str) -> None:
    """
    Download all of the OpenCL on GitHub (!)

    Shortcomings of this appraoch:
        * Only includes exclusively OpenCL files, no inline strings.
        * Occasionally (< 1%) can't find headers to include.

    Arguments:
        db_path (str): Dataset path.
        github_username (str): Authorization.
        github_pw (str): Authorization.
        github_token (str): Authorization.
    """
    global errors_counter

    g = Github(github_username, github_pw)
    db = dbutil.connect(db_path)

    if not dbutil.is_github:
        raise clgen.UserError("not a GitHub database")

    handle_repo = partial(process_repo, g, db)

    # fetch the repositories to iterate over. Since opencl isn't
    # treated as a first-class language by GitHub, we can't use the
    # 'language=' keyword for queries, so instead we through a much
    # wider net and filter the results afterwards.
    query_terms = [
        'opencl',
        'cl',
        'khronos',
        'gpu',
        'gpgpu',
        'cuda',
        'amd',
        'nvidia',
        'heterogeneous'
    ]
    for query in query_terms:
        # forks are okay - we use checksums to ensure uniqueness in
        # final dataset
        repos = g.search_repositories(query + ' fork:true sort:stars')

        for repo in repos:
            repo_modified = handle_repo(repo)

            # do nothing unless the repo is new or modified
            if not repo_modified:
                continue

            handle_file = partial(process_file, g, github_token, db, repo)

            # iterate over the entire git tree of the repo's default
            # branch (usually 'master'). If a file ends with the .cl
            # extension, check to see if we already have it, else download
            # it
            try:
                branch = repo.default_branch
                tree_iterator = repo.get_git_tree(branch, recursive=True).tree
                for f in tree_iterator:
                    try:
                        handle_file(f)
                    except Exception:
                        errors_counter += 1
            except GithubException:
                # do nothing in case of error (such as an empty repo)
                pass

    print_counters()
    print("\n\ndone.")
    db.close()
Пример #26
0
    def _init_tensorflow(self, infer: bool=False) -> 'tf':
        """
        Deferred importing of tensorflow and initializing model for training
        or sampling.

        This is necessary for two reasons: first, the tensorflow graph is
        different for training and inference, so must be reset when switching
        between modes. Second, importing tensorflow takes a long time, so
        we only want to do it if we actually need to.

        Parameters
        ----------
        infer : bool
            If True, initialize model for inference. If False, initialize
            model for training.

        Returns
        -------
        module
            TensorFlow module.
        """
        # quiet tensorflow. See: https://github.com/tensorflow/tensorflow/issues/1258
        os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

        self.cell_fn = {
            "lstm": rnn.BasicLSTMCell,
            "gru": rnn.GRUCell,
            "rnn": rnn.BasicRNNCell
        }.get(self.model_type, None)
        if self.cell_fn is None:
            raise clgen.UserError("Unrecognized model type")

        # reset the graph when switching between training and inference
        tf.reset_default_graph()

        # corpus info:
        batch_size = 1 if infer else self.corpus.batch_size
        seq_length = 1 if infer else self.corpus.seq_length
        vocab_size = self.corpus.vocab_size

        cells_lst = [self.cell_fn(self.rnn_size, state_is_tuple=True) for _ in range(self.num_layers)]
        self.cell = rnn.MultiRNNCell(cells_lst, state_is_tuple=True)

        with tf.device("/cpu:0"):
            # Inputs 
            self.encoder_input = tf.placeholder(tf.int32, [batch_size, seq_length])
            self.decoder_input = tf.placeholder(tf.int32, [batch_size, seq_length])
            self.target_weights = tf.placeholder(tf.int32, [batch_size, seq_length])
            self.lengths = tf.placeholder(tf.int32, [batch_size])

            self.q = tf.FIFOQueue(capacity=4,
                dtypes=[tf.int32, tf.int32, tf.int32, tf.int32],
                shapes=[tf.TensorShape([batch_size, seq_length]), 
                    tf.TensorShape([batch_size, seq_length]),
                    tf.TensorShape([batch_size, seq_length]),
                    tf.TensorShape([batch_size])])
            self.enqueue_op = self.q.enqueue((self.encoder_input, self.decoder_input, self.target_weights, self.lengths))

            next_example = self.q.dequeue()

            self.inputs = next_example[0]
            self.dec_inp = next_example[1]
            self.tweights = tf.to_float(next_example[2])
            self.lens = next_example[3]
        

        scope_name = 'rnnlm'
        with tf.variable_scope(scope_name):
            softmax_w = tf.get_variable("softmax_w", [self.rnn_size, vocab_size])
            softmax_b = tf.get_variable("softmax_b", [vocab_size])

            with tf.device("/cpu:0"):
                embedding_dec = tf.get_variable("embedding_dec", [vocab_size, self.rnn_size])
                dec_inp2 = tf.nn.embedding_lookup(embedding_dec, self.dec_inp)

        encoder = SeqEncoder(self.model_type, self.rnn_size, self.num_layers, batch_size, vocab_size)
        encoder_state = encoder.encode(self.inputs, self.lens)

        self.mean_latent, self.logvar_latent = encoder_to_latent(encoder_state, self.rnn_size, 32, self.num_layers, tf.float32)
        self.latent, self.KL_obj, self.KL_cost = sample(self.mean_latent, self.logvar_latent, 32)
        self.decoder_initial_state = latent_to_decoder(self.latent, self.rnn_size, 32, self.num_layers, tf.float32)


        decoder_initial_state2 = tuple([rnn.LSTMStateTuple(*single_layer_state) for single_layer_state in self.decoder_initial_state])

        helper = seq2seq.TrainingHelper(dec_inp2, self.lens, time_major=False)
        decoder = seq2seq.BasicDecoder(self.cell, helper, decoder_initial_state2, Dense(vocab_size))
        self.final_outputs, self.final_state = seq2seq.dynamic_decode(decoder, output_time_major=False, impute_finished=True, swap_memory=True, scope='rnnlm')

        self.final_out = self.final_outputs.rnn_output

        self.probs = tf.nn.softmax(self.final_out)
        self.cost = seq2seq.sequence_loss(self.final_out, self.inputs, self.tweights)

        self.learning_rate = tf.Variable(0.0, trainable=False)
        self.epoch = tf.Variable(0, trainable=False)
        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(
            tf.gradients(self.cost + self.KL_obj, tvars, aggregation_method = 2), self.grad_clip)
        optimizer = tf.train.AdamOptimizer(self.learning_rate)
        self.train_op = optimizer.apply_gradients(zip(grads, tvars))

        return tf
Пример #27
0
def fetch_repos(db_path: Path, indir: Path, lang: clgen.Language) -> None:
    db = dbutil.connect(db_path)

    if not dbutil.is_github(db):
        raise clgen.UserError("not a GitHub database")

    c = db.cursor()

    for directory in fs.ls(indir, abspaths=True):
        # hacky hardcoded interpretation of `git remote -v`
        gitdir = fs.path(directory, ".git")
        output = subprocess.check_output(
            ["git", "--git-dir", gitdir, "remote", "-v"],
            universal_newlines=True)
        url = output.split("\n")[0].split("\t")[1].split(" ")[0]
        name = fs.basename(directory)

        output = subprocess.check_output(
            f"git --git-dir {gitdir} rev-list --format=format:'%ai' " +
            f"--max-count=1 $(git --git-dir {gitdir} rev-parse HEAD) | tail -n1",
            shell=True,
            universal_newlines=True)
        try:
            updated_at = dateutil.parser.parse(output)
        except ValueError:
            log.error(f"failed to process {name} {url}")
            continue

        c.execute("SELECT updated_at FROM Repositories WHERE url=?", (url, ))
        cached_updated_at = c.fetchone()

        # Do nothing unless updated timestamps don't match
        # if cached_updated_at and cached_updated_at[0] >= updated_at:
        #     log.verbose(name, "already in database")
        #     continue

        c.execute("DELETE FROM Repositories WHERE url=?", (url, ))
        c.execute("INSERT INTO Repositories VALUES(?,?,?,?,?,?,?,?,?)",
                  (url, "<unknown>", name, 0, 0, 0, 0, updated_at, updated_at))

        name_str = " -o ".join(
            [f"-name '*{ext}'" for ext in clgen.file_extensions(lang)])
        output = subprocess.check_output(
            f"find {directory} -type f {name_str} | grep -v '.git/' || true",
            shell=True,
            universal_newlines=True)
        files = [x.strip() for x in output.split("\n") if x.strip()]

        # nothing to import
        if not len(files):
            # log.verbose("no files in", name)
            continue

        log.verbose("processing", len(files), "files in", name)
        for path in files:
            relpath = path[len(directory) + 1:]
            try:
                contents = inline_fs_headers(path, [], lang=lang)
                sha = crypto.sha1_str(contents)
                c.execute('INSERT OR IGNORE INTO ContentFiles VALUES(?,?)',
                          (sha, contents))
                c.execute(
                    "INSERT OR IGNORE INTO ContentMeta VALUES(?,?,?,?,?)",
                    (sha, relpath, url, sha, len(contents)))
            except UnicodeDecodeError:
                log.warning("non UTF-8 file", path)

        db.commit()
        c = db.cursor()
Пример #28
0
    def __init__(self, contentid: str, path: str = None, **opts):
        """
        Instantiate a corpus.

        If this is a new corpus, a number of files will be created, which may
        take some time.

        Arguments:
            contentid (str): ID of corpus content.
            path (str, optional): Path to corpus.
            **opts: Keyword options.
        """
        def _init_error(err: Exception) -> None:
            """ tidy up in case of error """
            log.error("corpus creation failed. Deleting corpus files")
            paths = [
                fs.path(self.contentcache.path, "kernels.db"),
                fs.path(self.cache.path, "corpus.txt"),
                fs.path(self.cache.path, "tensor.npy"),
                fs.path(self.cache.path, "atomizer.pkl")
            ]
            for path in paths:
                if fs.exists(path):
                    log.info("removing", path)
                    fs.rm(path)
            raise err

        # Validate options
        for key in opts.keys():
            if key not in DEFAULT_CORPUS_OPTS:
                raise clgen.UserError(
                    "Unsupported corpus option '{}'. Valid keys: {}".format(
                        key, ','.join(sorted(DEFAULT_CORPUS_OPTS.keys()))))

        self.opts = deepcopy(DEFAULT_CORPUS_OPTS)
        clgen.update(self.opts, opts)
        self.contentid = contentid
        self.hash = self._hash(contentid, self.opts)
        self.cache = Cache(fs.path("corpus", self.hash))
        self.contentcache = Cache(fs.path("contentfiles", contentid))
        self.kernels_db = self.contentcache['kernels.db']

        log.debug("corpus {hash}".format(hash=self.hash))

        try:
            if path is not None:
                if not fs.isdir(path):
                    raise clgen.UserError(
                        "Corpus path '{}' is not a directory".format(path))
                # create kernels database if necessary
                if not self.contentcache["kernels.db"]:
                    self._create_kernels_db(path, self.opts["encoding"])
                    assert (self.contentcache["kernels.db"])

            # create corpus text if not exists
            if not self.cache["corpus.txt"]:
                self._create_txt()
                assert (self.cache["corpus.txt"])

            # create atomizer if needed
            if self.cache["atomizer.pkl"]:
                self._load_atomizer()
                assert (self.cache["atomizer.pkl"])
            else:
                self._create_atomizer(self.opts["vocabulary"])
        except Exception as e:
            _init_error(e)