示例#1
0
def clangformat(src: str, id: str = 'anon', timeout: int = 60) -> str:
    """
    Enforce code style on source file.

    Parameters
    ----------
    src : str
        Source code.
    id : str, optional
        Name of source file.

    Returns
    -------
    str
        Styled source.

    Raises
    ------
    ClangFormatException
        If formatting errors.
    """
    cmd = [
        "timeout", "-s9",
        str(timeout), native.CLANG_FORMAT,
        '-style={}'.format(json.dumps(clangformat_config))
    ]
    process = Popen(cmd, stdin=PIPE, stdout=PIPE, stderr=PIPE)
    stdout, stderr = process.communicate(src.encode('utf-8'))

    if stderr:
        log.error(stderr.decode('utf-8'))
    if process.returncode != 0:
        raise ClangFormatException(stderr.decode('utf-8'))

    return stdout.decode('utf-8')
示例#2
0
def preprocess_inplace(paths: str,
                       max_num_workers: int = cpu_count(),
                       attempt: int = 1) -> None:
    """
    Preprocess a list of files in place.

    Arguments:
        paths (str[]): List of paths.
        max_num_workers (int, optional): Number of processes to spawn.
    """
    if attempt >= MAX_OS_RETRIES:
        raise clgen.InternalError("Failed to process files")

    num_workers = min(len(paths), max_num_workers)

    try:
        log.info('spawned', num_workers, 'worker threads to process',
                 len(paths), 'files ...')
        with clgen.terminating(Pool(num_workers)) as pool:
            pool.map(_preprocess_inplace_worker, paths)
    except OSError as e:
        log.error(e)

        # Try again with fewer threads.
        # See: https://github.com/ChrisCummins/clgen/issues/64
        max_num_workers = max(int(max_num_workers / 2), 1)
        preprocess_inplace(paths,
                           max_num_workers=max_num_workers,
                           attempt=attempt + 1)
示例#3
0
def clangformat_ocl(src: str, id: str = 'anon') -> str:
    """
    Enforce code style on OpenCL file.

    Arguments:
        src (str): OpenCL source.
        id (str, optional): Name of OpenCL source.

    Returns:
        str: Styled source.

    Raises:
        ClangFormatException: If formatting errors.
    """
    cmd = [
        native.CLANG_FORMAT, '-style={}'.format(json.dumps(clangformat_config))
    ]
    process = Popen(cmd, stdin=PIPE, stdout=PIPE, stderr=PIPE)
    stdout, stderr = process.communicate(src.encode('utf-8'))

    if stderr:
        log.error(stderr.decode('utf-8'))
    if process.returncode != 0:
        raise ClangFormatException(stderr.decode('utf-8'))

    return stdout.decode('utf-8')
示例#4
0
 def _init_error(err: Exception, files_to_rm: List[str]=[]) -> None:
     """ tidy up in case of error """
     log.error("corpus creation failed. Deleting corpus files")
     for path in files_to_rm:
         if fs.exists(path):
             log.info("removing", path)
             fs.rm(path)
     raise err
示例#5
0
 def _process_file(path: str, **kwargs):
     buf = StringIO()
     features(path=path, file=buf, **kwargs)
     ret = buf.getvalue()
     try:
         # last line is empty:
         lines = ret.split('\n')[:-1]
         # first two cols are ignored (path and kernel names):
         parse = lambda l: np.array([float(x) for x in l.split(',')[2:]])
         return [parse(line) for line in lines]
     except IndexError:
         log.error("lines:", lines)
         raise FeatureExtractionError
示例#6
0
 def _init_error(err: Exception) -> None:
     """ tidy up in case of error """
     log.error("corpus creation failed. Deleting corpus files")
     paths = [
         fs.path(self.contentcache.path, "kernels.db"),
         fs.path(self.cache.path, "corpus.txt"),
         fs.path(self.cache.path, "tensor.npy"),
         fs.path(self.cache.path, "atomizer.pkl")
     ]
     for path in paths:
         if fs.exists(path):
             log.info("removing", path)
             fs.rm(path)
     raise err
示例#7
0
def get_features(code: str) -> np.array:
    """
    Get features for code.

    Arguments:
        code (str): Source code.

    Returns:
        np.array: Feature values.
    """
    with NamedTemporaryFile() as outfile:
        outfile.write(code.encode("utf-8"))
        outfile.seek(0)
        f = features.to_np_arrays([outfile.name])
    if len(f) != 1:
        log.error("features:", f)
        raise FeaturesError("code contains more than one kernel")
    return f[0]
示例#8
0
def preprocess_inplace(paths: List[str],
                       max_num_workers: int = cpu_count(),
                       max_attempts: int = 100,
                       attempt: int = 1) -> None:
    """
    Preprocess a list of files in place.

    Parameters
    ----------
    paths : List[str]
        List of paths.
    max_num_workers : int, optional
        Number of processes to spawn.
    max_attempts : int, optional
        In case of an OSError or TimeoutError, this number of attempts will be
        made.
    """
    if attempt > max_attempts:
        raise clgen.InternalError(
            f"Failed to process files after {max_attempts} attempts")
    elif attempt > 1:
        log.warning("preprocess attempt #.", attempt)

    num_workers = min(len(paths), max_num_workers)

    try:
        log.info('spawned', num_workers, 'worker threads to process',
                 len(paths), 'files ...')
        with clgen.terminating(Pool(num_workers)) as pool:
            pool.map(_preprocess_inplace_worker, paths)
    except (OSError, TimeoutError) as e:
        log.error(e)

        # Try again with fewer threads.
        # See: https://github.com/ChrisCummins/clgen/issues/64
        max_num_workers = max(int(max_num_workers / 2), 1)
        preprocess_inplace(paths,
                           max_num_workers=max_num_workers,
                           attempt=attempt + 1,
                           max_attempts=max_attempts)
示例#9
0
def get_kernel_features(code: str, **kwargs) -> np.array:
    """
    Get features for code.

    Parameters
    ----------
    code : str
        Source code.
    **kwargs
        Arguments to features.features()

    Returns
    -------
    np.array
        Feature values.
    """
    with NamedTemporaryFile() as outfile:
        outfile.write(code.encode("utf-8"))
        outfile.seek(0)
        f = features.to_np_arrays([outfile.name], **kwargs)
    if len(f) != 1:
        log.error("features:", f)
        raise FeaturesError("code contains more than one kernel")
    return f[0]
示例#10
0
def _preprocess_db(db_path: str,
                   max_num_workers: int = cpu_count(),
                   max_attempts: int = 100,
                   attempt: int = 1,
                   **preprocess_opts) -> None:
    """
    Preprocess OpenCL dataset.

    Parameters
    ----------
    db_path : str
        OpenCL kernels dataset.
    max_num_workers : int, optional
        Number of processes to spawn.
    max_attempts : int, optional
        In case of an OSError or TimeoutError, this number of attempts will be
        made.
    """
    if attempt > max_attempts:
        raise clgen.InternalError(
            f"failed to preprocess files after {max_attempts} attempts")

    log.verbose("determining jobs")

    contentfiles = set(dbutil.kernel_ids(db_path, "ContentFiles"))
    preprocessedfiles = set(dbutil.kernel_ids(db_path, "PreprocessedFiles"))

    ncontentfiles = len(contentfiles)
    npreprocessedfiles = len(preprocessedfiles)

    todo = contentfiles - preprocessedfiles
    ntodo = len(todo)

    # check we have something to do
    if not ntodo:
        return

    todo_ratio = ntodo / ncontentfiles

    log.info("{ntodo} ({todo_ratio:.1%}) samples need preprocessing".format(
        **vars()))

    log.verbose("creating jobs")

    # Determine if we need to inline kernels when creating jobs
    db = sqlite3.connect(db_path)
    c = db.cursor()
    c.execute(
        "SELECT name FROM sqlite_master WHERE type='table' AND name='ContentMeta';"
    )
    meta_table = c.fetchone()
    c.close()
    db.close()
    if meta_table:
        get_kernel = lambda kid: dbutil.get_inlined_kernel(
            db_path, kid, lang=preprocess_opts["lang"])
    else:
        get_kernel = lambda kid: dbutil.get_kernel(
            db_path, kid, table="ContentFiles")

    # create jobs
    jobs = [{
        "id": kid,
        "src": get_kernel(kid),
        "preprocess_opts": preprocess_opts,
    } for kid in todo]

    random.shuffle(jobs)

    # split size
    worker_njobs = math.ceil(ntodo / max_num_workers)

    # producer-consumer queue
    queue = Queue(maxsize=128)

    log.verbose(f"assigning {ntodo} jobs to {max_num_workers} threads")

    try:
        # our worker threads. these busy little bees will do the heavy lifting
        # of preprocessing the contentfiles, pushing their results onto
        # the queue
        producers = [
            PreprocessWorker(jobs[i:i + worker_njobs], queue)
            for i in range(0, ntodo, worker_njobs)
        ]

        # fly, my pretties, fly!
        for producer in producers:
            producer.start()

        # consume the results from the worker threads from the main thread
        for i in progressbar.ProgressBar()(range(ntodo)):
            # pull a fresh result from the queue (block if necessary)
            try:
                result = queue.get(timeout=90)
            except QueueEmpty as e:
                raise TimeoutError('failed to fetch result after 90 seconds. '
                                   'something went wrong') from e

            # insert result into database
            db = dbutil.connect(db_path)
            c = db.cursor()
            c.execute("INSERT INTO PreprocessedFiles VALUES(?,?,?)",
                      (result["id"], result["status"], result["contents"]))
            c.close()
            db.commit()
            db.close()

        for producer in producers:
            producer.join()

    except (OSError, TimeoutError) as e:
        log.error(e)

        if attempt > 2 and not i:
            log.warning("no progress has been made since previous attempt. "
                        "I'm not going to try another attempt.")
            return

        # Try again with fewer threads.
        # See: https://github.com/ChrisCummins/clgen/issues/64
        max_num_workers = max(int(max_num_workers / 2), 1)
        _preprocess_db(db_path,
                       max_num_workers=max_num_workers,
                       attempt=attempt + 1,
                       max_attempts=max_attempts,
                       **preprocess_opts)
示例#11
0
文件: _fetch.py 项目: DhashS/clgen
def fetch_repos(db_path: Path, indir: Path, lang: clgen.Language) -> None:
    db = dbutil.connect(db_path)

    if not dbutil.is_github(db):
        raise clgen.UserError("not a GitHub database")

    c = db.cursor()

    for directory in fs.ls(indir, abspaths=True):
        # hacky hardcoded interpretation of `git remote -v`
        gitdir = fs.path(directory, ".git")
        output = subprocess.check_output(
            ["git", "--git-dir", gitdir, "remote", "-v"],
            universal_newlines=True)
        url = output.split("\n")[0].split("\t")[1].split(" ")[0]
        name = fs.basename(directory)

        output = subprocess.check_output(
            f"git --git-dir {gitdir} rev-list --format=format:'%ai' " +
            f"--max-count=1 $(git --git-dir {gitdir} rev-parse HEAD) | tail -n1",
            shell=True,
            universal_newlines=True)
        try:
            updated_at = dateutil.parser.parse(output)
        except ValueError:
            log.error(f"failed to process {name} {url}")
            continue

        c.execute("SELECT updated_at FROM Repositories WHERE url=?", (url, ))
        cached_updated_at = c.fetchone()

        # Do nothing unless updated timestamps don't match
        # if cached_updated_at and cached_updated_at[0] >= updated_at:
        #     log.verbose(name, "already in database")
        #     continue

        c.execute("DELETE FROM Repositories WHERE url=?", (url, ))
        c.execute("INSERT INTO Repositories VALUES(?,?,?,?,?,?,?,?,?)",
                  (url, "<unknown>", name, 0, 0, 0, 0, updated_at, updated_at))

        name_str = " -o ".join(
            [f"-name '*{ext}'" for ext in clgen.file_extensions(lang)])
        output = subprocess.check_output(
            f"find {directory} -type f {name_str} | grep -v '.git/' || true",
            shell=True,
            universal_newlines=True)
        files = [x.strip() for x in output.split("\n") if x.strip()]

        # nothing to import
        if not len(files):
            # log.verbose("no files in", name)
            continue

        log.verbose("processing", len(files), "files in", name)
        for path in files:
            relpath = path[len(directory) + 1:]
            try:
                contents = inline_fs_headers(path, [], lang=lang)
                sha = crypto.sha1_str(contents)
                c.execute('INSERT OR IGNORE INTO ContentFiles VALUES(?,?)',
                          (sha, contents))
                c.execute(
                    "INSERT OR IGNORE INTO ContentMeta VALUES(?,?,?,?,?)",
                    (sha, relpath, url, sha, len(contents)))
            except UnicodeDecodeError:
                log.warning("non UTF-8 file", path)

        db.commit()
        c = db.cursor()
示例#12
0
    def __init__(self, corpus: clgen.Corpus, **opts):
        """
        Instantiate model.

        Parameters
        ----------
        corpus : clgen.Corpus
            Corpus instance.
        **opts
            Training options.
        """
        assert(isinstance(corpus, clgen.Corpus))

        def _hash(corpus: clgen.Corpus, opts: dict) -> str:
            """ compute model hash """
            hashopts = deepcopy(opts)
            del hashopts["created"]
            del hashopts["train_opts"]["epochs"]
            return crypto.sha1_list(corpus.hash, *types.dict_values(hashopts))

        # Validate options
        for key in opts:
            if key not in DEFAULT_MODEL_OPTS:
                raise clgen.UserError(
                    "Unsupported model option '{}'. Valid keys: {}".format(
                        key, ','.join(sorted(DEFAULT_MODEL_OPTS.keys()))))

        # set properties
        self.opts = types.update(deepcopy(DEFAULT_MODEL_OPTS), opts)
        self.corpus = corpus
        self.hash = _hash(self.corpus, self.opts)
        self.cache = clgen.mkcache("model", f"{corpus.language}-{self.hash}")

        log.debug("model", self.hash)

        # validate metadata against cache, and restore stats
        self.stats = {
            "epoch_times": [],
            "epoch_costs": [],
            "epoch_batches": []
        }
        meta = deepcopy(self.to_json())
        if self.cache.get("META"):
            cached_meta = jsonutil.read_file(self.cache["META"])
            self.stats = cached_meta["stats"]  # restore stats

            if "created" in cached_meta:
                del cached_meta["created"]
            del meta["created"]

            if "created" in cached_meta["corpus"]:
                del cached_meta["corpus"]["created"]
            del meta["corpus"]["created"]

            if "stats" in cached_meta:
                del cached_meta["stats"]
            del meta["stats"]

            if "epochs" in cached_meta["train_opts"]:
                del cached_meta["train_opts"]["epochs"]
            del meta["train_opts"]["epochs"]

            if meta != cached_meta:
                log.error("Computed META:", jsonutil.format_json(meta))
                raise clgen.InternalError(
                    "metadata mismatch in model %s" % self.cache["META"])
        else:
            self._flush_meta()
示例#13
0
def preprocess_contentfiles(db_path: str,
                            max_num_workers: int = cpu_count(),
                            attempt: int = 1) -> None:
    """
    Preprocess OpenCL dataset.

    Arguments:
        db_path (str): OpenCL kernels dataset.
        max_num_workers (int, optional): Number of processes to spawn.
    """
    def _finalize(db_path, cache):
        """Tidy up after worker threads finish"""
        log.debug("worker finalize")

        db = dbutil.connect(db_path)
        c = db.cursor()

        # import results from worker threads
        for outpath in fs.ls(cache.path, abspaths=True):
            with open(outpath) as infile:
                for line in infile:
                    c.execute(
                        'INSERT OR REPLACE INTO PreprocessedFiles '
                        'VALUES(?,?,?)', json.loads(line))

        # write changes to database and remove cache
        db.commit()
        db.close()
        cache.empty()

    if attempt >= MAX_OS_RETRIES:
        raise clgen.InternalError("failed to preprocess files")

    num_contentfiles = dbutil.num_rows_in(db_path, 'ContentFiles')
    num_preprocessedfiles = dbutil.num_rows_in(db_path, 'PreprocessedFiles')
    log.info("{n} ({r:.1%}) files need preprocessing".format(
        n=num_contentfiles - num_preprocessedfiles,
        r=(num_contentfiles - num_preprocessedfiles) / num_contentfiles))

    # split into mulitple jobs of a maximum size
    jobsize = min(512, num_contentfiles)
    numjobs = math.ceil(num_contentfiles / jobsize)
    for j, offset in enumerate(range(0, num_contentfiles, jobsize)):
        num_preprocessedfiles = dbutil.num_rows_in(db_path,
                                                   'PreprocessedFiles')
        num_workers = min(num_contentfiles, max_num_workers)
        files_per_worker = math.ceil(jobsize / num_workers)

        # temporary cache used for worker thread results
        cache = Cache("{pid}.preprocess".format(pid=os.getpid()))
        # each worker thread receives a range of database indices to preprocess,
        # and a JSON file to write results into
        jobs = [{
            "db_in":
            db_path,
            "db_index_range":
            (offset + i * files_per_worker,
             offset + i * files_per_worker + files_per_worker),
            "json_out":
            fs.path(cache.path, "{i}.json".format(i=i))
        } for i in range(num_workers)]

        # spool up worker threads then finalize
        log.info('job {j} of {numjobs}: spawning {num_workers} worker threads '
                 'to process {jobsize} files ...'.format(**vars()))
        try:
            with clgen.terminating(Pool(num_workers)) as pool:
                pool.map(_preprocess_db_worker, jobs)
        except OSError as e:
            _finalize(db_path, cache)
            log.error(e)

            # Try again with fewer threads.
            # See: https://github.com/ChrisCummins/clgen/issues/64
            max_num_workers = max(int(max_num_workers / 2), 1)
            preprocess_contentfiles(db_path,
                                    max_num_workers=max_num_workers,
                                    attempt=attempt + 1)
        except Exception as e:
            _finalize(db_path, cache)
            raise e
        _finalize(db_path, cache)