Пример #1
0
  def _get_assertion(session: session_t, lines: Iterable[str]) -> Union[
    None, Assertion]:
    clang_assertion = False
    strip = False

    for line in lines:
      if "assertion" in line.lower():
        if strip:
          if line.startswith("cldrive-harness"):
            msg = ":".join(line.split(":")[1:])
          else:
            msg = line
          msg = re.sub(r"^ *:[0-9]+: ", "", msg)
          if "Assertion `(null)' failed." in msg:
            msg = "Assertion `(null)' failed."
          elif "Assertion `' failed." in msg:
            msg = "Assertion `' failed."
          elif "Assertion `' failed." in msg:
            msg = "Assertion `' failed."
        elif clang_assertion:
          msg = ":".join(line.split(":")[3:])
        else:
          msg = line

        assertion = get_or_add(
            session, Assertion,
            sha1=crypto.sha1_str(msg), assertion=msg)
        return assertion
Пример #2
0
    def run(self) -> None:
        i = dbutil.num_rows_in(self.db_path, "ContentFiles")

        if not log.is_verbose():
            bar = progressbar.ProgressBar(max_value=self.max_i)
            bar.update(self.progress())

        try:
            while True:
                sample_time = time()
                sample = self.queue.get(timeout=60)

                kernels = clutil.get_cl_kernels(sample)
                ids = [crypto.sha1_str(k) for k in kernels]

                if self.sampler_opts["static_checker"]:
                    preprocess_opts = {
                        "use_shim": False,
                        "use_gpuverify": self.sampler_opts["gpuverify"]
                    }
                    pp = [clgen.preprocess_for_db(k, **preprocess_opts)
                          for k in kernels]

                db = dbutil.connect(self.db_path)
                c = db.cursor()

                # insert raw samples
                for kid, src in zip(ids, kernels):
                    dbutil.sql_insert_dict(c, "ContentFiles",
                                           {"id": kid, "contents": src},
                                           ignore_existing=True)

                # insert preprocessed samples
                if self.sampler_opts["static_checker"]:
                    for kid, (status, src) in zip(ids, pp):
                        dbutil.sql_insert_dict(c, "PreprocessedFiles", {
                            "id": kid, "status": status, "contents": src
                        }, ignore_existing=True)

                c.close()
                db.commit()
                db.close()

                # update progress bar
                progress = self.progress()
                if not log.is_verbose():
                    bar.update(progress)

                sample_time = time() - sample_time
                self.sampler.stats["progress"] = progress
                self.sampler.stats["time"] += sample_time
                self.sampler._flush_meta(self.cache)

                # determine if we are done sampling
                if self.term_condition():
                    self.producer.stop()
                    return
        finally:  # always kill the sampler thread
            print()
            self.producer.stop()
Пример #3
0
 def _get_unreachable(session: session_t, lines: Iterable[str]) -> Union[
   None, Unreachable]:
   for line in lines:
     if "unreachable" in line.lower():
       unreachable = get_or_add(
           session, Unreachable,
           sha1=crypto.sha1_str(line), unreachable=line)
       return unreachable
Пример #4
0
Файл: db.py Проект: SpringRi/phd
 def __init__(self, generator: Generators.column_t, generation_time: float,
              src: str):
     self.generator = generator
     self.sha1 = crypto.sha1_str(src)
     self.date = datetime.datetime.utcnow()
     self.generation_time = generation_time
     self.linecount = len(src.split("\n"))
     self.charcount = len(src)
     self.src = src
Пример #5
0
        def _hash(sampler_opts: dict, kernel_opts: dict) -> str:
            # we don't consider the number of samples in the ID
            sampler_opts = deepcopy(sampler_opts)
            del sampler_opts["min_samples"]
            del sampler_opts["min_kernels"]
            del sampler_opts["created"]

            checksum_data = sorted([str(x) for x in sampler_opts.values()] +
                                   [str(x) for x in kernel_opts.values()])
            string = "".join([str(x) for x in checksum_data])
            return crypto.sha1_str(string)
Пример #6
0
Файл: db.py Проект: SpringRi/phd
    def from_str(session: session_t, string: str) -> str:
        """
    Instantiate a Stdout object
    """
        # Strip the noise
        string = Stdout._escape(string)

        stdout = get_or_add(session,
                            Stdout,
                            sha1=crypto.sha1_str(string),
                            stdout=string)
        return stdout
Пример #7
0
Файл: db.py Проект: SpringRi/phd
    def from_str(session: session_t, string: str) -> 'Stderr':
        string = Stderr._escape(string)
        sha1 = crypto.sha1_str(string)

        stderr = get_or_add(session,
                            Stderr,
                            sha1=sha1,
                            linecount=len(string.split("\n")),
                            charcount=len(string),
                            truncated=len(string) > Stderr.max_chars,
                            stderr=string[:Stderr.max_chars])
        return stderr
Пример #8
0
    def cache(self, model: clgen.Model):
        """
        Return sampler cache.

        Parameters
        ----------
        model : clgen.Model
            CLgen model.

        Returns
        -------
        labm8
            FSCache: Cache.
        """
        sampler_model_hash = crypto.sha1_str(self.hash + model.hash)

        cache = clgen.mkcache("sampler", sampler_model_hash)

        # validate metadata against cache
        self.stats = {
            "time": 0,
            "progress": 0
        }
        meta = deepcopy(self.to_json())
        if cache.get("META"):
            cached_meta = jsonutil.read_file(cache["META"])

            if "stats" in cached_meta:
                self.stats = cached_meta["stats"]
                del cached_meta["stats"]

            if "created" in cached_meta["sampler"]:
                del cached_meta["sampler"]["created"]
            del meta["sampler"]["created"]

            if "min_samples" in cached_meta["sampler"]:
                del cached_meta["sampler"]["min_samples"]
            del meta["sampler"]["min_samples"]

            if "min_kernels" in cached_meta["sampler"]:
                del cached_meta["sampler"]["min_kernels"]
            del meta["sampler"]["min_kernels"]

            if meta != cached_meta:
                raise clgen.InternalError("sampler metadata mismatch")
        else:
            self._flush_meta(cache)

        return cache
Пример #9
0
 def _get_stackdump(session: session_t, lines: Iterable[str]) -> Union[
   None, StackDump]:
   in_stackdump = False
   stackdump = []
   for line in lines:
     if in_stackdump:
       if line and line[0].isdigit():
         stackdump.append(line)
       else:
         stackdump_ = "\n".join(stackdump)
         stackdump = get_or_add(
             session, StackDump,
             sha1=crypto.sha1_str(stackdump_), stackdump=stackdump_)
         return stackdump
     elif "stack dump:" in line.lower():
       in_stackdump = True
Пример #10
0
    def run(self) -> None:
        i = dbutil.num_rows_in(self.db_path, "ContentFiles")

        if not log.is_verbose():
            bar = progressbar.ProgressBar(max_value=self.max_i)
            bar.update(self.progress())

        try:
            while True:
                sample_time = time()

                # Block while waiting for a new sample to come in:
                sample = self.queue.get(timeout=120).strip()

                # Compute the sample ID:
                kid = crypto.sha1_str(sample)

                # Add the new sample to the database:
                db = dbutil.connect(self.db_path)
                c = db.cursor()
                dbutil.sql_insert_dict(c,
                                       "ContentFiles", {
                                           "id": kid,
                                           "contents": sample
                                       },
                                       ignore_existing=True)
                c.close()
                db.commit()
                db.close()

                # update progress bar
                progress = self.progress()
                if not log.is_verbose():
                    bar.update(progress)

                sample_time = time() - sample_time
                self.sampler.stats["progress"] = progress
                self.sampler.stats["time"] += sample_time
                self.sampler._flush_meta(self.cache)

                # determine if we are done sampling
                if self.term_condition():
                    self.producer.stop()
                    return
        finally:  # always kill the sampler thread
            print()
            self.producer.stop()
Пример #11
0
  def from_str(session: session_t, string: str) -> 'Stderr':
    # Strip the noise:
    string = Stderr._escape(string)

    # Get metadata:
    lines = string.split('\n')
    assertion = Stderr._get_assertion(session, lines)
    if assertion:
      unreachable = None
      stackdump = None
    else:
      unreachable = Stderr._get_unreachable(session, lines)
      if unreachable:
        stackdump = None
      else:
        stackdump = Stderr._get_stackdump(session, lines)
    session.flush()

    # Sanity check:
    errs = sum(1 if x else 0 for x in [assertion, unreachable, stackdump])
    if errs > 1:
      logging.error("Stderr: " + string)
      if assertion:
        logging.error("Assertion: " + assertion.assertion)
      if unreachable:
        logging.error("Assertion: " + unreachable.unreachable)
      if stackdump:
        logging.error("Stackdump: " + stackdump.stackdump)
      raise LookupError(f"Multiple errors types found in stderr:\n\n" +
                        f"Assertion: {assertion}\n" +
                        f"Unreachable: {unreachable}\n" +
                        f"Stackdump: {stackdump}")

    stderr = get_or_add(
        session, Stderr,
        sha1=crypto.sha1_str(string),
        assertion=assertion,
        unreachable=unreachable,
        stackdump=stackdump,
        linecount=len(lines),
        charcount=len(string),
        truncated=len(string) > Stderr.max_chars,
        stderr=string[:Stderr.max_chars])
    return stderr
Пример #12
0
def import_clgen_sample(session: session_t,
                        path: Path,
                        cl_launchable: bool = False,
                        harnesses: List[cldriveParams] = [],
                        delete: bool = False) -> None:
    src = fs.read_file(path)
    hash_ = crypto.sha1_str(src)

    dupe = s.query(CLgenProgram).filter(CLgenProgram.hash == hash_).first()

    if dupe:
        print(f"warning: ignoring duplicate file {path}")
    elif not len(src):
        print(f"warning: ignoring empty file {path}")
    else:
        program = CLgenProgram(hash=hash_,
                               runtime=len(src) / CLGEN_INFERENCE_CPS,
                               src=src,
                               linecount=len(src.split('\n')),
                               cl_launchable=cl_launchable)
        s.add(program)
        s.commit()

        # Make test harnesses, if required
        if harnesses:
            env = cldrive.make_env()
            for params in harnesses:
                testcase = get_or_create(s,
                                         CLgenTestCase,
                                         program_id=program.id,
                                         params_id=params.id)
                s.flush()
                clgen_mkharness.mkharness(s, env, testcase)

        if delete:
            fs.rm(path)
Пример #13
0
def ResolveContentId(config: corpus_pb2.Corpus,
                     hc: typing.Optional[hashcache.HashCache] = None) -> str:
  """Compute the hash of the input contentfiles.

  This function resolves the unique sha1 checksum of a set of content files.

  Args:
    config: The corpus config proto.
    hc: A hashcache database instance, used for resolving directory hashes. If
      the corpus has pre_encoded_corpus_url field set, this may be omitted.

  Returns:
    A hex encoded sha1 string.
  """
  # We can take a massive shortcut if the content ID is already set in the
  # config proto.
  if config.HasField('content_id'):
    # TODO(github.com/ChrisCummins/phd/issues/46): Refactor this after splitting
    # out Corpus class.
    return config.content_id
  elif config.HasField('pre_encoded_corpus_url'):
    # TODO(github.com/ChrisCummins/phd/issues/46): Refactor this after splitting
    # out Corpus class.
    return crypto.sha1_str(config.pre_encoded_corpus_url)

  start_time = time.time()
  if config.HasField('local_directory'):
    local_directory = ExpandConfigPath(
        config.local_directory, path_prefix=FLAGS.clgen_local_path_prefix)

    # After the first time we compute the hash of a directory, we write it into
    # a file. This is a shortcut to work around the fact that computing the
    # directory checksum is O(n) with respect to the number of files in the
    # directory (even if the directory is already cached by the hash cache).
    # This means that it is the responsibility of the user to delete this cached
    # file if the directory is changed.
    hash_file_path = pathlib.Path(str(local_directory) + '.sha1.txt')
    if hash_file_path.is_file():
      app.Log(1, "Reading directory hash: '%s'.", hash_file_path)
      with open(hash_file_path) as f:
        content_id = f.read().rstrip()
    else:
      # No hash file, so compute the directory hash and create it.
      try:
        content_id = hc.GetHash(local_directory)
      except FileNotFoundError as e:
        raise errors.UserError(e)
      # Create the hash file in the directory so that next time we don't need
      # to reference the hash cache.
      with open(hash_file_path, 'w') as f:
        print(content_id, file=f)
      app.Log(1, "Wrote directory hash: '%s'.", hash_file_path)
  elif config.HasField('local_tar_archive'):
    # This if not an efficient means of getting the hash, as it requires always
    # unpacking the archive and reading the entire contents. It would be nicer
    # to maintain a cache which maps the mtime of tarballs to their content ID,
    # similart to how local_directory is implemented.
    content_id = GetHashOfArchiveContents(
        ExpandConfigPath(
            config.local_tar_archive,
            path_prefix=FLAGS.clgen_local_path_prefix))
  else:
    raise NotImplementedError('Unsupported Corpus.contentfiles field value')
  app.Log(2, 'Resolved Content ID %s in %s ms.', content_id,
          humanize.Commas(int((time.time() - start_time) * 1000)))
  return content_id
Пример #14
0
          runtime, status, stdout, stderr = drive_testcase(
              session, testcase, env, platform_id, device_id)

          # assert that executed params match expected
          if stderr != '<-- UTF-ERROR -->':
            verify_params(platform=args.platform, device=args.device,
                          optimizations=testcase.params.optimizations,
                          global_size=testcase.params.gsize,
                          local_size=testcase.params.lsize,
                          stderr=stderr)

          # create new result
          stdout_ = util.escape_stdout(stdout)
          stdout = get_or_create(
              session, CLgenStdout,
              hash=crypto.sha1_str(stdout_), stdout=stdout_)

          stderr_ = util.escape_stderr(stderr)
          stderr = get_or_create(
              session, CLgenStderr,
              hash=crypto.sha1_str(stderr_), stderr=stderr_)
          session.flush()

          result = CLgenResult(
              testbed_id=testbed.id,
              testcase_id=testcase.id,
              status=status,
              runtime=runtime,
              stdout_id=stdout.id,
              stderr_id=stderr.id,
              outcome=analyze.get_cldrive_outcome(status, runtime, stderr_))
Пример #15
0
#!/usr/bin/env python3.6

import sys

from progressbar import ProgressBar

from labm8 import crypto
from labm8 import fs

if __name__ == "__main__":
    inpath = sys.argv[1]
    outdir = sys.argv[2]
    print(f"reading from {inpath} into {outdir}")

    assert fs.isfile(inpath)
    assert not fs.exists(outdir) or fs.isdir(outdir)
    fs.mkdir(outdir)

    with open(inpath) as infile:
        text = infile.read()

    kernels = text.split("// ==== START SAMPLE ====")
    kernels = [kernel.strip() for kernel in kernels if kernel.strip()]
    print(len(kernels), "kernels")

    sha1s = [crypto.sha1_str(kernel) for kernel in kernels]
    for kernel, sha1 in ProgressBar()(list(zip(kernels, sha1s))):
        with open(f"{outdir}/{sha1}.txt", "w") as outfile:
            print(kernel, file=outfile)
Пример #16
0
def fetch_repos(db_path: Path, indir: Path, lang: clgen.Language) -> None:
    db = dbutil.connect(db_path)

    if not dbutil.is_github(db):
        raise clgen.UserError("not a GitHub database")

    c = db.cursor()

    for directory in fs.ls(indir, abspaths=True):
        # hacky hardcoded interpretation of `git remote -v`
        gitdir = fs.path(directory, ".git")
        output = subprocess.check_output(
            ["git", "--git-dir", gitdir, "remote", "-v"],
            universal_newlines=True)
        url = output.split("\n")[0].split("\t")[1].split(" ")[0]
        name = fs.basename(directory)

        output = subprocess.check_output(
            f"git --git-dir {gitdir} rev-list --format=format:'%ai' " +
            f"--max-count=1 $(git --git-dir {gitdir} rev-parse HEAD) | tail -n1",
            shell=True,
            universal_newlines=True)
        try:
            updated_at = dateutil.parser.parse(output)
        except ValueError:
            log.error(f"failed to process {name} {url}")
            continue

        c.execute("SELECT updated_at FROM Repositories WHERE url=?", (url, ))
        cached_updated_at = c.fetchone()

        # Do nothing unless updated timestamps don't match
        # if cached_updated_at and cached_updated_at[0] >= updated_at:
        #     log.verbose(name, "already in database")
        #     continue

        c.execute("DELETE FROM Repositories WHERE url=?", (url, ))
        c.execute("INSERT INTO Repositories VALUES(?,?,?,?,?,?,?,?,?)",
                  (url, "<unknown>", name, 0, 0, 0, 0, updated_at, updated_at))

        name_str = " -o ".join(
            [f"-name '*{ext}'" for ext in clgen.file_extensions(lang)])
        output = subprocess.check_output(
            f"find {directory} -type f {name_str} | grep -v '.git/' || true",
            shell=True,
            universal_newlines=True)
        files = [x.strip() for x in output.split("\n") if x.strip()]

        # nothing to import
        if not len(files):
            # log.verbose("no files in", name)
            continue

        log.verbose("processing", len(files), "files in", name)
        for path in files:
            relpath = path[len(directory) + 1:]
            try:
                contents = inline_fs_headers(path, [], lang=lang)
                sha = crypto.sha1_str(contents)
                c.execute('INSERT OR IGNORE INTO ContentFiles VALUES(?,?)',
                          (sha, contents))
                c.execute(
                    "INSERT OR IGNORE INTO ContentMeta VALUES(?,?,?,?,?)",
                    (sha, relpath, url, sha, len(contents)))
            except UnicodeDecodeError:
                log.warning("non UTF-8 file", path)

        db.commit()
        c = db.cursor()
Пример #17
0
def hash_key(key):
    """
    Convert a key to a filename by hashing its value.
    """
    return crypto.sha1_str(json.dumps(key, sort_keys=True))
Пример #18
0
                while True:
                    # get the next batch of programs to run
                    if not len(inbox):
                        next_batch()
                    # we have no programs to run
                    if not len(inbox):
                        break

                    # get next program to run
                    program = inbox.popleft()

                    status, runtime, stderr_ = build_with_clang(program, clang)

                    # create new result
                    hash_ = crypto.sha1_str(stderr_)
                    q = s.query(tables.clang_stderrs.id) \
                      .filter(tables.clang_stderrs.hash == hash_) \
                      .first()

                    if q:
                        stderr_id = q[0]
                    else:
                        stderr_id = create_stderr(s, tables, stderr_).id

                    result = tables.clangs(program_id=program.id,
                                           clang=args.clang,
                                           status=status,
                                           runtime=runtime,
                                           stderr_id=stderr_id)
Пример #19
0
def hash_key(key):
    """
  Convert a key to a filename by hashing its value.
  """
    return crypto.sha1_str(json.dumps(key, sort_keys=True))
Пример #20
0
                runtime, status, stdout, stderr = cl_launcher(
                    program.src, platform_id, device_id, *flags)

                # assert that executed params match expected
                verify_params(platform=platform_name,
                              device=device_name,
                              optimizations=params.optimizations,
                              global_size=params.gsize,
                              local_size=params.lsize,
                              stderr=stderr)

                # create new result
                stdout_ = util.escape_stdout(stdout)
                stdout = get_or_create(session,
                                       CLSmithStdout,
                                       hash=crypto.sha1_str(stdout_),
                                       stdout=stdout_)

                stderr_ = util.escape_stderr(stderr)
                stderr = get_or_create(session,
                                       CLSmithStderr,
                                       hash=crypto.sha1_str(stderr_),
                                       stderr=stderr_)
                session.flush()

                result = CLSmithResult(testbed_id=testbed.id,
                                       testcase_id=testcase.id,
                                       status=status,
                                       runtime=runtime,
                                       stdout_id=stdout.id,
                                       stderr_id=stderr.id,
Пример #21
0
    with Session(commit=False) as s:

        def flush():
            if args.commit:
                s.commit()
                while len(to_del):
                    fs.rm(to_del.popleft())

        print("Importing CLgen programs ...")
        paths = [p for p in Path("export/clgen/program").iterdir()]
        for i, path in enumerate(ProgressBar()(paths)):
            with open(path) as infile:
                data = json.loads(infile.read())

            new_id = s.query(CLgenProgram.id) \
              .filter(CLgenProgram.hash == crypto.sha1_str(data["src"])).scalar()

            idx = CLgenProgramTranslation(old_id=data["id"], new_id=new_id)
            s.add(idx)

            to_del.append(path)
            if i and not i % 1000:
                flush()
        flush()

        PROGRAMS = dict((old_id, new_id) for old_id, new_id in s.query(
            CLgenProgramTranslation.old_id,
            CLgenProgramTranslation.new_id).all())

        print("Import CLgen results ...")
        paths = [p for p in Path("export/clgen/result").iterdir()]