示例#1
0
def _ProcessInputs(
  bytecode_db: bytecode_database.Database, bytecode_ids: typing.List[int]
) -> typing.List[graph_database.GraphMeta]:
  """Process a set of bytecodes.

  Returns:
    A list of analysis-annotated graphs.
  """
  with bytecode_db.Session() as session:
    jobs = (
      session.query(
        bytecode_database.LlvmBytecode.id,
        bytecode_database.LlvmBytecode.bytecode,
        bytecode_database.LlvmBytecode.source_name,
        bytecode_database.LlvmBytecode.relpath,
        bytecode_database.LlvmBytecode.language,
      )
      .filter(bytecode_database.LlvmBytecode.id.in_(bytecode_ids))
      .all()
    )
  bytecode_db.Close()  # Don't leave the database connection lying around.

  builder = graph_builder.ProGraMLGraphBuilder()

  graph_metas = []

  for bytecode_id, bytecode, source_name, relpath, language in jobs:
    # Haskell uses an older version of LLVM which emits incompatible bytecode.
    # When processing Haskell code we must use the older version of opt. Else,
    # the default version is fine.
    opt = "opt-3.5" if language == "haskell" else None

    try:
      with prof.Profile(
        lambda t: f"Constructed {graph.number_of_nodes()}-node CDFG"
      ):
        graph = builder.Build(bytecode, opt=opt)
      graph.bytecode_id = bytecode_id
      graph.source_name = source_name
      graph.relpath = relpath
      graph.language = language
      graph_metas.append(
        graph_database.GraphMeta.CreateWithNetworkXGraph(graph)
      )
    except Exception as e:
      _, _, tb = sys.exc_info()
      tb = traceback.extract_tb(tb, 2)
      filename, line_number, function_name, *_ = tb[-1]
      filename = pathlib.Path(filename).name
      app.Error(
        "Failed to annotate bytecode with id " "%d: %s (%s:%s:%s() -> %s)",
        bytecode_id,
        e,
        filename,
        line_number,
        function_name,
        type(e).__name__,
      )
  return graph_metas
示例#2
0
def ImportProtos(
    db: database.Database,
    bytecode_protos: typing.Iterable[ml4pl_pb2.LlvmBytecode],
) -> None:
    """Import bytecode protobufs to the database."""
    for chunk in labtypes.Chunkify(bytecode_protos, 256):
        with db.Session(commit=True) as s:
            bytecodes = [
                database.LlvmBytecode(**database.LlvmBytecode.FromProto(proto))
                for proto in chunk
            ]
            s.add_all(bytecodes)
示例#3
0
  def PopulateBytecodeTable(
    self, db: bytecode_database.Database, commit_every: int = 1000
  ):
    bar = progressbar.ProgressBar()
    bar.max_value = len(self.all_srcs)

    # Process each row of the table in parallel.
    pool = multiprocessing.Pool()
    with db.Session(commit=True) as s:
      for i, proto in enumerate(
        pool.imap_unordered(ProcessLinuxSrcToBytecode, self.all_srcs)
      ):
        bar.update(i)
        s.add(
          bytecode_database.LlvmBytecode(
            **bytecode_database.LlvmBytecode.FromProto(proto)
          )
        )
        if not (i % commit_every):
          s.commit()
示例#4
0
def PopulateBytecodeTable(
  cf: contentfiles.ContentFiles,
  language: str,
  db: bytecode_database.Database,
  pool: typing.Optional[multiprocessing.Pool] = None,
):
  # Only one process at a time can run this method.
  mutex = lockfile.AutoLockFile(granularity="function")

  # We use the database URL as the name of the source.
  source_name = cf.url

  # Read source files from the contenfiles database, process them into
  # bytecodes, and, if successful, write them into the database. We process
  # files sorted by their numeric ID in the contentfiles database, so that if
  with db.Session() as s:
    # Get the ID of the last-processed bytecode file to resume from.
    resume_from = int(
      (
        s.query(bytecode_database.LlvmBytecode.relpath)
        .filter(bytecode_database.LlvmBytecode.source_name == cf.url)
        .filter(bytecode_database.LlvmBytecode.language == language)
        # Note the cast to integer: relpath is a string column, sorting by it
        # in its native type would sort the string (e.g. '9' > '10'.
        .order_by(
          sql.cast(bytecode_database.LlvmBytecode.relpath, sql.Integer).desc()
        )
        .limit(1)
        .first()
        or (0,)
      )[0]
    )

  with mutex, cf.Session() as cf_s, sqlutil.BufferedDatabaseWriter(
    db, max_buffer_length=10
  ) as writer:
    # Get the ID of the last contentfile to process.
    n = (
      cf_s.query(contentfiles.ContentFile.id)
      .join(contentfiles.GitHubRepository)
      .filter(contentfiles.GitHubRepository.language == language)
      .order_by(contentfiles.ContentFile.id.desc())
      .limit(1)
      .one_or_none()
      or (0,)
    )[0]
    app.Log(
      1,
      "Starting at row %s / %s",
      humanize.Commas(resume_from),
      humanize.Commas(n),
    )

    # A query to return the <id,text> tuples of files to process.
    q = (
      cf_s.query(contentfiles.ContentFile.id, contentfiles.ContentFile.text)
      .filter(contentfiles.ContentFile.id > resume_from)
      .join(contentfiles.GitHubRepository)
      .filter(contentfiles.GitHubRepository.language == language)
      .order_by(contentfiles.ContentFile.id)
    )

    row_batches = sqlutil.OffsetLimitBatchedQuery(
      q, batch_size=FLAGS.batch_size
    )

    for i, batch in zip(range(resume_from, n + 1), row_batches):
      app.Log(
        1,
        "Processing batch of %d contentfiles -> bytecodes, %s / %s (%.1f%%)",
        FLAGS.batch_size,
        humanize.Commas(i),
        humanize.Commas(n),
        (i / n) * 100,
      )
      protos = GetBytecodesFromContentFiles(source_name, language, batch.rows)
      writer.AddMany(
        [
          bytecode_database.LlvmBytecode(
            **bytecode_database.LlvmBytecode.FromProto(proto)
          )
          for proto in protos
        ]
      )