def _ProcessInputs( bytecode_db: bytecode_database.Database, bytecode_ids: typing.List[int] ) -> typing.List[graph_database.GraphMeta]: """Process a set of bytecodes. Returns: A list of analysis-annotated graphs. """ with bytecode_db.Session() as session: jobs = ( session.query( bytecode_database.LlvmBytecode.id, bytecode_database.LlvmBytecode.bytecode, bytecode_database.LlvmBytecode.source_name, bytecode_database.LlvmBytecode.relpath, bytecode_database.LlvmBytecode.language, ) .filter(bytecode_database.LlvmBytecode.id.in_(bytecode_ids)) .all() ) bytecode_db.Close() # Don't leave the database connection lying around. builder = graph_builder.ProGraMLGraphBuilder() graph_metas = [] for bytecode_id, bytecode, source_name, relpath, language in jobs: # Haskell uses an older version of LLVM which emits incompatible bytecode. # When processing Haskell code we must use the older version of opt. Else, # the default version is fine. opt = "opt-3.5" if language == "haskell" else None try: with prof.Profile( lambda t: f"Constructed {graph.number_of_nodes()}-node CDFG" ): graph = builder.Build(bytecode, opt=opt) graph.bytecode_id = bytecode_id graph.source_name = source_name graph.relpath = relpath graph.language = language graph_metas.append( graph_database.GraphMeta.CreateWithNetworkXGraph(graph) ) except Exception as e: _, _, tb = sys.exc_info() tb = traceback.extract_tb(tb, 2) filename, line_number, function_name, *_ = tb[-1] filename = pathlib.Path(filename).name app.Error( "Failed to annotate bytecode with id " "%d: %s (%s:%s:%s() -> %s)", bytecode_id, e, filename, line_number, function_name, type(e).__name__, ) return graph_metas
def ImportProtos( db: database.Database, bytecode_protos: typing.Iterable[ml4pl_pb2.LlvmBytecode], ) -> None: """Import bytecode protobufs to the database.""" for chunk in labtypes.Chunkify(bytecode_protos, 256): with db.Session(commit=True) as s: bytecodes = [ database.LlvmBytecode(**database.LlvmBytecode.FromProto(proto)) for proto in chunk ] s.add_all(bytecodes)
def PopulateBytecodeTable( self, db: bytecode_database.Database, commit_every: int = 1000 ): bar = progressbar.ProgressBar() bar.max_value = len(self.all_srcs) # Process each row of the table in parallel. pool = multiprocessing.Pool() with db.Session(commit=True) as s: for i, proto in enumerate( pool.imap_unordered(ProcessLinuxSrcToBytecode, self.all_srcs) ): bar.update(i) s.add( bytecode_database.LlvmBytecode( **bytecode_database.LlvmBytecode.FromProto(proto) ) ) if not (i % commit_every): s.commit()
def PopulateBytecodeTable( cf: contentfiles.ContentFiles, language: str, db: bytecode_database.Database, pool: typing.Optional[multiprocessing.Pool] = None, ): # Only one process at a time can run this method. mutex = lockfile.AutoLockFile(granularity="function") # We use the database URL as the name of the source. source_name = cf.url # Read source files from the contenfiles database, process them into # bytecodes, and, if successful, write them into the database. We process # files sorted by their numeric ID in the contentfiles database, so that if with db.Session() as s: # Get the ID of the last-processed bytecode file to resume from. resume_from = int( ( s.query(bytecode_database.LlvmBytecode.relpath) .filter(bytecode_database.LlvmBytecode.source_name == cf.url) .filter(bytecode_database.LlvmBytecode.language == language) # Note the cast to integer: relpath is a string column, sorting by it # in its native type would sort the string (e.g. '9' > '10'. .order_by( sql.cast(bytecode_database.LlvmBytecode.relpath, sql.Integer).desc() ) .limit(1) .first() or (0,) )[0] ) with mutex, cf.Session() as cf_s, sqlutil.BufferedDatabaseWriter( db, max_buffer_length=10 ) as writer: # Get the ID of the last contentfile to process. n = ( cf_s.query(contentfiles.ContentFile.id) .join(contentfiles.GitHubRepository) .filter(contentfiles.GitHubRepository.language == language) .order_by(contentfiles.ContentFile.id.desc()) .limit(1) .one_or_none() or (0,) )[0] app.Log( 1, "Starting at row %s / %s", humanize.Commas(resume_from), humanize.Commas(n), ) # A query to return the <id,text> tuples of files to process. q = ( cf_s.query(contentfiles.ContentFile.id, contentfiles.ContentFile.text) .filter(contentfiles.ContentFile.id > resume_from) .join(contentfiles.GitHubRepository) .filter(contentfiles.GitHubRepository.language == language) .order_by(contentfiles.ContentFile.id) ) row_batches = sqlutil.OffsetLimitBatchedQuery( q, batch_size=FLAGS.batch_size ) for i, batch in zip(range(resume_from, n + 1), row_batches): app.Log( 1, "Processing batch of %d contentfiles -> bytecodes, %s / %s (%.1f%%)", FLAGS.batch_size, humanize.Commas(i), humanize.Commas(n), (i / n) * 100, ) protos = GetBytecodesFromContentFiles(source_name, language, batch.rows) writer.AddMany( [ bytecode_database.LlvmBytecode( **bytecode_database.LlvmBytecode.FromProto(proto) ) for proto in protos ] )