def test_empty_analysis( proto_db: unlabelled_graph_database.Database, graph_db: graph_tuple_database.Database, order_by: str, n: int, ): """Test that 'empty' graphs are produced when analysis returns no results.""" FLAGS.n = n progress.Run( make_data_flow_analysis_dataset.DatasetGenerator( input_db=proto_db, analysis="test_empty", output_db=graph_db, order_by=order_by, )) with graph_db.Session() as session, proto_db.Session() as proto_session: output_graph_count = session.query( sql.func.count(graph_tuple_database.GraphTuple.id)).scalar() input_graph_count = proto_session.query( sql.func.count( unlabelled_graph_database.ProgramGraph.ir_id)).scalar() assert output_graph_count == input_graph_count # All graphs are empty. assert (session.query( sql.func.sum( graph_tuple_database.GraphTuple.node_count)).scalar() == 0)
def test_timeout_analysis( proto_db_10: unlabelled_graph_database.Database, graph_db: graph_tuple_database.Database, order_by: str, n: int, ): """Test that timeout annotator produces one 'empty' graph for each input.""" FLAGS.n = n FLAGS.annotator_timeout = 1 progress.Run( make_data_flow_analysis_dataset.DatasetGenerator( input_db=proto_db_10, analysis="test_timeout", output_db=graph_db, order_by=order_by, )) with graph_db.Session() as session, proto_db_10.Session() as proto_session: assert (session.query( sql.func.count(graph_tuple_database.GraphTuple.id)).scalar( ) == proto_session.query( sql.func.count( unlabelled_graph_database.ProgramGraph.ir_id)).scalar()) # All graphs are empty. assert (session.query( sql.func.sum( graph_tuple_database.GraphTuple.node_count)).scalar() == 0)
def test_flaky_analysis( proto_db: unlabelled_graph_database.Database, graph_db: graph_tuple_database.Database, order_by: str, n: int, ): """Test that flaky annotator produces "some" graphs.""" FLAGS.n = n progress.Run( make_data_flow_analysis_dataset.DatasetGenerator( input_db=proto_db, analysis="test_flaky", output_db=graph_db, order_by=order_by, )) with graph_db.Session() as session, proto_db.Session() as proto_session: assert ( session.query(sql.func.count( graph_tuple_database.GraphTuple.id)).scalar() >= proto_session.query( sql.func.count( unlabelled_graph_database.ProgramGraph.ir_id)).scalar()) # Not all graphs are empty. assert session.query( sql.func.sum(graph_tuple_database.GraphTuple.node_count)).scalar()
def two_graph_db_session( db: unlabelled_graph_database.Database, ) -> unlabelled_graph_database.Database.SessionType: a = unlabelled_graph_database.ProgramGraph.Create( proto=random_programl_generator.CreateRandomProto(), ir_id=1 ) b = unlabelled_graph_database.ProgramGraph.Create( proto=random_programl_generator.CreateRandomProto(), ir_id=2 ) with db.Session() as session: session.add_all([a, b]) session.commit() # Sanity check that the graphs have been added to the database. assert ( session.query( sql.func.count(unlabelled_graph_database.ProgramGraph.ir_id) ).scalar() == 2 ) assert ( session.query( sql.func.count(unlabelled_graph_database.ProgramGraphData.ir_id) ).scalar() == 2 ) yield session
def AnnotateGraphMetas( ir_db: ir_database.Database, proto_db: unlabelled_graph_database.Database, df: pd.DataFrame, ctx: progress.ProgressContext = progress.NullContext, ) -> Iterable[graph_tuple_database.GraphTuple]: """Add features and labels to graph metas in database.""" with ir_db.Session() as ir_session, proto_db.Session() as proto_session: for _, row in df.iterrows(): relpath = row["relpath"] with ctx.Profile( 2, f"Processed graph {row['relpath']}:{row['data:dataset_name']}" ): # Select the corresponding IR. ir_id = (ir_session.query( ir_database.IntermediateRepresentation.id).filter( ir_database.IntermediateRepresentation.source == "pact17_opencl_devmap", ir_database.IntermediateRepresentation.relpath == relpath, ).scalar()) # Check that we have an exact 1:1 mapping from the opencl devmap dataset # to IR. if ir_id is None: raise ValueError(f"Expected one IR with relpath {relpath}") # Load the program graph. proto_row = (proto_session.query( unlabelled_graph_database.ProgramGraph).filter( unlabelled_graph_database.ProgramGraph.ir_id == ir_id).options( sql.orm.joinedload(unlabelled_graph_database. ProgramGraph.data)).scalar()) if proto_row is None: raise ValueError( f"Expected one proto for relpath {relpath} with ID {ir_id}" ) proto: programl_pb2.ProgramGraph = proto_row.proto # Add the null "selector vector" value. for node in proto.node: node.x.append(0) # Add the graph-level features. proto.x[:] = [row["wgsize"], row["transfer"]] # Add 'y' graph feature as target. proto.y[:] = row["y"].tolist() # Create the graph tuple. Note the jumping through hoops with converting # proto -> nx -> graph_tuple, because there is currently no direct # proto -> graph_tuple conversion. graph_tuple = graph_tuple_database.GraphTuple.CreateFromGraphTuple( graph_tuple=graph_tuples.GraphTuple.CreateFromNetworkX( programl.ProgramGraphToNetworkX(proto)), ir_id=ir_id, ) yield graph_tuple
def BatchedProtoReader( proto_db: unlabelled_graph_database.Database, ids_and_sizes_to_do: List[Tuple[int, int]], batch_size_in_bytes: int, order_by: str, ctx: progress.ProgressBarContext, ) -> Iterable[List[ProgramGraphProto]]: """Read from the given list of IDs in batches.""" ids_and_sizes_to_do = sorted(ids_and_sizes_to_do, key=lambda x: x[0]) i = 0 while i < len(ids_and_sizes_to_do): end_i = i batch_size = 0 while batch_size < batch_size_in_bytes: batch_size += ids_and_sizes_to_do[end_i][1] end_i += 1 if end_i >= len(ids_and_sizes_to_do): # We have run out of graphs to read. break with proto_db.Session() as session: with ctx.Profile( 2, f"[reader] Read {humanize.BinaryPrefix(batch_size, 'B')} " f"batch of {end_i - i} unlabelled graphs", ): graphs = session.query( unlabelled_graph_database.ProgramGraph).options( sql.orm.joinedload( unlabelled_graph_database.ProgramGraph.data)) if order_by == "in_order": # For in-order reading, we can do fast range checks on the IR id. start_id = ids_and_sizes_to_do[i][0] end_id = ids_and_sizes_to_do[end_i - 1][0] graphs = graphs.filter( unlabelled_graph_database.ProgramGraph.ir_id >= start_id, unlabelled_graph_database.ProgramGraph.ir_id <= end_id, ) elif order_by == "random": # For random order, have to do set lookups on each ID in the batch. batch_ids_and_sizes = ids_and_sizes_to_do[i:end_i] batch_ids = [x[0] for x in batch_ids_and_sizes] graphs = graphs.filter( unlabelled_graph_database.ProgramGraph.ir_id.in_( batch_ids), ) else: raise app.UsageError(f"Unknown order: {order_by}") graphs = graphs.all() yield [ ProgramGraphProto(ir_id=graph.ir_id, serialized_proto=graph.data.serialized_proto) for graph in graphs ] i = end_i
def test_fuzz_ProgramGraph_Create(db: unlabelled_graph_database.Database): """Fuzz the networkx -> proto conversion using randomly generated graphs.""" global ir_id ir_id += 1 with db.Session(commit=True) as session: session.add( unlabelled_graph_database.ProgramGraph.Create( proto=random_programl_generator.CreateRandomProto(), ir_id=ir_id, split=random.randint(0, 10) if random.random() < 0.5 else None, ))
def PopulateDatabaseWithTestSet(db: unlabelled_graph_database.Database, graph_count: Optional[int] = None): """Populate a database with "real" programs.""" inputs = itertools.islice( itertools.cycle( random_programl_generator.EnumerateTestSet(n=graph_count)), graph_count, ) with db.Session(commit=True) as session: session.add_all([ unlabelled_graph_database.ProgramGraph.Create(proto, ir_id=i + 1) for i, proto in enumerate(inputs) ]) return db
def test_pass_thru_analysis( proto_db: unlabelled_graph_database.Database, graph_db: graph_tuple_database.Database, order_by: str, n: int, ): """Test that pass-thru annotator produces n * protos graphs.""" FLAGS.n = n progress.Run( make_data_flow_analysis_dataset.DatasetGenerator( input_db=proto_db, analysis="test_pass_thru", output_db=graph_db, order_by=order_by, ) ) with graph_db.Session() as session, proto_db.Session() as proto_session: # Check that n * proto_countto graphs were generated. assert ( session.query(sql.func.count(graph_tuple_database.GraphTuple.id)).scalar() == n * proto_session.query( sql.func.count(unlabelled_graph_database.ProgramGraph.ir_id) ).scalar() ) # Check that every unique proto appears in the graph database. assert set( row.ir_id for row in session.query(graph_tuple_database.GraphTuple.ir_id).all() ) == set( row.ir_id for row in proto_session.query( unlabelled_graph_database.ProgramGraph.ir_id ) ) # Check the node counts of the generated graphs. assert ( session.query( sql.func.sum(graph_tuple_database.GraphTuple.node_count) ).scalar() == n * proto_session.query( sql.func.sum(unlabelled_graph_database.ProgramGraph.node_count) ).scalar() )
def test_PopulateDatabaseWithRandomProgramGraphs( db: unlabelled_graph_database.Database, proto_count: int, node_x_dimensionality: int, node_y_dimensionality: int, graph_x_dimensionality: int, graph_y_dimensionality: int, split_count: int, ): """Test populating databases.""" random_unlabelled_graph_database_generator.PopulateDatabaseWithRandomProgramGraphs( db=db, proto_count=proto_count, node_x_dimensionality=node_x_dimensionality, node_y_dimensionality=node_y_dimensionality, graph_x_dimensionality=graph_x_dimensionality, graph_y_dimensionality=graph_y_dimensionality, split_count=split_count, ) with db.Session() as session: assert (session.query( sql.func.count( unlabelled_graph_database.ProgramGraph.ir_id)).scalar() == proto_count) assert (session.query( sql.func.min( unlabelled_graph_database.ProgramGraph.node_x_dimensionality)). scalar() == node_x_dimensionality) assert (session.query( sql.func.min( unlabelled_graph_database.ProgramGraph.node_y_dimensionality)). scalar() == node_y_dimensionality) assert (session.query( sql.func.min( unlabelled_graph_database.ProgramGraph.graph_y_dimensionality) ).scalar() == graph_y_dimensionality) assert (session.query( sql.func.min( unlabelled_graph_database.ProgramGraph.graph_y_dimensionality) ).scalar() == graph_y_dimensionality)
def PopulateDatabaseWithRandomProgramGraphs( db: unlabelled_graph_database.Database, proto_count: int, node_x_dimensionality: int = 1, node_y_dimensionality: int = 0, graph_x_dimensionality: int = 0, graph_y_dimensionality: int = 0, split_count: int = 0, random_proto_pool_size: int = 0, ) -> DatabaseAndRows: """Populate a database of random graph tuples.""" random_proto_pool_size = random_proto_pool_size or min( FLAGS.random_proto_pool_size, 128) graph_pool = [ CreateRandomProgramGraph( node_x_dimensionality=node_x_dimensionality, node_y_dimensionality=node_y_dimensionality, graph_x_dimensionality=graph_x_dimensionality, graph_y_dimensionality=graph_y_dimensionality, split_count=split_count, ) for _ in range(random_proto_pool_size) ] # Generate a full list of rows by randomly selecting from the graph pool. rows = [ copy.deepcopy(random.choice(graph_pool)) for _ in range(proto_count) ] # Assign unique keys and checksums. for i, row in enumerate(rows, start=1): row.ir_id = i row.data.sha1 = str(i) * 40 with db.Session(commit=True) as session: session.add_all([copy.deepcopy(t) for t in rows]) return DatabaseAndRows(db, rows)
def __init__( self, input_db: unlabelled_graph_database.Database, analysis: str, output_db: graph_tuple_database.Database, order_by: str = "in_order", max_instances: int = 0, ): self.analysis = analysis self.output_db = output_db # Check that the requested analysis exists. if analysis not in annotate.ANALYSES: raise app.UsageError( f"Unknown analysis: {analysis}. " f"Available analyses: {annotate.AVAILABLE_ANALYSES}", ) with input_db.Session() as in_session, output_db.Session( ) as out_session: # Get the graphs that have already been processed. already_done_max, already_done_count = out_session.query( sql.func.max(graph_tuple_database.GraphTuple.ir_id), sql.func.count( sql.func.distinct(graph_tuple_database.GraphTuple.ir_id)), ).one() already_done_max = already_done_max or -1 # Get the total number of graphs, including those that have already been # processed. total_graph_count = in_session.query( sql.func.count( unlabelled_graph_database.ProgramGraph.ir_id)).scalar() # Get the total number of graphs to process, and the IDs of the graphs to # process. ids_and_sizes_to_do = in_session.query( unlabelled_graph_database.ProgramGraph.ir_id, unlabelled_graph_database.ProgramGraph.serialized_proto_size, ) if order_by == "in_order": ids_and_sizes_to_do = ids_and_sizes_to_do.filter( unlabelled_graph_database.ProgramGraph.ir_id > already_done_max).order_by( unlabelled_graph_database.ProgramGraph.ir_id) elif order_by == "random": # Filter out the graphs that have already been processed. if already_done_count: already_done_ids = { row.ir_id for row in out_session.query( graph_tuple_database.GraphTuple.ir_id) } assert already_done_ids != already_done_count ids_and_sizes_to_do = ids_and_sizes_to_do.filter( ~unlabelled_graph_database.ProgramGraph.ir_id.in_( already_done_ids)) # Order the graphs to do randomly. ids_and_sizes_to_do = ids_and_sizes_to_do.order_by( input_db.Random()) else: raise app.UsageError(f"Unknown order: {order_by}") # Optionally limit the number of IDs to process. if max_instances: ids_and_sizes_to_do = ids_and_sizes_to_do.limit(max_instances) ids_and_sizes_to_do = [(row.ir_id, row.serialized_proto_size) for row in ids_and_sizes_to_do] # Sanity check. if not max_instances: if len(ids_and_sizes_to_do ) + already_done_count != total_graph_count: raise OSError( "ids_to_do(%s) + already_done(%s) != total_rows(%s)", len(ids_and_sizes_to_do), already_done_count, total_graph_count, ) with output_db.Session(commit=True) as out_session: out_session.add( unlabelled_graph_database.Meta.Create( key="Graph counts", value=(already_done_count, total_graph_count))) app.Log( 1, "Selected %s of %s to process", humanize.Commas(len(ids_and_sizes_to_do)), humanize.Plural(total_graph_count, "unlabelled graph"), ) super(DatasetGenerator, self).__init__(name=analysis, i=already_done_count, n=total_graph_count, unit="protos") self.graph_reader = ppar.ThreadedIterator( BatchedProtoReader( input_db, ids_and_sizes_to_do, FLAGS.proto_batch_mb * 1024 * 1024, order_by, self.ctx.ToProgressContext(), ), max_queue_size=FLAGS.max_reader_queue_size, )