def SelectRandomGraphs(graph_db: graph_tuple_database.Database): """Return [1, graph_db.graph_count] graphs in a random order.""" with graph_db.Session() as session: # Load a random collection of graphs. graphs = (session.query( graph_tuple_database.GraphTuple).order_by(graph_db.Random()).limit( random.randint(1, graph_db.graph_count)).all()) # Sanity check that graphs are returned. assert graphs return graphs
def db_10000( empty_graph_db: graph_tuple_database.Database, ) -> graph_tuple_database.Database: """Fixture which returns a database with 5000 + 2 graph tuples, where 2 of the graph tuples are empty. For the current implementation of CreateRandomGraphTuple(), a database of 5000 graphs is ~14MB of data. """ # Generate some random graph tuples. graph_pool = [ random_graph_tuple_database_generator.CreateRandomGraphTuple() for _ in range(128) ] # Generate a full list of graphs by randomly selecting from the graph pool. random_graph_tuples: List[graph_tuple_database.GraphTuple] = [ copy.deepcopy(random.choice(graph_pool)) for _ in range(10000) ] # Index the random graphs by ir_id. for i, t in enumerate(random_graph_tuples): t.ir_id = i t.data_flow_steps = i with empty_graph_db.Session(commit=True) as s: s.add_all(random_graph_tuples) # Create the empty graph tuples. These should be ignored by the graph # reader. s.add_all([ graph_tuple_database.GraphTuple.CreateEmpty(0), graph_tuple_database.GraphTuple.CreateEmpty(0), ]) return empty_graph_db
def test_empty_analysis( proto_db: unlabelled_graph_database.Database, graph_db: graph_tuple_database.Database, order_by: str, n: int, ): """Test that 'empty' graphs are produced when analysis returns no results.""" FLAGS.n = n progress.Run( make_data_flow_analysis_dataset.DatasetGenerator( input_db=proto_db, analysis="test_empty", output_db=graph_db, order_by=order_by, )) with graph_db.Session() as session, proto_db.Session() as proto_session: output_graph_count = session.query( sql.func.count(graph_tuple_database.GraphTuple.id)).scalar() input_graph_count = proto_session.query( sql.func.count( unlabelled_graph_database.ProgramGraph.ir_id)).scalar() assert output_graph_count == input_graph_count # All graphs are empty. assert (session.query( sql.func.sum( graph_tuple_database.GraphTuple.node_count)).scalar() == 0)
def test_timeout_analysis( proto_db_10: unlabelled_graph_database.Database, graph_db: graph_tuple_database.Database, order_by: str, n: int, ): """Test that timeout annotator produces one 'empty' graph for each input.""" FLAGS.n = n FLAGS.annotator_timeout = 1 progress.Run( make_data_flow_analysis_dataset.DatasetGenerator( input_db=proto_db_10, analysis="test_timeout", output_db=graph_db, order_by=order_by, )) with graph_db.Session() as session, proto_db_10.Session() as proto_session: assert (session.query( sql.func.count(graph_tuple_database.GraphTuple.id)).scalar( ) == proto_session.query( sql.func.count( unlabelled_graph_database.ProgramGraph.ir_id)).scalar()) # All graphs are empty. assert (session.query( sql.func.sum( graph_tuple_database.GraphTuple.node_count)).scalar() == 0)
def test_flaky_analysis( proto_db: unlabelled_graph_database.Database, graph_db: graph_tuple_database.Database, order_by: str, n: int, ): """Test that flaky annotator produces "some" graphs.""" FLAGS.n = n progress.Run( make_data_flow_analysis_dataset.DatasetGenerator( input_db=proto_db, analysis="test_flaky", output_db=graph_db, order_by=order_by, )) with graph_db.Session() as session, proto_db.Session() as proto_session: assert ( session.query(sql.func.count( graph_tuple_database.GraphTuple.id)).scalar() >= proto_session.query( sql.func.count( unlabelled_graph_database.ProgramGraph.ir_id)).scalar()) # Not all graphs are empty. assert session.query( sql.func.sum(graph_tuple_database.GraphTuple.node_count)).scalar()
def CopySplits( input_db: graph_tuple_database.Database, output_db: graph_tuple_database.Database, ): """Propagate the `split` column from one database to another.""" # Unset splits on output database. with prof.Profile(f"Unset splits on {output_db.graph_count} graphs"): update = sql.update(graph_tuple_database.GraphTuple).values(split=None) output_db.engine.execute(update) # Copy each split one at a time. for split in input_db.splits: with prof.Profile(f"Copied split {split}"): with input_db.Session() as in_session: ids_to_set = [ row.id for row in in_session.query( graph_tuple_database.GraphTuple.id ).filter(graph_tuple_database.GraphTuple.split == split) ] update = ( sql.update(graph_tuple_database.GraphTuple) .where(graph_tuple_database.GraphTuple.id.in_(ids_to_set)) .values(split=split) ) output_db.engine.execute(update)
def PopulateWithTestSet( db: graph_tuple_database.Database, graph_count: int, node_x_dimensionality: int = 2, node_y_dimensionality: int = 0, graph_x_dimensionality: int = 0, graph_y_dimensionality: int = 0, with_data_flow: bool = False, split_count: int = 0, ): """Populate a database with "real" programs.""" rows = [] graph_tuples = itertools.islice( itertools.cycle( random_graph_tuple_generator.EnumerateTestSet(n=graph_count)), graph_count, ) for i, graph_tuple in enumerate(graph_tuples): # Set the graph labels. node_x = (np.random.randint(low=0, high=2, size=(graph_tuple.node_count, node_x_dimensionality)) if node_x_dimensionality else None) node_y = (np.random.rand(graph_tuple.node_count, node_y_dimensionality) if node_y_dimensionality else None) graph_x = (np.random.randint( low=0, high=51, size=graph_x_dimensionality) if graph_x_dimensionality else None) graph_y = (np.random.rand(graph_tuple.graph_count, graph_y_dimensionality) if graph_y_dimensionality else None) graph_tuple = graph_tuple.SetFeaturesAndLabels(node_x=node_x, node_y=node_y, graph_x=graph_x, graph_y=graph_y, copy=False) mapped = graph_tuple_database.GraphTuple.CreateFromGraphTuple( graph_tuple, ir_id=i + 1, split=random.randint(0, split_count) if split_count else None, ) if with_data_flow: mapped.data_flow_steps = random.randint(1, 50) mapped.data_flow_root_node = random.randint( 0, mapped.node_count - 1) mapped.data_flow_positive_node_count = random.randint( 1, mapped.node_count - 1) rows.append(mapped) with db.Session(commit=True) as session: session.add_all(rows) return DatabaseAndRows(db, rows)
def test_PopulateDatahaseithRandomGraphTuples( db: graph_tuple_database.Database, graph_count: int, node_x_dimensionality: int, node_y_dimensionality: int, graph_x_dimensionality: int, graph_y_dimensionality: int, with_data_flow: bool, split_count: int, ): """Test populating databases.""" random_graph_tuple_database_generator.PopulateDatabaseWithRandomGraphTuples( db=db, graph_count=graph_count, node_x_dimensionality=node_x_dimensionality, node_y_dimensionality=node_y_dimensionality, graph_x_dimensionality=graph_x_dimensionality, graph_y_dimensionality=graph_y_dimensionality, with_data_flow=with_data_flow, split_count=split_count, ) with db.Session() as session: assert ( session.query(sql.func.count(graph_tuple_database.GraphTuple.id)).scalar() == graph_count ) assert ( session.query( sql.func.min(graph_tuple_database.GraphTuple.node_x_dimensionality) ).scalar() == node_x_dimensionality ) assert ( session.query( sql.func.min(graph_tuple_database.GraphTuple.node_y_dimensionality) ).scalar() == node_y_dimensionality ) assert ( session.query( sql.func.min(graph_tuple_database.GraphTuple.graph_y_dimensionality) ).scalar() == graph_y_dimensionality ) assert ( session.query( sql.func.min(graph_tuple_database.GraphTuple.graph_y_dimensionality) ).scalar() == graph_y_dimensionality )
def PopulateDatabaseWithRandomGraphTuples( db: graph_tuple_database.Database, graph_count: int, node_x_dimensionality: int = 1, node_y_dimensionality: int = 0, graph_x_dimensionality: int = 0, graph_y_dimensionality: int = 0, with_data_flow: bool = False, split_count: int = 0, random_graph_pool_size: int = 0, ) -> DatabaseAndRows: """Populate a database of random graph tuples.""" random_graph_pool_size = random_graph_pool_size or min( FLAGS.random_graph_pool_size, 128) graph_pool = [ CreateRandomGraphTuple( node_x_dimensionality=node_x_dimensionality, node_y_dimensionality=node_y_dimensionality, graph_x_dimensionality=graph_x_dimensionality, graph_y_dimensionality=graph_y_dimensionality, with_data_flow=with_data_flow, split_count=split_count, ) for _ in range(random_graph_pool_size) ] # Generate a full list of graph rows by randomly selecting from the graph # pool. rows = [ copy.deepcopy(random.choice(graph_pool)) for _ in range(graph_count) ] with db.Session(commit=True) as session: session.add_all([copy.deepcopy(t) for t in rows]) db.RefreshStats() return DatabaseAndRows(db, rows)
def populated_proto_db( proto_db: graph_tuple_database.Database, opencl_relpaths: Set[str]) -> graph_tuple_database.Database: """A test fixture which yields a graph database with 256 OpenCL IR entries.""" rows = [] # Create random rows using OpenCL relpaths. for i, relpath in enumerate(opencl_relpaths): proto = unlabelled_graph_database.ProgramGraph.Create( proto=random_programl_generator.CreateRandomProto(), ir_id=i + 1) proto.id = len(opencl_relpaths) - i rows.append(proto) with proto_db.Session(commit=True) as session: session.add_all(rows) return proto_db
def test_pass_thru_analysis( proto_db: unlabelled_graph_database.Database, graph_db: graph_tuple_database.Database, order_by: str, n: int, ): """Test that pass-thru annotator produces n * protos graphs.""" FLAGS.n = n progress.Run( make_data_flow_analysis_dataset.DatasetGenerator( input_db=proto_db, analysis="test_pass_thru", output_db=graph_db, order_by=order_by, ) ) with graph_db.Session() as session, proto_db.Session() as proto_session: # Check that n * proto_countto graphs were generated. assert ( session.query(sql.func.count(graph_tuple_database.GraphTuple.id)).scalar() == n * proto_session.query( sql.func.count(unlabelled_graph_database.ProgramGraph.ir_id) ).scalar() ) # Check that every unique proto appears in the graph database. assert set( row.ir_id for row in session.query(graph_tuple_database.GraphTuple.ir_id).all() ) == set( row.ir_id for row in proto_session.query( unlabelled_graph_database.ProgramGraph.ir_id ) ) # Check the node counts of the generated graphs. assert ( session.query( sql.func.sum(graph_tuple_database.GraphTuple.node_count) ).scalar() == n * proto_session.query( sql.func.sum(unlabelled_graph_database.ProgramGraph.node_count) ).scalar() )
def test_MakeOpenClDevmapDataset( populated_ir_db: ir_database.Database, populated_proto_db: unlabelled_graph_database.Database, graph_db: graph_tuple_database.Database, gpu: str, ): """Test that the expected number of graph tuples are generated.""" job = make_devmap_dataset.MakeOpenClDevmapDataset( ir_db=populated_ir_db, proto_db=populated_proto_db, graph_db=graph_db, gpu=gpu, ) progress.Run(job) with graph_db.Session() as session: assert (session.query( sql.func.count(graph_tuple_database.GraphTuple.id)).scalar() >= 256) # Check that there are 2-D node features. assert (session.query( graph_tuple_database.GraphTuple.node_x_dimensionality).first(). node_x_dimensionality == 2)
def __init__( self, input_db: unlabelled_graph_database.Database, analysis: str, output_db: graph_tuple_database.Database, order_by: str = "in_order", max_instances: int = 0, ): self.analysis = analysis self.output_db = output_db # Check that the requested analysis exists. if analysis not in annotate.ANALYSES: raise app.UsageError( f"Unknown analysis: {analysis}. " f"Available analyses: {annotate.AVAILABLE_ANALYSES}", ) with input_db.Session() as in_session, output_db.Session( ) as out_session: # Get the graphs that have already been processed. already_done_max, already_done_count = out_session.query( sql.func.max(graph_tuple_database.GraphTuple.ir_id), sql.func.count( sql.func.distinct(graph_tuple_database.GraphTuple.ir_id)), ).one() already_done_max = already_done_max or -1 # Get the total number of graphs, including those that have already been # processed. total_graph_count = in_session.query( sql.func.count( unlabelled_graph_database.ProgramGraph.ir_id)).scalar() # Get the total number of graphs to process, and the IDs of the graphs to # process. ids_and_sizes_to_do = in_session.query( unlabelled_graph_database.ProgramGraph.ir_id, unlabelled_graph_database.ProgramGraph.serialized_proto_size, ) if order_by == "in_order": ids_and_sizes_to_do = ids_and_sizes_to_do.filter( unlabelled_graph_database.ProgramGraph.ir_id > already_done_max).order_by( unlabelled_graph_database.ProgramGraph.ir_id) elif order_by == "random": # Filter out the graphs that have already been processed. if already_done_count: already_done_ids = { row.ir_id for row in out_session.query( graph_tuple_database.GraphTuple.ir_id) } assert already_done_ids != already_done_count ids_and_sizes_to_do = ids_and_sizes_to_do.filter( ~unlabelled_graph_database.ProgramGraph.ir_id.in_( already_done_ids)) # Order the graphs to do randomly. ids_and_sizes_to_do = ids_and_sizes_to_do.order_by( input_db.Random()) else: raise app.UsageError(f"Unknown order: {order_by}") # Optionally limit the number of IDs to process. if max_instances: ids_and_sizes_to_do = ids_and_sizes_to_do.limit(max_instances) ids_and_sizes_to_do = [(row.ir_id, row.serialized_proto_size) for row in ids_and_sizes_to_do] # Sanity check. if not max_instances: if len(ids_and_sizes_to_do ) + already_done_count != total_graph_count: raise OSError( "ids_to_do(%s) + already_done(%s) != total_rows(%s)", len(ids_and_sizes_to_do), already_done_count, total_graph_count, ) with output_db.Session(commit=True) as out_session: out_session.add( unlabelled_graph_database.Meta.Create( key="Graph counts", value=(already_done_count, total_graph_count))) app.Log( 1, "Selected %s of %s to process", humanize.Commas(len(ids_and_sizes_to_do)), humanize.Plural(total_graph_count, "unlabelled graph"), ) super(DatasetGenerator, self).__init__(name=analysis, i=already_done_count, n=total_graph_count, unit="protos") self.graph_reader = ppar.ThreadedIterator( BatchedProtoReader( input_db, ids_and_sizes_to_do, FLAGS.proto_batch_mb * 1024 * 1024, order_by, self.ctx.ToProgressContext(), ), max_queue_size=FLAGS.max_reader_queue_size, )
def __init__( self, db: graph_tuple_database.Database, buffer_size_mb: int = 16, filters: Optional[List[Callable[[], bool]]] = None, order: BufferedGraphReaderOrder = BufferedGraphReaderOrder.IN_ORDER, eager_graph_loading: bool = True, limit: Optional[int] = None, ctx: progress.ProgressContext = progress.NullContext, ): """Constructor. Args: db: The database to iterate over. filters: An optional list of callbacks, where each callback returns a filter condition on the GraphTuple table. order: Determine the order to read graphs. See BufferedGraphReaderOrder. eager_graph_loading: If true, load the contents of the Graph table eagerly, preventing the need for subsequent SQL queries to access the graph data. buffer_size_mb: The number of graphs to query from the database at a time. A larger number reduces the number of queries, but increases the memory requirement. limit: Limit the total number of rows returned to this value. Raises: ValueError: If the query with the given filters returns no results. """ self.db = db self.order = order self.max_buffer_size = buffer_size_mb * 1024 * 1024 self.eager_graph_loading = eager_graph_loading self.filters = filters or [] self.ctx = ctx # Graphs that fail during dataset generation are inserted as zero-node # entries. Ignore those. self.filters.append(lambda: graph_tuple_database.GraphTuple.node_count > 1) if not self.db.graph_count: raise ValueError(f"Database contains no graphs: {self.db.url}") with ctx.Profile( 3, lambda _: ( f"Selected {humanize.Commas(self.n)} of " f"{humanize.Commas(self.db.graph_count)} graphs from database" ), ): with db.Session() as session: # Random ordering means that we can't use # labm8.py.sqlutil.OffsetLimitBatchedQuery() to read results as each # query will produce a different random order. Instead, first run a # query to read all of the IDs and the corresponding tuple sizes that # match the query, then iterate through the list of IDs. query = session.query( graph_tuple_database.GraphTuple.id, graph_tuple_database.GraphTuple.pickled_graph_tuple_size.label( "size" ), ) # Apply the requested filters. for filter_cb in self.filters: query = query.filter(filter_cb()) # If we are ordering with global random then we can scan through the # graph table using index range checks, so we need the IDs sorted. if order == BufferedGraphReaderOrder.DATA_FLOW_STEPS: self.ordered_ids = False query = query.order_by( graph_tuple_database.GraphTuple.data_flow_steps ) elif order == BufferedGraphReaderOrder.GLOBAL_RANDOM: self.ordered_ids = False query = query.order_by(db.Random()) else: self.ordered_ids = True query = query.order_by(graph_tuple_database.GraphTuple.id) # Read the full set of graph IDs and sizes. self.ids_and_sizes = [(row.id, row.size) for row in query.all()] if not self.ids_and_sizes: raise ValueError( f"Query on database `{db.url}` returned no results: " f"`{sqlutil.QueryToString(query)}`" ) # When we are limiting the number of rows and not reading the table in # order, pick a random starting point in the list of IDs. if limit and order != BufferedGraphReaderOrder.IN_ORDER: batch_start = random.randint( 0, max(len(self.ids_and_sizes) - limit - 1, 0) ) self.ids_and_sizes = self.ids_and_sizes[ batch_start : batch_start + limit ] elif limit: # If we are reading the table in order, we must still respect the limit # argument. self.ids_and_sizes = self.ids_and_sizes[:limit] self.i = 0 self.n = len(self.ids_and_sizes) # The local buffer of graphs, and an index into that buffer. self.buffer: List[graph_tuple_database.GraphTuple] = [] self.buffer_i = 0