Exemplo n.º 1
0
def SelectRandomGraphs(graph_db: graph_tuple_database.Database):
    """Return [1, graph_db.graph_count] graphs in a random order."""
    with graph_db.Session() as session:
        # Load a random collection of graphs.
        graphs = (session.query(
            graph_tuple_database.GraphTuple).order_by(graph_db.Random()).limit(
                random.randint(1, graph_db.graph_count)).all())
        # Sanity check that graphs are returned.
        assert graphs

    return graphs
def db_10000(
    empty_graph_db: graph_tuple_database.Database,
) -> graph_tuple_database.Database:
    """Fixture which returns a database with 5000 + 2 graph tuples, where 2 of the
  graph tuples are empty.

  For the current implementation of CreateRandomGraphTuple(), a database of
  5000 graphs is ~14MB of data.
  """
    # Generate some random graph tuples.
    graph_pool = [
        random_graph_tuple_database_generator.CreateRandomGraphTuple()
        for _ in range(128)
    ]

    # Generate a full list of graphs by randomly selecting from the graph pool.
    random_graph_tuples: List[graph_tuple_database.GraphTuple] = [
        copy.deepcopy(random.choice(graph_pool)) for _ in range(10000)
    ]
    # Index the random graphs by ir_id.
    for i, t in enumerate(random_graph_tuples):
        t.ir_id = i
        t.data_flow_steps = i

    with empty_graph_db.Session(commit=True) as s:
        s.add_all(random_graph_tuples)
        # Create the empty graph tuples. These should be ignored by the graph
        # reader.
        s.add_all([
            graph_tuple_database.GraphTuple.CreateEmpty(0),
            graph_tuple_database.GraphTuple.CreateEmpty(0),
        ])

    return empty_graph_db
Exemplo n.º 3
0
def test_empty_analysis(
    proto_db: unlabelled_graph_database.Database,
    graph_db: graph_tuple_database.Database,
    order_by: str,
    n: int,
):
    """Test that 'empty' graphs are produced when analysis returns no results."""
    FLAGS.n = n
    progress.Run(
        make_data_flow_analysis_dataset.DatasetGenerator(
            input_db=proto_db,
            analysis="test_empty",
            output_db=graph_db,
            order_by=order_by,
        ))
    with graph_db.Session() as session, proto_db.Session() as proto_session:
        output_graph_count = session.query(
            sql.func.count(graph_tuple_database.GraphTuple.id)).scalar()

        input_graph_count = proto_session.query(
            sql.func.count(
                unlabelled_graph_database.ProgramGraph.ir_id)).scalar()

        assert output_graph_count == input_graph_count

        # All graphs are empty.
        assert (session.query(
            sql.func.sum(
                graph_tuple_database.GraphTuple.node_count)).scalar() == 0)
Exemplo n.º 4
0
def test_timeout_analysis(
    proto_db_10: unlabelled_graph_database.Database,
    graph_db: graph_tuple_database.Database,
    order_by: str,
    n: int,
):
    """Test that timeout annotator produces one 'empty' graph for each input."""
    FLAGS.n = n
    FLAGS.annotator_timeout = 1
    progress.Run(
        make_data_flow_analysis_dataset.DatasetGenerator(
            input_db=proto_db_10,
            analysis="test_timeout",
            output_db=graph_db,
            order_by=order_by,
        ))
    with graph_db.Session() as session, proto_db_10.Session() as proto_session:
        assert (session.query(
            sql.func.count(graph_tuple_database.GraphTuple.id)).scalar(
            ) == proto_session.query(
                sql.func.count(
                    unlabelled_graph_database.ProgramGraph.ir_id)).scalar())

        # All graphs are empty.
        assert (session.query(
            sql.func.sum(
                graph_tuple_database.GraphTuple.node_count)).scalar() == 0)
Exemplo n.º 5
0
def test_flaky_analysis(
    proto_db: unlabelled_graph_database.Database,
    graph_db: graph_tuple_database.Database,
    order_by: str,
    n: int,
):
    """Test that flaky annotator produces "some" graphs."""
    FLAGS.n = n
    progress.Run(
        make_data_flow_analysis_dataset.DatasetGenerator(
            input_db=proto_db,
            analysis="test_flaky",
            output_db=graph_db,
            order_by=order_by,
        ))
    with graph_db.Session() as session, proto_db.Session() as proto_session:
        assert (
            session.query(sql.func.count(
                graph_tuple_database.GraphTuple.id)).scalar() >=
            proto_session.query(
                sql.func.count(
                    unlabelled_graph_database.ProgramGraph.ir_id)).scalar())

        # Not all graphs are empty.
        assert session.query(
            sql.func.sum(graph_tuple_database.GraphTuple.node_count)).scalar()
Exemplo n.º 6
0
def CopySplits(
  input_db: graph_tuple_database.Database,
  output_db: graph_tuple_database.Database,
):
  """Propagate the `split` column from one database to another."""
  # Unset splits on output database.
  with prof.Profile(f"Unset splits on {output_db.graph_count} graphs"):
    update = sql.update(graph_tuple_database.GraphTuple).values(split=None)
    output_db.engine.execute(update)

  # Copy each split one at a time.
  for split in input_db.splits:
    with prof.Profile(f"Copied split {split}"):
      with input_db.Session() as in_session:
        ids_to_set = [
          row.id
          for row in in_session.query(
            graph_tuple_database.GraphTuple.id
          ).filter(graph_tuple_database.GraphTuple.split == split)
        ]

      update = (
        sql.update(graph_tuple_database.GraphTuple)
        .where(graph_tuple_database.GraphTuple.id.in_(ids_to_set))
        .values(split=split)
      )
      output_db.engine.execute(update)
Exemplo n.º 7
0
def PopulateWithTestSet(
    db: graph_tuple_database.Database,
    graph_count: int,
    node_x_dimensionality: int = 2,
    node_y_dimensionality: int = 0,
    graph_x_dimensionality: int = 0,
    graph_y_dimensionality: int = 0,
    with_data_flow: bool = False,
    split_count: int = 0,
):
    """Populate a database with "real" programs."""
    rows = []
    graph_tuples = itertools.islice(
        itertools.cycle(
            random_graph_tuple_generator.EnumerateTestSet(n=graph_count)),
        graph_count,
    )
    for i, graph_tuple in enumerate(graph_tuples):
        # Set the graph labels.
        node_x = (np.random.randint(low=0,
                                    high=2,
                                    size=(graph_tuple.node_count,
                                          node_x_dimensionality))
                  if node_x_dimensionality else None)
        node_y = (np.random.rand(graph_tuple.node_count, node_y_dimensionality)
                  if node_y_dimensionality else None)
        graph_x = (np.random.randint(
            low=0, high=51, size=graph_x_dimensionality)
                   if graph_x_dimensionality else None)
        graph_y = (np.random.rand(graph_tuple.graph_count,
                                  graph_y_dimensionality)
                   if graph_y_dimensionality else None)
        graph_tuple = graph_tuple.SetFeaturesAndLabels(node_x=node_x,
                                                       node_y=node_y,
                                                       graph_x=graph_x,
                                                       graph_y=graph_y,
                                                       copy=False)

        mapped = graph_tuple_database.GraphTuple.CreateFromGraphTuple(
            graph_tuple,
            ir_id=i + 1,
            split=random.randint(0, split_count) if split_count else None,
        )

        if with_data_flow:
            mapped.data_flow_steps = random.randint(1, 50)
            mapped.data_flow_root_node = random.randint(
                0, mapped.node_count - 1)
            mapped.data_flow_positive_node_count = random.randint(
                1, mapped.node_count - 1)

        rows.append(mapped)

    with db.Session(commit=True) as session:
        session.add_all(rows)

    return DatabaseAndRows(db, rows)
Exemplo n.º 8
0
def test_PopulateDatahaseithRandomGraphTuples(
  db: graph_tuple_database.Database,
  graph_count: int,
  node_x_dimensionality: int,
  node_y_dimensionality: int,
  graph_x_dimensionality: int,
  graph_y_dimensionality: int,
  with_data_flow: bool,
  split_count: int,
):
  """Test populating databases."""
  random_graph_tuple_database_generator.PopulateDatabaseWithRandomGraphTuples(
    db=db,
    graph_count=graph_count,
    node_x_dimensionality=node_x_dimensionality,
    node_y_dimensionality=node_y_dimensionality,
    graph_x_dimensionality=graph_x_dimensionality,
    graph_y_dimensionality=graph_y_dimensionality,
    with_data_flow=with_data_flow,
    split_count=split_count,
  )
  with db.Session() as session:
    assert (
      session.query(sql.func.count(graph_tuple_database.GraphTuple.id)).scalar()
      == graph_count
    )

    assert (
      session.query(
        sql.func.min(graph_tuple_database.GraphTuple.node_x_dimensionality)
      ).scalar()
      == node_x_dimensionality
    )

    assert (
      session.query(
        sql.func.min(graph_tuple_database.GraphTuple.node_y_dimensionality)
      ).scalar()
      == node_y_dimensionality
    )

    assert (
      session.query(
        sql.func.min(graph_tuple_database.GraphTuple.graph_y_dimensionality)
      ).scalar()
      == graph_y_dimensionality
    )

    assert (
      session.query(
        sql.func.min(graph_tuple_database.GraphTuple.graph_y_dimensionality)
      ).scalar()
      == graph_y_dimensionality
    )
Exemplo n.º 9
0
def PopulateDatabaseWithRandomGraphTuples(
    db: graph_tuple_database.Database,
    graph_count: int,
    node_x_dimensionality: int = 1,
    node_y_dimensionality: int = 0,
    graph_x_dimensionality: int = 0,
    graph_y_dimensionality: int = 0,
    with_data_flow: bool = False,
    split_count: int = 0,
    random_graph_pool_size: int = 0,
) -> DatabaseAndRows:
    """Populate a database of random graph tuples."""
    random_graph_pool_size = random_graph_pool_size or min(
        FLAGS.random_graph_pool_size, 128)

    graph_pool = [
        CreateRandomGraphTuple(
            node_x_dimensionality=node_x_dimensionality,
            node_y_dimensionality=node_y_dimensionality,
            graph_x_dimensionality=graph_x_dimensionality,
            graph_y_dimensionality=graph_y_dimensionality,
            with_data_flow=with_data_flow,
            split_count=split_count,
        ) for _ in range(random_graph_pool_size)
    ]

    # Generate a full list of graph rows by randomly selecting from the graph
    # pool.
    rows = [
        copy.deepcopy(random.choice(graph_pool)) for _ in range(graph_count)
    ]

    with db.Session(commit=True) as session:
        session.add_all([copy.deepcopy(t) for t in rows])

    db.RefreshStats()

    return DatabaseAndRows(db, rows)
Exemplo n.º 10
0
def populated_proto_db(
        proto_db: graph_tuple_database.Database,
        opencl_relpaths: Set[str]) -> graph_tuple_database.Database:
    """A test fixture which yields a graph database with 256 OpenCL IR entries."""
    rows = []
    # Create random rows using OpenCL relpaths.
    for i, relpath in enumerate(opencl_relpaths):
        proto = unlabelled_graph_database.ProgramGraph.Create(
            proto=random_programl_generator.CreateRandomProto(), ir_id=i + 1)
        proto.id = len(opencl_relpaths) - i
        rows.append(proto)

    with proto_db.Session(commit=True) as session:
        session.add_all(rows)

    return proto_db
Exemplo n.º 11
0
def test_pass_thru_analysis(
  proto_db: unlabelled_graph_database.Database,
  graph_db: graph_tuple_database.Database,
  order_by: str,
  n: int,
):
  """Test that pass-thru annotator produces n * protos graphs."""
  FLAGS.n = n
  progress.Run(
    make_data_flow_analysis_dataset.DatasetGenerator(
      input_db=proto_db,
      analysis="test_pass_thru",
      output_db=graph_db,
      order_by=order_by,
    )
  )
  with graph_db.Session() as session, proto_db.Session() as proto_session:
    # Check that n * proto_countto graphs were generated.
    assert (
      session.query(sql.func.count(graph_tuple_database.GraphTuple.id)).scalar()
      == n
      * proto_session.query(
        sql.func.count(unlabelled_graph_database.ProgramGraph.ir_id)
      ).scalar()
    )

    # Check that every unique proto appears in the graph database.
    assert set(
      row.ir_id
      for row in session.query(graph_tuple_database.GraphTuple.ir_id).all()
    ) == set(
      row.ir_id
      for row in proto_session.query(
        unlabelled_graph_database.ProgramGraph.ir_id
      )
    )

    # Check the node counts of the generated graphs.
    assert (
      session.query(
        sql.func.sum(graph_tuple_database.GraphTuple.node_count)
      ).scalar()
      == n
      * proto_session.query(
        sql.func.sum(unlabelled_graph_database.ProgramGraph.node_count)
      ).scalar()
    )
Exemplo n.º 12
0
def test_MakeOpenClDevmapDataset(
    populated_ir_db: ir_database.Database,
    populated_proto_db: unlabelled_graph_database.Database,
    graph_db: graph_tuple_database.Database,
    gpu: str,
):
    """Test that the expected number of graph tuples are generated."""
    job = make_devmap_dataset.MakeOpenClDevmapDataset(
        ir_db=populated_ir_db,
        proto_db=populated_proto_db,
        graph_db=graph_db,
        gpu=gpu,
    )
    progress.Run(job)
    with graph_db.Session() as session:
        assert (session.query(
            sql.func.count(graph_tuple_database.GraphTuple.id)).scalar() >=
                256)
        # Check that there are 2-D node features.
        assert (session.query(
            graph_tuple_database.GraphTuple.node_x_dimensionality).first().
                node_x_dimensionality == 2)
Exemplo n.º 13
0
    def __init__(
        self,
        input_db: unlabelled_graph_database.Database,
        analysis: str,
        output_db: graph_tuple_database.Database,
        order_by: str = "in_order",
        max_instances: int = 0,
    ):
        self.analysis = analysis
        self.output_db = output_db

        # Check that the requested analysis exists.
        if analysis not in annotate.ANALYSES:
            raise app.UsageError(
                f"Unknown analysis: {analysis}. "
                f"Available analyses: {annotate.AVAILABLE_ANALYSES}", )

        with input_db.Session() as in_session, output_db.Session(
        ) as out_session:
            # Get the graphs that have already been processed.
            already_done_max, already_done_count = out_session.query(
                sql.func.max(graph_tuple_database.GraphTuple.ir_id),
                sql.func.count(
                    sql.func.distinct(graph_tuple_database.GraphTuple.ir_id)),
            ).one()
            already_done_max = already_done_max or -1

            # Get the total number of graphs, including those that have already been
            # processed.
            total_graph_count = in_session.query(
                sql.func.count(
                    unlabelled_graph_database.ProgramGraph.ir_id)).scalar()

            # Get the total number of graphs to process, and the IDs of the graphs to
            # process.
            ids_and_sizes_to_do = in_session.query(
                unlabelled_graph_database.ProgramGraph.ir_id,
                unlabelled_graph_database.ProgramGraph.serialized_proto_size,
            )
            if order_by == "in_order":
                ids_and_sizes_to_do = ids_and_sizes_to_do.filter(
                    unlabelled_graph_database.ProgramGraph.ir_id >
                    already_done_max).order_by(
                        unlabelled_graph_database.ProgramGraph.ir_id)
            elif order_by == "random":
                # Filter out the graphs that have already been processed.
                if already_done_count:
                    already_done_ids = {
                        row.ir_id
                        for row in out_session.query(
                            graph_tuple_database.GraphTuple.ir_id)
                    }
                    assert already_done_ids != already_done_count
                    ids_and_sizes_to_do = ids_and_sizes_to_do.filter(
                        ~unlabelled_graph_database.ProgramGraph.ir_id.in_(
                            already_done_ids))
                # Order the graphs to do randomly.
                ids_and_sizes_to_do = ids_and_sizes_to_do.order_by(
                    input_db.Random())
            else:
                raise app.UsageError(f"Unknown order: {order_by}")

            # Optionally limit the number of IDs to process.
            if max_instances:
                ids_and_sizes_to_do = ids_and_sizes_to_do.limit(max_instances)
            ids_and_sizes_to_do = [(row.ir_id, row.serialized_proto_size)
                                   for row in ids_and_sizes_to_do]

        # Sanity check.
        if not max_instances:
            if len(ids_and_sizes_to_do
                   ) + already_done_count != total_graph_count:
                raise OSError(
                    "ids_to_do(%s) + already_done(%s) != total_rows(%s)",
                    len(ids_and_sizes_to_do),
                    already_done_count,
                    total_graph_count,
                )

        with output_db.Session(commit=True) as out_session:
            out_session.add(
                unlabelled_graph_database.Meta.Create(
                    key="Graph counts",
                    value=(already_done_count, total_graph_count)))
        app.Log(
            1,
            "Selected %s of %s to process",
            humanize.Commas(len(ids_and_sizes_to_do)),
            humanize.Plural(total_graph_count, "unlabelled graph"),
        )

        super(DatasetGenerator, self).__init__(name=analysis,
                                               i=already_done_count,
                                               n=total_graph_count,
                                               unit="protos")

        self.graph_reader = ppar.ThreadedIterator(
            BatchedProtoReader(
                input_db,
                ids_and_sizes_to_do,
                FLAGS.proto_batch_mb * 1024 * 1024,
                order_by,
                self.ctx.ToProgressContext(),
            ),
            max_queue_size=FLAGS.max_reader_queue_size,
        )
Exemplo n.º 14
0
  def __init__(
    self,
    db: graph_tuple_database.Database,
    buffer_size_mb: int = 16,
    filters: Optional[List[Callable[[], bool]]] = None,
    order: BufferedGraphReaderOrder = BufferedGraphReaderOrder.IN_ORDER,
    eager_graph_loading: bool = True,
    limit: Optional[int] = None,
    ctx: progress.ProgressContext = progress.NullContext,
  ):
    """Constructor.

    Args:
      db: The database to iterate over.
      filters: An optional list of callbacks, where each callback returns a
        filter condition on the GraphTuple table.
      order: Determine the order to read graphs. See BufferedGraphReaderOrder.
      eager_graph_loading: If true, load the contents of the Graph table eagerly,
        preventing the need for subsequent SQL queries to access the graph data.
      buffer_size_mb: The number of graphs to query from the database at a time. A
        larger number reduces the number of queries, but increases the memory
        requirement.
      limit: Limit the total number of rows returned to this value.

    Raises:
      ValueError: If the query with the given filters returns no results.
    """
    self.db = db
    self.order = order
    self.max_buffer_size = buffer_size_mb * 1024 * 1024
    self.eager_graph_loading = eager_graph_loading
    self.filters = filters or []
    self.ctx = ctx

    # Graphs that fail during dataset generation are inserted as zero-node
    # entries. Ignore those.
    self.filters.append(lambda: graph_tuple_database.GraphTuple.node_count > 1)

    if not self.db.graph_count:
      raise ValueError(f"Database contains no graphs: {self.db.url}")

    with ctx.Profile(
      3,
      lambda _: (
        f"Selected {humanize.Commas(self.n)} of "
        f"{humanize.Commas(self.db.graph_count)} graphs from database"
      ),
    ):
      with db.Session() as session:
        # Random ordering means that we can't use
        # labm8.py.sqlutil.OffsetLimitBatchedQuery() to read results as each
        # query will produce a different random order. Instead, first run a
        # query to read all of the IDs and the corresponding tuple sizes that
        # match the query, then iterate through the list of IDs.
        query = session.query(
          graph_tuple_database.GraphTuple.id,
          graph_tuple_database.GraphTuple.pickled_graph_tuple_size.label(
            "size"
          ),
        )

        # Apply the requested filters.
        for filter_cb in self.filters:
          query = query.filter(filter_cb())

        # If we are ordering with global random then we can scan through the
        # graph table using index range checks, so we need the IDs sorted.
        if order == BufferedGraphReaderOrder.DATA_FLOW_STEPS:
          self.ordered_ids = False
          query = query.order_by(
            graph_tuple_database.GraphTuple.data_flow_steps
          )
        elif order == BufferedGraphReaderOrder.GLOBAL_RANDOM:
          self.ordered_ids = False
          query = query.order_by(db.Random())
        else:
          self.ordered_ids = True
          query = query.order_by(graph_tuple_database.GraphTuple.id)

        # Read the full set of graph IDs and sizes.
        self.ids_and_sizes = [(row.id, row.size) for row in query.all()]

      if not self.ids_and_sizes:
        raise ValueError(
          f"Query on database `{db.url}` returned no results: "
          f"`{sqlutil.QueryToString(query)}`"
        )

      # When we are limiting the number of rows and not reading the table in
      # order, pick a random starting point in the list of IDs.
      if limit and order != BufferedGraphReaderOrder.IN_ORDER:
        batch_start = random.randint(
          0, max(len(self.ids_and_sizes) - limit - 1, 0)
        )
        self.ids_and_sizes = self.ids_and_sizes[
          batch_start : batch_start + limit
        ]
      elif limit:
        # If we are reading the table in order, we must still respect the limit
        # argument.
        self.ids_and_sizes = self.ids_and_sizes[:limit]

      self.i = 0
      self.n = len(self.ids_and_sizes)

      # The local buffer of graphs, and an index into that buffer.
      self.buffer: List[graph_tuple_database.GraphTuple] = []
      self.buffer_i = 0