Exemplo n.º 1
0
def TestDataflowGGNN(
    path: pathlib.Path,
    log_dir: pathlib.Path,
    analysis: str,
    vocab: Dict[str, int],
    limit_max_data_flow_steps: bool,
    batch_size: int,
    use_cdfg: bool,
):
    dataflow.PatchWarnings()
    dataflow.RecordExperimentalSetup(log_dir)

    # Create the logging directories.
    assert (log_dir / "epochs").is_dir()
    assert (log_dir / "checkpoints").is_dir()
    (log_dir / "graph_loader").mkdir(exist_ok=True)

    # Create the model, defining the shape of the graphs that it will process.
    #
    # For these data flow experiments, our graphs contain per-node binary
    # classification targets (e.g. reachable / not-reachable).
    model = Ggnn(
        vocabulary=vocab,
        test_only=True,
        node_y_dimensionality=2,
        graph_y_dimensionality=0,
        graph_x_dimensionality=0,
        use_selector_embeddings=True,
    )
    restored_epoch, checkpoint = dataflow.SelectTestCheckpoint(log_dir)
    model.RestoreCheckpoint(checkpoint)

    batches = MakeBatchBuilder(
        dataset_root=path,
        log_dir=log_dir,
        epoch_type=epoch_pb2.TEST,
        analysis=analysis,
        model=model,
        batch_size=batch_size,
        use_cdfg=use_cdfg,
        # Specify that we require at least one graph, as the default (no min) will
        # loop forever.
        min_graph_count=1,
        limit_max_data_flow_steps=limit_max_data_flow_steps,
    )

    start_time = time.time()
    test_results = model.RunBatches(epoch_pb2.TEST, batches, log_prefix="Test")
    epoch = epoch_pb2.EpochList(epoch=[
        epoch_pb2.Epoch(
            walltime_seconds=time.time() - start_time,
            epoch_num=restored_epoch.epoch_num,
            test_results=test_results,
        )
    ])
    print(epoch, end="")

    epoch_path = log_dir / "epochs" / "TEST.EpochList.pbtxt"
    pbutil.ToFile(epoch, epoch_path)
    logging.info("Wrote %s", epoch_path)
Exemplo n.º 2
0
def TestOne(
    features_list_path: pathlib.Path,
    features_list_index: int,
    checkpoint_path: pathlib.Path,
) -> BatchResults:
    path = pathlib.Path(pathflag.path())

    features_list = pbutil.FromFile(
        features_list_path,
        program_graph_features_pb2.ProgramGraphFeaturesList(),
    )
    features = features_list.graph[features_list_index]

    graph_name = features_list_path.name[: -len(".ProgramGraphFeaturesList.pb")]
    graph = pbutil.FromFile(
        path / "graphs" / f"{graph_name}.ProgramGraph.pb",
        program_graph_pb2.ProgramGraph(),
    )

    # Instantiate and restore the model.
    vocab = vocabulary.LoadVocabulary(
        path,
        model_name="cdfg" if FLAGS.cdfg else "programl",
        max_items=FLAGS.max_vocab_size,
        target_cumfreq=FLAGS.target_vocab_cumfreq,
    )

    if FLAGS.cdfg:
        FLAGS.use_position_embeddings = False

    model = Ggnn(
        vocabulary=vocab,
        test_only=True,
        node_y_dimensionality=2,
        graph_y_dimensionality=0,
        graph_x_dimensionality=0,
        use_selector_embeddings=True,
    )
    checkpoint = pbutil.FromFile(checkpoint_path, checkpoint_pb2.Checkpoint())
    model.RestoreCheckpoint(checkpoint)

    batch = list(
        DataflowGgnnBatchBuilder(
            graph_loader=SingleGraphLoader(graph=graph, features=features),
            vocabulary=vocab,
            max_node_size=int(1e9),
            use_cdfg=FLAGS.cdfg,
            max_batch_count=1,
        )
    )[0]

    results = model.RunBatch(epoch_pb2.TEST, batch)

    return AnnotateGraphWithBatchResults(graph, features, results)
Exemplo n.º 3
0
def TrainDataflowGGNN(
  path: pathlib.Path,
  analysis: str,
  vocab: Dict[str, int],
  limit_max_data_flow_steps: bool,
  train_graph_counts: List[int],
  val_graph_count: int,
  val_seed: int,
  batch_size: int,
  use_cdfg: bool,
  run_id: Optional[str] = None,
  restore_from: pathlib.Path = None,
) -> pathlib.Path:
  if not path.is_dir():
    raise FileNotFoundError(path)

  if restore_from:
    log_dir = restore_from
  else:
    # Create the logging directories.
    log_dir = dataflow.CreateLoggingDirectories(
      dataset_root=path,
      model_name="cdfg" if use_cdfg else "programl",
      analysis=analysis,
      run_id=run_id,
    )

  dataflow.PatchWarnings()
  dataflow.RecordExperimentalSetup(log_dir)

  # Cumulative totals for training graph counts at each "epoch".
  train_graph_cumsums = np.array(train_graph_counts, dtype=np.int32)
  # The number of training graphs in each "epoch".
  train_graph_counts = train_graph_cumsums - np.concatenate(
    ([0], train_graph_counts[:-1])
  )

  # Create the model, defining the shape of the graphs that it will process.
  #
  # For these data flow experiments, our graphs contain per-node binary
  # classification targets (e.g. reachable / not-reachable).
  model = Ggnn(
    vocabulary=vocab,
    test_only=False,
    node_y_dimensionality=2,
    graph_y_dimensionality=0,
    graph_x_dimensionality=0,
    use_selector_embeddings=True,
  )

  if restore_from:
    # Pick up training where we left off.
    restored_epoch, checkpoint = dataflow.SelectTrainingCheckpoint(log_dir)
    # Skip the epochs that we have already done.
    # This requires that --train_graph_counts is the same as it was in the
    # run that we are resuming!
    start_epoch_step = restored_epoch.epoch_num
    start_graph_cumsum = sum(train_graph_counts[:start_epoch_step])
    train_graph_counts = train_graph_counts[start_epoch_step:]
    model.RestoreCheckpoint(checkpoint)
  else:
    # Else initialize a new model.
    model.Initialize()
    start_epoch_step, start_graph_cumsum = 1, 0

  app.Log(
    1,
    "GGNN has %s training params",
    humanize.Commas(model.trainable_parameter_count),
  )

  # Create training batches and split into epochs.
  epochs = EpochBatchIterator(
    MakeBatchBuilder(
      dataset_root=path,
      log_dir=log_dir,
      epoch_type=epoch_pb2.TRAIN,
      analysis=analysis,
      model=model,
      batch_size=batch_size,
      use_cdfg=use_cdfg,
      limit_max_data_flow_steps=limit_max_data_flow_steps,
    ),
    train_graph_counts,
    start_graph_count=start_graph_cumsum,
  )

  # Read val batches asynchronously.
  val_batches = AsyncBatchBuilder(
    MakeBatchBuilder(
      dataset_root=path,
      log_dir=log_dir,
      epoch_type=epoch_pb2.VAL,
      analysis=analysis,
      model=model,
      batch_size=batch_size,
      use_cdfg=use_cdfg,
      limit_max_data_flow_steps=limit_max_data_flow_steps,
      min_graph_count=val_graph_count,
      max_graph_count=val_graph_count,
      seed=val_seed,
    )
  )

  for (
    epoch_step,
    (train_graph_count, train_graph_cumsum, train_batches),
  ) in enumerate(epochs, start=start_epoch_step):
    start_time = time.time()
    hr_graph_cumsum = f"{humanize.Commas(train_graph_cumsum)} graphs"

    train_results = model.RunBatches(
      epoch_pb2.TRAIN,
      train_batches,
      log_prefix=f"Train to {hr_graph_cumsum}",
      total_graph_count=train_graph_count,
    )
    val_results = model.RunBatches(
      epoch_pb2.VAL,
      val_batches.batches,
      log_prefix=f"Val at {hr_graph_cumsum}",
      total_graph_count=val_graph_count,
    )

    # Write the epoch to file as an epoch list. This may seem redundant since
    # epoch list contains a single item, but it means that we can easily
    # concatenate a sequence of these epoch protos to produce a valid epoch
    # list using: `cat *.EpochList.pbtxt > epochs.pbtxt`
    epoch = epoch_pb2.EpochList(
      epoch=[
        epoch_pb2.Epoch(
          walltime_seconds=time.time() - start_time,
          epoch_num=epoch_step,
          train_results=train_results,
          val_results=val_results,
        )
      ]
    )
    print(epoch, end="")

    epoch_path = log_dir / "epochs" / f"{epoch_step:03d}.EpochList.pbtxt"
    pbutil.ToFile(epoch, epoch_path)
    app.Log(1, "Wrote %s", epoch_path)

    checkpoint_path = (
      log_dir / "checkpoints" / f"{epoch_step:03d}.Checkpoint.pb"
    )
    pbutil.ToFile(model.SaveCheckpoint(), checkpoint_path)

  return log_dir
Exemplo n.º 4
0
def Main():
    # NOTE(github.com/ChrisCummins/ProGraML/issues/13): F1 score computation
    # warns that it is undefined when there are missing instances from a class,
    # which is fine for our usage.
    warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

    with data_directory() as path:
        Print("=== BENCHMARK 1: Loading graphs from filesystem ===")
        graph_loader = GraphLoader(path)
        graphs = ppar.ThreadedIterator(graph_loader, max_queue_size=100)
        with prof.Profile("Benchmark graph loader"):
            for _ in tqdm(graphs, unit=" graphs"):
                pass
        app.Log(1, "Skip count: %s", graph_loader.skip_count)

        Print(
            "=== BENCHMARK 1: Loading graphs from filesystem and converting to CDFG ==="
        )
        graph_loader = GraphLoader(path, use_cdfg=True)
        graphs = ppar.ThreadedIterator(graph_loader, max_queue_size=100)
        with prof.Profile("Benchmark CDFG graph loader"):
            for _ in tqdm(graphs, unit=" graphs"):
                pass
        app.Log(1, "Skip count: %s", graph_loader.skip_count)

        Print("=== BENCHMARK 2: Batch construction ===")
        batches = BatchBuilder(GraphLoader(path), Vocab())
        batches = ppar.ThreadedIterator(batches, max_queue_size=100)
        cached_batches = []
        with prof.Profile("Benchmark batch construction"):
            for batch in tqdm(batches, unit=" batches"):
                cached_batches.append(batch)

        Print("=== BENCHMARK 2: CDFG batch construction ===")
        batches = BatchBuilder(GraphLoader(path, use_cdfg=True),
                               Vocab(),
                               use_cdfg=True)
        batches = ppar.ThreadedIterator(batches, max_queue_size=100)
        cached_batches = []
        with prof.Profile("Benchmark batch construction"):
            for batch in tqdm(batches, unit=" batches"):
                cached_batches.append(batch)

        Print("=== BENCHMARK 3: Model training ===")
        model = Ggnn(
            vocabulary=Vocab(),
            node_y_dimensionality=2,
            graph_y_dimensionality=0,
            graph_x_dimensionality=0,
            use_selector_embeddings=True,
        )

        with prof.Profile("Benchmark training (prebuilt batches)"):
            model.RunBatches(
                epoch_pb2.TRAIN,
                cached_batches[:FLAGS.train_batch_count],
                log_prefix="Train",
                total_graph_count=sum(
                    b.graph_count
                    for b in cached_batches[:FLAGS.train_batch_count]),
            )
        with prof.Profile("Benchmark training"):
            model.RunBatches(
                epoch_pb2.TRAIN,
                BatchBuilder(GraphLoader(path), Vocab(),
                             FLAGS.train_batch_count),
                log_prefix="Train",
            )

        Print("=== BENCHMARK 4: Model inference ===")
        model = Ggnn(
            vocabulary=Vocab(),
            test_only=True,
            node_y_dimensionality=2,
            graph_y_dimensionality=0,
            graph_x_dimensionality=0,
            use_selector_embeddings=True,
        )

        with prof.Profile("Benchmark inference (prebuilt batches)"):
            model.RunBatches(
                epoch_pb2.TEST,
                cached_batches[:FLAGS.test_batch_count],
                log_prefix="Val",
                total_graph_count=sum(
                    b.graph_count
                    for b in cached_batches[:FLAGS.test_batch_count]),
            )
        with prof.Profile("Benchmark inference"):
            model.RunBatches(
                epoch_pb2.TEST,
                BatchBuilder(GraphLoader(path), Vocab(),
                             FLAGS.test_batch_count),
                log_prefix="Val",
            )
Exemplo n.º 5
0
def main(argv):
    if len(argv) != 1:
        raise app.UsageError(f"Unrecognized arguments: {argv[1:]}")
    # NOTE(github.com/ChrisCummins/ProGraML/issues/13): F1 score computation
    # warns that it is undefined when there are missing instances from a class,
    # which is fine for our usage.
    warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

    with data_directory() as path:
        with benchmark("Loading graphs from filesystem"):
            graph_loader = make_graph_loader(path)
            graphs = ThreadedIterator(graph_loader, max_queue_size=100)
            with progress.Profile("Benchmark graph loader"):
                for _ in tqdm(graphs, unit=" graphs"):
                    pass
            logging.info("Skip count: %s", graph_loader.skip_count)

        with benchmark(
                "Loading graphs from filesystem and converting to CDFG"):
            graph_loader = make_graph_loader(path, use_cdfg=True)
            graphs = ThreadedIterator(graph_loader, max_queue_size=100)
            with progress.Profile("Benchmark CDFG graph loader"):
                for _ in tqdm(graphs, unit=" graphs"):
                    pass
            logging.info("Skip count: %s", graph_loader.skip_count)

        with benchmark("Batch construction"):
            batches = make_batch_builder(make_graph_loader(path), Vocab())
            batches = ThreadedIterator(batches, max_queue_size=100)
            cached_batches = []
            for batch in tqdm(batches, unit=" batches"):
                cached_batches.append(batch)

        with benchmark("CDFG batch construction"):
            batches = make_batch_builder(make_graph_loader(path,
                                                           use_cdfg=True),
                                         Vocab(),
                                         use_cdfg=True)
            batches = ThreadedIterator(batches, max_queue_size=100)
            cached_cdfg_batches = []
            with progress.Profile("Benchmark batch construction"):
                for batch in tqdm(batches, unit=" batches"):
                    cached_cdfg_batches.append(batch)

        model = Ggnn(
            vocabulary=Vocab(),
            node_y_dimensionality=2,
            graph_y_dimensionality=0,
            graph_x_dimensionality=0,
            use_selector_embeddings=True,
        )

        with benchmark("Training (prebuilt batches)"):
            model.RunBatches(
                epoch_pb2.TRAIN,
                cached_batches[:FLAGS.train_batch_count],
                log_prefix="Train",
                total_graph_count=sum(
                    b.graph_count
                    for b in cached_batches[:FLAGS.train_batch_count]),
            )

        with benchmark("Training"):
            model.RunBatches(
                epoch_pb2.TRAIN,
                make_batch_builder(make_graph_loader(path), Vocab(),
                                   FLAGS.train_batch_count),
                log_prefix="Train",
            )

        model = Ggnn(
            vocabulary=Vocab(),
            test_only=True,
            node_y_dimensionality=2,
            graph_y_dimensionality=0,
            graph_x_dimensionality=0,
            use_selector_embeddings=True,
        )

        with benchmark("Inference (prebuilt batches)"):
            model.RunBatches(
                epoch_pb2.TEST,
                cached_batches[:FLAGS.test_batch_count],
                log_prefix="Val",
                total_graph_count=sum(
                    b.graph_count
                    for b in cached_batches[:FLAGS.test_batch_count]),
            )
        with benchmark("Inference"):
            model.RunBatches(
                epoch_pb2.TEST,
                make_batch_builder(make_graph_loader(path), Vocab(),
                                   FLAGS.test_batch_count),
                log_prefix="Val",
            )