def main(unused_argv):
    start_time = time.time()

    # Parses Cora content into TF Examples.
    train_examples, test_examples = parse_cora_content(
        FLAGS.input_cora_content, FLAGS.train_percentage)

    graph = graph_utils.read_tsv_graph(FLAGS.input_cora_graph)
    graph_utils.add_undirected_edges(graph)

    # Joins 'train_examples' with 'graph'. 'test_examples' are used as *unlabeled*
    # neighbors for transductive learning purpose. In other words, the labels of
    # test_examples are not used.
    with tf.io.TFRecordWriter(FLAGS.output_train_data) as writer:
        for merged_example in _join_examples(train_examples, test_examples,
                                             graph, FLAGS.max_nbrs):
            writer.write(merged_example.SerializeToString())

    logging.info('Output training data written to TFRecord file: %s.',
                 FLAGS.output_train_data)

    # Writes 'test_examples' out w/o joining with the graph since graph
    # regularization is used only during training, not testing/serving.
    with tf.io.TFRecordWriter(FLAGS.output_test_data) as writer:
        for example in six.itervalues(test_examples):
            writer.write(example.SerializeToString())

    logging.info('Output test data written to TFRecord file: %s.',
                 FLAGS.output_test_data)
    logging.info('Total running time: %.2f minutes.',
                 (time.time() - start_time) / 60.0)
예제 #2
0
def _main(argv):
    """Main function for running the pack_nbrs program."""
    flag = flags.FLAGS
    flag.showprefixforinfo = False
    start_time = time.time()
    # Check that the correct number of arguments have been provided.
    if len(argv) != 5:
        raise app.UsageError(
            'Invalid number of arguments; expected 4, got %d' %
            (len(argv) - 1))

    # Read seed and neighbor TFRecord input files.
    seed_exs = _read_tfrecord_examples(argv[1], flag.id_feature_name)
    # Unlabeled neighbor input instances are optional. If not provided, all
    # neighbors used will be labeled instances.
    nbr_exs = _read_tfrecord_examples(argv[2],
                                      flag.id_feature_name) if argv[2] else {}

    # Read the input graph in TSV format, and conditionally reverse all its edges.
    graph = graph_utils.read_tsv_graph(argv[3])
    if flag.add_undirected_edges: graph_utils.add_undirected_edges(graph)

    # Join the edges with the seed and neighbor Examples, and write out the
    # results to the output TFRecord file.
    output_tfr = argv[4]
    with tf.io.TFRecordWriter(output_tfr) as writer:
        for merged_ex in _join_examples(seed_exs, nbr_exs, graph,
                                        flag.max_nbrs):
            writer.write(merged_ex.SerializeToString())
    logging.info('Output written to TFRecord file: %s.', output_tfr)
    logging.info('Total running time: %.2f minutes.',
                 (time.time() - start_time) / 60.0)
 def testBuildGraphNoThresholdingNoLSH(self):
     """All edges whose weight is greater than 0 are retained."""
     embeddings = r3_embeddings
     embedding_path = self._create_embedding_file()
     write_embeddings(embeddings, embedding_path)
     graph_path = self._create_graph_file()
     build_graph_lib.build_graph([embedding_path],
                                 graph_path,
                                 similarity_threshold=0)
     g_actual = graph_utils.read_tsv_graph(graph_path)
     self.assertDictEqual(
         g_actual, {
             'A': {
                 'B': 0.5,
                 'C': 0.5
             },
             'B': {
                 'A': 0.5,
                 'C': 0.5
             },
             'C': {
                 'A': 0.5,
                 'B': 0.5
             }
         })
    def testBuildGraphWithThresholdWithLSHSufficientLSHRounds(self):
        """Tests the case where we use (multiple rounds of) LSH bucketing."""
        # Construct the embeddings and write them to a file.
        num_points = 20
        (embeddings,
         adjacent_similarity) = self._build_test_embeddings(num_points)
        embedding_path = self._create_embedding_file()
        write_embeddings(embeddings, embedding_path)

        # Build the graph, and read the results into a dictionary.
        graph_path = self._create_graph_file()
        build_graph_lib.build_graph([embedding_path],
                                    graph_path,
                                    similarity_threshold=0.9,
                                    lsh_splits=2,
                                    lsh_rounds=4,
                                    random_seed=12345)
        g_actual = graph_utils.read_tsv_graph(graph_path)

        # Constuct the expected graph: each point should be a neighbor of the
        # point before it and the point after it in the 'embeddings' sequence.
        # That's because the cosine similarity of adjacent points is ~0.951057,
        # while between every other point it is ~0.809017 (which is below the
        # similarity threshold of 0.9).
        g_expected = {}
        for node_id in range(num_points):
            t_dict = g_expected.setdefault('id_{}'.format(node_id), {})
            t_dict['id_{}'.format(
                (node_id - 1) % num_points)] = adjacent_similarity
            t_dict['id_{}'.format(
                (node_id + 1) % num_points)] = adjacent_similarity
        self.assertDictEqual(g_actual, g_expected)
예제 #5
0
    def testBuildGraphWithThresholdWithLSHInsufficientLSHRounds(self):
        """Tests that some edges are lost with insufficient LSH rounds."""
        # Construct the embeddings and write them to a file.
        num_points = 20
        (embeddings, _) = self._build_test_embeddings(num_points)
        embedding_path = self._create_embedding_file()
        write_embeddings(embeddings, embedding_path)

        # Build the graph, and read the results into a dictionary.
        graph_path = self._create_graph_file()
        build_graph_lib.build_graph([embedding_path],
                                    graph_path,
                                    similarity_threshold=0.9,
                                    lsh_splits=2,
                                    lsh_rounds=1,
                                    random_seed=12345)
        self.assertEqual(self._num_file_lines(graph_path), num_points * 2 - 8)
        g_actual = graph_utils.read_tsv_graph(graph_path)

        # Check that the graph contains fewer than 2 * N edges
        actual_edge_cnt = 0
        for (unused_src_id, tgt_dict) in six.iteritems(g_actual):
            actual_edge_cnt += len(tgt_dict)
        self.assertEqual(actual_edge_cnt, 2 * len(embeddings) - 8,
                         'Expected some edges not to have been found.')
예제 #6
0
 def testGraphBuildingWithThresholding(self):
   """Edges below the similarity threshold are not part of the graph."""
   embedding_path = self._create_embedding_file()
   self._write_embeddings(embedding_path)
   graph_path = self._create_graph_file()
   build_graph_lib.build_graph([embedding_path],
                               graph_path,
                               similarity_threshold=0.51)
   g_actual = graph_utils.read_tsv_graph(graph_path)
   self.assertDictEqual(g_actual, {})
예제 #7
0
 def testBuildGraphWithThresholdingNoLSH(self):
     """Edges below the similarity threshold are not part of the graph."""
     embeddings = r3_embeddings
     embedding_path = self._create_embedding_file()
     write_embeddings(embeddings, embedding_path)
     graph_path = self._create_graph_file()
     build_graph_lib.build_graph([embedding_path],
                                 graph_path,
                                 similarity_threshold=0.51)
     self.assertEqual(self._num_file_lines(graph_path), 0)
     g_actual = graph_utils.read_tsv_graph(graph_path)
     self.assertDictEqual(g_actual, {})
예제 #8
0
def pack_nbrs(labeled_examples_path,
              unlabeled_examples_path,
              graph_path,
              output_training_data_path,
              add_undirected_edges=False,
              max_nbrs=None,
              id_feature_name='id'):
  """Prepares input for graph-based Neural Structured Learning and persists it.

  In particular, this function merges into each labeled training example the
  features from its out-edge neighbor examples according to a supplied
  similarity graph, and persists the resulting (augmented) training data.

  Each `tf.train.Example` read from the files identified by
  `labeled_examples_path` and `unlabeled_examples_path` is expected to have a
  feature that contains its ID (represented as a singleton `bytes_list` value);
  the name of this feature is specified by the value of `id_feature_name`.

  Each edge in the graph specified by `graph_path` is identified by a source
  instance ID, a target instance ID, and an optional edge weight. These edges
  are specified by TSV lines of the following form:

  ```
  source_id<TAB>target_id[<TAB>edge_weight]
  ```

  If no `edge_weight` is specified, it defaults to 1.0. If the input graph is
  not symmetric and if `add_undirected_edges` is `True`, then all edges will be
  treated as bi-directional. To build a graph based on the similarity of
  instances' dense embeddings, see `nsl.tools.build_graph`.

  This function merges into each labeled example the features of that example's
  out-edge neighbors according to that instance's in-edges in the graph. If a
  value is specified for `max_nbrs`, then at most that many neighbors' features
  are merged into each labeled instance (based on which neighbors have the
  largest edge weights, with ties broken using instance IDs).

  Here's how the merging process works. For each labeled example, the features
  of its `i`'th out-edge neighbor will be prefixed by `NL_nbr_<i>_`, with
  indexes `i` in the half-open interval `[0, K)`, where K is the minimum of
  `max_nbrs` and the number of the labeled example's out-edges in the graph. A
  feature named `NL_nbr_<i>_weight` will also be merged into the labeled example
  whose value will be the neighbor's corresponding edge weight. The top
  neighbors to use in this process are selected by consulting the input graph
  and selecting the labeled example's out-edge neighbors with the largest edge
  weight; ties are broken by preferring neighbor IDs with larger lexicographic
  order. Finally, a feature named `NL_num_nbrs` is set on the result (a
  singleton `int64_list`) denoting the number of neighbors `K` merged into the
  labeled example.

  Finally, the merged examples are written to a TFRecord file named by
  `output_training_data_path`.

  Args:
    labeled_examples_path: Names a TFRecord file containing labeled
      `tf.train.Example` instances.
    unlabeled_examples_path: Names a TFRecord file containing unlabeled
      `tf.train.Example` instances. This can be an empty string if there are no
      unlabeled examples.
    graph_path: Names a TSV file that specifies a graph as a set of edges
      representing similarity relationships.
    output_training_data_path: Path to a file where the resulting augmented
      training data in the form of `tf.train.Example` instances will be
      persisted in the TFRecord format.
    add_undirected_edges: `Boolean` indicating whether or not to treat adges as
      bi-directional.
    max_nbrs: The maximum number of neighbors to use to generate the augmented
      training data for downstream training.
    id_feature_name: The name of the feature in the input labeled and unlabeled
      `tf.train.Example` objects representing the ID of examples.
  """
  start_time = time.time()

  # Read seed and neighbor TFRecord input files.
  seed_exs = _read_tfrecord_examples(labeled_examples_path, id_feature_name)
  # Unlabeled neighbor input instances are optional. If not provided, all
  # neighbors used will be labeled instances.
  nbr_exs = _read_tfrecord_examples(
      unlabeled_examples_path,
      id_feature_name) if unlabeled_examples_path else {}

  # Read the input graph in TSV format, and conditionally reverse all its edges.
  graph = graph_utils.read_tsv_graph(graph_path)
  if add_undirected_edges:
    graph_utils.add_undirected_edges(graph)

  # Join the edges with the seed and neighbor Examples, and write out the
  # results to the output TFRecord file.
  with tf.io.TFRecordWriter(output_training_data_path) as writer:
    for merged_ex in _join_examples(seed_exs, nbr_exs, graph, max_nbrs):
      writer.write(merged_ex.SerializeToString())
  logging.info('Output written to TFRecord file: %s.',
               output_training_data_path)
  logging.info('Total running time: %.2f minutes.',
               (time.time() - start_time) / 60.0)
예제 #9
0
 def testReadAndWriteTsvGraph(self):
     path = self.create_tempfile('graph.tsv').full_path
     graph_utils.write_tsv_graph(path, GRAPH)
     read_graph = graph_utils.read_tsv_graph(path)
     self.assertDictEqual(read_graph, GRAPH)