def test_make_edges_pos_train_test_valid_edges_distinct(
            self, train, test, valid):
        output_dir = tempfile.mkdtemp()
        input_edges = tsv_to_df(self.edges_file)
        make_holdouts(nodes=self.nodes_file,
                      edges=self.edges_file,
                      output_dir=output_dir,
                      train_fraction=0.8,
                      validation=True)
        input_edges = tsv_to_df(self.edges_file)[['subject', 'object']]
        train_edges = tsv_to_df(os.path.join(output_dir,
                                             train))[['subject', 'object']]
        test_edges = tsv_to_df(os.path.join(output_dir,
                                            test))[['subject', 'object']]
        valid_edges = tsv_to_df(os.path.join(output_dir,
                                             valid))[['subject', 'object']]

        # train should not share any members with test
        self.assertTrue(not set(train_edges).isdisjoint(test_edges))
        # train should not share any members with valid
        self.assertTrue(not set(train_edges).isdisjoint(valid_edges))
        # test should not share any members with valid
        self.assertTrue(not set(test_edges).isdisjoint(valid_edges))

        # train should be a subset of input_edges
        self.assertTrue(set(train_edges) <= set(input_edges))
        # test should be a subset of input_edges
        self.assertTrue(set(test_edges) <= set(input_edges))
        # valid should be a subset of input_edges
        self.assertTrue(set(valid_edges) <= set(input_edges))
 def test_df_to_tsv(self):
     path = os.path.join(tempfile.mkdtemp(), 'some.tsv')
     df = tsv_to_df(self.edges_file)
     df_to_tsv(df, path)
     self.assertTrue(os.path.isfile(path))
     df_roundtrip = tsv_to_df(path)
     self.assertEqual(df.shape, df_roundtrip.shape)
    def setUpClass(cls) -> None:
        cls.nodes_file = 'tests/resources/holdouts/bigger_graph_nodes.tsv'
        cls.edges_file = 'tests/resources/holdouts/bigger_graph_edges.tsv'
        cls.edges = tsv_to_df(cls.edges_file)
        cls.nodes = tsv_to_df(cls.nodes_file)

        # make negative edges for small graph
        cls.ne = make_negative_edges(nodes_df=cls.nodes, edges_df=cls.edges)

        # make positive edges for small graph
        cls.train_fraction = 0.8
        (cls.train_edges, cls.test_edges) = make_positive_edges(
            nodes_df=cls.nodes,
            edges_df=cls.edges,
            train_fraction=cls.train_fraction)
 def test_make_edges_check_node_output_file(self):
     output_dir = tempfile.mkdtemp()
     output_file_with_path = os.path.join(output_dir, 'pos_train_nodes.tsv')
     input_nodes = tsv_to_df(self.nodes_file)
     make_holdouts(nodes=self.nodes_file,
                   edges=self.edges_file,
                   output_dir=output_dir,
                   train_fraction=0.8,
                   validation=False)
     self.assertTrue(os.path.isfile(output_file_with_path))
     new_nodes_df = tsv_to_df(output_file_with_path)
     # make sure we get expected
     self.assertAlmostEqual(new_nodes_df.shape[0], input_nodes.shape[0])
     # should also have subject and object column
     self.assertTrue('id' in new_nodes_df)
     self.assertTrue('category' in new_nodes_df)
 def test_make_edges_check_edge_output_files(self, output_file: str,
                                             make_validation: bool,
                                             file_should_exist: bool,
                                             expected_fract: float):
     me_output_dir = tempfile.mkdtemp()
     output_file_with_path = os.path.join(me_output_dir, output_file)
     input_edges = tsv_to_df(self.edges_file)
     num_input_edges = input_edges.shape[0]
     make_holdouts(nodes=self.nodes_file,
                   edges=self.edges_file,
                   output_dir=me_output_dir,
                   train_fraction=0.8,
                   validation=make_validation)
     if file_should_exist:
         self.assertTrue(os.path.isfile(output_file_with_path))
         new_edges_df = tsv_to_df(output_file_with_path)
         # make sure we get expected
         self.assertAlmostEqual(new_edges_df.shape[0],
                                num_input_edges * expected_fract, 1)
         # should also have subject and object column
         self.assertTrue('subject' in new_edges_df)
         self.assertTrue('object' in new_edges_df)
     else:
         self.assertTrue(not os.path.isfile(output_file_with_path))
 def test_tsv_to_df(self):
     df = tsv_to_df(self.edges_file)
     self.assertTrue(isinstance(df, pd.DataFrame))
     self.assertEqual((150, 5), df.shape)
     self.assertEqual(df['subject'][0], 'g1')