def test_make_edges_pos_train_test_valid_edges_distinct( self, train, test, valid): output_dir = tempfile.mkdtemp() input_edges = tsv_to_df(self.edges_file) make_holdouts(nodes=self.nodes_file, edges=self.edges_file, output_dir=output_dir, train_fraction=0.8, validation=True) input_edges = tsv_to_df(self.edges_file)[['subject', 'object']] train_edges = tsv_to_df(os.path.join(output_dir, train))[['subject', 'object']] test_edges = tsv_to_df(os.path.join(output_dir, test))[['subject', 'object']] valid_edges = tsv_to_df(os.path.join(output_dir, valid))[['subject', 'object']] # train should not share any members with test self.assertTrue(not set(train_edges).isdisjoint(test_edges)) # train should not share any members with valid self.assertTrue(not set(train_edges).isdisjoint(valid_edges)) # test should not share any members with valid self.assertTrue(not set(test_edges).isdisjoint(valid_edges)) # train should be a subset of input_edges self.assertTrue(set(train_edges) <= set(input_edges)) # test should be a subset of input_edges self.assertTrue(set(test_edges) <= set(input_edges)) # valid should be a subset of input_edges self.assertTrue(set(valid_edges) <= set(input_edges))
def test_df_to_tsv(self): path = os.path.join(tempfile.mkdtemp(), 'some.tsv') df = tsv_to_df(self.edges_file) df_to_tsv(df, path) self.assertTrue(os.path.isfile(path)) df_roundtrip = tsv_to_df(path) self.assertEqual(df.shape, df_roundtrip.shape)
def setUpClass(cls) -> None: cls.nodes_file = 'tests/resources/holdouts/bigger_graph_nodes.tsv' cls.edges_file = 'tests/resources/holdouts/bigger_graph_edges.tsv' cls.edges = tsv_to_df(cls.edges_file) cls.nodes = tsv_to_df(cls.nodes_file) # make negative edges for small graph cls.ne = make_negative_edges(nodes_df=cls.nodes, edges_df=cls.edges) # make positive edges for small graph cls.train_fraction = 0.8 (cls.train_edges, cls.test_edges) = make_positive_edges( nodes_df=cls.nodes, edges_df=cls.edges, train_fraction=cls.train_fraction)
def test_make_edges_check_node_output_file(self): output_dir = tempfile.mkdtemp() output_file_with_path = os.path.join(output_dir, 'pos_train_nodes.tsv') input_nodes = tsv_to_df(self.nodes_file) make_holdouts(nodes=self.nodes_file, edges=self.edges_file, output_dir=output_dir, train_fraction=0.8, validation=False) self.assertTrue(os.path.isfile(output_file_with_path)) new_nodes_df = tsv_to_df(output_file_with_path) # make sure we get expected self.assertAlmostEqual(new_nodes_df.shape[0], input_nodes.shape[0]) # should also have subject and object column self.assertTrue('id' in new_nodes_df) self.assertTrue('category' in new_nodes_df)
def test_make_edges_check_edge_output_files(self, output_file: str, make_validation: bool, file_should_exist: bool, expected_fract: float): me_output_dir = tempfile.mkdtemp() output_file_with_path = os.path.join(me_output_dir, output_file) input_edges = tsv_to_df(self.edges_file) num_input_edges = input_edges.shape[0] make_holdouts(nodes=self.nodes_file, edges=self.edges_file, output_dir=me_output_dir, train_fraction=0.8, validation=make_validation) if file_should_exist: self.assertTrue(os.path.isfile(output_file_with_path)) new_edges_df = tsv_to_df(output_file_with_path) # make sure we get expected self.assertAlmostEqual(new_edges_df.shape[0], num_input_edges * expected_fract, 1) # should also have subject and object column self.assertTrue('subject' in new_edges_df) self.assertTrue('object' in new_edges_df) else: self.assertTrue(not os.path.isfile(output_file_with_path))
def test_tsv_to_df(self): df = tsv_to_df(self.edges_file) self.assertTrue(isinstance(df, pd.DataFrame)) self.assertEqual((150, 5), df.shape) self.assertEqual(df['subject'][0], 'g1')