def test_size_of_chunks_is_same_as_original_list(self): list_size = 100 list_ = [random.random() for _ in range(list_size)] proportions = (0.2, 0.5, 0.3) results = gt.random_chunks(list_, proportions) total_size = sum([len(chunk) for chunk in results]) self.assertEqual(total_size, list_size)
def test_correct_chunk_sizes(self): list_size = 100 list_ = [random.random() for _ in range(list_size)] proportions = (0.2, 0.5, 0.3) expected_sizes = [list_size * i for i in proportions] results = gt.random_chunks(list_, proportions) for chunk, expected in zip(results, expected_sizes): self.assertEqual(len(chunk), expected)
def test_all_elements_from_chunk_are_in_original_list(self): list_size = 10 list_ = [random.random() for _ in range(list_size)] proportions = [1 / list_size for _ in range(list_size)] results = gt.random_chunks(list_, proportions) for x in results: self.assertEqual(len(x), 1) self.assertTrue(x[0] in list_)
def make_subsets(path_data, path_subsets, proportions=None, copy=False): if proportions is None: proportions = { "test": settings.PROPTEST, "train": settings.PROPTRAIN, "valid": settings.PROPVALID} if copy: moving = shutil.copy else: moving = shutil.move list_chroms = gt.list_elements(path_data, type_="dir") list_chroms_names = [os.path.basename(i).split(".")[0] for i in list_chroms] create_subsets_dirs(path_subsets, list_chroms_names) for index_1, (chrom, chrom_name) in enumerate( zip(list_chroms, list_chroms_names)): files = gt.list_elements( chrom, type_="file", extension=".txt.gz", exception=[os.path.join(chrom, "_meta.txt.gz")]) subsets = gt.random_chunks(files, ( proportions["test"], proportions["train"], proportions["valid"])) test_files, train_files, valid_files = subsets test_files_out = [os.path.join( path_subsets, "Test", chrom_name) for _ in range(len(test_files))] train_files_out = [os.path.join( path_subsets, "Train", chrom_name) for _ in range(len(train_files))] valid_files_out = [os.path.join( path_subsets, "Valid", chrom_name) for _ in range(len(valid_files))] for index_2, (in_, out_) in enumerate( zip( test_files+train_files+valid_files, test_files_out+train_files_out+valid_files_out)): moving(in_, out_) shutil.move( os.path.join(chrom, "_meta.txt.gz"), os.path.join(path_subsets, "_meta_"+chrom_name+".txt.gz")) shutil.move( os.path.join(chrom, "_comments.txt"), os.path.join(path_subsets, "_comments_"+chrom_name+".txt"))
def test_force_with_incorrect_proportions(self): list_ = [random.random() for _ in range(10)] proportions = (0.4, 0.4, 0.4) try: _, _, _ = gt.random_chunks(list_, proportions, force=True) except ValueError: self.fail( "{0} raised ValueError while force was set to True!".format( gt.random_chunks.__name__))
def test_raise_error_when_proportions_greater_than_one(self): list_ = [random.random() for _ in range(10)] proportions = (0.4, 0.4, 0.4) with self.assertRaises(ValueError): _, _, _ = gt.random_chunks(list_, proportions)