def test_size_of_chunks_is_same_as_original_list(self):
     list_size = 100
     list_ = [random.random() for _ in range(list_size)]
     proportions = (0.2, 0.5, 0.3)
     results = gt.random_chunks(list_, proportions)
     total_size = sum([len(chunk) for chunk in results])
     self.assertEqual(total_size, list_size)
 def test_correct_chunk_sizes(self):
     list_size = 100
     list_ = [random.random() for _ in range(list_size)]
     proportions = (0.2, 0.5, 0.3)
     expected_sizes = [list_size * i for i in proportions]
     results = gt.random_chunks(list_, proportions)
     for chunk, expected in zip(results, expected_sizes):
         self.assertEqual(len(chunk), expected)
 def test_all_elements_from_chunk_are_in_original_list(self):
     list_size = 10
     list_ = [random.random() for _ in range(list_size)]
     proportions = [1 / list_size for _ in range(list_size)]
     results = gt.random_chunks(list_, proportions)
     for x in results:
         self.assertEqual(len(x), 1)
         self.assertTrue(x[0] in list_)
def make_subsets(path_data, path_subsets, proportions=None, copy=False):

    if proportions is None:
        proportions = {
            "test": settings.PROPTEST,
            "train": settings.PROPTRAIN,
            "valid": settings.PROPVALID}
    if copy:
        moving = shutil.copy
    else:
        moving = shutil.move

    list_chroms = gt.list_elements(path_data, type_="dir")
    list_chroms_names = [os.path.basename(i).split(".")[0] for i in list_chroms]

    create_subsets_dirs(path_subsets, list_chroms_names)

    for index_1, (chrom, chrom_name) in enumerate(
            zip(list_chroms, list_chroms_names)):

        files = gt.list_elements(
            chrom,
            type_="file",
            extension=".txt.gz",
            exception=[os.path.join(chrom, "_meta.txt.gz")])

        subsets = gt.random_chunks(files, (
            proportions["test"],
            proportions["train"],
            proportions["valid"]))

        test_files, train_files, valid_files = subsets

        test_files_out = [os.path.join(
            path_subsets,
            "Test",
            chrom_name) for _ in range(len(test_files))]
        train_files_out = [os.path.join(
            path_subsets,
            "Train",
            chrom_name) for _ in range(len(train_files))]
        valid_files_out = [os.path.join(
            path_subsets,
            "Valid",
            chrom_name) for _ in range(len(valid_files))]

        for index_2, (in_, out_) in enumerate(
                zip(
                    test_files+train_files+valid_files,
                    test_files_out+train_files_out+valid_files_out)):
            moving(in_, out_)
        shutil.move(
            os.path.join(chrom, "_meta.txt.gz"),
            os.path.join(path_subsets, "_meta_"+chrom_name+".txt.gz"))
        shutil.move(
            os.path.join(chrom, "_comments.txt"),
            os.path.join(path_subsets, "_comments_"+chrom_name+".txt"))
    def test_force_with_incorrect_proportions(self):
        list_ = [random.random() for _ in range(10)]
        proportions = (0.4, 0.4, 0.4)

        try:
            _, _, _ = gt.random_chunks(list_, proportions, force=True)
        except ValueError:
            self.fail(
                "{0} raised ValueError while force was set to True!".format(
                    gt.random_chunks.__name__))
 def test_raise_error_when_proportions_greater_than_one(self):
     list_ = [random.random() for _ in range(10)]
     proportions = (0.4, 0.4, 0.4)
     with self.assertRaises(ValueError):
         _, _, _ = gt.random_chunks(list_, proportions)