def test_file_with_only_line_breaks_compressed(self):
     found_nb_of_lines_1 = gt.get_nb_lines_file(
         os.path.join(self.path_to_playground, "empty_encoded_1.txt.gz"))
     found_nb_of_lines_2 = gt.get_nb_lines_file(
         os.path.join(self.path_to_playground, "empty_encoded_2.txt.gz"))
     self.assertEqual(self.nb_lines_empty_1, found_nb_of_lines_1)
     self.assertEqual(self.nb_lines_empty_2, found_nb_of_lines_2)
    def test_file_with_text_not_compressed(self):

        found_nb_of_lines_1 = gt.get_nb_lines_file(
            os.path.join(self.path_to_playground, "not_empty_1.txt"))
        found_nb_of_lines_2 = gt.get_nb_lines_file(
            os.path.join(self.path_to_playground, "not_empty_2.txt"))
        self.assertEqual(self.nb_lines_not_empty_1, found_nb_of_lines_1)
        self.assertEqual(self.nb_lines_not_empty_2, found_nb_of_lines_2)
示例#3
0
def cut_file(file, size_of_output_files, output_path, copy=True):

    filename = os.path.basename(file).split(".")[0]
    nb_lines = gt.get_nb_lines_file(file)
    nb_of_files = math.ceil(nb_lines / size_of_output_files)
    overlapping = math.floor(
        (size_of_output_files - nb_lines % size_of_output_files) / nb_of_files)

    begin = 0
    for i in range(int(nb_of_files)):
        end = int(min(begin + size_of_output_files, nb_lines))
        if end - begin < size_of_output_files:
            begin = int(end - size_of_output_files)
        subset_of_lines = range(begin, end)
        begin += int(size_of_output_files - overlapping)

        with open(os.path.join(
                    output_path,
                    filename+"_"+str(i + 1)+".txt"), "w") as outfile,\
                gzip.open(file, "rt") as infile:

            lines = infile.readlines()
            for index in subset_of_lines:
                outfile.write(lines[index])
        subprocess.call("gzip {}".format(
            os.path.join(output_path, filename + "_" + str(i + 1) + ".txt")),
                        shell=True)

    if not copy:
        subprocess.call("rm {}".format(file), shell=True)
示例#4
0
    cmd_dir = os.path.abspath(os.path.dirname(__file__)).split("alltests")[0]
    if cmd_dir not in sys.path:
        sys.path.append(cmd_dir)
    from pydeepgenomics.tools import generaltools as gt

if __name__ == "__main__":

    prob_rich_region_to_normal = 0.03
    prob_normal_to_rich_region = 0.03

    files = gt.list_elements(os.path.abspath("."), extension=".vcf")
    header_size = 6

    for index_f, file in enumerate(files):

        size = gt.get_nb_lines_file(file) - header_size
        size_chromosome_a_priori = 200000 / math.pow(1.1, index_f)
        mean_distance = size_chromosome_a_priori / size

        with open(file, "r") as in_file, gzip.open(file + ".gz",
                                                   "wb") as out_file:

            in_rich = False
            position = 0
            for index_l, line in enumerate(in_file):

                if in_rich:
                    position += max(
                        1, random.gauss(mean_distance / 3, mean_distance / 10))
                    if random.random() < prob_rich_region_to_normal:
                        in_rich = False
示例#5
0
def concatenate_split_files(
        path_to_input,
        tree_structure="by_sample",
        make_copy=False,
        force=False):
    if not force:
        if (
            (
                not path_to_input.endswith("split_by_chr") and
                not path_to_input[:-1].endswith("split_by_chr") and
                tree_structure == "by_chr") or
            (
                not path_to_input.endswith("split_by_sample") and
                not path_to_input[:-1].endswith("split_by_sample") and
                tree_structure == "by_sample")):
            raise ValueError(
                "Inconsistent combination of path and tree structure.\n" +
                "Received {0} and {1}".format(path_to_input, tree_structure))
        elif (tree_structure != "by_sample") and (tree_structure != "by_chr"):
            raise ValueError(
                "{0} is not managed by this function".format(tree_structure))

    path_to_concat_data = os.path.join(os.path.dirname(
        path_to_input),
        "split_and_concat_data")
    if not os.path.isdir(path_to_concat_data):
        os.mkdir(path_to_concat_data)
    else:
        shutil.rmtree(path_to_concat_data)
        os.mkdir(path_to_concat_data)

    list_users = gt.list_elements(
        path_to_input,
        type_="dir",
        exception=[os.path.join(
            path_to_input, "_meta"),
            os.path.join(path_to_input, "_comments")])

    _concat_meta(
        os.path.join(path_to_input, "_meta"),
        path_to_concat_data)

    _copy_comment(
        os.path.join(path_to_input, "_comments"),
        path_to_concat_data)

    for user in list_users:

        split_files = gt.list_elements(user, type_="file", extension=".txt.gz")
        name_user = os.path.basename(split_files[0])
        name_user = name_user.replace(
            re.sub("[^a-z\d]", "", re.search("^[^_]*", name_user).group(0)),
            "allchr")
        with gzip.open(
                os.path.join(path_to_concat_data, name_user), 'wb') as outfile:
            for file in split_files:
                with gzip.open(file, "rb") as infile:
                    outfile.write(infile.read())
        nb_lines_out = gt.get_nb_lines_file(
            os.path.join(path_to_concat_data, name_user))
        nb_lines_in = 0
        for file in split_files:
            nb_lines_in += gt.get_nb_lines_file(file)
        if nb_lines_in != nb_lines_out:
            sys.stderr.write(
                "Number of lines between original files and the\n" +
                "concatenated file does not match.\n" +
                "Origin: {0}, Concat: {1}\n".format(nb_lines_in, nb_lines_out))
        if not make_copy:
            shutil.rmtree(user)
    if not make_copy:
        shutil.rmtree(path_to_input)
 def test_empty_file(self):
     found_nb_of_lines = gt.get_nb_lines_file(
         os.path.join(self.path_to_playground, "really_empty.txt"))
     self.assertEqual(0, found_nb_of_lines)
 def test_incorrect_name(self):
     with self.assertRaises(FileNotFoundError):
         _ = gt.get_nb_lines_file(
             os.path.join(self.path_to_playground,
                          "this_file_does_not_exist.txt"))
示例#8
0
def mask_data(path_data,
              fraction_pass,
              path_output=None,
              prefix_subset=None,
              verbose=False,
              logging=False):
    """ fraction_pass = nb between 0 and 1
    This function builds the output directory based on the name of the input dir
    and the prefix.
    """
    print("Starting to filter data from {0} at {1}. ({2} pass)".format(
        path_data, datetime.datetime.now(), fraction_pass))

    if prefix_subset is None:
        prefix_subset = str(int(100 * fraction_pass)) + "PER_"
    out_dir_name = prefix_subset + os.path.basename(path_data)
    if path_output is None:
        path_output = os.path.join(os.path.dirname(path_data), out_dir_name)
    copy_output_tree_struct(path_data, path_output)

    i = 0

    chromosomes = gt.list_elements(path_data,
                                   type_='dir',
                                   exception=[
                                       os.path.join(path_data, "floatfiles"),
                                       os.path.join(path_data, "encodeddata"),
                                       os.path.join(path_data, "Subsets")
                                   ])

    for chrom in chromosomes:

        chrom_name = os.path.basename(chrom)
        files = gt.list_elements(chrom, extension='.txt.gz')

        for sample in files:

            name_sample = sample.split("/")[-1].split(".")[0].split("_")[-1]
            nb_lines = gt.get_nb_lines_file(sample)
            subset_of_lines = random.sample(
                range(nb_lines), int(math.floor(nb_lines * fraction_pass)))

            with gzip.open(sample, "rt") as infile,\
                open(os.path.join(
                        path_output,
                        chrom_name,
                        prefix_subset+name_sample+".txt"),
                     "w") as outfile:

                lines = infile.readlines()

                for index in subset_of_lines:
                    outfile.write(lines[index])
            subprocess.call("gzip {}".format(
                os.path.join(path_output, chrom_name,
                             prefix_subset + name_sample + ".txt")),
                            shell=True)

            if not logging:
                i += 1
                gt.print_progress(i,
                                  len(chromosomes) * len(files) - 1,
                                  decimals=3)
            elif verbose:
                print("{0}/{1} files tested. Date : {2}".format(
                    i,
                    len(chromosomes) * len(files),
                    str(datetime.datetime.now())))

    print("\nData from {0} filtered at {1}. ({2} pass)".format(
        path_data, datetime.datetime.now(), fraction_pass))