def test_file_with_only_line_breaks_compressed(self): found_nb_of_lines_1 = gt.get_nb_lines_file( os.path.join(self.path_to_playground, "empty_encoded_1.txt.gz")) found_nb_of_lines_2 = gt.get_nb_lines_file( os.path.join(self.path_to_playground, "empty_encoded_2.txt.gz")) self.assertEqual(self.nb_lines_empty_1, found_nb_of_lines_1) self.assertEqual(self.nb_lines_empty_2, found_nb_of_lines_2)
def test_file_with_text_not_compressed(self): found_nb_of_lines_1 = gt.get_nb_lines_file( os.path.join(self.path_to_playground, "not_empty_1.txt")) found_nb_of_lines_2 = gt.get_nb_lines_file( os.path.join(self.path_to_playground, "not_empty_2.txt")) self.assertEqual(self.nb_lines_not_empty_1, found_nb_of_lines_1) self.assertEqual(self.nb_lines_not_empty_2, found_nb_of_lines_2)
def cut_file(file, size_of_output_files, output_path, copy=True): filename = os.path.basename(file).split(".")[0] nb_lines = gt.get_nb_lines_file(file) nb_of_files = math.ceil(nb_lines / size_of_output_files) overlapping = math.floor( (size_of_output_files - nb_lines % size_of_output_files) / nb_of_files) begin = 0 for i in range(int(nb_of_files)): end = int(min(begin + size_of_output_files, nb_lines)) if end - begin < size_of_output_files: begin = int(end - size_of_output_files) subset_of_lines = range(begin, end) begin += int(size_of_output_files - overlapping) with open(os.path.join( output_path, filename+"_"+str(i + 1)+".txt"), "w") as outfile,\ gzip.open(file, "rt") as infile: lines = infile.readlines() for index in subset_of_lines: outfile.write(lines[index]) subprocess.call("gzip {}".format( os.path.join(output_path, filename + "_" + str(i + 1) + ".txt")), shell=True) if not copy: subprocess.call("rm {}".format(file), shell=True)
cmd_dir = os.path.abspath(os.path.dirname(__file__)).split("alltests")[0] if cmd_dir not in sys.path: sys.path.append(cmd_dir) from pydeepgenomics.tools import generaltools as gt if __name__ == "__main__": prob_rich_region_to_normal = 0.03 prob_normal_to_rich_region = 0.03 files = gt.list_elements(os.path.abspath("."), extension=".vcf") header_size = 6 for index_f, file in enumerate(files): size = gt.get_nb_lines_file(file) - header_size size_chromosome_a_priori = 200000 / math.pow(1.1, index_f) mean_distance = size_chromosome_a_priori / size with open(file, "r") as in_file, gzip.open(file + ".gz", "wb") as out_file: in_rich = False position = 0 for index_l, line in enumerate(in_file): if in_rich: position += max( 1, random.gauss(mean_distance / 3, mean_distance / 10)) if random.random() < prob_rich_region_to_normal: in_rich = False
def concatenate_split_files( path_to_input, tree_structure="by_sample", make_copy=False, force=False): if not force: if ( ( not path_to_input.endswith("split_by_chr") and not path_to_input[:-1].endswith("split_by_chr") and tree_structure == "by_chr") or ( not path_to_input.endswith("split_by_sample") and not path_to_input[:-1].endswith("split_by_sample") and tree_structure == "by_sample")): raise ValueError( "Inconsistent combination of path and tree structure.\n" + "Received {0} and {1}".format(path_to_input, tree_structure)) elif (tree_structure != "by_sample") and (tree_structure != "by_chr"): raise ValueError( "{0} is not managed by this function".format(tree_structure)) path_to_concat_data = os.path.join(os.path.dirname( path_to_input), "split_and_concat_data") if not os.path.isdir(path_to_concat_data): os.mkdir(path_to_concat_data) else: shutil.rmtree(path_to_concat_data) os.mkdir(path_to_concat_data) list_users = gt.list_elements( path_to_input, type_="dir", exception=[os.path.join( path_to_input, "_meta"), os.path.join(path_to_input, "_comments")]) _concat_meta( os.path.join(path_to_input, "_meta"), path_to_concat_data) _copy_comment( os.path.join(path_to_input, "_comments"), path_to_concat_data) for user in list_users: split_files = gt.list_elements(user, type_="file", extension=".txt.gz") name_user = os.path.basename(split_files[0]) name_user = name_user.replace( re.sub("[^a-z\d]", "", re.search("^[^_]*", name_user).group(0)), "allchr") with gzip.open( os.path.join(path_to_concat_data, name_user), 'wb') as outfile: for file in split_files: with gzip.open(file, "rb") as infile: outfile.write(infile.read()) nb_lines_out = gt.get_nb_lines_file( os.path.join(path_to_concat_data, name_user)) nb_lines_in = 0 for file in split_files: nb_lines_in += gt.get_nb_lines_file(file) if nb_lines_in != nb_lines_out: sys.stderr.write( "Number of lines between original files and the\n" + "concatenated file does not match.\n" + "Origin: {0}, Concat: {1}\n".format(nb_lines_in, nb_lines_out)) if not make_copy: shutil.rmtree(user) if not make_copy: shutil.rmtree(path_to_input)
def test_empty_file(self): found_nb_of_lines = gt.get_nb_lines_file( os.path.join(self.path_to_playground, "really_empty.txt")) self.assertEqual(0, found_nb_of_lines)
def test_incorrect_name(self): with self.assertRaises(FileNotFoundError): _ = gt.get_nb_lines_file( os.path.join(self.path_to_playground, "this_file_does_not_exist.txt"))
def mask_data(path_data, fraction_pass, path_output=None, prefix_subset=None, verbose=False, logging=False): """ fraction_pass = nb between 0 and 1 This function builds the output directory based on the name of the input dir and the prefix. """ print("Starting to filter data from {0} at {1}. ({2} pass)".format( path_data, datetime.datetime.now(), fraction_pass)) if prefix_subset is None: prefix_subset = str(int(100 * fraction_pass)) + "PER_" out_dir_name = prefix_subset + os.path.basename(path_data) if path_output is None: path_output = os.path.join(os.path.dirname(path_data), out_dir_name) copy_output_tree_struct(path_data, path_output) i = 0 chromosomes = gt.list_elements(path_data, type_='dir', exception=[ os.path.join(path_data, "floatfiles"), os.path.join(path_data, "encodeddata"), os.path.join(path_data, "Subsets") ]) for chrom in chromosomes: chrom_name = os.path.basename(chrom) files = gt.list_elements(chrom, extension='.txt.gz') for sample in files: name_sample = sample.split("/")[-1].split(".")[0].split("_")[-1] nb_lines = gt.get_nb_lines_file(sample) subset_of_lines = random.sample( range(nb_lines), int(math.floor(nb_lines * fraction_pass))) with gzip.open(sample, "rt") as infile,\ open(os.path.join( path_output, chrom_name, prefix_subset+name_sample+".txt"), "w") as outfile: lines = infile.readlines() for index in subset_of_lines: outfile.write(lines[index]) subprocess.call("gzip {}".format( os.path.join(path_output, chrom_name, prefix_subset + name_sample + ".txt")), shell=True) if not logging: i += 1 gt.print_progress(i, len(chromosomes) * len(files) - 1, decimals=3) elif verbose: print("{0}/{1} files tested. Date : {2}".format( i, len(chromosomes) * len(files), str(datetime.datetime.now()))) print("\nData from {0} filtered at {1}. ({2} pass)".format( path_data, datetime.datetime.now(), fraction_pass))