def make_subsets(path_data, path_subsets, proportions=None, copy=False): if proportions is None: proportions = { "test": settings.PROPTEST, "train": settings.PROPTRAIN, "valid": settings.PROPVALID} if copy: moving = shutil.copy else: moving = shutil.move list_chroms = gt.list_elements(path_data, type_="dir") list_chroms_names = [os.path.basename(i).split(".")[0] for i in list_chroms] create_subsets_dirs(path_subsets, list_chroms_names) for index_1, (chrom, chrom_name) in enumerate( zip(list_chroms, list_chroms_names)): files = gt.list_elements( chrom, type_="file", extension=".txt.gz", exception=[os.path.join(chrom, "_meta.txt.gz")]) subsets = gt.random_chunks(files, ( proportions["test"], proportions["train"], proportions["valid"])) test_files, train_files, valid_files = subsets test_files_out = [os.path.join( path_subsets, "Test", chrom_name) for _ in range(len(test_files))] train_files_out = [os.path.join( path_subsets, "Train", chrom_name) for _ in range(len(train_files))] valid_files_out = [os.path.join( path_subsets, "Valid", chrom_name) for _ in range(len(valid_files))] for index_2, (in_, out_) in enumerate( zip( test_files+train_files+valid_files, test_files_out+train_files_out+valid_files_out)): moving(in_, out_) shutil.move( os.path.join(chrom, "_meta.txt.gz"), os.path.join(path_subsets, "_meta_"+chrom_name+".txt.gz")) shutil.move( os.path.join(chrom, "_comments.txt"), os.path.join(path_subsets, "_comments_"+chrom_name+".txt"))
def test_simple_ls(self): # deep copy of the class variable files_in_current_folder = self.total_elements[:] files_in_current_folder.sort() results = gt.list_elements(self.path_to_playground, sort="alphanumeric") self.assertEqual(results, files_in_current_folder)
def test_ls_with_type_file(self): files_in_current_folder = self.files["total"][:] files_in_current_folder = gt._natural_sort(files_in_current_folder) results = gt.list_elements(self.path_to_playground, type_="file", sort="natural") self.assertEqual(results, files_in_current_folder)
def example_2(VERBOSE=True): subprocess.call("python " + os.path.join(os.path.dirname(__file__), "setup_ex_env.py"), shell=True) list_chrs = gt.list_elements(PATH_TO_PLAYGROUND, type_="file", extension=".vcf.gz") list_chrs = [os.path.basename(i).split(".")[0] for i in list_chrs] vcf.split_vcf_files(PATH_TO_PLAYGROUND, verbose=False) for chr_to_be_processed in list_chrs: print("###########################################\n" + "Processing chr {}".format(chr_to_be_processed)) path_to_data = os.path.join(PATH_TO_PLAYGROUND, "split_by_chr", str(chr_to_be_processed)) encoding.encode_file_positions(chr_to_be_processed, path_to_data, PATH_TO_PLAYGROUND, verbose=VERBOSE) encoding.verify_decoding(os.path.join(PATH_TO_PLAYGROUND, "split_by_chr"), os.path.join(PATH_TO_PLAYGROUND, "encoded_files"), str(chr_to_be_processed), nb_of_tests_per_file=10, max_nb_of_files_to_test=100, verbose=VERBOSE)
def test_ls_with_extensions(self): files_in_current_folder = self.files[".py"] files_in_current_folder = gt._natural_sort(files_in_current_folder) results = gt.list_elements(self.path_to_playground, sort="natural", extension=".py") self.assertEqual(results, files_in_current_folder)
def test_ls_with_exception(self): files_in_current_folder = self.total_elements[:] files_in_current_folder = gt._natural_sort(files_in_current_folder) exception = random.sample(files_in_current_folder, len(files_in_current_folder) // 5) files_in_current_folder = [ x for x in files_in_current_folder if x not in exception ] results = gt.list_elements(self.path_to_playground, sort="natural", exception=exception) self.assertEqual(results, files_in_current_folder)
def _concat_meta(path_in, path_out): meta_files = gt.list_elements(path_in, type_="file", extension=".txt.gz") header_written = False with gzip.open(os.path.join(path_out, "_meta.txt.gz"), 'wb') as out_file: for file in meta_files: with gzip.open(file, "rb") as in_file: for line in in_file: try: is_comment = line.startswith('#') except TypeError: is_comment = line.startswith(b'#') if not is_comment or not header_written: out_file.write(line) header_written = True
def sort_chr_files_by_sample( path_to_data_by_chr, make_copy=False, force=False): if ( not path_to_data_by_chr.endswith("split_by_chr") and not path_to_data_by_chr[:-1].endswith("split_by_chr") and not force): raise ValueError( "Directory name not recognized, directory name should be" + "split_by_chr") path_to_data_by_sample = os.path.join(os.path.dirname( path_to_data_by_chr), "split_by_sample") if not os.path.isdir(path_to_data_by_sample): os.mkdir(path_to_data_by_sample) else: shutil.rmtree(path_to_data_by_sample) os.mkdir(path_to_data_by_sample) chromosomes_dirs = gt.list_elements(path_to_data_by_chr, type_="dir") for folder in chromosomes_dirs: dir_name = os.path.basename(folder) files = gt.list_elements(folder, type_="file") for file in files: file_name = os.path.basename(file) destination = os.path.join( path_to_data_by_sample, file_name.split(".")[0]) if not os.path.isdir(destination): os.mkdir(destination) new_name = os.path.join(destination, "chr"+dir_name+"_"+file_name) if make_copy: shutil.copy(file, new_name) else: shutil.move(file, new_name) if not make_copy: shutil.rmtree(path_to_data_by_chr)
def concatenate_split_files( path_to_input, tree_structure="by_sample", make_copy=False, force=False): if not force: if ( ( not path_to_input.endswith("split_by_chr") and not path_to_input[:-1].endswith("split_by_chr") and tree_structure == "by_chr") or ( not path_to_input.endswith("split_by_sample") and not path_to_input[:-1].endswith("split_by_sample") and tree_structure == "by_sample")): raise ValueError( "Inconsistent combination of path and tree structure.\n" + "Received {0} and {1}".format(path_to_input, tree_structure)) elif (tree_structure != "by_sample") and (tree_structure != "by_chr"): raise ValueError( "{0} is not managed by this function".format(tree_structure)) path_to_concat_data = os.path.join(os.path.dirname( path_to_input), "split_and_concat_data") if not os.path.isdir(path_to_concat_data): os.mkdir(path_to_concat_data) else: shutil.rmtree(path_to_concat_data) os.mkdir(path_to_concat_data) list_users = gt.list_elements( path_to_input, type_="dir", exception=[os.path.join( path_to_input, "_meta"), os.path.join(path_to_input, "_comments")]) _concat_meta( os.path.join(path_to_input, "_meta"), path_to_concat_data) _copy_comment( os.path.join(path_to_input, "_comments"), path_to_concat_data) for user in list_users: split_files = gt.list_elements(user, type_="file", extension=".txt.gz") name_user = os.path.basename(split_files[0]) name_user = name_user.replace( re.sub("[^a-z\d]", "", re.search("^[^_]*", name_user).group(0)), "allchr") with gzip.open( os.path.join(path_to_concat_data, name_user), 'wb') as outfile: for file in split_files: with gzip.open(file, "rb") as infile: outfile.write(infile.read()) nb_lines_out = gt.get_nb_lines_file( os.path.join(path_to_concat_data, name_user)) nb_lines_in = 0 for file in split_files: nb_lines_in += gt.get_nb_lines_file(file) if nb_lines_in != nb_lines_out: sys.stderr.write( "Number of lines between original files and the\n" + "concatenated file does not match.\n" + "Origin: {0}, Concat: {1}\n".format(nb_lines_in, nb_lines_out)) if not make_copy: shutil.rmtree(user) if not make_copy: shutil.rmtree(path_to_input)
def _copy_comment(path_in, path_out): comment_files = gt.list_elements(path_in, extension=".txt") shutil.copy( os.path.join(path_in, comment_files[0]), os.path.join(path_out, "_comments.txt"))
def verify_decoding( path_to_original_data, path_to_encoded_data, chromosome_verified, max_nb_of_files_to_test=100, nb_of_tests_per_file=100, verbose=True, printing=True, logging=False): print_parameters = { "verbose": verbose, "printing": printing, "logging": logging, "in_loop": False } errors_file = [] errors_sup_pos = [] errors_real_pos = [] errors_type = [] errors_prev_pos = [] errors_next_pos = [] gt.custom_output( "Function {0} started at {1}".format( verify_decoding.__name__, str(datetime.datetime.now())) + "\nTesting files in {0}:".format( path_to_encoded_data), **print_parameters) timer = gt.time_since_first_call() next(timer) _meta = pd.read_csv( os.path.join( path_to_original_data, chromosome_verified, "_meta.txt.gz"), sep="\t", index_col=False).drop( ["#CHROM", "ID", "QUAL", "FILTER", "INFO", "FORMAT"], 1) files = gt.list_elements( os.path.join( path_to_encoded_data, chromosome_verified), extension=".txt.gz") print_parameters["in_loop"] = True for j in range(min(max_nb_of_files_to_test, len(files))): testfile = random.choice(files) name = testfile.split("/")[-1].split(".")[0] _meta["originaldata"] = pd.read_csv( os.path.join( path_to_original_data, chromosome_verified, name+"_"+name+".txt.gz"), index_col=None, header=None) _meta["totest"] = pd.read_csv(testfile, index_col=None, header=None) for i in range(nb_of_tests_per_file): to_test = random.choice(_meta.totest.tolist()) allele_1, allele_2, position = decode_position_int(to_test) if position == -1: index = _meta.loc[ (_meta.totest == to_test), :].index.tolist()[0] errors_file.append(testfile) errors_sup_pos.append(position) errors_real_pos.append(_meta.iloc[max(index, 0), 0]) errors_type.append("Impossible to decode") errors_prev_pos.append(_meta.iloc[max(index - 1, 0), 0]) errors_next_pos.append( _meta.iloc[min(index + 1, _meta.shape[0]), 0]) gt.custom_output("{0}/{1} files tested. Date : {2}".format( j+1, min(max_nb_of_files_to_test, len(files)), str(datetime.datetime.now())), **print_parameters) continue original_alleles =\ _meta.loc[(_meta.totest == to_test), :]["originaldata"].tolist()[0].split("/") original_pos =\ _meta.loc[(_meta.totest == to_test), :]["POS"].tolist()[0] ref = _meta.loc[(_meta.totest == to_test), :]["REF"].tolist()[0] alt = _meta.loc[(_meta.totest == to_test), :]["ALT"].tolist()[0] if position != original_pos: #print("#####################################") #print("#####################################") #print(allele_1, allele_2, position) #print(original_alleles, ref, alt, original_pos) #print("#####################################") #print("#####################################") index = _meta.loc[ (_meta.totest == to_test), :].index.tolist()[0] errors_file.append(testfile) errors_sup_pos.append(position) errors_real_pos.append(_meta.iloc[max(index, 0), 0]) errors_type.append("Position") errors_prev_pos.append(_meta.iloc[max(index - 1, 0), 0]) errors_next_pos.append( _meta.iloc[min(index + 1, _meta.shape[0]), 0]) if ((original_alleles[0] == 0) and (allele_1 != ref)) or\ ((original_alleles[0] == 1) and (allele_1 != alt)): index = _meta.loc[ (_meta.totest == to_test), :].index.tolist()[0] errors_file.append(testfile) errors_sup_pos.append(position) errors_real_pos.append(_meta.iloc[max(index, 0), 0]) errors_type.append("Allele 1") errors_prev_pos.append(_meta.iloc[max(index - 1, 0), 0]) errors_next_pos.append( _meta.iloc[min(index + 1, _meta.shape[0]), 0]) if ((original_alleles[-1] == 0) and (allele_1 != alt)) or\ ((original_alleles[-1] == 1) and (allele_1 != alt)): index = _meta.loc[ (_meta.totest == to_test), :].index.tolist()[0] errors_file.append(testfile) errors_sup_pos.append(position) errors_real_pos.append(_meta.iloc[max(index, 0), 0]) errors_type.append("Allele 2") errors_prev_pos.append(_meta.iloc[max(index - 1, 0), 0]) errors_next_pos.append( _meta.iloc[min(index + 1, _meta.shape[0]), 0]) h, m, s = next(timer) gt.custom_output( "{0}/{1} files tested ".format( j+1, min(max_nb_of_files_to_test, len(files))) + "after {0}h{1}m{2}s. ".format(h, m, s) + "Date : {0}".format(str(datetime.datetime.now())), **print_parameters) print_parameters["in_loop"] = False errors = pd.DataFrame({"File": errors_file, "Supposed_position": errors_sup_pos, "Real_position": errors_real_pos, "Error_type": errors_type, "Previous_positions": errors_prev_pos, "Next_position": errors_next_pos}) if not errors.empty: errors_al_1 = errors.loc[(errors.Error_type == "Allele 1"), :].shape[0] errors_al_2 = errors.loc[(errors.Error_type == "Allele 2"), :].shape[0] errors_pos = errors.loc[(errors.Error_type == "Position"), :].shape[0] impossible_to_decode =\ errors.loc[ (errors.Error_type == "Impossible to decode"), :].shape[0] total_error = errors.shape[0] gt.custom_output( "\nAllele 1 errors: {}".format(errors_al_1) + "\nAllele 2 errors: {}".format(errors_al_2) + "\nPosition errors: {}".format(errors_pos) + "\nImpossible to decode: {}".format(impossible_to_decode) + "\nTotal errors: {}".format(total_error) + "\nIn total: {}% errors !\n".format( 100*total_error/(nb_of_tests_per_file*min( max_nb_of_files_to_test, len(files)))), **print_parameters ) print("Date : {}".format(str(datetime.datetime.now()))) errors.to_csv( "Errors_found_in{}.csv".format(chromosome_verified), sep="\t") else: gt.custom_output("\nNo error found !", **print_parameters)
def test_sorted_ls(self): files_in_current_folder = self.total_elements[:] files_in_current_folder = gt._natural_sort(files_in_current_folder) results = gt.list_elements(self.path_to_playground, sort="natural") self.assertEqual(results, files_in_current_folder)
def mask_data(path_data, fraction_pass, path_output=None, prefix_subset=None, verbose=False, logging=False): """ fraction_pass = nb between 0 and 1 This function builds the output directory based on the name of the input dir and the prefix. """ print("Starting to filter data from {0} at {1}. ({2} pass)".format( path_data, datetime.datetime.now(), fraction_pass)) if prefix_subset is None: prefix_subset = str(int(100 * fraction_pass)) + "PER_" out_dir_name = prefix_subset + os.path.basename(path_data) if path_output is None: path_output = os.path.join(os.path.dirname(path_data), out_dir_name) copy_output_tree_struct(path_data, path_output) i = 0 chromosomes = gt.list_elements(path_data, type_='dir', exception=[ os.path.join(path_data, "floatfiles"), os.path.join(path_data, "encodeddata"), os.path.join(path_data, "Subsets") ]) for chrom in chromosomes: chrom_name = os.path.basename(chrom) files = gt.list_elements(chrom, extension='.txt.gz') for sample in files: name_sample = sample.split("/")[-1].split(".")[0].split("_")[-1] nb_lines = gt.get_nb_lines_file(sample) subset_of_lines = random.sample( range(nb_lines), int(math.floor(nb_lines * fraction_pass))) with gzip.open(sample, "rt") as infile,\ open(os.path.join( path_output, chrom_name, prefix_subset+name_sample+".txt"), "w") as outfile: lines = infile.readlines() for index in subset_of_lines: outfile.write(lines[index]) subprocess.call("gzip {}".format( os.path.join(path_output, chrom_name, prefix_subset + name_sample + ".txt")), shell=True) if not logging: i += 1 gt.print_progress(i, len(chromosomes) * len(files) - 1, decimals=3) elif verbose: print("{0}/{1} files tested. Date : {2}".format( i, len(chromosomes) * len(files), str(datetime.datetime.now()))) print("\nData from {0} filtered at {1}. ({2} pass)".format( path_data, datetime.datetime.now(), fraction_pass))
cmd_subfolder = os.path.dirname(cmd_subfolder) try: from pydeepgenomics.tools import generaltools as gt except ImportError: if cmd_subfolder not in sys.path: sys.path.append(cmd_subfolder) from pydeepgenomics.tools import generaltools as gt if __name__ == "__main__": print("Initializating the playground ...") PATH_TO_VCF = os.path.join( cmd_subfolder, "alltests", "sim_data", "vcf_files") PATH_TO_OUTPUT = os.path.join( os.path.dirname(os.path.abspath(__file__)), "playground") if not os.path.isdir(PATH_TO_OUTPUT): os.mkdir(PATH_TO_OUTPUT) else: shutil.rmtree(PATH_TO_OUTPUT) os.mkdir(PATH_TO_OUTPUT) vcf_files = gt.list_elements(PATH_TO_VCF, type_="file", extension=".vcf.gz") for file in vcf_files: shutil.copy(file, PATH_TO_OUTPUT) name_file = os.path.basename(file) copied_file = os.path.join(PATH_TO_OUTPUT, name_file)
def extract_snps_from_file( path_to_vcf_files, ref_snps_file, sep=",", chr_nb_prefix="", chr_nb_suffix=".vcf.gz"): """ Load list of snp in a dataframe (column1 = snp ref name, column2 = chr, column3 = position) Also convert snpref to strings """ # Clean previous results subprocess.call( "rm -rf " + os.path.join(path_to_vcf_files, "SelectionofSNPs"), shell=True) ref_snps = pd.read_csv(ref_snps_file, sep=sep) # Looking at the files present in the directory before working on them vcf_files = gt.list_elements( path_to_vcf_files, type_="file", extension=".vcf.gz") # Find the reference SNPs in the actual vcf files matching_references, matching_positions = find_matches(vcf_files, ref_snps) # Save the list of SNPs found if not os.path.isdir(os.path.join(path_to_vcf_files, "SelectionofSNPs")): os.mkdir(os.path.join(path_to_vcf_files, "SelectionofSNPs")) matching_positions.to_csv( path_or_buf=os.path.join( path_to_vcf_files, "SelectionofSNPs", "MatchingPositions.csv"), sep="\t", index=False) matching_references.to_csv( path_or_buf=os.path.join( path_to_vcf_files, "SelectionofSNPs", "MatchingReferences.csv"), sep="\t", index=False) for files in vcf_files: print("Starting to process file : {}".format(files)) chrnb = int(re.sub( chr_nb_prefix, '', re.sub(chr_nb_suffix, '', files))) if not os.path.isdir(os.path.join( path_to_vcf_files, "SelectionofSNPs", "ID")): os.mkdir(os.path.join(path_to_vcf_files, "SelectionofSNPs", "ID")) if not os.path.isdir(os.path.join( path_to_vcf_files, "/SelectionofSNPs", "POS")): os.mkdir(os.path.join(path_to_vcf_files, "SelectionofSNPs", "POS")) output_file_id = os.path.join( path_to_vcf_files, "SelectionofSNPs", "ID", str(chrnb)+"_subset_ID.vcf") output_file_pos = os.path.join( path_to_vcf_files, "SelectionofSNPs", "POS", str(chrnb) + "_subset_POS.vcf") # Copy header and column labels in a subset file # (<chrnb>.subset_<type of selction criteria>.vcf) in # PATH/SelectionofSNPs with gzip.open(files, "r") as fi: with open(output_file_id, "w") as fo: for line in fi: if line.startswith("#"): fo.write(line) else: break subprocess.call( "cp {0} {1}".format(output_file_id, output_file_pos), shell=True) # Filter dataframe to have only snps corresponding to the file filtered_match_pos = matching_positions[ matching_positions[VCF_HEADER[0]] == chrnb] filtered_match_ref = matching_references[ matching_references[VCF_HEADER[0]] == chrnb] lines_from_origin_vcf = gzip.open(files, "r").readlines() out_pos = open(output_file_pos, 'a') out_id = open(output_file_id, 'a') it_positions = 0 it_ids = 0 print("Extract corresponding positions") for line_nb in filtered_match_pos["Corresponding row in vcf file"]: out_pos.write(lines_from_origin_vcf[line_nb-1]) it_positions += 1 print("Information on {0}/{1} matching positions copied".format( it_positions, matching_positions.shape[0])) print("Extract corresponding references") for line_nb in filtered_match_ref["Corresponding row in vcf file"]: out_id.write(lines_from_origin_vcf[line_nb-1]) it_ids += 1 print("Information on {0}/{1} matching references copied".format( it_ids, matching_references.shape[0])) out_pos.close() out_id.close() lines_from_origin_vcf.close() # compress the output file to .gz subprocess.call("gzip {}".format(output_file_pos), shell=True) subprocess.call("gzip {}".format(output_file_id), shell=True)
def test_ls_with_type_dir(self): dirs_in_current_folder = self.dirs dirs_in_current_folder = gt._natural_sort(dirs_in_current_folder) results = gt.list_elements(self.path_to_playground, type_="dir") self.assertEqual(results, dirs_in_current_folder)
import sys try: from pydeepgenomics.tools import generaltools as gt except ImportError: cmd_dir = os.path.abspath(os.path.dirname(__file__)).split("alltests")[0] if cmd_dir not in sys.path: sys.path.append(cmd_dir) from pydeepgenomics.tools import generaltools as gt if __name__ == "__main__": prob_rich_region_to_normal = 0.03 prob_normal_to_rich_region = 0.03 files = gt.list_elements(os.path.abspath("."), extension=".vcf") header_size = 6 for index_f, file in enumerate(files): size = gt.get_nb_lines_file(file) - header_size size_chromosome_a_priori = 200000 / math.pow(1.1, index_f) mean_distance = size_chromosome_a_priori / size with open(file, "r") as in_file, gzip.open(file + ".gz", "wb") as out_file: in_rich = False position = 0 for index_l, line in enumerate(in_file):
def encode_file_positions( chr_to_be_processed, path_to_data, path_to_output, name_output_dir="encoded_files", verbose=True, printing=True, logging=False): # Start timer timer = gt.time_since_first_call() next(timer) print_parameters = { "verbose": verbose, "printing": printing, "logging": logging, "in_loop": False } gt.custom_output( "Function {0} started at {1}".format( encode_file_positions.__name__, str(datetime.datetime.now())) + "\nProcessing files in {0}:".format(path_to_data), **print_parameters) chromosome_name = str(chr_to_be_processed) _build_output_tree_structure( path_to_output, name_output_dir, chromosome_name) # load the meta data in a pandas data frame _meta = pd.read_csv( os.path.join(path_to_data, "_meta.txt.gz"), sep="\t", index_col=False) list_files = gt.list_elements( path_to_data, extension=".txt.gz", exception=[ os.path.join(path_to_data, "_meta.txt.gz"), os.path.join(path_to_data, "_comments.txt.gz")]) nb_processed_files = 0 batch_iter = 0 list_ = [] df = _meta.drop(["#CHROM", "ID", "QUAL", "FILTER", "INFO", "FORMAT"], 1) for files in list_files: print_parameters["in_loop"] = True sample_name = files.split("/")[-1].split(".")[0].split("_")[-1] list_.append(sample_name) df[sample_name] = pd.read_csv(files, index_col=None, header=None) if (batch_iter < settings.FILEBATCHSIZE - 1) and \ (files is not list_files[-1]): batch_iter += 1 else: # Reinitialize stuff encoded_data = do_conversion(df, list_, output_conversion="to_int") write_encoded_output( path_to_output, chromosome_name, encoded_data, list_, namedir=name_output_dir) batch_iter = 0 list_ = [] df = _meta.drop( ["#CHROM", "ID", "QUAL", "FILTER", "INFO", "FORMAT"], 1) nb_processed_files += settings.FILEBATCHSIZE h, m, s = next(timer) gt.custom_output( "\r{0}/{1}".format(nb_processed_files, len(list_files)) + " files processed after {0}h{1}m{2}s.".format(h, m, s) + " Date: {}".format(str(datetime.datetime.now())), **print_parameters) print_parameters["in_loop"] = False sys.stdout.write("\n") h, m, s = next(timer) gt.custom_output( "Finished after {0}h{1}m{2}s.\n".format(h, m, s), **print_parameters)