def main(): # starting count the staring time of the script start = time() # checking the current directory and printing it cwd = os.getcwd() print(colored(f'\nThe working directory: {cwd}\n', 'green', attrs=['bold'])) # passing the arguments to the script args = parse_arguments() # name of the input diretory, ex. Data/Genomes_splitted dir_in = args.dir_in # name of the sub directory to save the final result # Chromosomes/Plasmids sub_dir = args.sub_dir # sub_sub dir name, ex., kmers/palindromes sub_sub_dir = args.sub_sub_dir # name of the root directory to save the final result dir_out = args.dir_out # minimum kmer length kmin = args.kmin # miximum kmer length kmax = args.kmax # get the csv file with genome lengths csv_filename = args.csv_filename # cut off evalue eval_cutoff = args.eval_cutoff # check if the output directory existe other wise create it if os.path.exists(dir_out): print( colored('The directory to save the files already exists!', 'red', attrs=['bold'])) pass else: make_me_a_folder(dir_out) # Results/Lengths/chr_lengths.csv seq_len_dict = get_len_csv(csv_filename) # get the genus/species names names = seq_len_dict.keys() # get the csv files csv_files = get_paths_to_csv_counts(dir_in, sub_dir, sub_sub_dir, names) # get the kmer list kmer_list = get_all_possible_kmers(iupac_dna, kmin, kmax) # get all the stats and save it get_kmer_stats(seq_len_dict, csv_files, kmer_list, dir_out, sub_dir, sub_sub_dir, kmax, eval_cutoff) # the final time end = time() # print some info print( colored(f'Total time for the script: {round(end - start, 2)}.', 'red', attrs=['bold'])) print(colored('Done!', 'green', attrs=['bold']))
def main(): start = time.time() cwd = os.getcwd() print(colored(f'\nThe working directory: {cwd}\n', attrs=['bold'])) opt = parse_arguments() dir_name = opt.path filenames = get_fasta_files(dir_name) outfile = opt.outfile.split('/') dir_out = opt.dir_out if os.path.exists(dir_out): pass else: make_me_a_folder(dir_out) cnt_files = 0 for filename in filenames: print(colored(f"Results for file in: {filename}", attrs=['bold'])) # name of the taxon directory, ie. Acidisarcina genus = filename.split('/')[2] print(colored(f"Results for file: {genus}", attrs=['bold'])) plasmids, chromosome = split_sequences_from_fasta_file(filename) # checking the data obtained le_pl, le_ch = len(list(plasmids.values())), len( list(chromosome.values())) print(colored(f"Results for plasmids data: {le_pl}", attrs=['bold'])) print(colored(f"Results for chromosome data: {le_ch}", attrs=['bold'])) plasm_names = plasmids.keys() full_path_plasmids = os.path.join(dir_out, genus, outfile[0]) # checking if there are a path to save the data if not os.path.exists(full_path_plasmids): os.makedirs(full_path_plasmids) # saving the data for name in plasm_names: write_fasta_file(plasmids, f'{full_path_plasmids}' + '/' + name + '.fna') chromosome_names = chromosome.keys() full_path_chromosome = os.path.join(dir_out, genus, outfile[1]) # checking if there are a path to save the data if not os.path.exists(full_path_chromosome): os.makedirs(full_path_chromosome) # saving the data for name in chromosome_names: write_fasta_file(chromosome, f'{full_path_chromosome}' + '/' + name + '.fna') cnt_files += 1 end = time.time() print(colored(f"Total number of files: {cnt_files}", attrs=['bold'])) print(colored(f'Total time for the script: {end - start}', attrs=['bold'])) print(colored('Done', attrs=['bold']))
def main(): # starting count the staring time of the script start = time() # checking the current directory and printing it cwd = os.getcwd() print(colored(f'\nThe working directory: {cwd}\n', 'green', attrs=['bold'])) # passing the arguments to the script args = parse_arguments() # name of the input directory, ex. Data/Genomes_splitted dir_in = args.dir_in # name of the sub directory to save the final result # Chromosomes/Plasmids sub_dir = args.sub_dir # name of the root directory to save the final result dir_out = args.dir_out # name of the root directory to save the final result extension = args.extension # get the fasta files filenames = glob.glob(f'{dir_in}/*/{sub_dir}/*.{extension}') print(f"The number of files is {len(filenames)}") print(f'{filenames[0]}') # check if the output directory exist other wise create it if os.path.exists(dir_out): print( colored('The directory to save the files already exists!', 'red', attrs=['bold'])) pass else: make_me_a_folder(dir_out) data_len = get_genome_length(filenames) print('Calculating the genome mean') data = get_mean_genome_lengths(data_len) df = pd.DataFrame(data.items(), columns=['Name', 'Length']) file_name = f'All_{sub_dir}_length' full_path = os.path.join('Results', 'Length') if not os.path.exists(full_path): os.makedirs(full_path) df.to_csv(f'{full_path}/{file_name}.csv', index=False) # the final time end = time() # print some info print( colored(f"Total number of genus/species analyzed: {len(data)}\n.", attrs=['bold'])) print( colored( f'Total time for the script finishes: {round(end - start, 2)}.', 'red', attrs=['bold'])) print(colored('Done!', 'green', attrs=['bold']))
def main(): # starting count the staring time of the script start = time() # checking the current directory and printing it cwd = os.getcwd() print(colored(f'\nThe working directory: {cwd}\n', 'green', attrs=['bold'])) # passing the arguments to the script args = parse_arguments() # name of the input diretory, ex. Data/Genomes_splitted dir_in = args.dir_in # name of the sub directory to save the final result # Chromosomes/Plasmids sub_dir = args.sub_dir # sub_sub dir name, ex., kmers/palindromes sub_sub_dir = args.sub_sub_dir # name of the root directory to save the final result dir_out = args.dir_out # minimum kmer length kmin = args.kmin # miximum kmer length kmax = args.kmax # extension type for fasta extension = args.extension # alphabet alphabet = iupac_dna # get the list of all paths to the files in the input directory # ex., Data/Genomes_splitted, Chromosomes, gz fasta_dict = get_all_fasta(dir_in, sub_dir, extension) # check if the output directory existe other wise create it if os.path.exists(dir_out): print( colored('The directory to save the files already exists!', 'red', attrs=['bold'])) pass else: make_me_a_folder(dir_out) # initialize the file counter cnt_files = 0 # input the file paths and print it to show where the script is doing for name in fasta_dict.keys(): print(colored(f"Start working with genus {name}\n", attrs=['bold'])) # initialize the kmers counts cnt, seq_len = count_k_mers_fasta(fasta_dict, name, alphabet, kmax - 2, kmax, overlap=kmax, nprocs=4) # get the k-mer list for analyziz, k = 6 kmer_list = get_all_possible_kmers(alphabet, kmin, kmax) # calculating the expected number for all k-mers expected = get_expected_higher_markov(kmer_list, cnt) # get the expected count variance variance = get_variance(kmer_list, seq_len, expected) # get the standard deviation std = get_standard_deviation(variance) # getting the z-scores z_scrs = z_scores(expected, cnt, std) # get the p-values from k-mers pvals = get_p_values(z_scrs) # get the k-mers e-values evals = get_e_values(kmer_list, pvals) # saving the final results as a csv file kmers = get_kmer_data(kmer_list, cnt, expected, z_scrs, evals, pvals) save_data_frame_kmers(dir_out, sub_dir, sub_sub_dir, name, kmax, kmers) print( f'Number of kmer (kmin-{kmax-2}/kmax-{kmax}) from {name}: {len(cnt)}\n' ) # k = kmax # k_mers = len(expected) # pos = seq_len - k + 1 # all_mers = 4 ** k # mis = (4 ** k) - k_mers # rep = ((seq_len - 1) - k + 1) - k_mers # with open(f'Results/{name}_kmers{kmax}.txt', 'w') as fh: # fh.write('k\tkmers\t4^k\tpositions\tmissing\trepeated\n') # # (k, len(kmers), 4**k, (len(seq[0])-1)-k+1, 4**k-len(kmers), (len(seq[0])-1)-k+1-len(kmers)) # fh.write(f'{k}\t{k_mers}\t{all_mers}\t{pos}\t{mis}\t{rep}\n') # add to the count file cnt_files += 1 # the final time end = time() # print some info print(colored(f"Total number of files: {cnt_files}\n.", attrs=['bold'])) print( colored(f'Total time for the script: {round(end - start, 2)}.', 'red', attrs=['bold'])) print(colored('Done!', 'green', attrs=['bold']))
def main(): # starting count the staring time of the script start = time() # checking the current directory and printing it cwd = os.getcwd() print(colored(f'\nThe working directory: {cwd}\n', 'green', attrs=['bold'])) # passing the arguments to the script args = parse_arguments() # name of the input diretory, ex. Data/Genomes_splitted dir_in = args.dir_in # name of the root directory to save the final result dir_out = args.dir_out # path and name of the text file with the patterns pattern_file = args.pattern_file # get the list of all paths to the files in the input directory # ex., Data/Genomes_splitted all_files = get_files(dir_in) # get all patterns all_patterns = read_patterns(pattern_file) # check if the output directory existe other wise create it if os.path.exists(dir_out): print( colored('The directory to save the files already exists!', 'red', attrs=['bold'])) pass else: make_me_a_folder(dir_out) # initialize the file counter num_files = 0 # input the file paths and print it to show where the script is doing for filen in all_files: name = filen.split('/')[2] data = filen.split('/')[3] print( colored(f"Working with {data} from genus/species {name}", attrs=['bold'])) # get the search done for n, seq in parse_fasta(filen): print( f'Start counting the restriction enzymes cut sites in the sequence {n}' ) cut_sites = all_re_cut_sites(seq, all_patterns) df = pd.DataFrame(cut_sites, columns=['site', 'positions']) full_path = os.path.join(dir_out, name, 'RE_cuts') file_name = f'{n}_{data}_re_cuts.csv' if not os.path.exists(full_path): os.makedirs(full_path) print(f'Saving the files in {full_path}\n') df.to_csv(f'{full_path}/{file_name}', index=False) # the number of files analyzed num_files += 1 # the final time end = time() # print some info print( colored(f"Total number of files analyzed: {num_files}\n.", attrs=['bold'])) print( colored( f'Total time for the script finishes: {round(end - start, 2)}.', 'red', attrs=['bold'])) print(colored('Done!', 'green', attrs=['bold']))
def main(): """Parses options from the command line. Computes the k-mers to test (either palindromes or all k-mers). Computes the counts of k-mers in fasta files, and add the reverse complements of the sequence data to the counts. Computes the k-mers/palindromes statistics (expected value, z-scores and e-values), And if definide by user prints the results to stdout, else save to a csv file. """ cwd = os.getcwd() print(f'The working directory: {cwd}\n') start_time = time.process_time() opt = parse_arguments() dir_name = opt.path filenames = get_files(dir_name) outfile = opt.output dir_out = opt.dir_out if os.path.exists(dir_out): pass else: make_me_a_folder(dir_out) cnt_files = 0 for filename in filenames: for name, seq in fasta_parser.parse_fasta(filename): name = fasta_parser.str_punctuation_strip(name) n_name = '_'.join(name[0:3] + name[-3:]) seq = seq len_seq = len(seq) - count_umbiguous_bases(seq) if opt.kmer: kmer_counts = count_kmers(seq, opt.alphabet, opt.min_k - 2, opt.max_k) kmer_list = get_all_possible_kmers(opt.alphabet, opt.min_k, opt.max_k) kmer_freqs = kmers_frequencies(kmer_counts) kmer_expected = get_expected_values(kmer_list, kmer_counts) kmer_zscores = get_z_scores(kmer_list, kmer_counts, kmer_expected, len_seq) kmer_pvalues = get_pvalues(kmer_list, kmer_zscores) kmer_evalues = get_evalues(kmer_list, kmer_pvalues) kmer_scores = get_scores(kmer_list, kmer_counts, kmer_expected) kmer_nscores = get_new_scores(kmer_list, kmer_counts, kmer_expected) kmer_odds_ratio = get_odds_ratio(kmer_list, kmer_freqs) kmer_diff = get_difference(kmer_list, kmer_counts, kmer_expected) kmer_lod = get_log_odds(kmer_list, kmer_counts, kmer_expected) kmer_data = get_kmer_statistics(kmer_list, kmer_counts, kmer_expected, kmer_zscores, kmer_evalues, kmer_odds_ratio, kmer_diff, kmer_scores, kmer_nscores, kmer_lod) print_results_stats(n_name, kmer_list, len_seq, opt.min_k, opt.max_k, opt.max_e, kmer_data) df = pd.DataFrame(kmer_data, columns=[ "kmer", "Observed", "Expected", "Z_score", "Evalues", "Odds", "Diff", "Scores", "NScores", "Log_odds" ]) df.to_csv(f"{dir_out}/{n_name}_{opt.max_k}_all_kmer_stats.csv") with open(f"{dir_out}/{n_name}_{opt.max_k}_kmer_counts.csv", 'w') as fout: fout.write('Kmer,Counts\n') for kmer, count in kmer_counts.items(): fout.write(kmer + "," + str(count) + "\n") if opt.pal: n = len_seq pal_list = list( get_palindromes(opt.alphabet, opt.min_k, opt.max_k)) # counts = counts of the kmers/palindromes with min_k-2 <= k <= max_k pal_counts = count_kmers(seq, opt.alphabet, opt.min_k - 2, opt.max_k) # as palindromes are the need to count both strands rev_strand_cnt = dict((get_reverse_complement(kmer), cnt) for kmer, cnt in pal_counts.items()) for kmer, cnt in rev_strand_cnt.items(): pal_counts[kmer] += cnt n *= 2 pal_freqs = kmers_frequencies(pal_counts) pal_expected = get_expected_values(pal_list, pal_counts) pal_zscores = get_z_scores(pal_list, pal_counts, pal_expected, len_seq) pal_pvalues = get_pvalues(pal_list, pal_zscores) pal_evalues = get_evalues(pal_list, pal_pvalues) pal_scores = get_scores(pal_list, pal_counts, pal_expected) pal_nscores = get_new_scores(pal_list, pal_counts, pal_expected) pal_odds_ratio = get_odds_ratio(pal_list, pal_freqs) pal_diff = get_difference(pal_list, pal_counts, pal_expected) pal_lod = get_log_odds(pal_list, pal_counts, pal_expected) pal_data = get_kmer_statistics(pal_list, pal_counts, pal_expected, pal_zscores, pal_evalues, pal_odds_ratio, pal_diff, pal_scores, pal_nscores, pal_lod) print_results_stats(n_name, pal_list, len_seq, opt.min_k, opt.max_k, opt.max_e, pal_data) df = pd.DataFrame(pal_data, columns=[ "pal", "Observed", "Expected", "Z_score", "Evalues", "Odds", "Diff", "Scores", "NScores", "Log_odds" ]) df.to_csv(f"{dir_out}/{n_name}_{opt.max_k}_all_pal_stats.csv") with open( f"{dir_out}/{n_name}_{opt.max_k}_palindrome_counts.csv", 'w') as fout: fout.write('Palindrome,Counts\n') for pal, count in pal_counts.items(): fout.write(pal + "," + str(count) + "\n") if opt.all: kmer_counts = count_kmers(seq, opt.alphabet, opt.min_k - 2, opt.max_k) kmer_list = get_all_possible_kmers(opt.alphabet, opt.min_k, opt.max_k) kmer_freqs = kmers_frequencies(kmer_counts) kmer_expected = get_expected_values(kmer_list, kmer_counts) kmer_zscores = get_z_scores(kmer_list, kmer_counts, kmer_expected, len_seq) kmer_pvalues = get_pvalues(kmer_list, kmer_zscores) kmer_evalues = get_evalues(kmer_list, kmer_pvalues) kmer_scores = get_scores(kmer_list, kmer_counts, kmer_expected) kmer_nscores = get_new_scores(kmer_list, kmer_counts, kmer_expected) kmer_odds_ratio = get_odds_ratio(kmer_list, kmer_freqs) kmer_diff = get_difference(kmer_list, kmer_counts, kmer_expected) kmer_lod = get_log_odds(kmer_list, kmer_counts, kmer_expected) kmer_data = get_kmer_statistics(kmer_list, kmer_counts, kmer_expected, kmer_zscores, kmer_evalues, kmer_odds_ratio, kmer_diff, kmer_scores, kmer_nscores, kmer_lod) get_dataframe_from_kmer_data(dir_out, outfile, opt.max_k, kmer_data) data_dict = defaultdict(list) for data in kmer_data: kmer = data[0] obs = data[1] exp = data[2] zscr = data[3] eval = data[4] data_dict[kmer] = data_dict.get( kmer, []) + [obs, exp, zscr, eval] with open(f'{dir_out}/{outfile}_all_kmers_z_scores.csv', 'w') as fout: fout.write('kmer, data\n') for kmer, data in data_dict.items(): fout.write(kmer + ',' + str(data) + '\n') if opt.slide: kmer_slide = get_kmer_count_slide_window( seq, opt.alphabet, opt.window, opt.step, opt.min_k, opt.max_k) df = pd.DataFrame.from_dict(kmer_slide).fillna(0.0) df.to_csv(f"{dir_out}/{n_name}_slide_window.csv") cnt_files += 1 end = time.process_time() total_time = end - start_time print(f'The script takes {total_time} to finish!') print(f'Where read and manipulated {cnt_files} files') print('Done!')
def main(): # starting count the staring time of the script start = time() # checking the current directory and printing it cwd = os.getcwd() print(colored(f'\nThe working directory: {cwd}\n', 'green', attrs=['bold'])) # passing the arguments to the script args = parse_arguments() # name of the input diretory, ex. Data/Genomes_splitted dir_in = args.dir_in # name of the sub directory to save the final result # Chromosomes/Plasmids sub_dir = args.sub_dir # sub_sub dir name, ex., kmers/palindromes sub_sub_dir = args.sub_sub_dir # name of the root directory to save the final result dir_out = args.dir_out # minimum kmer length kmin = args.kmin # miximum kmer length kmax = args.kmax # extention type for fasta extention = args.extention # alphabet alphabet = iupac_dna # get the list of all paths to the files in the input directory # ex., Data/Genomes_splitted, Chromosomes, gz fasta_dict = get_all_fasta(dir_in, sub_dir, extention) # check if the output directory existe other wise create it if os.path.exists(dir_out): print( colored('The directory to save the files already exists!', 'red', attrs=['bold'])) pass else: make_me_a_folder(dir_out) # initialyze the file counter cnt_files = 0 # input the file paths and print it to show where the script is doing for name in fasta_dict.keys(): print(colored(f"Start working with genus {name}\n", attrs=['bold'])) # initialyze the kmers counts cnt = count_n_grams_fasta(fasta_dict, name, alphabet, kmin, kmax) print( f'Number of kmer (kmin-{kmin}/kmax-{kmax}) from {name}: {len(cnt)}.\n' ) # get the name of the full path to save the final csv file # Results/genus/Chromosomes/kmers{k}/ids{k}.csv full_path = os.path.join(dir_out, name, sub_dir, sub_sub_dir) # checking if there are a path to save the data # if not create it if not os.path.exists(full_path): os.makedirs(full_path) # name of the file to be saved csv_name = f'{name}_{sub_dir}{kmax}.csv' # iterate through the list and write the kmer to the file df = pd.DataFrame(cnt.items(), columns=['kmer', 'count']) df.to_csv(f'{full_path}/{csv_name}.gz', index=False, compression='gzip') # add to the count file cnt_files += 1 # the final time end = time() # print some info print(colored(f"Total number of files: {cnt_files}\n.", attrs=['bold'])) print( colored(f'Total time for the script: {round(end - start, 2)}.', 'red', attrs=['bold'])) print(colored('Done!', 'green', attrs=['bold']))
def main(): # starting count the staring time of the script start = time() # checking the current directory and printing it cwd = os.getcwd() print(colored(f'\nThe working directory: {cwd}\n', 'green', attrs=['bold'])) # passing the arguments to the script args = parse_arguments() # name of the input diretory, ex. Data/Genomes_splitted dir_in = args.dir_in # name of the sub directory to save the final result # Chromosomes/Plasmids sub_dir = args.sub_dir # name of the root directory to save the final result dir_out = args.dir_out # extension type for fasta sub_sub_dir = args.sub_sub_dir # name of the root directory to save the final result extension = args.extension # get the list of all paths to the files in the input directory # ex., Data/Genomes_splitted, Chromosomes, gz fasta_dict = get_all_fasta(dir_in, sub_dir, extension) # check if the output directory existe other wise create it if os.path.exists(dir_out): print(colored('The directory to save the files already exists!', 'red', attrs=['bold'])) pass else: make_me_a_folder(dir_out) # initialize the file counter cnt_spc = 0 # input the file paths and print it to show where the script is doing for name in fasta_dict.keys(): print(colored(f"Start working with genus {name}\n", attrs=['bold'])) # initialize the gc counts gc = gc_count_fasta(fasta_dict, name) # make a series gc_series = pd.Series(gc, index=["GC"]).reset_index() # create a data frame df_gc = pd.DataFrame(gc_series).rename(columns={'index': 'bases', 0: 'counts'}) # count the base composition bases = count_bases_fasta(fasta_dict, name) # create a dat frame df = pd.DataFrame(bases.items(), columns=['bases', 'counts']) # concatenate the two data frames df_final = pd.concat([df, df_gc]) # count the gc content in a slide window window = genome_stats_in_windows(fasta_dict, name, as_overlap=False, k=3000) # create a data frame dfw = pd.DataFrame(window, columns=['GC_window', 'GC_content']) # saving the final results as a csv file full_path = os.path.join(dir_out, name, 'BasicStats') file_name = f'{name}_basic_stats.csv' if not os.path.exists(full_path): os.makedirs(full_path) df_final.to_csv(f'{full_path}/{file_name}_{sub_sub_dir}_basic_stats.csv', index=False) dfw.to_csv(f'{full_path}/{file_name}_{sub_sub_dir}_gc_window.csv', index=False) # the number of genus/species analyzed cnt_spc += 1 # the final time end = time() # print some info print(colored(f"Total number of genus/species analyzed: {cnt_spc}\n.", attrs=['bold'])) print(colored(f'Total time for the script finishes: {round(end - start, 2)}.', 'red', attrs=['bold'])) print(colored('Done!', 'green', attrs=['bold']))
def main(): # starting count the staring time of the script start = time() # checking the current directory and printing it cwd = os.getcwd() print(colored(f'\nThe working directory: {cwd}\n', 'green', attrs=['bold'])) # passing the arguments to the script args = parse_arguments() # name of the input diretory, ex. Data/Genomes_splitted dir_in = args.dir_in # name of the sub directory to save the final result # Chromosomes/Plasmids sub_dir = args.sub_dir # sub_sub dir name, ex., kmers/palindromes sub_sub_dir = args.sub_sub_dir # name of the root directory to save the final result dir_out = args.dir_out # alphabet alphabet = iupac_dna # get the list of all paths to the files in the input directory filenames = get_fasta_files(dir_in) # check if the output directory existe other wise create it if os.path.exists(dir_out): print(colored('The directory to save the files already exists!', 'red', attrs=['bold'])) pass else: make_me_a_folder(dir_out) # initialyze the file counter cnt_files = 0 # input the file paths and print it to show where the script is doing for filename in filenames: print(colored(f"File: {filename}", attrs=['bold'])) # Data/Genomes_splitted/Genus # name of the taxon directory, ie. Acidisarcina # and get sub sub directory name genus = filename.split('/')[2] # read in the sequences and ids for seq_id, sequence in parse_fasta(filename): # get sequence length seq_len = len(sequence) print(f'Sequence length {seq_len}.') bases = count_all_bases(sequence) # Results/Genus/Bases path = os.path.join(dir_out, genus, sub_dir, sub_sub_dir) if not os.path.exists(path): os.makedirs(path) print(f'Saving the results in {path}\n') base_content_slide_window(sequence, path, seq_id, alphabet, 5000, 500, plot=True) with open(f'{path}/{seq_id}_bases.csv', 'w') as fout: fout.write('base,count\n') for base, cnt in bases.items(): fout.write(base + ',' + str(cnt) + '\n') if not os.path.exists(path): os.makedirs(path) cnt_files += 1 # the final time end = time() # print some info print(colored(f"Total number of files: {cnt_files}\n.", attrs=['bold'])) print(colored(f'Total time for the script: {round(end - start, 2)}.', 'red', attrs=['bold'])) print(colored('Done!', 'green', attrs=['bold']))