def main():
    # starting count the staring time of the script
    start = time()
    # checking the current directory and printing it
    cwd = os.getcwd()
    print(colored(f'\nThe working directory: {cwd}\n', 'green',
                  attrs=['bold']))
    # passing the arguments to the script
    args = parse_arguments()
    # name of the input diretory, ex. Data/Genomes_splitted
    dir_in = args.dir_in
    # name of the sub directory to save the final result
    # Chromosomes/Plasmids
    sub_dir = args.sub_dir
    # sub_sub dir name, ex., kmers/palindromes
    sub_sub_dir = args.sub_sub_dir
    # name of the root directory to save the final result
    dir_out = args.dir_out
    # minimum kmer length
    kmin = args.kmin
    # miximum kmer length
    kmax = args.kmax
    # get the csv file with genome lengths
    csv_filename = args.csv_filename
    # cut off evalue
    eval_cutoff = args.eval_cutoff
    # check if the output directory existe other wise create it
    if os.path.exists(dir_out):
        print(
            colored('The directory to save the files already exists!',
                    'red',
                    attrs=['bold']))
        pass
    else:
        make_me_a_folder(dir_out)

    # Results/Lengths/chr_lengths.csv
    seq_len_dict = get_len_csv(csv_filename)
    # get the genus/species names
    names = seq_len_dict.keys()
    # get the csv files
    csv_files = get_paths_to_csv_counts(dir_in, sub_dir, sub_sub_dir, names)
    # get the kmer list
    kmer_list = get_all_possible_kmers(iupac_dna, kmin, kmax)
    # get all the stats and save it
    get_kmer_stats(seq_len_dict, csv_files, kmer_list, dir_out, sub_dir,
                   sub_sub_dir, kmax, eval_cutoff)
    # the final time
    end = time()
    # print some info
    print(
        colored(f'Total time for the script: {round(end - start, 2)}.',
                'red',
                attrs=['bold']))
    print(colored('Done!', 'green', attrs=['bold']))
Exemplo n.º 2
0
def main():
    start = time.time()
    cwd = os.getcwd()

    print(colored(f'\nThe working directory: {cwd}\n', attrs=['bold']))

    opt = parse_arguments()
    dir_name = opt.path
    filenames = get_fasta_files(dir_name)
    outfile = opt.outfile.split('/')
    dir_out = opt.dir_out

    if os.path.exists(dir_out):
        pass
    else:
        make_me_a_folder(dir_out)

    cnt_files = 0
    for filename in filenames:
        print(colored(f"Results for file in: {filename}", attrs=['bold']))
        # name of the taxon directory, ie. Acidisarcina
        genus = filename.split('/')[2]
        print(colored(f"Results for file: {genus}", attrs=['bold']))
        plasmids, chromosome = split_sequences_from_fasta_file(filename)
        # checking the data obtained
        le_pl, le_ch = len(list(plasmids.values())), len(
            list(chromosome.values()))
        print(colored(f"Results for plasmids data: {le_pl}", attrs=['bold']))
        print(colored(f"Results for chromosome data: {le_ch}", attrs=['bold']))
        plasm_names = plasmids.keys()
        full_path_plasmids = os.path.join(dir_out, genus, outfile[0])
        # checking if there are a path to save the data
        if not os.path.exists(full_path_plasmids):
            os.makedirs(full_path_plasmids)
        # saving the data
        for name in plasm_names:
            write_fasta_file(plasmids,
                             f'{full_path_plasmids}' + '/' + name + '.fna')

        chromosome_names = chromosome.keys()
        full_path_chromosome = os.path.join(dir_out, genus, outfile[1])
        # checking if there are a path to save the data
        if not os.path.exists(full_path_chromosome):
            os.makedirs(full_path_chromosome)
        # saving the data
        for name in chromosome_names:
            write_fasta_file(chromosome,
                             f'{full_path_chromosome}' + '/' + name + '.fna')
            cnt_files += 1

    end = time.time()

    print(colored(f"Total number of files: {cnt_files}", attrs=['bold']))
    print(colored(f'Total time for the script: {end - start}', attrs=['bold']))
    print(colored('Done', attrs=['bold']))
Exemplo n.º 3
0
def main():
    # starting count the staring time of the script
    start = time()
    # checking the current directory and printing it
    cwd = os.getcwd()
    print(colored(f'\nThe working directory: {cwd}\n', 'green',
                  attrs=['bold']))
    # passing the arguments to the script
    args = parse_arguments()
    # name of the input directory, ex. Data/Genomes_splitted
    dir_in = args.dir_in
    # name of the sub directory to save the final result
    # Chromosomes/Plasmids
    sub_dir = args.sub_dir
    # name of the root directory to save the final result
    dir_out = args.dir_out
    # name of the root directory to save the final result
    extension = args.extension
    # get the fasta files
    filenames = glob.glob(f'{dir_in}/*/{sub_dir}/*.{extension}')
    print(f"The number of files is {len(filenames)}")
    print(f'{filenames[0]}')
    # check if the output directory exist other wise create it
    if os.path.exists(dir_out):
        print(
            colored('The directory to save the files already exists!',
                    'red',
                    attrs=['bold']))
        pass
    else:
        make_me_a_folder(dir_out)
    data_len = get_genome_length(filenames)
    print('Calculating the genome mean')
    data = get_mean_genome_lengths(data_len)
    df = pd.DataFrame(data.items(), columns=['Name', 'Length'])
    file_name = f'All_{sub_dir}_length'
    full_path = os.path.join('Results', 'Length')
    if not os.path.exists(full_path):
        os.makedirs(full_path)
    df.to_csv(f'{full_path}/{file_name}.csv', index=False)
    # the final time
    end = time()
    # print some info
    print(
        colored(f"Total number of genus/species analyzed: {len(data)}\n.",
                attrs=['bold']))
    print(
        colored(
            f'Total time for the script finishes: {round(end - start, 2)}.',
            'red',
            attrs=['bold']))
    print(colored('Done!', 'green', attrs=['bold']))
Exemplo n.º 4
0
def main():
    # starting count the staring time of the script
    start = time()
    # checking the current directory and printing it
    cwd = os.getcwd()
    print(colored(f'\nThe working directory: {cwd}\n', 'green',
                  attrs=['bold']))
    # passing the arguments to the script
    args = parse_arguments()
    # name of the input diretory, ex. Data/Genomes_splitted
    dir_in = args.dir_in
    # name of the sub directory to save the final result
    # Chromosomes/Plasmids
    sub_dir = args.sub_dir
    # sub_sub dir name, ex., kmers/palindromes
    sub_sub_dir = args.sub_sub_dir
    # name of the root directory to save the final result
    dir_out = args.dir_out
    # minimum kmer length
    kmin = args.kmin
    # miximum kmer length
    kmax = args.kmax
    # extension type for fasta
    extension = args.extension
    # alphabet
    alphabet = iupac_dna
    # get the list of all paths to the files in the input directory
    # ex., Data/Genomes_splitted, Chromosomes, gz
    fasta_dict = get_all_fasta(dir_in, sub_dir, extension)
    # check if the output directory existe other wise create it
    if os.path.exists(dir_out):
        print(
            colored('The directory to save the files already exists!',
                    'red',
                    attrs=['bold']))
        pass
    else:
        make_me_a_folder(dir_out)

    # initialize the file counter
    cnt_files = 0
    # input the file paths and print it to show where the script is doing
    for name in fasta_dict.keys():
        print(colored(f"Start working with genus {name}\n", attrs=['bold']))
        # initialize the kmers counts
        cnt, seq_len = count_k_mers_fasta(fasta_dict,
                                          name,
                                          alphabet,
                                          kmax - 2,
                                          kmax,
                                          overlap=kmax,
                                          nprocs=4)
        # get the k-mer list for analyziz, k = 6
        kmer_list = get_all_possible_kmers(alphabet, kmin, kmax)
        # calculating the expected number for all k-mers
        expected = get_expected_higher_markov(kmer_list, cnt)
        # get the expected count variance
        variance = get_variance(kmer_list, seq_len, expected)
        # get the standard deviation
        std = get_standard_deviation(variance)
        # getting the z-scores
        z_scrs = z_scores(expected, cnt, std)
        # get the p-values from k-mers
        pvals = get_p_values(z_scrs)
        # get the k-mers e-values
        evals = get_e_values(kmer_list, pvals)
        # saving the final results as a csv file
        kmers = get_kmer_data(kmer_list, cnt, expected, z_scrs, evals, pvals)
        save_data_frame_kmers(dir_out, sub_dir, sub_sub_dir, name, kmax, kmers)
        print(
            f'Number of kmer (kmin-{kmax-2}/kmax-{kmax}) from {name}: {len(cnt)}\n'
        )
        # k = kmax
        # k_mers = len(expected)
        # pos = seq_len - k + 1
        # all_mers = 4 ** k
        # mis = (4 ** k) - k_mers
        # rep = ((seq_len - 1) - k + 1) - k_mers
        # with open(f'Results/{name}_kmers{kmax}.txt', 'w') as fh:
        #     fh.write('k\tkmers\t4^k\tpositions\tmissing\trepeated\n')
        #     # (k, len(kmers), 4**k, (len(seq[0])-1)-k+1, 4**k-len(kmers), (len(seq[0])-1)-k+1-len(kmers))
        #     fh.write(f'{k}\t{k_mers}\t{all_mers}\t{pos}\t{mis}\t{rep}\n')
        # add to the count file
        cnt_files += 1
    # the final time
    end = time()
    # print some info
    print(colored(f"Total number of files: {cnt_files}\n.", attrs=['bold']))
    print(
        colored(f'Total time for the script: {round(end - start, 2)}.',
                'red',
                attrs=['bold']))
    print(colored('Done!', 'green', attrs=['bold']))
def main():
    # starting count the staring time of the script
    start = time()
    # checking the current directory and printing it
    cwd = os.getcwd()
    print(colored(f'\nThe working directory: {cwd}\n', 'green',
                  attrs=['bold']))
    # passing the arguments to the script
    args = parse_arguments()
    # name of the input diretory, ex. Data/Genomes_splitted
    dir_in = args.dir_in
    # name of the root directory to save the final result
    dir_out = args.dir_out
    # path and name of the text file with the patterns
    pattern_file = args.pattern_file
    # get the list of all paths to the files in the input directory
    # ex., Data/Genomes_splitted
    all_files = get_files(dir_in)
    # get all patterns
    all_patterns = read_patterns(pattern_file)
    # check if the output directory existe other wise create it
    if os.path.exists(dir_out):
        print(
            colored('The directory to save the files already exists!',
                    'red',
                    attrs=['bold']))
        pass
    else:
        make_me_a_folder(dir_out)

    # initialize the file counter
    num_files = 0
    # input the file paths and print it to show where the script is doing
    for filen in all_files:
        name = filen.split('/')[2]
        data = filen.split('/')[3]
        print(
            colored(f"Working with {data} from genus/species {name}",
                    attrs=['bold']))
        # get the search done
        for n, seq in parse_fasta(filen):
            print(
                f'Start counting the restriction enzymes cut sites in the sequence {n}'
            )
            cut_sites = all_re_cut_sites(seq, all_patterns)
            df = pd.DataFrame(cut_sites, columns=['site', 'positions'])
            full_path = os.path.join(dir_out, name, 'RE_cuts')
            file_name = f'{n}_{data}_re_cuts.csv'
            if not os.path.exists(full_path):
                os.makedirs(full_path)
            print(f'Saving the files in {full_path}\n')
            df.to_csv(f'{full_path}/{file_name}', index=False)
        # the number of files analyzed
        num_files += 1
    # the final time
    end = time()
    # print some info
    print(
        colored(f"Total number of files analyzed: {num_files}\n.",
                attrs=['bold']))
    print(
        colored(
            f'Total time for the script finishes: {round(end - start, 2)}.',
            'red',
            attrs=['bold']))
    print(colored('Done!', 'green', attrs=['bold']))
def main():
    """Parses options from the command line.
    Computes the k-mers to test (either palindromes or all k-mers).
    Computes the counts of k-mers in fasta files, and add the reverse complements
    of the sequence data to the counts.
    Computes the k-mers/palindromes statistics (expected value, z-scores and e-values),
    And if definide by user prints the results to stdout, else save to a csv file.
    """
    cwd = os.getcwd()
    print(f'The working directory: {cwd}\n')
    start_time = time.process_time()
    opt = parse_arguments()
    dir_name = opt.path
    filenames = get_files(dir_name)
    outfile = opt.output
    dir_out = opt.dir_out
    if os.path.exists(dir_out):
        pass
    else:
        make_me_a_folder(dir_out)
    cnt_files = 0
    for filename in filenames:
        for name, seq in fasta_parser.parse_fasta(filename):
            name = fasta_parser.str_punctuation_strip(name)
            n_name = '_'.join(name[0:3] + name[-3:])
            seq = seq
            len_seq = len(seq) - count_umbiguous_bases(seq)
            if opt.kmer:
                kmer_counts = count_kmers(seq, opt.alphabet, opt.min_k - 2,
                                          opt.max_k)
                kmer_list = get_all_possible_kmers(opt.alphabet, opt.min_k,
                                                   opt.max_k)
                kmer_freqs = kmers_frequencies(kmer_counts)
                kmer_expected = get_expected_values(kmer_list, kmer_counts)
                kmer_zscores = get_z_scores(kmer_list, kmer_counts,
                                            kmer_expected, len_seq)
                kmer_pvalues = get_pvalues(kmer_list, kmer_zscores)
                kmer_evalues = get_evalues(kmer_list, kmer_pvalues)
                kmer_scores = get_scores(kmer_list, kmer_counts, kmer_expected)
                kmer_nscores = get_new_scores(kmer_list, kmer_counts,
                                              kmer_expected)
                kmer_odds_ratio = get_odds_ratio(kmer_list, kmer_freqs)
                kmer_diff = get_difference(kmer_list, kmer_counts,
                                           kmer_expected)
                kmer_lod = get_log_odds(kmer_list, kmer_counts, kmer_expected)
                kmer_data = get_kmer_statistics(kmer_list, kmer_counts,
                                                kmer_expected, kmer_zscores,
                                                kmer_evalues, kmer_odds_ratio,
                                                kmer_diff, kmer_scores,
                                                kmer_nscores, kmer_lod)
                print_results_stats(n_name, kmer_list, len_seq, opt.min_k,
                                    opt.max_k, opt.max_e, kmer_data)
                df = pd.DataFrame(kmer_data,
                                  columns=[
                                      "kmer", "Observed", "Expected",
                                      "Z_score", "Evalues", "Odds", "Diff",
                                      "Scores", "NScores", "Log_odds"
                                  ])
                df.to_csv(f"{dir_out}/{n_name}_{opt.max_k}_all_kmer_stats.csv")
                with open(f"{dir_out}/{n_name}_{opt.max_k}_kmer_counts.csv",
                          'w') as fout:
                    fout.write('Kmer,Counts\n')
                    for kmer, count in kmer_counts.items():
                        fout.write(kmer + "," + str(count) + "\n")

            if opt.pal:
                n = len_seq
                pal_list = list(
                    get_palindromes(opt.alphabet, opt.min_k, opt.max_k))
                # counts = counts of the kmers/palindromes with min_k-2 <= k <= max_k
                pal_counts = count_kmers(seq, opt.alphabet, opt.min_k - 2,
                                         opt.max_k)
                # as palindromes are the need to count both strands
                rev_strand_cnt = dict((get_reverse_complement(kmer), cnt)
                                      for kmer, cnt in pal_counts.items())
                for kmer, cnt in rev_strand_cnt.items():
                    pal_counts[kmer] += cnt
                n *= 2
                pal_freqs = kmers_frequencies(pal_counts)
                pal_expected = get_expected_values(pal_list, pal_counts)
                pal_zscores = get_z_scores(pal_list, pal_counts, pal_expected,
                                           len_seq)
                pal_pvalues = get_pvalues(pal_list, pal_zscores)
                pal_evalues = get_evalues(pal_list, pal_pvalues)
                pal_scores = get_scores(pal_list, pal_counts, pal_expected)
                pal_nscores = get_new_scores(pal_list, pal_counts,
                                             pal_expected)
                pal_odds_ratio = get_odds_ratio(pal_list, pal_freqs)
                pal_diff = get_difference(pal_list, pal_counts, pal_expected)
                pal_lod = get_log_odds(pal_list, pal_counts, pal_expected)
                pal_data = get_kmer_statistics(pal_list, pal_counts,
                                               pal_expected, pal_zscores,
                                               pal_evalues, pal_odds_ratio,
                                               pal_diff, pal_scores,
                                               pal_nscores, pal_lod)
                print_results_stats(n_name, pal_list, len_seq, opt.min_k,
                                    opt.max_k, opt.max_e, pal_data)
                df = pd.DataFrame(pal_data,
                                  columns=[
                                      "pal", "Observed", "Expected", "Z_score",
                                      "Evalues", "Odds", "Diff", "Scores",
                                      "NScores", "Log_odds"
                                  ])
                df.to_csv(f"{dir_out}/{n_name}_{opt.max_k}_all_pal_stats.csv")
                with open(
                        f"{dir_out}/{n_name}_{opt.max_k}_palindrome_counts.csv",
                        'w') as fout:
                    fout.write('Palindrome,Counts\n')
                    for pal, count in pal_counts.items():
                        fout.write(pal + "," + str(count) + "\n")
            if opt.all:
                kmer_counts = count_kmers(seq, opt.alphabet, opt.min_k - 2,
                                          opt.max_k)
                kmer_list = get_all_possible_kmers(opt.alphabet, opt.min_k,
                                                   opt.max_k)
                kmer_freqs = kmers_frequencies(kmer_counts)
                kmer_expected = get_expected_values(kmer_list, kmer_counts)
                kmer_zscores = get_z_scores(kmer_list, kmer_counts,
                                            kmer_expected, len_seq)
                kmer_pvalues = get_pvalues(kmer_list, kmer_zscores)
                kmer_evalues = get_evalues(kmer_list, kmer_pvalues)
                kmer_scores = get_scores(kmer_list, kmer_counts, kmer_expected)
                kmer_nscores = get_new_scores(kmer_list, kmer_counts,
                                              kmer_expected)
                kmer_odds_ratio = get_odds_ratio(kmer_list, kmer_freqs)
                kmer_diff = get_difference(kmer_list, kmer_counts,
                                           kmer_expected)
                kmer_lod = get_log_odds(kmer_list, kmer_counts, kmer_expected)
                kmer_data = get_kmer_statistics(kmer_list, kmer_counts,
                                                kmer_expected, kmer_zscores,
                                                kmer_evalues, kmer_odds_ratio,
                                                kmer_diff, kmer_scores,
                                                kmer_nscores, kmer_lod)
                get_dataframe_from_kmer_data(dir_out, outfile, opt.max_k,
                                             kmer_data)
                data_dict = defaultdict(list)
                for data in kmer_data:
                    kmer = data[0]
                    obs = data[1]
                    exp = data[2]
                    zscr = data[3]
                    eval = data[4]
                    data_dict[kmer] = data_dict.get(
                        kmer, []) + [obs, exp, zscr, eval]
                with open(f'{dir_out}/{outfile}_all_kmers_z_scores.csv',
                          'w') as fout:
                    fout.write('kmer, data\n')
                    for kmer, data in data_dict.items():
                        fout.write(kmer + ',' + str(data) + '\n')

            if opt.slide:
                kmer_slide = get_kmer_count_slide_window(
                    seq, opt.alphabet, opt.window, opt.step, opt.min_k,
                    opt.max_k)
                df = pd.DataFrame.from_dict(kmer_slide).fillna(0.0)
                df.to_csv(f"{dir_out}/{n_name}_slide_window.csv")
        cnt_files += 1
    end = time.process_time()
    total_time = end - start_time
    print(f'The script takes {total_time} to finish!')
    print(f'Where read and manipulated {cnt_files} files')
    print('Done!')
def main():
    # starting count the staring time of the script
    start = time()
    # checking the current directory and printing it
    cwd = os.getcwd()
    print(colored(f'\nThe working directory: {cwd}\n', 'green',
                  attrs=['bold']))
    # passing the arguments to the script
    args = parse_arguments()
    # name of the input diretory, ex. Data/Genomes_splitted
    dir_in = args.dir_in
    # name of the sub directory to save the final result
    # Chromosomes/Plasmids
    sub_dir = args.sub_dir
    # sub_sub dir name, ex., kmers/palindromes
    sub_sub_dir = args.sub_sub_dir
    # name of the root directory to save the final result
    dir_out = args.dir_out
    # minimum kmer length
    kmin = args.kmin
    # miximum kmer length
    kmax = args.kmax
    # extention type for fasta
    extention = args.extention
    # alphabet
    alphabet = iupac_dna
    # get the list of all paths to the files in the input directory
    # ex., Data/Genomes_splitted, Chromosomes, gz
    fasta_dict = get_all_fasta(dir_in, sub_dir, extention)
    # check if the output directory existe other wise create it
    if os.path.exists(dir_out):
        print(
            colored('The directory to save the files already exists!',
                    'red',
                    attrs=['bold']))
        pass
    else:
        make_me_a_folder(dir_out)

    # initialyze the file counter
    cnt_files = 0
    # input the file paths and print it to show where the script is doing
    for name in fasta_dict.keys():
        print(colored(f"Start working with genus {name}\n", attrs=['bold']))
        # initialyze the kmers counts
        cnt = count_n_grams_fasta(fasta_dict, name, alphabet, kmin, kmax)
        print(
            f'Number of kmer (kmin-{kmin}/kmax-{kmax}) from {name}: {len(cnt)}.\n'
        )
        # get the name of the full path to save the final csv file
        # Results/genus/Chromosomes/kmers{k}/ids{k}.csv
        full_path = os.path.join(dir_out, name, sub_dir, sub_sub_dir)
        # checking if there are a path to save the data
        # if not create it
        if not os.path.exists(full_path):
            os.makedirs(full_path)
        # name of the file to be saved
        csv_name = f'{name}_{sub_dir}{kmax}.csv'
        # iterate through the list and write the kmer to the file
        df = pd.DataFrame(cnt.items(), columns=['kmer', 'count'])
        df.to_csv(f'{full_path}/{csv_name}.gz',
                  index=False,
                  compression='gzip')
        # add to the count file
        cnt_files += 1
    # the final time
    end = time()
    # print some info
    print(colored(f"Total number of files: {cnt_files}\n.", attrs=['bold']))
    print(
        colored(f'Total time for the script: {round(end - start, 2)}.',
                'red',
                attrs=['bold']))
    print(colored('Done!', 'green', attrs=['bold']))
Exemplo n.º 8
0
def main():
    # starting count the staring time of the script
    start = time()
    # checking the current directory and printing it
    cwd = os.getcwd()
    print(colored(f'\nThe working directory: {cwd}\n',
                  'green',
                  attrs=['bold']))
    # passing the arguments to the script
    args = parse_arguments()
    # name of the input diretory, ex. Data/Genomes_splitted
    dir_in = args.dir_in
    # name of the sub directory to save the final result
    # Chromosomes/Plasmids
    sub_dir = args.sub_dir
    # name of the root directory to save the final result
    dir_out = args.dir_out
    # extension type for fasta
    sub_sub_dir = args.sub_sub_dir
    # name of the root directory to save the final result
    extension = args.extension
    # get the list of all paths to the files in the input directory
    # ex., Data/Genomes_splitted, Chromosomes, gz
    fasta_dict = get_all_fasta(dir_in, sub_dir, extension)
    # check if the output directory existe other wise create it
    if os.path.exists(dir_out):
        print(colored('The directory to save the files already exists!',
                      'red',
                      attrs=['bold']))
        pass
    else:
        make_me_a_folder(dir_out)

    # initialize the file counter
    cnt_spc = 0
    # input the file paths and print it to show where the script is doing
    for name in fasta_dict.keys():
        print(colored(f"Start working with genus {name}\n", attrs=['bold']))
        # initialize the gc counts
        gc = gc_count_fasta(fasta_dict, name)
        # make a series
        gc_series = pd.Series(gc, index=["GC"]).reset_index()
        # create a data frame
        df_gc = pd.DataFrame(gc_series).rename(columns={'index': 'bases', 0: 'counts'})
        # count the base composition
        bases = count_bases_fasta(fasta_dict, name)
        # create a dat frame
        df = pd.DataFrame(bases.items(), columns=['bases', 'counts'])
        # concatenate the two data frames
        df_final = pd.concat([df, df_gc])
        # count the gc content in a slide window
        window = genome_stats_in_windows(fasta_dict, name, as_overlap=False, k=3000)
        # create a data frame
        dfw = pd.DataFrame(window, columns=['GC_window', 'GC_content'])
        # saving the final results as a csv file
        full_path = os.path.join(dir_out, name, 'BasicStats')
        file_name = f'{name}_basic_stats.csv'
        if not os.path.exists(full_path):
            os.makedirs(full_path)
        df_final.to_csv(f'{full_path}/{file_name}_{sub_sub_dir}_basic_stats.csv', index=False)
        dfw.to_csv(f'{full_path}/{file_name}_{sub_sub_dir}_gc_window.csv', index=False)
        # the number of genus/species analyzed
        cnt_spc += 1
    # the final time
    end = time()
    # print some info
    print(colored(f"Total number of genus/species analyzed: {cnt_spc}\n.",
                  attrs=['bold']))
    print(colored(f'Total time for the script finishes: {round(end - start, 2)}.',
                  'red',
                  attrs=['bold']))
    print(colored('Done!',
                  'green',
                  attrs=['bold']))
Exemplo n.º 9
0
def main():
    # starting count the staring time of the script
    start = time()
    # checking the current directory and printing it
    cwd = os.getcwd()
    print(colored(f'\nThe working directory: {cwd}\n',
                  'green',
                  attrs=['bold']))
    # passing the arguments to the script
    args = parse_arguments()
    # name of the input diretory, ex. Data/Genomes_splitted
    dir_in = args.dir_in
    # name of the sub directory to save the final result
    # Chromosomes/Plasmids
    sub_dir = args.sub_dir
    # sub_sub dir name, ex., kmers/palindromes
    sub_sub_dir = args.sub_sub_dir
    # name of the root directory to save the final result
    dir_out = args.dir_out
    # alphabet
    alphabet = iupac_dna
    # get the list of all paths to the files in the input directory
    filenames = get_fasta_files(dir_in)
    # check if the output directory existe other wise create it
    if os.path.exists(dir_out):
        print(colored('The directory to save the files already exists!',
                      'red',
                      attrs=['bold']))
        pass
    else:
        make_me_a_folder(dir_out)

    # initialyze the file counter
    cnt_files = 0
    # input the file paths and print it to show where the script is doing
    for filename in filenames:
        print(colored(f"File: {filename}",
                      attrs=['bold']))
        # Data/Genomes_splitted/Genus
        # name of the taxon directory, ie. Acidisarcina
        # and get sub sub directory name
        genus = filename.split('/')[2]
        # read in the sequences and ids
        for seq_id, sequence in parse_fasta(filename):
            # get sequence length
            seq_len = len(sequence)
            print(f'Sequence length {seq_len}.')
            bases = count_all_bases(sequence)
            # Results/Genus/Bases
            path = os.path.join(dir_out, genus, sub_dir, sub_sub_dir)
            if not os.path.exists(path):
                os.makedirs(path)
            print(f'Saving the results in {path}\n')
            base_content_slide_window(sequence, path, seq_id, alphabet, 5000, 500, plot=True)
            with open(f'{path}/{seq_id}_bases.csv', 'w') as fout:
                fout.write('base,count\n')
                for base, cnt in bases.items():
                    fout.write(base + ',' + str(cnt) + '\n')
                    if not os.path.exists(path):
                        os.makedirs(path)
            cnt_files += 1
        # the final time
    end = time()
    # print some info
    print(colored(f"Total number of files: {cnt_files}\n.",
                  attrs=['bold']))
    print(colored(f'Total time for the script: {round(end - start, 2)}.',
                  'red',
                  attrs=['bold']))
    print(colored('Done!',
                  'green',
                  attrs=['bold']))