Exemplo n.º 1
0
def gc_count_fasta(fasta_dict, name):
    """
    Function to count all n-grams/k-mers (substrings of lenght n or k) in a
    big string/genome.

    Inputs:
        fasta_dict - a dictionary-like object that map a word/kmer to their value,
                    in this case a full path to the files to be analized.
        name - a string representing a word (key) that represent a key in a
               dictionary.

    Outputs:
        gc content - a float representing the mean of the gc content from all
                     genus/species analyzed.
    """
    # get the number of files in the names directory
    num_fastas = len(fasta_dict[name])
    # initialize the counter
    gc_tot = 0
    # iterates through the list of paths
    for filename in fasta_dict[name]:
        # reads the file and parse the content
        print(f'Reading and parsing the filename {filename}')
        for name, sequence in parse_fasta(filename):
            # add the gc content from all files
            gc_tot += gc_cython(sequence)
    # returns the mean of the gc content from all files
    return (gc_tot / num_fastas) * 100
Exemplo n.º 2
0
def get_genome_length(filenames):
    gen_len = defaultdict(dict)

    for filename in filenames:
        genus = filename.split('/')[2]
        length = 0
        for name, seq in parse_fasta(filename):
            length += len(seq)
            gen_len[genus][name] = len(seq)

    return gen_len
Exemplo n.º 3
0
def genome_stats_in_windows(fasta_dict, name, as_overlap=False, k=20):
    """GC Content in a DNA/RNA sub-sequence length k. In
    overlapp windows of lenght k.

    Inputs:

        sequence - a string representing a DNA sequence.
        as_overlap - boolean that represents if overlap is needed.
        k - a integer reppresenting the lengths of overlappig bases.
            Default is 20.

    Outputs:

        gc_content - an array-like object with


    """
    seq = ''
    for file in fasta_dict[name]:
        for n, seq in parse_fasta(file):
            # make sequence upper case and getting the length of it
            seq += seq.upper()
    # the array-like object to collect the data
    gc_content = []
    # non overlap sequence length
    non_overlap = range(0, len(seq) - k + 1, k)
    # overlap sequence length
    overlap = range(0, len(seq) - k + 1)
    # overlap is needed
    if as_overlap:
        # iterates to the overlap region
        for i in overlap:
            # creates the substring to count the gc_content
            subseq = seq[i:i + k]
            # count and sum up the Gs and Cs counts
            g_c = gc_cython(subseq)
            # collect the data in the array container
            gc_content.append((i, round(g_c, 4) * 100))
    # if non overlap is choosed
    else:
        # iterates to the mon overlap region
        for j in non_overlap:
            # creates the substring to count the gc_content
            subseq = seq[j:j + k]
            # count and sum up the Gs and Cs counts
            g_c = gc_cython(subseq)
            # collect the data in the array container
            gc_content.append((j, round(g_c, 4) * 100))
    return gc_content
def count_n_grams_fasta(fasta_dict, name, alphabet, kmin, kmax):
    """
    Function to count all n-grams/k-mers (substrings of lenght n or k) in a
    big string/genome.

    Inputs:
        fasta_dict - a dictionary-like object that map a word/kmer to their value,
                    in this case a full path to the files to be analized.
        name - a string representing a word (key) that represent a key in a
               dictionary.
        kmin - a integer representing the lower bound of the kmer/n-gram length.
        kmax - a integer representing the maximum bound of the kmer/n-gram length.

    Outputs:
        final_counter - a dictionary-like mapping the kmers to their calculated count
                        in the input string, from a file.
    """
    # alphabet as a set
    alphabet = set(alphabet)
    # get the number of files in the names directory
    num_fastas = len(fasta_dict[name])
    print(f'The number of fasta files for this genus is {num_fastas}.')
    # initialyze the counter
    counter = Counter()
    # iterates through the list of paths
    for filename in fasta_dict[name]:
        # reads the file and parse the content
        print(f'Reading and parsing the file {filename}')
        for name, sequence in parse_fasta(filename):
            print(f'Sequence length {len(sequence)}')
            # get the counting the kmers
            cnt = count_kmers(sequence, kmin, kmax, counter=None)
            # add the count of the current file to the counter
            counter.update(cnt)
    # to get the mean of the kmer count for all the files
    final_counter = {
        k: (c // num_fastas)
        for k, c in counter.items() if set(k).issubset(alphabet)
    }
    return final_counter
Exemplo n.º 5
0
def count_bases_fasta(fasta_dict, name):
    """
    Function to count all n-grams/k-mers (substrings of lenght n or k) in a
    big string/genome.

    Inputs:
        fasta_dict - a dictionary-like object that map a word/kmer to their value,
                    in this case a full path to the files to be analized.
        name - a string representing a word (key) that represent a key in a
               dictionary.

    Outputs:
        final_counter - a dictionary-like mapping the kmers to their calculated count
                        in the input string, from a file.
        seq_length - a integer representing the mean of the lengths from all genomes
                     files in the directory.
    """
    # get the number of files in the names directory
    num_fastas = len(fasta_dict[name])
    print(f'The number of fasta files for this genus is {num_fastas}.')
    # initialize the counter
    counter = Counter()
    # get the sequence length
    seq_len = 0
    num_files = 0
    # iterates through the list of paths
    for filename in fasta_dict[name]:
        # reads the file and parse the content
        print(f'Reading and parsing the file {filename}')
        for name, sequence in parse_fasta(filename):
            print(f'Sequence length {len(sequence)}')
            seq_len += len(sequence)
            # get the counting the kmers
            cnt = count_bases_cython(sequence)
            # add the count of the current file to the counter
            counter.update(cnt)
        num_files += 1
    # to get the mean of the kmer count for all the files
    final_counter = {k: (c // num_fastas) for k, c in counter.items()}
    return final_counter
def main():
    # starting count the staring time of the script
    start = time()
    # checking the current directory and printing it
    cwd = os.getcwd()
    print(colored(f'\nThe working directory: {cwd}\n', 'green',
                  attrs=['bold']))
    # passing the arguments to the script
    args = parse_arguments()
    # name of the input diretory, ex. Data/Genomes_splitted
    dir_in = args.dir_in
    # name of the root directory to save the final result
    dir_out = args.dir_out
    # path and name of the text file with the patterns
    pattern_file = args.pattern_file
    # get the list of all paths to the files in the input directory
    # ex., Data/Genomes_splitted
    all_files = get_files(dir_in)
    # get all patterns
    all_patterns = read_patterns(pattern_file)
    # check if the output directory existe other wise create it
    if os.path.exists(dir_out):
        print(
            colored('The directory to save the files already exists!',
                    'red',
                    attrs=['bold']))
        pass
    else:
        make_me_a_folder(dir_out)

    # initialize the file counter
    num_files = 0
    # input the file paths and print it to show where the script is doing
    for filen in all_files:
        name = filen.split('/')[2]
        data = filen.split('/')[3]
        print(
            colored(f"Working with {data} from genus/species {name}",
                    attrs=['bold']))
        # get the search done
        for n, seq in parse_fasta(filen):
            print(
                f'Start counting the restriction enzymes cut sites in the sequence {n}'
            )
            cut_sites = all_re_cut_sites(seq, all_patterns)
            df = pd.DataFrame(cut_sites, columns=['site', 'positions'])
            full_path = os.path.join(dir_out, name, 'RE_cuts')
            file_name = f'{n}_{data}_re_cuts.csv'
            if not os.path.exists(full_path):
                os.makedirs(full_path)
            print(f'Saving the files in {full_path}\n')
            df.to_csv(f'{full_path}/{file_name}', index=False)
        # the number of files analyzed
        num_files += 1
    # the final time
    end = time()
    # print some info
    print(
        colored(f"Total number of files analyzed: {num_files}\n.",
                attrs=['bold']))
    print(
        colored(
            f'Total time for the script finishes: {round(end - start, 2)}.',
            'red',
            attrs=['bold']))
    print(colored('Done!', 'green', attrs=['bold']))
def main():
    """Parses options from the command line.
    Computes the k-mers to test (either palindromes or all k-mers).
    Computes the counts of k-mers in fasta files, and add the reverse complements
    of the sequence data to the counts.
    Computes the k-mers/palindromes statistics (expected value, z-scores and e-values),
    And if definide by user prints the results to stdout, else save to a csv file.
    """
    cwd = os.getcwd()
    print(f'The working directory: {cwd}\n')
    start_time = time.process_time()
    opt = parse_arguments()
    dir_name = opt.path
    filenames = get_files(dir_name)
    outfile = opt.output
    dir_out = opt.dir_out
    if os.path.exists(dir_out):
        pass
    else:
        make_me_a_folder(dir_out)
    cnt_files = 0
    for filename in filenames:
        for name, seq in fasta_parser.parse_fasta(filename):
            name = fasta_parser.str_punctuation_strip(name)
            n_name = '_'.join(name[0:3] + name[-3:])
            seq = seq
            len_seq = len(seq) - count_umbiguous_bases(seq)
            if opt.kmer:
                kmer_counts = count_kmers(seq, opt.alphabet, opt.min_k - 2,
                                          opt.max_k)
                kmer_list = get_all_possible_kmers(opt.alphabet, opt.min_k,
                                                   opt.max_k)
                kmer_freqs = kmers_frequencies(kmer_counts)
                kmer_expected = get_expected_values(kmer_list, kmer_counts)
                kmer_zscores = get_z_scores(kmer_list, kmer_counts,
                                            kmer_expected, len_seq)
                kmer_pvalues = get_pvalues(kmer_list, kmer_zscores)
                kmer_evalues = get_evalues(kmer_list, kmer_pvalues)
                kmer_scores = get_scores(kmer_list, kmer_counts, kmer_expected)
                kmer_nscores = get_new_scores(kmer_list, kmer_counts,
                                              kmer_expected)
                kmer_odds_ratio = get_odds_ratio(kmer_list, kmer_freqs)
                kmer_diff = get_difference(kmer_list, kmer_counts,
                                           kmer_expected)
                kmer_lod = get_log_odds(kmer_list, kmer_counts, kmer_expected)
                kmer_data = get_kmer_statistics(kmer_list, kmer_counts,
                                                kmer_expected, kmer_zscores,
                                                kmer_evalues, kmer_odds_ratio,
                                                kmer_diff, kmer_scores,
                                                kmer_nscores, kmer_lod)
                print_results_stats(n_name, kmer_list, len_seq, opt.min_k,
                                    opt.max_k, opt.max_e, kmer_data)
                df = pd.DataFrame(kmer_data,
                                  columns=[
                                      "kmer", "Observed", "Expected",
                                      "Z_score", "Evalues", "Odds", "Diff",
                                      "Scores", "NScores", "Log_odds"
                                  ])
                df.to_csv(f"{dir_out}/{n_name}_{opt.max_k}_all_kmer_stats.csv")
                with open(f"{dir_out}/{n_name}_{opt.max_k}_kmer_counts.csv",
                          'w') as fout:
                    fout.write('Kmer,Counts\n')
                    for kmer, count in kmer_counts.items():
                        fout.write(kmer + "," + str(count) + "\n")

            if opt.pal:
                n = len_seq
                pal_list = list(
                    get_palindromes(opt.alphabet, opt.min_k, opt.max_k))
                # counts = counts of the kmers/palindromes with min_k-2 <= k <= max_k
                pal_counts = count_kmers(seq, opt.alphabet, opt.min_k - 2,
                                         opt.max_k)
                # as palindromes are the need to count both strands
                rev_strand_cnt = dict((get_reverse_complement(kmer), cnt)
                                      for kmer, cnt in pal_counts.items())
                for kmer, cnt in rev_strand_cnt.items():
                    pal_counts[kmer] += cnt
                n *= 2
                pal_freqs = kmers_frequencies(pal_counts)
                pal_expected = get_expected_values(pal_list, pal_counts)
                pal_zscores = get_z_scores(pal_list, pal_counts, pal_expected,
                                           len_seq)
                pal_pvalues = get_pvalues(pal_list, pal_zscores)
                pal_evalues = get_evalues(pal_list, pal_pvalues)
                pal_scores = get_scores(pal_list, pal_counts, pal_expected)
                pal_nscores = get_new_scores(pal_list, pal_counts,
                                             pal_expected)
                pal_odds_ratio = get_odds_ratio(pal_list, pal_freqs)
                pal_diff = get_difference(pal_list, pal_counts, pal_expected)
                pal_lod = get_log_odds(pal_list, pal_counts, pal_expected)
                pal_data = get_kmer_statistics(pal_list, pal_counts,
                                               pal_expected, pal_zscores,
                                               pal_evalues, pal_odds_ratio,
                                               pal_diff, pal_scores,
                                               pal_nscores, pal_lod)
                print_results_stats(n_name, pal_list, len_seq, opt.min_k,
                                    opt.max_k, opt.max_e, pal_data)
                df = pd.DataFrame(pal_data,
                                  columns=[
                                      "pal", "Observed", "Expected", "Z_score",
                                      "Evalues", "Odds", "Diff", "Scores",
                                      "NScores", "Log_odds"
                                  ])
                df.to_csv(f"{dir_out}/{n_name}_{opt.max_k}_all_pal_stats.csv")
                with open(
                        f"{dir_out}/{n_name}_{opt.max_k}_palindrome_counts.csv",
                        'w') as fout:
                    fout.write('Palindrome,Counts\n')
                    for pal, count in pal_counts.items():
                        fout.write(pal + "," + str(count) + "\n")
            if opt.all:
                kmer_counts = count_kmers(seq, opt.alphabet, opt.min_k - 2,
                                          opt.max_k)
                kmer_list = get_all_possible_kmers(opt.alphabet, opt.min_k,
                                                   opt.max_k)
                kmer_freqs = kmers_frequencies(kmer_counts)
                kmer_expected = get_expected_values(kmer_list, kmer_counts)
                kmer_zscores = get_z_scores(kmer_list, kmer_counts,
                                            kmer_expected, len_seq)
                kmer_pvalues = get_pvalues(kmer_list, kmer_zscores)
                kmer_evalues = get_evalues(kmer_list, kmer_pvalues)
                kmer_scores = get_scores(kmer_list, kmer_counts, kmer_expected)
                kmer_nscores = get_new_scores(kmer_list, kmer_counts,
                                              kmer_expected)
                kmer_odds_ratio = get_odds_ratio(kmer_list, kmer_freqs)
                kmer_diff = get_difference(kmer_list, kmer_counts,
                                           kmer_expected)
                kmer_lod = get_log_odds(kmer_list, kmer_counts, kmer_expected)
                kmer_data = get_kmer_statistics(kmer_list, kmer_counts,
                                                kmer_expected, kmer_zscores,
                                                kmer_evalues, kmer_odds_ratio,
                                                kmer_diff, kmer_scores,
                                                kmer_nscores, kmer_lod)
                get_dataframe_from_kmer_data(dir_out, outfile, opt.max_k,
                                             kmer_data)
                data_dict = defaultdict(list)
                for data in kmer_data:
                    kmer = data[0]
                    obs = data[1]
                    exp = data[2]
                    zscr = data[3]
                    eval = data[4]
                    data_dict[kmer] = data_dict.get(
                        kmer, []) + [obs, exp, zscr, eval]
                with open(f'{dir_out}/{outfile}_all_kmers_z_scores.csv',
                          'w') as fout:
                    fout.write('kmer, data\n')
                    for kmer, data in data_dict.items():
                        fout.write(kmer + ',' + str(data) + '\n')

            if opt.slide:
                kmer_slide = get_kmer_count_slide_window(
                    seq, opt.alphabet, opt.window, opt.step, opt.min_k,
                    opt.max_k)
                df = pd.DataFrame.from_dict(kmer_slide).fillna(0.0)
                df.to_csv(f"{dir_out}/{n_name}_slide_window.csv")
        cnt_files += 1
    end = time.process_time()
    total_time = end - start_time
    print(f'The script takes {total_time} to finish!')
    print(f'Where read and manipulated {cnt_files} files')
    print('Done!')
#!usr/bin/env python
import sys
from fasta_parser import fasta_item_counter, parse_fasta
from system_utils import get_fasta_files

if len(sys.argv) < 2:
    print('USAGE: < count_assemblies_with_plasmids.py > < directory name > ')
    sys.exit(1)

path = sys.argv[1]

assemblies_plasmids = []
cnt = 0
for filename in get_fasta_files(path):
    name = filename.split('/')[-1]
    headers = [
        header for header in parse_fasta(filename) if 'plasmid' in header
    ]
    cnt += len(headers)
    assemblies_plasmids.append((set(headers), cnt))

print(f'The number of assemblies with plasmids are {cnt}')

with open('assemblies_with_plasmids.txt', 'w') as fo:
    for name in assemblies_plasmids:
        fo.write(f'{name}\n')
Exemplo n.º 9
0
def main():
    # starting count the staring time of the script
    start = time()
    # checking the current directory and printing it
    cwd = os.getcwd()
    print(colored(f'\nThe working directory: {cwd}\n',
                  'green',
                  attrs=['bold']))
    # passing the arguments to the script
    args = parse_arguments()
    # name of the input diretory, ex. Data/Genomes_splitted
    dir_in = args.dir_in
    # name of the sub directory to save the final result
    # Chromosomes/Plasmids
    sub_dir = args.sub_dir
    # sub_sub dir name, ex., kmers/palindromes
    sub_sub_dir = args.sub_sub_dir
    # name of the root directory to save the final result
    dir_out = args.dir_out
    # alphabet
    alphabet = iupac_dna
    # get the list of all paths to the files in the input directory
    filenames = get_fasta_files(dir_in)
    # check if the output directory existe other wise create it
    if os.path.exists(dir_out):
        print(colored('The directory to save the files already exists!',
                      'red',
                      attrs=['bold']))
        pass
    else:
        make_me_a_folder(dir_out)

    # initialyze the file counter
    cnt_files = 0
    # input the file paths and print it to show where the script is doing
    for filename in filenames:
        print(colored(f"File: {filename}",
                      attrs=['bold']))
        # Data/Genomes_splitted/Genus
        # name of the taxon directory, ie. Acidisarcina
        # and get sub sub directory name
        genus = filename.split('/')[2]
        # read in the sequences and ids
        for seq_id, sequence in parse_fasta(filename):
            # get sequence length
            seq_len = len(sequence)
            print(f'Sequence length {seq_len}.')
            bases = count_all_bases(sequence)
            # Results/Genus/Bases
            path = os.path.join(dir_out, genus, sub_dir, sub_sub_dir)
            if not os.path.exists(path):
                os.makedirs(path)
            print(f'Saving the results in {path}\n')
            base_content_slide_window(sequence, path, seq_id, alphabet, 5000, 500, plot=True)
            with open(f'{path}/{seq_id}_bases.csv', 'w') as fout:
                fout.write('base,count\n')
                for base, cnt in bases.items():
                    fout.write(base + ',' + str(cnt) + '\n')
                    if not os.path.exists(path):
                        os.makedirs(path)
            cnt_files += 1
        # the final time
    end = time()
    # print some info
    print(colored(f"Total number of files: {cnt_files}\n.",
                  attrs=['bold']))
    print(colored(f'Total time for the script: {round(end - start, 2)}.',
                  'red',
                  attrs=['bold']))
    print(colored('Done!',
                  'green',
                  attrs=['bold']))
Exemplo n.º 10
0
from dollarsign import dollarsign_matches
from fasta_parser import parse_fasta
from fastq_parser import parse_fastq

reads_file = 'data/reads.fastq'
refs_file = 'data/ref.fa'

# reads (fastq) in outer loop
for read in parse_fastq(reads_file):
    print(read)

    for reference in parse_fasta()