Exemplo n.º 1
0
def main(target, query, k):

    #returns a tuple of the seq_id and the sequences
    target_seqs = FASTAReader(open(target))
    droyak_seqs = FASTAReader(open(query))

    #handle multiple target_seqs in its own dictionary
    #query_positions = track_kmer(droyak_seqs, k)
    target_positions = track_kmer(target_seqs, k)

    #grab a specific kmer in the query sequence
    for seq_id, sequence in droyak_seqs:
        for i in range(0, len(sequence) - k + 1):
            query_kmer = sequence[i:i + k]

            #go through targets to find matching Kmers and their positions
            for target_ID in target_positions.keys():
                target_kmers = target_positions[target_ID]
                if query_kmer in target_kmers.keys():
                    for position in target_kmers[query_kmer]:
                        print(target_ID + '\t' + str(position) + '\t' +
                              str(i) + '\t' + query_kmer)

    #print(target_positions)
#     for seq_id, sequence in droyak_seqs:
#     print(seq_id)
#     print(sequence)

    return
Exemplo n.º 2
0
def aa_conversion(aa_file, nuc_file):
    #read aa_file and nuc_file sequences
    aa_seq = FASTAReader(open(aa_file))
    nuc_seq = FASTAReader(open(nuc_file))
    #set up dictionary of sequences
    seq_dict = {}
    #create dictionary of sequences per id
    for seq_id, sequence in nuc_seq:
        seq_dict.setdefault(seq_id.split(" ")[0], [sequence])
    for seq_id, sequence in aa_seq:
        seq_dict[seq_id.split(" ")[0][:-2]].append(sequence)

    #loops through dictionary and creates new sequence
    for key, item in seq_dict.items():
        nuc_og = item[0]
        aa = item[1]
        nuc_new = ''
        counter_og = 0
        for char in aa:
            if char == '-':
                nuc_new = nuc_new + '---'
            else:
                nuc_new = nuc_new + nuc_og[counter_og:counter_og + 3]
                counter_og += 3
        seq_dict[key].append(nuc_new)
    return seq_dict
Exemplo n.º 3
0
def kmer_matcher(target, query, k):
    #read target sequences
    read_target = FASTAReader(open(target))
    #set up beginning of dictionary
    kmers = {}
    #for each seq_id and sequence read and log kmer in dictionary as tuples where kmer is key
    #and lists of tuples of seq_id and location in value
    for seq_id, sequence in read_target:
        for i in range(0, len(sequence) - k + 1):
            kmer = sequence[i:i + k].upper()
            kmers.setdefault(kmer, [])
            kmers[kmer].append((seq_id, i))
    #opens file to write out kmer matches to
    f = open('kmer_matcher.out', 'w')
    #reads query sequence
    read_query = FASTAReader(open(query))
    for seq_idq, sequenceq in read_query:
        #for each kmer in query sequence checks if its in the dictionary
        #if not it continues, othewise it writes out target_sequence id, target start, query start and kmer
        for itera in range(0, len(sequenceq) - k + 1):
            kmerq = sequenceq[itera:itera + k].upper() 
            if kmers.get(kmerq, -1) == -1:
                continue
            else:
                target_id = kmers.get(kmerq)
                for seq_id, start in target_id:
                    f.write('{}\t{}\t{}\t{}\n'.format(seq_id, start, itera, kmerq))
    f.close()
Exemplo n.º 4
0
def main():
    reader = FASTAReader(open(sys.argv[1]))
    # so that the target file is the first argument

    kmers = {}
    k = int(sys.argv[3])
    # so that the kmer size is the third argument

    for seq_id, sequence in reader:
        for i in range(0, len(sequence) - (k + 1)):
            kmer = sequence[i:(i + k)]

            if kmers.get(kmer) == None:
                kmers.setdefault(kmer, 0)
                kmers[kmer] = (seq_id, i)

            else:
                kmers[kmer] += (seq_id, i)

    reader_2 = FASTAReader(open(sys.argv[2]))
    #so the querry seq is the third argument

    k = int(sys.argv[3])

    for seq_id, sequence in reader_2:

        for i in range(0, len(sequence) - k + 1):
            kmer_2 = sequence[i:(i + k)]

            if kmers.get(kmer_2) == None:
                continue

            else:
                print(kmers.get(kmer_2), i, kmer_2)
def kmer_matcher(target, query, match_file):
    #read in match file
    match = open(match_file, 'r')
    match_log = {}
    #create dictionary of start positions for target and query sequences
    for line in match:
        line_split = line.split()
        match_log.setdefault(line_split[0], [])
        match_log[line_split[0]].append(
            (int(line_split[1]), int(line_split[2])))
    #read target sequence
    read_target = FASTAReader(open(target))
    arr_seq = []
    # for each target sequence read query sequence and check if dictionary contains target sequence
    # id, then append id to overarching array.
    for seq_idt, sequencet in read_target:
        read_query = FASTAReader(open(query))
        for seq_idq, sequenceq in read_query:
            if match_log.get(seq_idt, -1) == -1:
                continue
            else:
                arr_seq.append(seq_idt)
                arr_id_seq = []
                # for each starting index from target and query get 11 base sequence and while
                # query and target are equal, increase length by 1 then append when while loop breaks
                for tar_i, quer_i in match_log.get(seq_idt):
                    len_seq = 11
                    tar_seq = sequencet[tar_i:tar_i + len_seq].upper()
                    quer_seq = sequenceq[quer_i:quer_i + len_seq].upper()
                    while tar_seq == quer_seq:
                        len_seq += 1
                        tar_seq = sequencet[tar_i:tar_i + len_seq].upper()
                        quer_seq = sequenceq[quer_i:quer_i + len_seq].upper()
                    arr_id_seq.append(sequencet[tar_i:tar_i + len_seq -
                                                1].upper())
                # sort by length and add to sequence specific array
                arr_id_seq.sort(key=len, reverse=True)
                arr_seq.extend(arr_id_seq)
    # print array as column
    np.savetxt('kmer_match_extender.out', arr_seq, fmt="%s")
Exemplo n.º 6
0
# note that shebang is omitted because I'm running this with python kmer_matcher.py
import sys
from fasta_iterator_class import FASTAReader

t_name = sys.argv[1]
q_name = sys.argv[2]
k = int(sys.argv[3])

dbseq = FASTAReader(open(t_name))
qseq = FASTAReader(open(q_name))

kmers = {}  # initialize k-mer dictionary

for seq_id, sequence in dbseq:
    for i in range(0, len(sequence) - k + 1):
        kmer = sequence[i:i + k]
        kmers.setdefault(kmer, [])
        kloc = (seq_id, i)
        kmers[kmer].append(kloc)
# for key in kmers:
#     print(key, kmers[key])

for seq_name, sequence in qseq:
    for key in kmers:
        if key in sequence:
            print(key, kmers[key], sequence.find(key))
Exemplo n.º 7
0
#!/usr/bin/env python3
import sys
from fasta_iterator_class import FASTAReader
# get arguments from the command line

target_fname = sys.argv[1]
droyak_fname = sys.argv[2]
k = int(sys.argv[3])
# Load sequences

target_seqs = FASTAReader(open(target_fname))
droyak_seqs = FASTAReader(open(droyak_fname))

# for seq_id, sequence in droyak_seqs:
#     print(seq_id) # the NAME of the sequence in the FASTA file; a string
#     print(sequence) # FULL SEQUENCE of the sequence in the FASTA file; a string
# find 11bp sequences in Droyak fasta file
# that match to sequences in the subset.fa file

# turn target file into a dictionary
kmers = {}

for seq_id, sequence in target_seqs:
    for i in range(0, len(sequence) - k + 1):
        kmer = sequence[i:i + k]
        # kmers.setdefault(kmer, 0)
        if kmer in kmers:
            kmers[kmer].append(seq_id)
            kmers[kmer].append(i)
        else:
            kmers[kmer] = [seq_id, i]
Exemplo n.º 8
0
# real version with subdictionaries
def make_kmer_dict2(k, seqs): 
    kmer_dict = {}
    for seq_id, sequence in seqs:
        seq_id_dict = {}
        for i in range(0, len(sequence) - k + 1):
            kmer = sequence[i:(i+k)]
            if kmer not in seq_id_dict:
                seq_id_dict[kmer] = []
            seq_id_dict.setdefault(kmer, [])
            if kmer in seq_id_dict:
                seq_id_dict[kmer].append(i)       
        kmer_dict[seq_id] = seq_id_dict
    return kmer_dict

# loop through target dictionaries
target_seqs = FASTAReader(open('subset.fa'))
target_kmers = {}
target_dicts = make_kmer_dict2(11, target_seqs)

# loop through query – you can use the first version because there is only one sequnce 
# in the FASTA file
query_seqs = FASTAReader(open('droYak2_seq.fa'))
query_kmers = {}
query_dict = make_kmer_dict(11, query_seqs)

for query_key in query_dict.keys():
    for target_dict in target_dicts.keys(): 
        if query_key in target_dicts[target_dict].keys():
            print(target_dict, target_dicts[target_dict][query_key], query_dict[query_key], query_key, sep = '\t')
Exemplo n.º 9
0
##fs3 = open('output.txt', 'w')

##def kcount(subset, dict_name, k =11):

##reader = FASTAReader(open(subset))
##dict_name = {}

##for seq_id, sequence in reader:
##for i in range(0, len(sequence) - k +1):
##kmer = sequence[i:i +k]
##dict_name.setdefault(kmer, 0)
##dict_name[kmer] += 1

query_ks = {}

for seq_id, sequence in FASTAReader(fs):

    for i in range(0, len(sequence) - k + 1):
        kmer = sequence[i:i + k]
        query_ks.setdefault(kmer, [])
        query_ks[kmer].append(i)

    ##print(seq_id, sequence)
    ##target_seqs[seq_id] = sequence

target_ks = {}

for seq_id2, sequence2 in FASTAReader(fs2):

    for i in range(0, len(sequence2) - k + 1):
        kmer2 = sequence2[i:i + k]
Exemplo n.º 10
0
#! /usr/bin/env python3

import sys

from fasta_iterator_class import FASTAReader
#from file name pulling from, import class

#read in query sequence
query = sys.argv[1]
target = sys.argv[2]
k = int(sys.argv[3])

query = FASTAReader(open(query))

#for seq_id, sequence in FASTAReader(open('droYak2_seq.fa')): #give file path
#print(seq_id, sequence) #one sequence id and one sequence at a time

kmers_query = {}

#divide query into kmers
#make a dictionary where key is kmer, value is start position
for seq_id, sequence in query:
    for i in range(0, len(sequence) - k + 1):
        kmer = sequence[i:i + k]
        kmers_query.setdefault(kmer, [])
        kmers_query[kmer].append(i)

#read in target sequences
target = FASTAReader(open(target))

#make nested dictionary for target sequences
Exemplo n.º 11
0
#! /usr/bin/env python3

from fasta_iterator_class import FASTAReader

reader = FASTAReader(open('droYak2_seq.fa'))
kmers = {}

k = 11

for seq_id, sequence in reader:
    for i in range(0, len(sequence) - k + 1):
        kmer = sequence[i:i + k]
        kmers.setdefault(kmer, [])
        kmers[kmer].append(i)

reader2 = FASTAReader(open('subset.fa'))
kmers2 = {}

k = 11

for seq_id2, sequence2 in reader2:
    for i in range(0, len(sequence2) - k + 1):
        kmer2 = sequence2[i:i + k]
        kmers2.setdefault(kmer2, {})
        kmers2[kmer2].setdefault(seq_id2, [])
        kmers2[kmer2][seq_id2].append(i)

for key in kmers.keys():
    if key in kmers2.keys():
        for seq_id in kmers2[key]:
            print(seq_id, kmers2[key][seq_id], kmers[key], key)
Exemplo n.º 12
0
#!/usr/bin/env python3
#Part 3
#converts aligned AA sequence into nucleotides, with "---" inserted for alignment gaps
import sys
from fasta_iterator_class import FASTAReader

AA_file = "/Users/cmdb/qbb2020-answers/assignment4/alignAA.fa"
DNA_file = "/Users/cmdb/qbb2020-answers/assignment4/query_and_blast.fa"
outputfile = "/Users/cmdb/qbb2020-answers/assignment4/conv_DNA_seqs.fa"

#AA_file = "/Users/cmdb/qbb2020-answers/assignment4/test_alignAA.fa"
#DNA_file = "/Users/cmdb/qbb2020-answers/assignment4/test_alignDNA.fa"

AA_file = FASTAReader(open(AA_file))
DNA_file = FASTAReader(open(DNA_file))

AA_seqs = []
DNA_seqs = []
#confirm sequences have the same length
AA_lengths = set()
for seq_id, sequence in AA_file:
    AA_seqs.append((seq_id, sequence))
    AA_lengths.add(len(sequence))

print("length of aligned sequences")
print(AA_lengths)

for seq_id, sequence in DNA_file:
    DNA_seqs.append((seq_id, sequence))

#for every gap in the AA sequence, add a gap in the DNA sequence
Exemplo n.º 13
0
#!/usr/bin/env python3

import sys

from fasta_iterator_class import FASTAReader

target_file = sys.argv[1]
query_file = sys.argv[2]

k = int(sys.argv[3])

kmers = {}

our_seq = {}

for seq_id, sequence in FASTAReader(open(query_file)):
    for i in range(0, len(sequence) - k + 1):
        kmer = sequence[i:i + k]
        kmers.setdefault(kmer, [])
        kmers[kmer].append(i)

for seq_id, sequence in FASTAReader(open(target_file)):
    for i in range(0, len(sequence) - k + 1):
        t_kmer = sequence[i:i + k]
        our_seq.setdefault(t_kmer, {})
        our_seq[t_kmer].setdefault(seq_id, [])
        our_seq[t_kmer][seq_id].append(i)

for q_seq in kmers.keys():
    if q_seq in our_seq.keys():
        for nucs in our_seq[q_seq]:
Exemplo n.º 14
0
#!/usr/bin/env python3

import sys
from fasta_iterator_class import FASTAReader

target_fname = "subset.fa"
droyak_fname = "droYak2_seq.fa"

target_seqs = FASTAReader(open(target_fname))
droyak_seqs = FASTAReader(open(droyak_fname))

for seq_id, sequence in droyak_seqs:
    print(seq_id)
    print(sequence)

################################################################

target = FASTAReader(open('subset.fa'))
query = FASTAReader(open('droYak2_seq.fa'))

kmersT = {}

k = 11

for seq_id, sequence in target:
    for i in range(0, len(sequence) - k + 1):
        kmer = sequence[i:i + k]
        kmersT.setdefault(kmer, [])
        kmersT[kmer].append([seq_id, i])

reader = FASTAReader(open('droYak2_seq.fa'))
Exemplo n.º 15
0
#!/usr/bin/env python3

from fasta_iterator_class import FASTAReader

import sys
#gest user import
target_name = sys.argv[1]
query_name = sys.argv[2]
k = int(sys.argv[3])

#imports files from user command
target = FASTAReader(open(target_name))
query = FASTAReader(open(query_name))

target_kmers = {}
#iterates through target and establishes a dictionary with each unique location in each of the target sequences
for seq_id, sequence in target:
    for i in range(0, len(sequence) - k + 1):
        kmer = sequence[i:i + k].upper()
        target_kmers.setdefault(kmer, [])
        target_kmers[kmer].append((i, seq_id))
#format output string
output = ["target_sequence_name \t target_start \t query_start \t k-mer"]
#iterates through query and tries to find matches. Each match=newline
for seq_id, sequence in query:
    for start in range(0, len(sequence) - k + 1):
        kmer = sequence[start:start + k].upper()
        if kmer in target_kmers:
            for entry in target_kmers[kmer]:
                output.append("\n" + entry[1] + "\t" + str(entry[0]) + "\t" +
                              str(start) + "\t" + kmer)
Exemplo n.º 16
0
#!/usr/bin/env python3
"""
Usage: ./kmer_matcher.py <target.fa> <query.fa> <k>
"""

# Load libraries
import sys
from fasta_iterator_class import FASTAReader

# Open files and get k-mer size
target = FASTAReader(open(sys.argv[1], "r"))
query = FASTAReader(open(sys.argv[2], "r"))
k = int(sys.argv[3])

kmers = {}

# Iterate through target file and get every k-mer
for tseq_id, tsequence in target:

    for i in range(0, len(tsequence) - k + 1):

        kmer = tsequence[i:i + k]
        kmers.setdefault(kmer, [])

        # Append sequence ID and k-mer position to dictionary (can have multiple tuples for one k-mer)
        kmers[kmer].append((tseq_id, i))

# Iterate through query sequence
for qseq_id, qsequence in query:

    for j in range(0, len(qsequence) - k + 1):