예제 #1
0
#!/usr/bin/env python3

from fasta import FASTAReader
import sys

target = FASTAReader(open(sys.argv[1]))
query = FASTAReader(open(sys.argv[2]))
k = int(sys.argv[3])
kmers = {}

for ident1, sequence1 in target:
    sequence1 = sequence1.upper()
    for i in range(0, len(sequence1) - k + 1):
        kmer1 = sequence1[i:i + k]
        if kmer1 in kmers:
            kmers[kmer1].append((ident1, i))
        else:
            kmers[kmer1] = [(ident1, i)]

for ident2, sequence2 in query:
    sequence2 = sequence2.upper()
    for a in range(0, len(sequence2) - k + 1):
        kmer2 = sequence2[a:a + k]
        if kmer2 in kmers:
            print(kmers[kmer2], str(a), kmer2)
예제 #2
0
#!/usr/bin/env python3

import sys
from fasta import FASTAReader

f = sys.stdin
reader = FASTAReader(f)

contigs = []
for indent, sequence in reader:
    contigs.append(sequence)
contigs = sorted(contigs, reverse=True)

print(len(contigs))

sequence_length = []
for sequence in contigs:
    sequence_length.append(len(sequence))

print(min(sequence_length))
print(max(sequence_length))
print(sum(sequence_length) / len(contigs))
print(sum(sequence_length) / 2)

sorted_length = sorted(sequence_length, reverse=True)

count = 0
for i, item in enumerate(sorted_length):
    count += item
    if count >= (sum(sequence_length) / 2):
        break
#!/usr/bin/env python3
"""match extender"""

from fasta import FASTAReader
import sys

target = FASTAReader(open(sys.argv[1]))  #subset.fa
query = FASTAReader(open(sys.argv[2]))  #droYak2_seq.fa
k = int(sys.argv[3])

kmers_from_target = {}
target_sequence = {}

for ident, sequence in target:
    sequence = sequence.upper()
    target_sequence[ident] = sequence
    for i in range(0, len(sequence) - k + 1):
        kmer = sequence[i:i + k]
        if kmer in kmers_from_target:
            kmers_from_target[kmer].append((ident, i))
        else:
            kmers_from_target[kmer] = [(ident, i)]

elongated_seq = []

for ident, sequence1 in query:
    sequence1 = sequence1.upper()
    for i in range(0, len(sequence) - k + 1):
        kmer = sequence1[i:i + k]
        if kmer in kmers_from_target:
            for ident, j in kmers_from_target[kmer]:
예제 #4
0
#!/usr/bin/env python3

import sys
from fasta import FASTAReader
import math
import matplotlib.pyplot as plt
import statistics as stats

reader = FASTAReader(open(sys.argv[1]))  #translated.out

protein_seq = []
for ident, sequence1 in reader:
    protein_seq.append(sequence1)

#print (protein_seq)
reader2 = FASTAReader(open(sys.argv[2]))  #3_new_blast_output

nt_seq = []
for ident, sequence2 in reader2:
    nt_seq.append(sequence2)

#print (nt_seq)

list1 = []
for sequence, protein in zip(nt_seq, protein_seq):
    dna1 = ""
    nt_pos = 0
    for num, j in enumerate(protein):
        # print(a)
        if j == "-":
            dna1 = dna1 + "---"
예제 #5
0
#!/usr/bin/env python3

import sys

from fasta import FASTAReader
import math
import matplotlib.pyplot as plt
import statistics as stats

reader = FASTAReader(open(sys.argv[1]))
reader2 = FASTAReader(open(sys.argv[2]))

my_prot_sequence = []  #protein
for ident, sequence in reader:
    my_prot_sequence.append(sequence)
#print(my_sequence)
#quit()
my_sequence_nt = []  #nucleotide
for ident2, sequence2 in reader2:
    my_sequence_nt.append(sequence2)
#print(my_sequence_nt)
newlist = []
#for m in my_sequence_nt:
#dna = my_sequence_nt[m]#you only need to do it in the query sequence
#store by sequence identity in dictionary
# for i in range(len(my_prot_sequence)):
#dna= str(dna)
for sequence, protein in zip(my_sequence_nt, my_prot_sequence):
    gap_dna = ""
    nucl_pos = 0
    for num, a in enumerate(protein):
예제 #6
0
#!/usr/bin/env python3
"""
Commands: enter this argument:
./.py file subset.fa droYak2_seq.fa 11
 script, target file, query file, and kmer number
"""

#use previously developed FASTAReader to make the work below easier.
from fasta import FASTAReader
import sys

#add our files
target = FASTAReader(open(sys.argv[1]))  # this is target (subset.fa)
query = FASTAReader(open(sys.argv[2]))  # this is query file
k = int(sys.argv[3])  # this is the length of kmer

#initialize dictionary that holds the kmer (sequence) as the key and a tuple that contains the name
target_dictionary = {}

#for loop to go through target file
for name_t, sequence_t in target:
    for i in range(0, len(sequence_t) - k + 1):
        #set kmer_t length and make letters IN CAPS
        kmer_t = sequence_t[i:i + k].upper()
        #since we are using a dictionary, add name_t and i into the values section and assign it to specific kmer_t
        target_tuple = (name_t, i)
        if kmer_t in target_dictionary:
            target_dictionary[kmer_t].append(target_tuple)
        else:
            target_dictionary[kmer_t] = [target_tuple]
예제 #7
0
#!/usr/bin/env python3
"""
Usage: ./02_nt.py new_blast_output.fa aa_alignment.out week5_query.fa
"""

import sys
from fasta import FASTAReader
import numpy as np
import matplotlib.pyplot as plt
import math

dna_reader = FASTAReader(open(sys.argv[1]))
dna_sequence = []
blast_dnaid = []
for ident, sequence in dna_reader:
    dna_sequence.append(sequence)
    blast_dnaid.append(ident)

# dna = dna_sequence[0]
# print(len(dna))

protein_reader = FASTAReader(open(sys.argv[2]))
protein_sequence = []

for ident, sequence in protein_reader:
    protein_sequence.append(sequence)

# print(len(protein_sequence))

l = len(protein_sequence)
aligned_dna = {}
예제 #8
0
Commands:
	k		The length of the seeds used to find exact matches
"""

# ______________________________________________________________________________
# This section is copied from the kmer_matcher. Finds all matches

import sys
from fasta import FASTAReader

k = int(sys.argv[3])

target_kmers = {}

for ident, sequence in FASTAReader(open(sys.argv[1])):
    target_kmers[ident] = {}
    sequence = sequence.upper()
    for i in range(0, len(sequence) - k + 1):
        kmer = sequence[i:i + k]
        target_kmers[ident].setdefault(kmer, set())
        target_kmers[ident][kmer].add(str(i))

query_file = open(sys.argv[2])
query_seq = ''
for line in query_file:
    if line.startswith('>'):
        continue
    query_seq += line.strip()

query_kmers = {}
예제 #9
0
#!/usr/bin/env python3

import sys
from fasta import FASTAReader
import numpy as np
import matplotlib.pyplot as plt

f = open(sys.argv[1])
f2 = open(sys.argv[2])

reader = FASTAReader(f)
reader2 = FASTAReader(f2)

count = 0
protein_seq = {}
for ident, seq in reader:
    count += 1
    ident = count
    protein_seq[ident] = seq

count2 = 0
nt_seq = {}
for ident, seq in reader2:
    count2 += 1
    ident = count2
    nt_seq[ident] = seq

new_seq = {}
for ident in protein_seq:
    pseq = protein_seq[ident]
    nseq = nt_seq[ident]
예제 #10
0
#!/usr/bin/env python3

"""
Count all kmers in a FASTA file
"""

from fasta import FASTAReader
import sys

reader1 = FASTAReader( open(sys.argv[1]) ) #This is a function that returns an object AND was generated by us
k = int(sys.argv[2])
reader2 = FASTAReader( open(sys.argv[3]) )

#query to 1 k to 2 and target to 3

kmers = {}

for ident, sequence in reader1:
    for i in range( 0, len(sequence) - k + 1 ):
        kmer = sequence[i:i+k]
        if kmer in kmers: 
            kmers[kmer].append((i, ident))
        else:
            kmers[kmer] = [(i, ident)]
    
extk = {}
for ident1, sequence1 in reader2:
    for i in range( 0, len(sequence1) - k + 1 ):
        kmerQ = sequence1[i:i+k]
        if kmerQ in kmers:
            print(i, kmerQ, kmers[kmerQ])