#!/usr/bin/env python3 from fasta import FASTAReader import sys target = FASTAReader(open(sys.argv[1])) query = FASTAReader(open(sys.argv[2])) k = int(sys.argv[3]) kmers = {} for ident1, sequence1 in target: sequence1 = sequence1.upper() for i in range(0, len(sequence1) - k + 1): kmer1 = sequence1[i:i + k] if kmer1 in kmers: kmers[kmer1].append((ident1, i)) else: kmers[kmer1] = [(ident1, i)] for ident2, sequence2 in query: sequence2 = sequence2.upper() for a in range(0, len(sequence2) - k + 1): kmer2 = sequence2[a:a + k] if kmer2 in kmers: print(kmers[kmer2], str(a), kmer2)
#!/usr/bin/env python3 import sys from fasta import FASTAReader f = sys.stdin reader = FASTAReader(f) contigs = [] for indent, sequence in reader: contigs.append(sequence) contigs = sorted(contigs, reverse=True) print(len(contigs)) sequence_length = [] for sequence in contigs: sequence_length.append(len(sequence)) print(min(sequence_length)) print(max(sequence_length)) print(sum(sequence_length) / len(contigs)) print(sum(sequence_length) / 2) sorted_length = sorted(sequence_length, reverse=True) count = 0 for i, item in enumerate(sorted_length): count += item if count >= (sum(sequence_length) / 2): break
#!/usr/bin/env python3 """match extender""" from fasta import FASTAReader import sys target = FASTAReader(open(sys.argv[1])) #subset.fa query = FASTAReader(open(sys.argv[2])) #droYak2_seq.fa k = int(sys.argv[3]) kmers_from_target = {} target_sequence = {} for ident, sequence in target: sequence = sequence.upper() target_sequence[ident] = sequence for i in range(0, len(sequence) - k + 1): kmer = sequence[i:i + k] if kmer in kmers_from_target: kmers_from_target[kmer].append((ident, i)) else: kmers_from_target[kmer] = [(ident, i)] elongated_seq = [] for ident, sequence1 in query: sequence1 = sequence1.upper() for i in range(0, len(sequence) - k + 1): kmer = sequence1[i:i + k] if kmer in kmers_from_target: for ident, j in kmers_from_target[kmer]:
#!/usr/bin/env python3 import sys from fasta import FASTAReader import math import matplotlib.pyplot as plt import statistics as stats reader = FASTAReader(open(sys.argv[1])) #translated.out protein_seq = [] for ident, sequence1 in reader: protein_seq.append(sequence1) #print (protein_seq) reader2 = FASTAReader(open(sys.argv[2])) #3_new_blast_output nt_seq = [] for ident, sequence2 in reader2: nt_seq.append(sequence2) #print (nt_seq) list1 = [] for sequence, protein in zip(nt_seq, protein_seq): dna1 = "" nt_pos = 0 for num, j in enumerate(protein): # print(a) if j == "-": dna1 = dna1 + "---"
#!/usr/bin/env python3 import sys from fasta import FASTAReader import math import matplotlib.pyplot as plt import statistics as stats reader = FASTAReader(open(sys.argv[1])) reader2 = FASTAReader(open(sys.argv[2])) my_prot_sequence = [] #protein for ident, sequence in reader: my_prot_sequence.append(sequence) #print(my_sequence) #quit() my_sequence_nt = [] #nucleotide for ident2, sequence2 in reader2: my_sequence_nt.append(sequence2) #print(my_sequence_nt) newlist = [] #for m in my_sequence_nt: #dna = my_sequence_nt[m]#you only need to do it in the query sequence #store by sequence identity in dictionary # for i in range(len(my_prot_sequence)): #dna= str(dna) for sequence, protein in zip(my_sequence_nt, my_prot_sequence): gap_dna = "" nucl_pos = 0 for num, a in enumerate(protein):
#!/usr/bin/env python3 """ Commands: enter this argument: ./.py file subset.fa droYak2_seq.fa 11 script, target file, query file, and kmer number """ #use previously developed FASTAReader to make the work below easier. from fasta import FASTAReader import sys #add our files target = FASTAReader(open(sys.argv[1])) # this is target (subset.fa) query = FASTAReader(open(sys.argv[2])) # this is query file k = int(sys.argv[3]) # this is the length of kmer #initialize dictionary that holds the kmer (sequence) as the key and a tuple that contains the name target_dictionary = {} #for loop to go through target file for name_t, sequence_t in target: for i in range(0, len(sequence_t) - k + 1): #set kmer_t length and make letters IN CAPS kmer_t = sequence_t[i:i + k].upper() #since we are using a dictionary, add name_t and i into the values section and assign it to specific kmer_t target_tuple = (name_t, i) if kmer_t in target_dictionary: target_dictionary[kmer_t].append(target_tuple) else: target_dictionary[kmer_t] = [target_tuple]
#!/usr/bin/env python3 """ Usage: ./02_nt.py new_blast_output.fa aa_alignment.out week5_query.fa """ import sys from fasta import FASTAReader import numpy as np import matplotlib.pyplot as plt import math dna_reader = FASTAReader(open(sys.argv[1])) dna_sequence = [] blast_dnaid = [] for ident, sequence in dna_reader: dna_sequence.append(sequence) blast_dnaid.append(ident) # dna = dna_sequence[0] # print(len(dna)) protein_reader = FASTAReader(open(sys.argv[2])) protein_sequence = [] for ident, sequence in protein_reader: protein_sequence.append(sequence) # print(len(protein_sequence)) l = len(protein_sequence) aligned_dna = {}
Commands: k The length of the seeds used to find exact matches """ # ______________________________________________________________________________ # This section is copied from the kmer_matcher. Finds all matches import sys from fasta import FASTAReader k = int(sys.argv[3]) target_kmers = {} for ident, sequence in FASTAReader(open(sys.argv[1])): target_kmers[ident] = {} sequence = sequence.upper() for i in range(0, len(sequence) - k + 1): kmer = sequence[i:i + k] target_kmers[ident].setdefault(kmer, set()) target_kmers[ident][kmer].add(str(i)) query_file = open(sys.argv[2]) query_seq = '' for line in query_file: if line.startswith('>'): continue query_seq += line.strip() query_kmers = {}
#!/usr/bin/env python3 import sys from fasta import FASTAReader import numpy as np import matplotlib.pyplot as plt f = open(sys.argv[1]) f2 = open(sys.argv[2]) reader = FASTAReader(f) reader2 = FASTAReader(f2) count = 0 protein_seq = {} for ident, seq in reader: count += 1 ident = count protein_seq[ident] = seq count2 = 0 nt_seq = {} for ident, seq in reader2: count2 += 1 ident = count2 nt_seq[ident] = seq new_seq = {} for ident in protein_seq: pseq = protein_seq[ident] nseq = nt_seq[ident]
#!/usr/bin/env python3 """ Count all kmers in a FASTA file """ from fasta import FASTAReader import sys reader1 = FASTAReader( open(sys.argv[1]) ) #This is a function that returns an object AND was generated by us k = int(sys.argv[2]) reader2 = FASTAReader( open(sys.argv[3]) ) #query to 1 k to 2 and target to 3 kmers = {} for ident, sequence in reader1: for i in range( 0, len(sequence) - k + 1 ): kmer = sequence[i:i+k] if kmer in kmers: kmers[kmer].append((i, ident)) else: kmers[kmer] = [(i, ident)] extk = {} for ident1, sequence1 in reader2: for i in range( 0, len(sequence1) - k + 1 ): kmerQ = sequence1[i:i+k] if kmerQ in kmers: print(i, kmerQ, kmers[kmerQ])