def phageSequences(self, fafile=datadir + 'phage_with_host.fna'): '''Get the DNA sequences of the phages that we are interested in ''' if len(self.host) == 0: self.phageHost() fa = readFasta(fafile) for i in fa: m = re.findall('(NC_\d+)', i) if m[0] not in self.host: continue self.fasta[m[0]] = fa[i] return self.fasta
import os import re try: orfF = sys.argv[1] dir = sys.argv[2] type = sys.argv[3] except: sys.exit( sys.argv[0] + " <orfs file> <dir to put them> <type must be one of phage or genome>") if type not in ['phage', 'genome']: sys.exit(type + " is not valid: must be either phage or genome\n") fa = rob.readFasta(orfF) if not os.path.exists(dir): os.mkdir(dir) for i in fa: if type == 'genome': m = re.findall('([\w\.\-]+)\s+\[(\w+)\]\s+\[(.*)\]', i) if m == []: continue gene, genome, locus = m[0] else: x = i.replace(' COMPLEMENT', '') m = x.split(' ') gene, genome, locus = m with open(os.path.sep.join([dir, genome]), 'a') as out: out.write('>' + i + "\n" + fa[i] + "\n")
sys.path.append('/home3/redwards/bioinformatics/Modules') import rob import robseq # this is a hash of all possible codons codons = robseq.geneticCode().keys() codons.sort() try: file = sys.argv[1] except: sys.exit(sys.argv[0] + " <fasta file of coding regions>") fa = rob.readFasta(file) count={} cds={} for id in fa: if ((1.0 * len(fa[id])) / 3) != len(fa[id])/3: sys.stderr.write("Sequence " + id + " does not appear to be a multiple of 3 nucleotides. Skipped\n") continue pieces = id.split(" ") locus = pieces[1] if locus not in count: count[locus]=0 cds[locus]={} for codon in codons: cds[locus][codon]=0 p=0 while p<len(fa[id]):
complement = False m = locationre.match(line) if m: start = m.group(1) end = m.group(2) else: m = locationrerc.match(line) if m: complement = True start = m.group(1) end = m.group(2) else: sys.stderr.write("Can't parse an apparent location at : " + line + "\n") fa = rob.readFasta(fnaf) ncre = re.compile('.*ref\|(\w+)') for id in fa: m = ncre.match(id) if not m: sys.stderr.write("No apparent NC_ idenitifer in this sequence id: " + id + "\n") continue locus = m.group(1) for l in locations[locus]: [start, end, complement] = locations[locus][l] if complement: print ">" + l + " " + locus + " " + end + "_" + start + " COMPLEMENT" print rob.rc(fa[id][int(start)-1:int(end)]) else:
import rob import sys # 1404927386.fasta analyzed_sequences.txt annotations.txt # faf = None try: faf = sys.argv[1] except IndexError: sys.stderr.write("Please provide a fasta file\n") sys.exit(0) fa = rob.readFasta(faf) analyzed = [] with open('analyzed_sequences.txt', 'r') as asf: for line in asf: pieces = line.rstrip() analyzed.append(pieces) if pieces not in fa: sys.stderr.write(pieces + " has been analyzed but is not in " + faf + "\n") for f in fa: if f not in analyzed: sys.stderr.write("NOT ANALYZED: " + f + "\n") annotated = [] with open('annotations.txt', 'r') as asf:
import rob import sys import os import re try: orfF = sys.argv[1] dir = sys.argv[2] type = sys.argv[3] except: sys.exit(sys.argv[0] + " <orfs file> <dir to put them> <type must be one of phage or genome>") if type not in ['phage', 'genome']: sys.exit(type + " is not valid: must be either phage or genome\n") fa = rob.readFasta(orfF) if not os.path.exists(dir): os.mkdir(dir) for i in fa: if type == 'genome': m = re.findall('([\w\.\-]+)\s+\[(\w+)\]\s+\[(.*)\]', i) if m == []: continue gene, genome, locus = m[0] else: x=i.replace(' COMPLEMENT', '') m=x.split(' ') gene, genome, locus = m with open(os.path.sep.join([dir, genome]), 'a') as out: out.write('>' + i + "\n" + fa[i] + "\n")
m = locationre.match(line) if m: start = m.group(1) end = m.group(2) else: m = locationrerc.match(line) if m: complement = True start = m.group(1) end = m.group(2) else: sys.stderr.write("Can't parse an apparent location at : " + line + "\n") fa = rob.readFasta(fnaf) #ncre = re.compile('.*ref\|(\w+)') ncre = re.compile('(NC_\d+)') for id in fa: m = ncre.match(id) if not m: sys.stderr.write("No apparent NC_ idenitifer in this sequence id: " + id + "\n") continue locus = m.group(1) for l in locations[locus]: [start, end, complement] = locations[locus][l] if complement: print ">" + l + " " + locus + " " + end + "_" + start + " COMPLEMENT"
def longestCommonSubstring(s1, s2): '''This is taken straight from the wikibooks page, and is creating a matrix to look up. Dynamic programming''' m = [[0] * (1 + len(s2)) for i in xrange(1 + len(s1))] longest, x_longest = 0, 0 for x in xrange(1, 1 + len(s1)): for y in xrange(1, 1 + len(s2)): if s1[x - 1] == s2[y - 1]: m[x][y] = m[x - 1][y - 1] + 1 if m[x][y] > longest: longest = m[x][y] x_longest = x else: m[x][y] = 0 return s1[x_longest - longest: x_longest] fa1=rob.readFasta(file1) fa2=rob.readFasta(file2) longest = "" for id1 in fa1.keys(): for id2 in fa2.keys(): sys.stderr.write("Comparing " + id1 + " to " + id2 + "\n") test = longestCommonSubstring(fa1[id1], fa2[id2]) if len(test) > len(longest): longest = test sys.stderr.write("Comparing rc " + id1 + " to " + id2 + "\n") test = longestCommonSubstring(rob.rc(fa1[id1]), fa2[id2]) if len(test) > len(longest): longest = test print "\t".join([file1, file2, len(longest), longest])
import rob import sys # 1404927386.fasta analyzed_sequences.txt annotations.txt # faf=None try: faf=sys.argv[1] except IndexError: sys.stderr.write("Please provide a fasta file\n") sys.exit(0) fa = rob.readFasta(faf) analyzed=[] with open('analyzed_sequences.txt', 'r') as asf: for line in asf: pieces=line.rstrip() analyzed.append(pieces) if pieces not in fa: sys.stderr.write(pieces + " has been analyzed but is not in " + faf + "\n") for f in fa: if f not in analyzed: sys.stderr.write("NOT ANALYZED: " + f + "\n")