Exemplo n.º 1
0
    def phageSequences(self, fafile=datadir + 'phage_with_host.fna'):
        '''Get the DNA sequences of the phages that we are interested in '''
        if len(self.host) == 0:
            self.phageHost()

        fa = readFasta(fafile)
        for i in fa:
            m = re.findall('(NC_\d+)', i)
            if m[0] not in self.host:
                continue
            self.fasta[m[0]] = fa[i]
        return self.fasta
Exemplo n.º 2
0
import os
import re

try:
    orfF = sys.argv[1]
    dir = sys.argv[2]
    type = sys.argv[3]
except:
    sys.exit(
        sys.argv[0] +
        " <orfs file> <dir to put them> <type must be one of phage or genome>")

if type not in ['phage', 'genome']:
    sys.exit(type + " is not valid: must be either phage or genome\n")

fa = rob.readFasta(orfF)
if not os.path.exists(dir):
    os.mkdir(dir)

for i in fa:
    if type == 'genome':
        m = re.findall('([\w\.\-]+)\s+\[(\w+)\]\s+\[(.*)\]', i)
        if m == []:
            continue
        gene, genome, locus = m[0]
    else:
        x = i.replace(' COMPLEMENT', '')
        m = x.split(' ')
        gene, genome, locus = m
        with open(os.path.sep.join([dir, genome]), 'a') as out:
            out.write('>' + i + "\n" + fa[i] + "\n")
Exemplo n.º 3
0
sys.path.append('/home3/redwards/bioinformatics/Modules')
import rob
import robseq

# this is a hash of all possible codons
codons = robseq.geneticCode().keys()
codons.sort()

try:
    file = sys.argv[1]
except:
    sys.exit(sys.argv[0] + " <fasta file of coding regions>")



fa = rob.readFasta(file)
count={}
cds={}
for id in fa:
    if ((1.0 * len(fa[id])) / 3) != len(fa[id])/3:
        sys.stderr.write("Sequence " + id + " does not appear to be a multiple of 3 nucleotides. Skipped\n")
        continue
    pieces = id.split(" ")
    locus = pieces[1]
    if locus not in count:
        count[locus]=0
        cds[locus]={}
        for codon in codons:
            cds[locus][codon]=0
    p=0
    while p<len(fa[id]):
Exemplo n.º 4
0
        complement = False

        m = locationre.match(line)
        if m:
            start = m.group(1)
            end = m.group(2)
        else:
            m = locationrerc.match(line)
            if m:
                complement = True
                start = m.group(1)
                end = m.group(2)
            else:
                sys.stderr.write("Can't parse an apparent location at : " + line + "\n")

fa = rob.readFasta(fnaf)

ncre = re.compile('.*ref\|(\w+)')
for id in fa:
    m = ncre.match(id)
    if not m:
        sys.stderr.write("No apparent NC_ idenitifer in this sequence id: " + id + "\n")
        continue

    locus = m.group(1)
    for l in locations[locus]:
        [start, end, complement] = locations[locus][l]
        if complement:
            print ">" + l + " "  + locus + " " + end + "_" + start + " COMPLEMENT"
            print rob.rc(fa[id][int(start)-1:int(end)])
        else:
Exemplo n.º 5
0
import rob
import sys

# 1404927386.fasta  analyzed_sequences.txt  annotations.txt
#

faf = None

try:
    faf = sys.argv[1]
except IndexError:
    sys.stderr.write("Please provide a fasta file\n")
    sys.exit(0)

fa = rob.readFasta(faf)

analyzed = []
with open('analyzed_sequences.txt', 'r') as asf:
    for line in asf:
        pieces = line.rstrip()
        analyzed.append(pieces)
        if pieces not in fa:
            sys.stderr.write(pieces + " has been analyzed but is not in " +
                             faf + "\n")

for f in fa:
    if f not in analyzed:
        sys.stderr.write("NOT ANALYZED: " + f + "\n")

annotated = []
with open('annotations.txt', 'r') as asf:
import rob
import sys
import os
import re

try:
    orfF = sys.argv[1]
    dir  = sys.argv[2]
    type = sys.argv[3]
except:
    sys.exit(sys.argv[0] + " <orfs file> <dir to put them> <type must be one of phage or genome>")

if type not in ['phage', 'genome']:
    sys.exit(type + " is not valid: must be either phage or genome\n")

fa = rob.readFasta(orfF)
if not os.path.exists(dir):
    os.mkdir(dir)

for i in fa:
    if type == 'genome':
        m = re.findall('([\w\.\-]+)\s+\[(\w+)\]\s+\[(.*)\]', i)
        if m == []:
            continue
        gene, genome, locus = m[0]
    else:
        x=i.replace(' COMPLEMENT', '')
        m=x.split(' ')
        gene, genome, locus = m
        with open(os.path.sep.join([dir, genome]), 'a') as out:
            out.write('>' + i + "\n" + fa[i] + "\n")
Exemplo n.º 7
0
        m = locationre.match(line)
        if m:
            start = m.group(1)
            end = m.group(2)
        else:
            m = locationrerc.match(line)
            if m:
                complement = True
                start = m.group(1)
                end = m.group(2)
            else:
                sys.stderr.write("Can't parse an apparent location at : " +
                                 line + "\n")

fa = rob.readFasta(fnaf)

#ncre = re.compile('.*ref\|(\w+)')
ncre = re.compile('(NC_\d+)')
for id in fa:
    m = ncre.match(id)
    if not m:
        sys.stderr.write("No apparent NC_ idenitifer in this sequence id: " +
                         id + "\n")
        continue

    locus = m.group(1)
    for l in locations[locus]:
        [start, end, complement] = locations[locus][l]
        if complement:
            print ">" + l + " " + locus + " " + end + "_" + start + " COMPLEMENT"
def longestCommonSubstring(s1, s2):
    '''This is taken straight from the wikibooks page, and is creating a matrix to look up. Dynamic programming'''
    m = [[0] * (1 + len(s2)) for i in xrange(1 + len(s1))]
    longest, x_longest = 0, 0
    for x in xrange(1, 1 + len(s1)):
        for y in xrange(1, 1 + len(s2)):
            if s1[x - 1] == s2[y - 1]:
                m[x][y] = m[x - 1][y - 1] + 1
                if m[x][y] > longest:
                    longest = m[x][y]
                    x_longest = x
            else:
                m[x][y] = 0
    return s1[x_longest - longest: x_longest]

fa1=rob.readFasta(file1)
fa2=rob.readFasta(file2)

longest = ""
for id1 in fa1.keys():
    for id2 in fa2.keys(): 
        sys.stderr.write("Comparing " + id1 + " to " + id2 + "\n")
        test = longestCommonSubstring(fa1[id1], fa2[id2])
        if len(test) > len(longest):
            longest = test
        sys.stderr.write("Comparing rc " + id1 + " to " + id2 + "\n")
        test = longestCommonSubstring(rob.rc(fa1[id1]), fa2[id2])
        if len(test) > len(longest):
            longest = test

print "\t".join([file1, file2, len(longest), longest])
Exemplo n.º 9
0
import rob
import sys

# 1404927386.fasta  analyzed_sequences.txt  annotations.txt
#

faf=None

try:
    faf=sys.argv[1]
except IndexError:
    sys.stderr.write("Please provide a fasta file\n")
    sys.exit(0)


fa = rob.readFasta(faf)


analyzed=[]
with open('analyzed_sequences.txt', 'r') as asf:
    for line in asf:
        pieces=line.rstrip()
        analyzed.append(pieces)
        if pieces not in fa:
            sys.stderr.write(pieces + " has been analyzed but is not in " + faf + "\n")

for f in fa:
    if f not in analyzed:
        sys.stderr.write("NOT ANALYZED: " + f + "\n")