def Initialize(): header,seq = fasta.load('MHC_hg18.fa') sixFrameIter = sequence.sixFrameTranslationIter(seq) writer = fasta.MfaWriter('6frames.fa') for frame,p in sixFrameIter: print 'Frame:', frame writer.write('%s:%i' % (header,frame),p) writer.close() sys.exit()
def Initialize(): header, seq = fasta.load('MHC_hg18.fa') sixFrameIter = sequence.sixFrameTranslationIter(seq) writer = fasta.MfaWriter('6frames.fa') for frame, p in sixFrameIter: print 'Frame:', frame writer.write('%s:%i' % (header, frame), p) writer.close() sys.exit()
def loadCdsSeq(codingSeqFnL): '''Given a list NCBI coding sequence file names (fna) load the sequences and store in a dictionary keyed by locus tag. ''' seqD={} for fn in codingSeqFnL: for header,seq in fasta.load(fn): locusTag = header.split('locus_tag=')[1].split(']')[0] seqD[locusTag]=seq return seqD
def loadProt(protFnL): '''Given a list of file names of fasta files with the gene name as header, load the sequences and store in a dictionary keyed by protein name. ''' seqD={} for fn in protFnL: for header,seq in fasta.load(fn): gn = header.split()[0][1:] seqD[gn]=seq return seqD
def loadProt(protFnL): '''Given a list of file names of fasta files with the gene name as header, load the sequences and store in a dictionary keyed by protein name. ''' seqD = {} for fn in protFnL: for header, seq in fasta.load(fn): gn = header.split()[0][1:] seqD[gn] = seq return seqD
def createGeneSpeciesMap(protFile): f=open(protFile,'r') while True: s=f.readline() if s=='': break s=s.rstrip('\n') L=s.rstrip().split('/') k=L[2].split('.')[0] print k, strainInfo=fasta.load(s) for gene in strainInfo: q=gene[0] q=q[1:] q=q.split() q=q[0] print q, print return
except: print """ Your input files cannot be found. """ sys.exit(2) # Read in the cd-hit clustered file try: cluster_list = cluster_file.read() except: print 'Cannot open', cluster_file, '\n' sys.exit(2) # Read in the FASTA file and check to make sure it's in FASTA format try: fasta_dict_raw = fasta.load(fasta_file) except: print '\n', sys.argv[2], 'does not appear to be a fasta file\n' sys.exit(2) fasta_dict = {} # Just take everything before the first space in the first line of the FASTA file as # the key. This is also how cd-hit takes the name, so the keys will match. for fasta_key in fasta_dict_raw: new_fasta = fasta_key.split(' ') new_key = new_fasta[0] fasta_dict[new_key] = fasta_dict_raw[fasta_key]
usage = "%prog [options] <gff file> <fasta reference sequence>" parser = OptionParser(usage=usage) parser.add_option("-o", "--output", dest="oFilename", help="Output filename", default=None) parser.add_option("-f", "--features", dest="features", help="Features to extract", default=['exon']) options, args = parser.parse_args(sys.argv) if len(args)!=3: sys.exit(__doc__) gffFilename = args[1] faFilename = args[2] data = gff.load(gffFilename) header,seq = fasta.load(faFilename) if options.oFilename: oFile = open(options.oFilename, 'w') else: oFile = sys.stdout writer = fasta.MfaWriter(oFile) for name in data: s = [] extrema = [] for f in data[name]: if f.type in options.features: if f.strand=='+': start,end = f.start,f.end _seq = seq[start-1:end]
action="store_true", dest="complement", help="Complement sequence", default=False) parser.add_option( "-b", "--reverseComplement", "--revComp", action="store_true", dest="reverseComplement", help="Reverse complement sequence", default=False) options, args = parser.parse_args(sys.argv) iFilename = args[1] start = int(args[2]) end = int(args[3]) header,seq = fasta.load(iFilename) s = seq[start-1:end] h = '%s %i-%i' % (header,start,end) if options.reverse: s = sequence.reverse(s) h += '(r)' elif options.complement: s = sequence.complement(s) h += '(c)' elif options.reverseComplement: s = sequence.reverse_complement(s) h += '(rc)' fasta.pretty(h, s, width=options.width)
if __name__ == "__main__": if len(sys.argv) !=4: sys.stderr.write(""" Usage: python seedAlign.fa protDB.fa outfile.txt """) sys.exit(-1) seedAlignFN = sys.argv[1] dbSeqFN = sys.argv[2] outFN = sys.argv[3] # create profile HMM based on seed alignment seedAlignL = [seq for header,seq in fasta.load(seedAlignFN)] # load db dbL = fasta.load(dbSeqFN) outL=[] model=hmmmodel.HmmModel(seedAlignFN) for hd,sseq in dbL: # load db print("aligning: "+hd) alignment= hmmAlign.HmmAlign(sseq,model) score=alignment.subtractShuffleMean() outL.append((score,hd))
#!/usr/bin/python # chrom.py import sys import fasta assembly_file = sys.argv[1] contigs_file = sys.argv[2] contigs = {} for f in fasta.load(open(contigs_file)): contigs[f.name] = f chrom = "chrI" print ">" + chrom for line in open(assembly_file): fields = line.strip().split() s, e = int(fields[1]), int(fields[2]) l = e - s if fields[3][0:3] == "gap": sys.stdout.write("N" * l) else: strand = fields[5] if strand == "+": sys.stdout.write(contigs[fields[3]].seq) elif strand == "-": sys.stdout.write(contigs[fields[3]].reverse_complement().seq) elif strand == ".": sys.stdout.write(contigs[fields[3]].seq)
sys.exit(2) # Check to make sure the input file exists and can be opened try: fasta_file = open(filename) except: print "This file could not be opened" sys.exit(2) # Check to make sure the input file is in FASTA format try: fasta_data = fasta.load(fasta_file) except: print 'This file does not seem to be a fasta file. Please try again with a fasta file' sys.exit(0) # Write out the FASTA input file to a new file, because CD-HIT doesn't handle # all input file types correctly new_fasta_file = dirname+'/tmp/input_fasta_file.fa' try: input_fasta_file = open(new_fasta_file, 'wt') except: print 'Cannot open', new_fasta_file, 'for writing a temporary fasta file'
#!/usr/bin/env python """ testHmmer.py Author: Tony Papenfuss Date: Fri Sep 1 09:09:02 EST 2006 """ import os, sys, re import hmmer4, fasta, sequence h,s = fasta.load('seq/HLA-A.fa') L = len(s) if False: domains = hmmer4.load_domains('hmmer/6frames.txt') for d in domains: p = hmmer4.parseSixFrameHeader(d.accession) print d print p.name, p.frame gStart,gEnd,strand = hmmer4.convert6FrameToGenomic(d.sStart,d.sEnd,p.frame,L) print gStart,gEnd,strand if strand=='+': dna = s[gStart-1:gEnd] print len(dna), len(dna) % 3==0 print sequence.codons(dna, remainder=True) print sequence.translate(dna) else:
except: print """ Your input files cannot be found. """ sys.exit(2) # Read in the cd-hit clustered file try: cluster_list = cluster_file.read() except: print 'Cannot open', cluster_file, '\n' sys.exit(2) # Read in the FASTA file and check to make sure it's in FASTA format try: fasta_dict_raw = fasta.load(fasta_file) except: print '\n', sys.argv[2], 'does not appear to be a fasta file\n' sys.exit(2) fasta_dict = {} # Just take everything before the first space in the first line of the FASTA file as # the key. This is also how cd-hit takes the name, so the keys will match. for fasta_key in fasta_dict_raw: new_fasta = fasta_key.split(' ') new_key = new_fasta[0] fasta_dict[new_key] = fasta_dict_raw[fasta_key] # Output file for the list of all the sequences in each cluster
#!/usr/bin/env python """ extractORFs.py Author: Tony Papenfuss Date: Wed Aug 23 08:52:58 EST 2006 """ import os, sys import re, copy import fasta, sequence, hmmer3 seqFilename = sys.argv[1] header, seq = fasta.load(seqFilename) header = header.split()[0] L = len(seq) pattern = re.compile('\*') minLen = 10 sixFrameIter = sequence.sixFrameTranslationIter(seq) writer = fasta.MfaWriter(sys.stdout) i = 0 for frame, p in sixFrameIter: print >> sys.stderr, 'Frame:', frame matchIter = pattern.finditer(p) match = matchIter.next()
def Initialize(): header,seq = fasta.load('MHC_hg18.fa') sixFrameIter = sequence.sixFrameTranslationIter(seq) writer = fasta.MfaWriter('6frames.fa') for frame,p in sixFrameIter: print 'Frame:', frame writer.write('%s:%i' % (header,frame),p) writer.close() sys.exit() # Initialize() header,seq = fasta.load('MHC_hg18.fa') L = len(seq) hstart = header.split()[0] pattern = re.compile('\*|X{200,}') minLen = 20 # sixFrameIter = sequence.sixFrameTranslationIter(seq) sixFrameIter = fasta.load_iter('6frames.fa') writer = fasta.MfaWriter('ORFs.fa') i = 0 for h,p in sixFrameIter: hmmerFrame = int(h.split(':')[-1]) frame = hmmer.hmmer2frame[hmmerFrame] print >> sys.stderr, 'Frame:', frame
def Initialize(): header, seq = fasta.load('MHC_hg18.fa') sixFrameIter = sequence.sixFrameTranslationIter(seq) writer = fasta.MfaWriter('6frames.fa') for frame, p in sixFrameIter: print 'Frame:', frame writer.write('%s:%i' % (header, frame), p) writer.close() sys.exit() # Initialize() header, seq = fasta.load('MHC_hg18.fa') L = len(seq) hstart = header.split()[0] pattern = re.compile('\*|X{200,}') minLen = 20 # sixFrameIter = sequence.sixFrameTranslationIter(seq) sixFrameIter = fasta.load_iter('6frames.fa') writer = fasta.MfaWriter('ORFs.fa') i = 0 for h, p in sixFrameIter: hmmerFrame = int(h.split(':')[-1]) frame = hmmer.hmmer2frame[hmmerFrame] print >> sys.stderr, 'Frame:', frame
f = open(aabrhAlignmentFN, "w") randStr = str(random.randrange(1e5)) intempAlignFN = "/tmp/tempAlign" + randStr + ".fa" outtempAlignFN = "/tmp/tempAlign" + randStr + ".afa" for orthos in aabrhL: tempf = open(intempAlignFN, "w") writeSeqBlock(tempf, orthos, seqD) tempf.close() # align the temp file os.system("muscle -in " + intempAlignFN + " -out " + outtempAlignFN) # load aligned file into a sequence dict alSeqL = fasta.load(outtempAlignFN) alSeqD = {} for hd, sq in alSeqL: alSeqD[hd[1:]] = sq # write aligned fasta block into main output file, in the # original order (muscle messes up this order). for gene in orthos: commonName, locusTag, descrip, chrom, start, end, strand = geneInfoD[ gene] f.write(">" + gene + " " + locusTag + "\n") f.write(alSeqD[gene] + "\n") f.write("\n") # delete the temp files os.system("rm " + intempAlignFN)
""" extractORFs.py Author: Tony Papenfuss Date: Wed Aug 23 08:52:58 EST 2006 """ import os, sys import re, copy import fasta, sequence, hmmer3 seqFilename = sys.argv[1] header,seq = fasta.load(seqFilename) header = header.split()[0] L = len(seq) pattern = re.compile('\*') minLen = 10 sixFrameIter = sequence.sixFrameTranslationIter(seq) writer = fasta.MfaWriter(sys.stdout) i = 0 for frame,p in sixFrameIter: print >> sys.stderr, 'Frame:', frame matchIter = pattern.finditer(p) match = matchIter.next()
Date: Wed Aug 23 08:52:58 EST 2006 """ import os, sys import re, copy import fasta, sequence pattern = re.compile('[\*|X{200,}]') minLen = 20 i = 0 writer = fasta.MfaWriter('ORFs.fa') filename = sys.argv[1] header, dna = fasta.load(filename) header = header.strip() orfIter = sequence.extractOrfsIter(dna, minLen=minLen, pattern=pattern) for i, gStart, gEnd, orf in orfIter: h = '%s.%i.%i-%i Length=%i' % (header, i, gStart, gEnd, len(orf)) writer.write(h, orf) fasta.pretty(h, orf) if gStart < gEnd: s = dna[gStart - 1:gEnd] print gStart, gEnd, len(s), len(s) % 3 == 0 print sequence.codons(s, remainder=True) print sequence.translate(s) else:
#!/usr/bin/env python """ orfTest.py Author: Tony Papenfuss Date: Tue Aug 22 20:14:57 EST 2006 """ import os, sys import fasta, sequence header,seq = fasta.load('NKC.fa') orfIterator = fasta.load_iter('ORFs.fa') writer = fasta.MfaWriter('ORFs2.fa') for h,orf in orfIterator: chrom,block,orfId,limits = h.split()[0].split('.') start,end = limits.split('-') start = int(start) end = int(end) if start>end: strand = '-' start,end = end,start s = sequence.translate(sequence.reverseComplement(seq[start-1:end])) else: strand = '+' s = sequence.translate(seq[start-1:end])
help="Complement sequence", default=False) parser.add_option("-b", "--reverseComplement", "--revComp", action="store_true", dest="reverseComplement", help="Reverse complement sequence", default=False) options, args = parser.parse_args(sys.argv) iFilename = args[1] start = int(args[2]) end = int(args[3]) header, seq = fasta.load(iFilename) s = seq[start - 1:end] h = '%s %i-%i' % (header, start, end) if options.reverse: s = sequence.reverse(s) h += '(r)' elif options.complement: s = sequence.complement(s) h += '(c)' elif options.reverseComplement: s = sequence.reverse_complement(s) h += '(rc)' fasta.pretty(h, s, width=options.width)
#!/usr/bin/python # Usage: chromByLen.py 454LargeContigs.fna chrI import fasta import sys gap = "N" * 100 contig_file, cn, outdir = sys.argv[1], sys.argv[2], sys.argv[3] # Load fasta file w/ contigs sequences. Typically, 454LargeContigs.fna contigs = dict([(x.name, x) for x in fasta.load(open(sys.argv[1]))]) # get sorted list of contigs by length lengths = [(contigs[x].length, x) for x in contigs] lengths.sort(reverse=True) cn = sys.argv[2] # output BED tracks cf = open(outdir + '/contigs.bed', 'w') gf = open(outdir + '/gaps.bed', 'w') cstart = 0 gn = 1 for length, name in lengths: print >> cf, cn, cstart, cstart+length, name, 1000, '+' cstart = cstart + length print >> gf, cn, cstart, cstart+100, 'gap%s' % gn cstart = cstart + len(gap)
""" import os, sys import re, copy import fasta, sequence pattern = re.compile('[\*|X{200,}]') minLen = 20 i = 0 writer = fasta.MfaWriter('ORFs.fa') filename = sys.argv[1] header,dna = fasta.load(filename) header = header.strip() orfIter = sequence.extractOrfsIter(dna, minLen=minLen, pattern=pattern) for i,gStart,gEnd,orf in orfIter: h = '%s.%i.%i-%i Length=%i' % (header,i,gStart,gEnd,len(orf)) writer.write(h, orf) fasta.pretty(h, orf) if gStart<gEnd: s = dna[gStart-1:gEnd] print gStart, gEnd, len(s), len(s) % 3==0 print sequence.codons(s, remainder=True) print sequence.translate(s) else:
#!/usr/bin/env python """codingscore2.py <fasta file> <ghmm cfg> <gff file> In windows of step evaluates log(P(S|q = coding) / P(S|q = noncoding)) and outputs to file """ from common.GHMM import * from common.feature import * import fasta, sys if len(sys.argv) != 4: print 'Invalid arguments' print __doc__ sys.exit(0) head,seq = fasta.load(sys.argv[1]) ghmm = GHMM(sys.argv[2]) features = Features(sys.argv[3]) sequence = sequenceDict(seq, 5) outfile = open(sys.argv[1] + '.scr', 'w') posscorecnc = [] posscore = [] for ref in features: pos = 0 for generef in features[ref]: gene = features[ref][generef] genecoords = gene.coords for (start, end) in genecoords: coding1 = max(ghmm.content['eintn'].probEmit(sequence, start, end+1, 0, '+'), ghmm.content['eintn'].probEmit(sequence, start, end+1, 1, '+'), ghmm.content['eintn'].probEmit(sequence, start, end+1, 2, '+'))
f=open(aabrhAlignmentFN,"w") randStr = str(random.randrange(1e5)) intempAlignFN="/tmp/tempAlign"+randStr+".fa" outtempAlignFN="/tmp/tempAlign"+randStr+".afa" for orthos in aabrhL: tempf = open(intempAlignFN,"w") writeSeqBlock(tempf,orthos,seqD) tempf.close() # align the temp file os.system("muscle -in "+ intempAlignFN + " -out " + outtempAlignFN) # load aligned file into a sequence dict alSeqL=fasta.load(outtempAlignFN) alSeqD={} for hd,sq in alSeqL: alSeqD[hd[1:]]=sq # write aligned fasta block into main output file, in the # original order (muscle messes up this order). for gene in orthos: commonName,locusTag,descrip,chrom,start,end,strand=geneInfoD[gene] f.write(">"+gene+" "+locusTag+"\n") f.write(alSeqD[gene]+"\n") f.write("\n") # delete the temp files os.system("rm "+intempAlignFN) os.system("rm "+outtempAlignFN)
#!/usr/bin/python # Usage: chromByLen.py 454LargeContigs.fna chrI import fasta import sys gap = "N" * 100 contig_file, cn, outdir = sys.argv[1], sys.argv[2], sys.argv[3] # Load fasta file w/ contigs sequences. Typically, 454LargeContigs.fna contigs = dict([(x.name, x) for x in fasta.load(open(sys.argv[1]))]) # get sorted list of contigs by length lengths = [(contigs[x].length, x) for x in contigs] lengths.sort(reverse=True) cn = sys.argv[2] # output BED tracks cf = open(outdir + '/contigs.bed', 'w') gf = open(outdir + '/gaps.bed', 'w') cstart = 0 gn = 1 for length, name in lengths: print >> cf, cn, cstart, cstart + length, name, 1000, '+' cstart = cstart + length print >> gf, cn, cstart, cstart + 100, 'gap%s' % gn cstart = cstart + len(gap) gn += 1
#!/usr/bin/python # chrom.py import sys import fasta assembly_file = sys.argv[1] contigs_file = sys.argv[2] contigs = {} for f in fasta.load(open(contigs_file)): contigs[f.name] = f chrom = 'chrI' print '>' + chrom for line in open(assembly_file): fields = line.strip().split() s, e = int(fields[1]), int(fields[2]) l = e - s if fields[3][0:3] == 'gap': sys.stdout.write('N' * l) else: strand = fields[5] if strand == '+': sys.stdout.write(contigs[fields[3]].seq) elif strand == '-': sys.stdout.write(contigs[fields[3]].reverse_complement().seq) elif strand == '.': sys.stdout.write(contigs[fields[3]].seq)
import sys, fasta import hmmmodel import hmmAlign #### Main if __name__ == "__main__": if len(sys.argv) != 3: sys.stderr.write(""" Usage: python seedAlign.fa protSeq.fa """) sys.exit(-1) seedAlignFN = sys.argv[1] protSeqFN = sys.argv[2] # create profile HMM based on seed alignment model = hmmmodel.HmmModel(seedAlignFN) # load db sseq = fasta.load(protSeqFN)[0][1] alignment = hmmAlign.HmmAlign(sseq, model) score = alignment.subtractShuffleMean() print(score)