def saveNexus0(self, oFileHandle, datatype='protein', gap='-', interleave=False, width=60): assert datatype.lower() in ['dna', 'rna', 'protein', 'standard', 'restriction'] yesno = {True: 'yes', False: 'no'} oFile = smartopen(oFileHandle, 'w') print >> oFile, '#NEXUS' print >> oFile, 'BEGIN data;' print >> oFile, 'DIMENSIONS ntax=%i nchar=%i;' % (self.numberOfSeqs(), self.__len__()) print >> oFile, 'FORMAT datatype=%s interleave=%s gap=%s;' \ % (datatype, yesno[interleave], gap) print >> oFile print >> oFile, 'MATRIX' format = '%%-%is %%s' % max([len(name) for name in self.seqDict]) if interleave: for i in xrange(0,self.__len__(),width): for name in self.order: print >> oFile, format % (name, self.seqDict[name][i:i+width]) print >> oFile else: for name in self.order: print >> oFile, format % (name, self.seqDict[name]) print >> oFile print >> oFile, ';' print >> oFile, 'END;' oFile.close()
def loadPhylip(iFileHandle, multipleDatasets=False): """ Load phylip alignment data. @param iFileHandle: Input filename or file. @param multipleDatasets: Do not close file (default False) """ iFile = smartopen(iFileHandle) nSeq,L = [int(x) for x in iFile.readline().strip().split()] alignment = Alignment() for i in xrange(nSeq): line = iFile.readline() tokens = line.strip().split() seq = ''.join(tokens[1:]) alignment.append(tokens[0], seq) skip = iFile.readline() width = len(alignment) nBlocks = int(math.ceil(float(L)/width)) for j in xrange(nBlocks-1): for i in xrange(nSeq): line = iFile.readline().strip() name = alignment.getName(i) alignment.append(name, line, cleanup=True) try: skip = iFile.readline() except: pass if not multipleDatasets: iFile.close() return alignment
def load(iFilename, offset=0): """Load a BED file. @param iFilename: Input filename or file. @param offset: Offset subtracted from positions (Default: 0). @return: List of features. """ iFile = smartopen(iFilename) data = [] for line in iFile: line = line.strip() if line and line[0] != "#": tokens = line.split("\t") f = Feature(tokens) try: f.chromStart -= offset f.chromEnd -= offset f.thickStart -= offset f.thickEnd -= offset except: pass data.append(f) return data
def saveMolphy(self, oFileHandle, width=60): oFile = smartopen(oFileHandle, 'w') print >> oFile, '%i %i' % (self.numberOfSeqs(), self.__len__()) for name in self.order: print >> oFile, name for i in xrange(0, self.__len__(), width): print >> oFile, self.seqDict[name][i:i+width] oFile.close()
def load2(iFileHandle): iFile = smartopen(iFileHandle) genes = [] for line in iFile: tokens = line.strip().split() if not line or line[0]=='#': continue elif line[0] in ['<', '>']: genes.append(Gene.fromTokens(tokens)) else: genes[-1].add(Exon.fromTokens(tokens))
def __init__(self, fileHandle, mode='w', width=60, blockSize=None, **kw): """ @param iFileHandle: Output file or name @keyword mode: File mode - write(w) or append(a) """ assert mode in ('w', 'a') self.iFile = smartopen(fileHandle, mode) self.iFilename = self.iFile.name if kw: print 'Uncaptured keywords' print kw
def load(iFileHandle): """Load WIG file. @param iFileHandle: Input file or filename @return: (header, data) """ iFile = smartopen(iFileHandle) header = iFile.readline() data = [] for line in iFile: score = float(line.strip()) data.append(score) return header,data
def load_iter(iFileHandle, skip=5, splitOn=None): """Load TRANSFAC match output. Arguments: iFileHandle -- Input file or filename. """ iFile = smartopen(iFileHandle) for i in xrange(skip): iFile.next() for line in iFile: tokens = line.strip().split(splitOn) yield Match(tokens)
def load_iter(iFileHandle): iFile = smartopen(iFileHandle) first = True for line in iFile: tokens = line.strip().split() if not line or line[0]=='#': continue elif line[0] in ['<', '>']: if not first: yield gene else: first = False gene = Gene.fromTokens(tokens) else: gene.add(Exon.fromTokens(tokens)) yield gene
def savePhylip(self, oFileHandle, width=10, **kw): oFile = smartopen(oFileHandle, 'w') print >> oFile, '%7i%7i' % (self.numberOfSeqs(), self.__len__()) L = self.__len__() format = '%%-%is %%s' % width for i in xrange(0,L,50): for name in self.order: if i==0: label = name[0:width] else: label = '' print >> oFile, ('%%-%is' % width) % label, for j in xrange(0,50,10): print >> oFile, self.seqDict[name][i+j:i+j+10], print >> oFile print >> oFile oFile.close()
def __init__(self, iFileHandle, force=False, section=None, **kw): """Constructor @param iFileHandle: CAP3 file name or object """ self.iFile = smartopen(iFileHandle) self.iFilename = self.iFile.name self.indexFile = CapIndexFile(self.iFilename) self.indexFile.build(force=force) self.stopAtMiddle = False if section==1: self.stopAtMiddle = True elif section==2: self.seek(0,2) self._iter = None self._initIter = True self._section = section
def load_iter(iFileHandle, format='psl', **kw): """Return an iterator to the BLAT file. @param iFileHandle: Input filename or file. @param format: BLAT output format (optional; default: 'psl') """ if not format in ['psl']: raise 'Only psl is currently supported.' iFile = smartopen(iFileHandle) skip = kw.pop('skip', 5) for i in xrange(skip): junk = iFile.readline() for line in iFile: if line: tokens = line.strip().split('\t') yield Chain(tokens, **kw)
def saveNexus(self, oFileHandle, datatype='protein', gap='-', interleave=False, width=60): assert datatype.lower() in ['dna', 'rna', 'protein', 'standard', 'restriction'] yesno = {True: 'yes', False: 'no'} oFile = smartopen(oFileHandle, 'w') print >> oFile, '#nexus' print >> oFile print >> oFile, 'BEGIN Taxa;' print >> oFile, 'DIMENSIONS ntax=%i;' % self.numberOfSeqs() print >> oFile, 'TAXLABELS' for i,name in enumerate(self.order): print >> oFile, "[%i] '%s'" % (i+1,name) print >> oFile, ';' print >> oFile, 'END; [Taxa]' print >> oFile print >> oFile, 'BEGIN Characters;' print >> oFile, 'DIMENSIONS nchar=%i;' % self.__len__() print >> oFile, 'FORMAT' print >> oFile, ' datatype=%s' % datatype print >> oFile, ' missing=?' print >> oFile, ' gap=%s' % gap print >> oFile, ' symbols="a r n d c q e g h i l k m f p s t w y v z"' print >> oFile, ' labels=left' print >> oFile, ' transpose=no' print >> oFile, ' interleave=%s' % yesno[interleave] print >> oFile, ';' print >> oFile, 'MATRIX' format = "%%-%is %%s" % max([len(name) for name in self.seqDict]) if interleave: for i in xrange(0,self.__len__(),width): for name in self.order: print >> oFile, format % (name, self.seqDict[name][i:i+width]) print >> oFile else: for name in self.order: print >> oFile, format % (name, self.seqDict[name]) print >> oFile print >> oFile, ';' print >> oFile, 'END;' oFile.close()
def loadClustal(iFileHandle, headerCheck=True): """ Load clustal alignment data. @param iFileHandle: Input filename or file. @param headerCheck: Test that CLUSTAL appears on first line (default True) """ iFile = smartopen(iFileHandle) clustalHeader = iFile.readline().strip() if headerCheck and not clustalHeader.split()[0] in ['CLUSTAL', 'MUSCLE']: raise 'Not a CLUSTAL file' alignment = Alignment() for line in iFile: if line[0]!=' ': tokens = line.strip().split() if len(tokens)==2: alignment.append(tokens[0], tokens[1]) iFile.close() return alignment
def saveClustal(self, oFileHandle, nameWidth=None, width=60, interleaved=True, **kw): oFile = smartopen(oFileHandle, 'w') print >> oFile, 'CLUSTAL W (1.83) multiple sequence alignment\n\n' if not nameWidth: nameWidth = max([len(name) for name in self.order]) format = '%%-%is %%s' % nameWidth L = self.__len__() if interleaved: for i in xrange(0,L,width): for name in self.order: print >> oFile, format % (name[0:nameWidth], self.seqDict[name][i:i+width]) print >> oFile, format % (' '*nameWidth, ' '*width) print >> oFile else: for name in self.order: print >> oFile, format % (name[0:nameWidth], self.seqDict[name]) print >> oFile, format % (' '*nameWidth, ' '*len(self.seqDict[name])) print >> oFile oFile.close()
def __init__(self, iFileHandle, clobber=False, interface=Interface.CONTAINER, method=IndexMethod.SQLITE, **kw): """ @param iFileHandle: Fasta file name or object """ self.iFile = smartopen(iFileHandle) self.iFilename = self.iFile.name if method==IndexMethod.PICKLE: self.indexFile = FastaIndexPickleFile(self.iFilename) elif method==IndexMethod.TEXT: self.indexFile = FastaIndexTextFile(self.iFilename) else: # sqlite3 method is the default self.indexFile = FastaIndexFile(self.iFilename) self.indexFile.build(clobber=clobber) self.interface = interface self._iter = None self._initIter = True if kw: print 'Uncaptured keywords' print kw
def loadStockholm(iFileHandle, **kw): """Load Stockholm alignment data. @param iFileHandle: Input filename or file. @returns: a dictionary of sequences {name1: seq1, name2: seq2, ...} """ iFile = smartopen(iFileHandle) alignment = Alignment() alignment.headers = {} # Skip hmmalign header start = False for line in iFile: line = line.strip() if line == '# STOCKHOLM 1.0': break # Parse sto header info for line in iFile: line = line.strip() if not line: # Blank lines continue elif line[0:4] == '#=GS': # Fasta headers header = line.strip()[1:] tokens = header.split() alignment.headers[tokens[0]] = header continue elif line[0] == '#': # Other boring comment lines continue elif line == '//': # End of file break # The real stuff name, seq = line.split() alignment.append(name, seq) return alignment
def load_preprocessed(iFileHandle): """Load genscan predictions when predictions have been preprocessed and only contain the gene prediction lines Arguments: iFileHandle -- Input file or filename. Return values: data -- Annotation data (a list of lists, each list in one gene) """ iFile = smartopen(iFileHandle) data = {} skipState = 'Slice no. ' state = None for line in iFile: line = line.strip() if line: if skipState in line: pass else: tokens = line.split() d = Predicted(tokens) gene = int(d.gene_exon.split('.')[0]) try: data[gene].append(d) except KeyError: data[gene] = [d] data = data.items() data.sort() data = [x[1] for x in data] return data
def loadStockholm(iFileHandle, **kw): """Load Stockholm alignment data. @param iFileHandle: Input filename or file. @returns: a dictionary of sequences {name1: seq1, name2: seq2, ...} """ iFile = smartopen(iFileHandle) alignment = Alignment() alignment.headers = {} # Skip hmmalign header start = False for line in iFile: line = line.strip() if line=='# STOCKHOLM 1.0': break # Parse sto header info for line in iFile: line = line.strip() if not line: # Blank lines continue elif line[0:4]=='#=GS': # Fasta headers header = line.strip()[1:] tokens = header.split() alignment.headers[tokens[0]] = header continue elif line[0]=='#': # Other boring comment lines continue elif line=='//': # End of file break # The real stuff name,seq = line.split() alignment.append(name, seq) return alignment
def __init__(self, iFileHandle1, iFileHandle2): self.iFile1 = smartopen(iFileHandle1) self.iFile2 = smartopen(iFileHandle2)
def load_full(iFileHandle): """Load genscan predictions. Arguments: iFileHandle -- Input file or filename. Return values: data -- Annotation data (a list of lists, each list in one gene) proteins -- Predicted proteins (a list of tuples (header, sequence)) meta -- Meta-data in first 8 lines of genscan output """ iFile = smartopen(iFileHandle) data = {} proteins = [] meta = [] startPredState = '----- ---- - ------ ------ ---- -- -- ---- ---- ----- ----- ------' endPredState = 'Predicted peptide sequence(s):' skipState = 'Slice no. ' metaState = 'GENSCAN 1.0' state = None for line in iFile: line = line.strip() if metaState in line: state = 'meta' if line == startPredState: state = 'pred' elif line == 'NO EXONS/GENES PREDICTED IN SEQUENCE': state = 'fail' elif line == endPredState: state = 'prot' elif skipState in line: state = 'skip' else: if state == 'meta': if line: meta.append(line) elif state == 'pred': if line: tokens = line.split() d = Predicted(tokens) gene = int(d.gene_exon.split('.')[0]) try: data[gene].append(d) except KeyError: data[gene] = [d] elif state == 'prot': break elif state == 'fail': return [], [], '' if state == 'prot': proteins = fasta.load_mfa(iFile) data = data.items() data.sort() data = [x[1] for x in data] return data, proteins, meta
def load(iFileHandle): iFile = smartopen(iFileHandle) aln = Alignment() state = None for line in iFile: line = line.rstrip() if not line: continue elif line == '#:lav': # Section break state = 'BLOCK' block = Block() continue elif line == '#:eof': # End of file state = 'EOF' break elif line[0] == 'd': # Substitution matrix stanza state = 'MATRIX' continue elif line[0] == 's': # Sequence files stanza state = 'FILES' continue elif line[0] == 'h': # Fasta headers stanza state = 'HEADERS' continue elif line[0] == 'a': # Alignment stanza state = 'ALIGN' chain = Chain.fromBlock(copy.copy(block)) continue elif line[0] in ['x', 'm']: state = 'BORING' continue elif state == 'MATRIX' and line[0] == '}': state = 'MATRIX_END' aln.matrix = '\n'.join(aln.matrix) continue elif state == 'ALIGN' and line[0] == '}': state = 'ALIGN_END' aln.chains.append(chain) chain = None continue elif line[0] == '}': # End of state continue tokens = line.lstrip().split() if state == None: print line raise Exception('Wrong') elif state == 'MATRIX': aln.matrix.append(line) elif state == 'FILES': block.filenames.append(tokens[0]) block.lengths.append(int(tokens[2])) block.strands.append(strandDict[tokens[3]]) # Next line in stanza tokens = iFile.next().strip().split() block.filenames.append(tokens[0]) block.lengths.append(int(tokens[2])) block.strands.append(strandDict[tokens[3]]) elif state == 'HEADERS': block.headers.append(tokens[0]) # Next line in stanza tokens = iFile.next().strip().split() block.headers.append(tokens[0]) elif state == 'ALIGN': if tokens[0] == 's': chain.score = int(tokens[1]) elif tokens[0] == 'b': chain.interval1[0] = int(tokens[1]) chain.interval2[0] = int(tokens[2]) elif tokens[0] == 'e': chain.interval1[1] = int(tokens[1]) chain.interval2[1] = int(tokens[2]) elif tokens[0] == 'l': chain.hsps.append(HSP(tokens[1:])) return aln
def load(iFileHandle): iFile = smartopen(iFileHandle) aln = Alignment() state = None for line in iFile: line = line.rstrip() if not line: continue elif line=='#:lav': # Section break state = 'BLOCK' block = Block() continue elif line=='#:eof': # End of file state = 'EOF' break elif line[0]=='d': # Substitution matrix stanza state = 'MATRIX' continue elif line[0]=='s': # Sequence files stanza state = 'FILES' continue elif line[0]=='h': # Fasta headers stanza state = 'HEADERS' continue elif line[0]=='a': # Alignment stanza state = 'ALIGN' chain = Chain.fromBlock(copy.copy(block)) continue elif line[0] in ['x', 'm']: state = 'BORING' continue elif state=='MATRIX' and line[0]=='}': state = 'MATRIX_END' aln.matrix = '\n'.join(aln.matrix) continue elif state=='ALIGN' and line[0]=='}': state = 'ALIGN_END' aln.chains.append(chain) chain = None continue elif line[0]=='}': # End of state continue tokens = line.lstrip().split() if state==None: print line raise Exception('Wrong') elif state=='MATRIX': aln.matrix.append(line) elif state=='FILES': block.filenames.append(tokens[0]) block.lengths.append(int(tokens[2])) block.strands.append(strandDict[tokens[3]]) # Next line in stanza tokens = iFile.next().strip().split() block.filenames.append(tokens[0]) block.lengths.append(int(tokens[2])) block.strands.append(strandDict[tokens[3]]) elif state=='HEADERS': block.headers.append(tokens[0]) # Next line in stanza tokens = iFile.next().strip().split() block.headers.append(tokens[0]) elif state=='ALIGN': if tokens[0]=='s': chain.score = int(tokens[1]) elif tokens[0]=='b': chain.interval1[0] = int(tokens[1]) chain.interval2[0] = int(tokens[2]) elif tokens[0]=='e': chain.interval1[1] = int(tokens[1]) chain.interval2[1] = int(tokens[2]) elif tokens[0]=='l': chain.hsps.append(HSP(tokens[1:])) return aln
('-p', 'Predicted peptides only'), ('-a', email), ] files = [('-u', '', ''), ('-v', '', '')] try: html = multipart.post(genomeScanURL, fields, files, proxy, proxyPort) except multipart.FormSubmissionException: print >> sys.stderr, "*** %s submission failed. Retry later" % description return except Exception, e: print e sys.exit('Argh!') if oFileHandle: oFile = smartopen(oFileHandle, 'w') print >> oFile, html oFile.close() return html def extractSeq(feature, blastDb, dx, dy): """Extract the translated sequence of a feature and DNA sequence of the surrounding the genomic region. @param feature: Feature object. Mandatory attributes: accession, sStart, sEnd. @param blastDb: Blast database. @param dx: Length of sequence to extract upstream. @param dy: Length of sequence to extract downstream. @returns: tuple of fasta strings (DNA, protein).
def __init__(self, iFileHandle): self.iFile = smartopen(iFileHandle) self.iFilename = self.iFile.name self._iter = None
def load_full(iFileHandle): """Load genscan predictions. Arguments: iFileHandle -- Input file or filename. Return values: data -- Annotation data (a list of lists, each list in one gene) proteins -- Predicted proteins (a list of tuples (header, sequence)) meta -- Meta-data in first 8 lines of genscan output """ iFile = smartopen(iFileHandle) data = {} proteins = [] meta = [] startPredState = '----- ---- - ------ ------ ---- -- -- ---- ---- ----- ----- ------' endPredState = 'Predicted peptide sequence(s):' skipState = 'Slice no. ' metaState = 'GENSCAN 1.0' state = None for line in iFile: line = line.strip() if metaState in line: state = 'meta' if line==startPredState: state = 'pred' elif line=='NO EXONS/GENES PREDICTED IN SEQUENCE': state = 'fail' elif line==endPredState: state = 'prot' elif skipState in line: state = 'skip' else: if state=='meta': if line: meta.append(line) elif state=='pred': if line: tokens = line.split() d = Predicted(tokens) gene = int(d.gene_exon.split('.')[0]) try: data[gene].append(d) except KeyError: data[gene] = [d] elif state=='prot': break elif state=='fail': return [], [], '' if state=='prot': proteins = fasta.load_mfa(iFile) data = data.items() data.sort() data = [x[1] for x in data] return data, proteins, meta