def full_transposon_treatment(seq,overlap,gap,minlength,fastaout,evalue=None, fname=None): '''This is where it all comes together. This takes a sequence of hits, assumed to constitute an entire a blast search between one transposon and one fly genome. (See note below.) It performs the main process of this module -- i.e., creating the input for a multiple-alignment -- and dumps that information in FASTA format to *fastaout*, which must be a writeable fasta object (see module *fasta*). The user is naturally responsible for closing both, if appropriate (as it is in almost all cases). NOTE: Generally it is best to have *seq* come from the function hitsfromcsv(). This can be done implicitly by giving None as the first argument, in which case *f* is expected to be a file object or filename to be given to hitstocsv(). ''' if None not in (seq,fname): raise Error("Cannot give both seq and fname arguments") elif seq is None: seq = hitsfromcsv(fname) for s,hits in utils.groupby(seq,key=_attrget('SSEQID')).iteritems(): for island in makeislands(hits,gap): singles,nests = classifyrecords(island,overlap) nests = [stratify(N,minlength) for N in nests] if singles or any(nests): fastaout.writeentries(resolve_query_overlap(singles,nests,overlap)) else: raise Error('No records result from file {!r}'.format(fname))
def stratify(nest,minlength): '''Generator function that takes a nest, i.e. a list of hits for which each adjacent pair has nontrivial overlap in the subject ordinates, and yields according to the following process: - Yield x, the hit with lowest EVALUE (largest length is tiebreaker) - For each other hit y, truncate (or if necessary, split) y so as to remove any overlap with x. If this makes y a trivial hit, i.e. one whose length is less than minlength, remove it from the nest entirely. - Repeat until the nest has been exhausted. The process is actually implemented in an abstract fashion using a helper function - see _stratify(). ''' return _stratify(nest, rank=hit_rank, filterfunc=lambda x: x.LENGTH > minlength, sget=_attrget('_SSTART'), eget=_attrget('_SEND'), sset=set__SSTART, eset=set__SEND)
def resolve_query_overlap(standalones,nests,overlap): '''Expects a list of standalone fragments and a list of nests. Nests are expected to have undergone the subject overlap truncation scheme (see *stratify*). The return value is a list of fasta entries. If the function detects no query overlap between any pair of fragments -- including those in nests -- the fragments are "assembled" (non-technical term) in order of query-ordinates into a single fasta entry, which is the only element of the returned list. ''' standalones = list(standalones) nests = map(list,nests) if not (standalones or any(nests)): raise Error('Tried to resolve query overlap on an empty set of records!') # assign names before reordering for j,hit in enumerate(standalones): setname(hit,'standalone[{}]'.format(j)) for i,nest in enumerate(nests,1): for j,hit in enumerate(nest): setname(hit,'nest{}[{}]'.format(i,j)) # and then reorder by query ordinate recs = sorted(_it.chain(standalones,*nests),key=_attrget('QSTART')) if any(q_overlap(x,y)>=overlap for x,y in _it.izip(recs,recs[1:]))\ or len(recs)==1: return _it.imap(make_entry,recs) prev = None with _cont.closing(_sIO()) as seq: for hit in recs: seq.write('-'*(hit.QSTART-1-( prev and prev.QEND or 0 ))) if prev is None: st,end = hit._SSTART,hit._SEND else: st,end = min(st,hit._SSTART),max(end,hit._SEND); seq.write(hit.SSEQ) prev = hit result = fasta.seq_entry({'SEQ': seq.getvalue(), 'NAME': _name_fmt.format(_GRP='all',SSEQID=recs[0].SSEQID, SSTART=min(h._SSTART for h in recs),SEND=max(h._SEND for h in recs))}) return [result]
def s_overlap(x,y): '''Returns the number of base pairs by which the subject ordinates of x and y overlap. Returns zero if and only if they are disjoint.''' return _overlap(x,y,_attrget('_SSTART'),_attrget('_SEND'))
def q_overlap(x,y): '''Returns the number of base pairs by which the query ordinates of x and y overlap. Returns zero if and only if they are disjoint.''' return _overlap(x,y,_attrget('QSTART'),_attrget('QEND'))
def s_distance(x,y): '''Measures distance between the query ordinates of x and y. Returns 1 if they are adjacent, 0 if they overlap.''' return _dist(x,y,_attrget('QSTART'),_attrget('QEND'))