def retrieve_tophit_names(hitsfile, filetype, mincols=False, evalue=False, title_split_char='', useScore=False): """ Returns the hit(s) with the lowest evalue, optionally filtering by minimum alignment span and maximum evalue. If evalue is not provided (or if useScore==True), takes hit(s) with highest score """ hitnames = {} #first, check that evalues are present, if required if not useScore: #first, find a result with a hit for result in reptools.hitsParser(hitsfile, filetype, mincols=mincols, evalue=evalue): if len(result.hits) > 0: if len(result.hits[0].hsps) > 0: if result.hits[0].hsps[0].evalue is None: useScore = True break for result in reptools.hitsParser(hitsfile, filetype, mincols=mincols, evalue=evalue): hitnames[result.id.split(title_split_char)[0]] = set() #loop through the hits, recording the hit names minevalue = False maxscore = 0 for hit in result.hits: for hsp in hit.hsps: if useScore: #if we're looking at bitscore if hsp.bitscore > maxscore: hitnames[result.id.split(title_split_char)[0]] = set( [hit.id]) maxscore = hsp.bitscore elif hsp.bitscore == maxscore: hitnames[result.id.split(title_split_char)[0]].add( hit.id) else: pass else: #if we're looking at evalue if hsp.evalue < minevalue or minevalue is False: hitnames[result.id.split(title_split_char)[0]] = set( [hit.id]) minevalue = hsp.evalue elif hsp.evalue == minevalue: hitnames[result.id.split(title_split_char)[0]].add( hit.id) else: pass return (hitnames)
def parseHitfile(fn,aligner,title_split=' ',clusterID_position=0,mincols=False,id=False,evalue=False): results = {} for result in reptools.hitsParser(fn,aligner): results[result.id.split(title_split)[clusterID_position]] = reptools.filterHits( result.hits, mincols=mincols, id=id, evalue=evalue ) results = {k:results[k] for k in results if len(results[k])>0} #remove queries left empty by filter return(results)
def get_top_hits( hitsfile,filetype='blast', title_split=None,clusterID_position=0,criteria=['bitscore','high'], mincols=False,id=False,evalue=False ): """ Filetype should be "stellar","usearch", "ublast", "local", global", "blast", "swipe", or "dict" Returns a dictionary keyed by query id, containing the best hit or hits for each query """ besthits = {} if criteria[1].lower().startswith('high'): selectionoperator = max elif criteria[1].lower().startswith('low'): selectionoperator = min else: raise ValueError( '"criteria" must be a list of len=2, with the first item giving the name of the property to select on, and ' 'the second being either "low" or "high"' ) if filetype == 'dict': for result in hitsfile: besthits[result.split(title_split)[clusterID_position]] = [ #this nested list comp. selects hits if their max (or min) hsp value is equal to the #max/min value of any hit hit for hit in hitsfile[result] if selectionoperator([getattr(hsp,criteria[0]) for hsp in hit.hsps]) == selectionoperator([ selectionoperator([getattr(hsp,criteria[0]) for hsp in hit2.hsps]) for hit2 in hitsfile[result] ]) ] else: for result in reptools.hitsParser(hitsfile,filetype=filetype): title = result.id.split(title_split)[clusterID_position] besthits[title] = [ #this nested list comp. selects hits if their max (or min) hsp value is equal to the #max/min value of any hit hit for hit in result.hits if selectionoperator([getattr(hsp,criteria[0]) for hsp in hit.hsps]) == selectionoperator([ selectionoperator([getattr(hsp,criteria[0]) for hsp in hit2.hsps]) for hit2 in result.hits ]) ] besthits = {k:reptools.filterHits(besthits[k],mincols=mincols,id=id,evalue=evalue) for k in besthits} besthits = {k:besthits[k] for k in besthits if len(besthits[k])>0} #remove queries left empty by filter return(besthits)
def find_nearest_to_overlap_hits( hitsfile,gene,genedictionary,dict_entry, filetype='blast',evalue=False,title_split=None,clusterID_position=0, criteria=('bitscore','high'), top_hits_only=False,returnhitstrings=False, mincols=False, id=False ): """ Filetype should be "stellar","usearch", "ublast", "local", global", "blast", or "swipe" """ if returnhitstrings and not top_hits_only: raise ValueError('returnhitstrings=True must be used with top_hits_only=True') if criteria[1].lower().startswith('high'): #selectionoperator = operator.gt selectionoperator = max elif criteria[1].lower().startswith('low'): #selectionoperator = operator.lt selectionoperator = min else: raise ValueError( '"criteria" must be a list of len=2, with the first item giving the name of the property to select on, and ' 'the second being either "low" or "high"' ) besthits = {} hitstrings = {} for result in reptools.hitsParser(hitsfile,filetype=filetype,evalue=evalue): closesthsp={} #loop through the hits, and from each, select the hsp closest to the junction if len(result.hits) == 0: continue hits = result.hits if top_hits_only: #first, select only the top hits, if instructed hits = [ h for h in hits if selectionoperator([getattr(hsp,criteria[0]) for hsp in h.hsps]) == selectionoperator([ selectionoperator([getattr(hsp,criteria[0]) for hsp in h2.hsps]) for h2 in hits ]) ] hits = reptools.filterHits(hits, mincols=mincols, id=id) #filter hits by mincols and id if len(hits) == 0 : continue if returnhitstrings: hitstrings[result.id.split(title_split)[clusterID_position]] = '+'.join([h.id for h in hits]) for hit in hits: pos_H = genedictionary[gene][dict_entry][hit.id] distancefromjunction=[] for hsp in hit.hsps: #does hsp surround hit? if (hsp.hit_start<pos_H and hsp.hit_end>pos_H) or (hsp.hit_start>pos_H and hsp.hit_end<pos_H): distancefromjunction.append(0) else: #hsp doesn't surround hit distancefromjunction.append(min(abs(hsp.hit_start-pos_H),abs(hsp.hit_end-pos_H))) #select the hsps with the minimal distance from the junction closest_dist = min(distancefromjunction) candidatehsps = [(hit.hsps[n],d) for n,d in enumerate(distancefromjunction) if d==closest_dist] scores = [getattr(hsptuple[0],criteria[0]) for hsptuple in candidatehsps] closesthsp[hit.id] = candidatehsps[scores.index(selectionoperator(scores))] #now select the best overall hit: ties are broken #need to pick closest hit by distance, then the best of the closest by selection criteria dists = [closesthsp[hit][1] for hit in closesthsp] closesthits = [closesthsp[hit][0] for hit in closesthsp if closesthsp[hit][1]==min(dists)] scores = [getattr(hsp,criteria[0]) for hsp in closesthits] besthits[result.id.split(title_split)[clusterID_position]]=closesthits[scores.index(selectionoperator(scores))] if returnhitstrings: return(tuple([besthits,hitstrings])) else: return(besthits)