def retrieve_tophit_names(hitsfile,
                          filetype,
                          mincols=False,
                          evalue=False,
                          title_split_char='',
                          useScore=False):
    """
    Returns the hit(s) with the lowest evalue, optionally filtering by minimum alignment span and maximum evalue.
    If evalue is not provided (or if useScore==True), takes hit(s) with highest score
    """
    hitnames = {}
    #first, check that evalues are present, if required
    if not useScore:
        #first, find a result with a hit
        for result in reptools.hitsParser(hitsfile,
                                          filetype,
                                          mincols=mincols,
                                          evalue=evalue):
            if len(result.hits) > 0:
                if len(result.hits[0].hsps) > 0:
                    if result.hits[0].hsps[0].evalue is None:
                        useScore = True
                    break

    for result in reptools.hitsParser(hitsfile,
                                      filetype,
                                      mincols=mincols,
                                      evalue=evalue):
        hitnames[result.id.split(title_split_char)[0]] = set()
        #loop through the hits, recording the hit names
        minevalue = False
        maxscore = 0
        for hit in result.hits:
            for hsp in hit.hsps:
                if useScore:  #if we're looking at bitscore
                    if hsp.bitscore > maxscore:
                        hitnames[result.id.split(title_split_char)[0]] = set(
                            [hit.id])
                        maxscore = hsp.bitscore
                    elif hsp.bitscore == maxscore:
                        hitnames[result.id.split(title_split_char)[0]].add(
                            hit.id)
                    else:
                        pass
                else:  #if we're looking at evalue
                    if hsp.evalue < minevalue or minevalue is False:
                        hitnames[result.id.split(title_split_char)[0]] = set(
                            [hit.id])
                        minevalue = hsp.evalue
                    elif hsp.evalue == minevalue:
                        hitnames[result.id.split(title_split_char)[0]].add(
                            hit.id)
                    else:
                        pass
    return (hitnames)
예제 #2
0
def parseHitfile(fn,aligner,title_split=' ',clusterID_position=0,mincols=False,id=False,evalue=False):
    results = {}
    for result in reptools.hitsParser(fn,aligner):
        results[result.id.split(title_split)[clusterID_position]] = reptools.filterHits(
                                                                                        result.hits,
                                                                                        mincols=mincols,
                                                                                        id=id,
                                                                                        evalue=evalue
                                                                                        )
    results = {k:results[k] for k in results if len(results[k])>0} #remove queries left empty by filter
    return(results)
예제 #3
0
def get_top_hits(
                 hitsfile,filetype='blast',
                 title_split=None,clusterID_position=0,criteria=['bitscore','high'],
                 mincols=False,id=False,evalue=False
                 ):
    """
    Filetype should be "stellar","usearch", "ublast", "local", global", "blast", "swipe", or "dict"
    Returns a dictionary keyed by query id, containing the best hit or hits for each query
    """
    besthits = {}
    if criteria[1].lower().startswith('high'):
        selectionoperator = max
    elif criteria[1].lower().startswith('low'):
        selectionoperator = min
    else:
        raise ValueError(
            '"criteria" must be a list of len=2, with the first item giving the name of the property to select on, and '
            'the second being either "low" or "high"'
        )
    if filetype == 'dict':
        for result in hitsfile:
            besthits[result.split(title_split)[clusterID_position]] = [
                                #this nested list comp. selects hits if their max (or min) hsp value is equal to the 
                                #max/min value of any hit
                                hit for hit in hitsfile[result]
                                  if
                                    selectionoperator([getattr(hsp,criteria[0]) for hsp in hit.hsps])
                                    ==
                                    selectionoperator([
                                                      selectionoperator([getattr(hsp,criteria[0]) for hsp in hit2.hsps])
                                                       for hit2 in hitsfile[result]
                                                      ])
                                    ]
    else:
        for result in reptools.hitsParser(hitsfile,filetype=filetype):
            title = result.id.split(title_split)[clusterID_position]
            besthits[title] = [
                                #this nested list comp. selects hits if their max (or min) hsp value is equal to the 
                                #max/min value of any hit
                                hit for hit in result.hits 
                                  if 
                                    selectionoperator([getattr(hsp,criteria[0]) for hsp in hit.hsps])
                                    ==
                                    selectionoperator([
                                                      selectionoperator([getattr(hsp,criteria[0]) for hsp in hit2.hsps])
                                                       for hit2 in result.hits
                                                      ])
                               ]
    
    besthits = {k:reptools.filterHits(besthits[k],mincols=mincols,id=id,evalue=evalue) for k in besthits}
    besthits = {k:besthits[k] for k in besthits if len(besthits[k])>0} #remove queries left empty by filter
    return(besthits)
예제 #4
0
def find_nearest_to_overlap_hits(
                                 hitsfile,gene,genedictionary,dict_entry,
                                 filetype='blast',evalue=False,title_split=None,clusterID_position=0,
                                 criteria=('bitscore','high'),
                                 top_hits_only=False,returnhitstrings=False,
                                 mincols=False, id=False
                                 ):
    """
    Filetype should be "stellar","usearch", "ublast", "local", global", "blast", or "swipe"
    """
    if returnhitstrings and not top_hits_only:
        raise ValueError('returnhitstrings=True must be used with top_hits_only=True')
    if criteria[1].lower().startswith('high'):
        #selectionoperator = operator.gt
        selectionoperator = max
    elif criteria[1].lower().startswith('low'):
        #selectionoperator = operator.lt
        selectionoperator = min
    else:
        raise ValueError(
            '"criteria" must be a list of len=2, with the first item giving the name of the property to select on, and '
            'the second being either "low" or "high"'
        )
    besthits = {}
    hitstrings = {}
    for result in reptools.hitsParser(hitsfile,filetype=filetype,evalue=evalue):
        closesthsp={}
        #loop through the hits, and from each, select the hsp closest to the junction
        if len(result.hits) == 0: continue
        hits = result.hits
        if top_hits_only: #first, select only the top hits, if instructed
            hits = [
                    h for h in hits if 
                     selectionoperator([getattr(hsp,criteria[0]) for hsp in h.hsps])
                     ==
                     selectionoperator([
                                        selectionoperator([getattr(hsp,criteria[0]) for hsp in h2.hsps])
                                         for h2 in hits
                                       ])
                   ]
        hits = reptools.filterHits(hits, mincols=mincols, id=id) #filter hits by mincols and id
        if len(hits) == 0 : continue
        if returnhitstrings:
            hitstrings[result.id.split(title_split)[clusterID_position]] = '+'.join([h.id for h in hits])
        for hit in hits:
            pos_H = genedictionary[gene][dict_entry][hit.id]
            distancefromjunction=[]
            for hsp in hit.hsps:
                #does hsp surround hit?
                if (hsp.hit_start<pos_H and hsp.hit_end>pos_H) or (hsp.hit_start>pos_H and hsp.hit_end<pos_H): 
                    distancefromjunction.append(0)
                else: #hsp doesn't surround hit
                    distancefromjunction.append(min(abs(hsp.hit_start-pos_H),abs(hsp.hit_end-pos_H)))
            #select the hsps with the minimal distance from the junction
            closest_dist = min(distancefromjunction)
            candidatehsps = [(hit.hsps[n],d) for n,d in enumerate(distancefromjunction) if d==closest_dist]
            scores = [getattr(hsptuple[0],criteria[0]) for hsptuple in candidatehsps]
            closesthsp[hit.id] = candidatehsps[scores.index(selectionoperator(scores))]
        #now select the best overall hit: ties are broken
        #need to pick closest hit by distance, then the best of the closest by selection criteria
        dists = [closesthsp[hit][1] for hit in closesthsp]
        closesthits = [closesthsp[hit][0] for hit in closesthsp if closesthsp[hit][1]==min(dists)]
        scores = [getattr(hsp,criteria[0]) for hsp in closesthits]
        besthits[result.id.split(title_split)[clusterID_position]]=closesthits[scores.index(selectionoperator(scores))]
    
    if returnhitstrings:
        return(tuple([besthits,hitstrings]))
    else:
        return(besthits)