示例#1
0
def FilterSignalPeptideInTopology(topo, sp_pos):#{{{
    """
    Filter signal peptide in topology
    sp_pos: location of signal peptide
    """
    gapless_topo = topo.replace(GAP, '')
    #get position of the N-terminal TM  
    (fb, fe) = myfunc.GetFirstTMPosition(gapless_topo)
    if fb != -1:
        cov = myfunc.coverage(0, sp_pos, fb, fe)
#        print "cov = %d, LTM = %d"%(cov, fe-fb)
        if cov/float(fe-fb) > 0.5: # the first TM helix is signal peptide
# get the position of the first TM helix in aligned form
            (fb_aln, fe_aln) = myfunc.GetFirstTMPosition(topo)
            Nterm = GetNtermState(topo)
            if Nterm == 'i':
                newNterm = 'o'
            else:
                newNterm = 'i'
            newtopo = topo[:fe_aln].replace(Nterm, newNterm).replace('M',
                    newNterm) + topo[fe_aln:]
            return newtopo
        else:
            return topo
    else:
        return topo
示例#2
0
def FilterSignalPeptideInTopology(topo, sp_pos):#{{{
    """
    Filter signal peptide in topology
    sp_pos: location of signal peptide
    """
    gapless_topo = topo.replace(GAP, '')
    #get position of the N-terminal TM  
    (fb, fe) = myfunc.GetFirstTMPosition(gapless_topo)
    if fb != -1:
        cov = myfunc.coverage(0, sp_pos, fb, fe)
#        print "cov = %d, LTM = %d"%(cov, fe-fb)
        if cov/float(fe-fb) > 0.5: # the first TM helix is signal peptide
# get the position of the first TM helix in aligned form
            (fb_aln, fe_aln) = myfunc.GetFirstTMPosition(topo)
            Nterm = GetNtermState(topo)
            if Nterm == 'i':
                newNterm = 'o'
            else:
                newNterm = 'i'
            newtopo = topo[:fe_aln].replace(Nterm, newNterm).replace('M',
                    newNterm) + topo[fe_aln:]
            return newtopo
        else:
            return topo
    else:
        return topo
示例#3
0
def IsOverlappingDomain(thisRangeList, totalRangeList, seqid, hit):#{{{
    if len(totalRangeList) < 1 or len(thisRangeList) < 1:
        return False
    else:
        sumCover = 0
        thisSumLen = 0
        totalSumLen = 0
        for i in xrange(len(thisRangeList)):
            (b1, e1) = thisRangeList[i]
            thisSumLen += (e1-b1)
        for j in xrange(len(totalRangeList)):
            (b2, e2) = totalRangeList[j]
            totalSumLen += (e2-b2)
        for i in xrange(len(thisRangeList)):
            (b1, e1) = thisRangeList[i]
            for j in xrange(len(totalRangeList)):
                (b2, e2) = totalRangeList[j]
                sumCover += max(myfunc.coverage(b1,e1,b2,e2),0)
        percentCoverage = (sumCover / (float(thisSumLen+totalSumLen)/2.0))*100
        if percentCoverage < 30.0:
            return False
        else:
            print >> sys.stderr, "Overlapping found:", "%s %s %4d %4d" % (
                seqid, hit['pfamid'], hit['alnBeg'], hit['alnEnd']
                ), "%4d %4d %4d %.1f" % ( sumCover, thisSumLen, totalSumLen,
                        percentCoverage)
            return True
def MaskTopologyBySignalPeptide(idList, topoList, signalpDict):
    newTopoList = []
    for i in xrange(len(idList)):
        topo = topoList[i]
        if idList[i] in signalpDict:
            posTMList = myfunc.GetTMPosition(topo)
            try:
                posSigP = signalpDict[idList[i]]
                (b,e) = (posTMList[0][0],posTMList[0][1])
                cov = myfunc.coverage(0, posSigP, b, e)
                if float(cov)/(e-b) > 0.5:
#mask
                    masked_state = topo[e]
                    newTopo = ( "".join([masked_state]*(e)) +
                            topo[e:])
                    newTopoList.append(newTopo)
                    if DEBUG:
                        print
                        print "posTM", (b,e), "SignalPeptide", posSigP
                        print topo
                        print newTopo
                else:
                    newTopoList.append(topo)
            except (KeyError, IndexError):
                newTopoList.append(topo)
        else:
            newTopoList.append(topo)
    return newTopoList
示例#5
0
def GetCoveredTM(segment, posTM):#{{{
    newPosTM = []
    indexList = []
    cnt = 0
    for (b,e) in posTM:
        x = myfunc.coverage(b,e,segment[0], segment[1])
        if x/float(e-b) > 0.75:
            newPosTM.append((b,e))
            indexList.append(cnt)
        cnt += 1
    return (newPosTM, indexList)
示例#6
0
def IsDuplicated(hitList, seqlen1, seqlen2):#{{{
    """
    Check the template is a duplicated form of the query
    The input is a list of hits retrieved by HHsearch
    each hit in the hit list contains information of
    posQuery: position of the hit in the query sequence
    posTemplate: position of the hit in the template sequence
    numTM1: numTM of the hit in the query sequence
    numTM2: numTM of the hit in the template sequence
    """
# checking whether the template is a duplicated form of the query
    numHit = len(hitList)
# if any of two hits are not overlapping, consider it as a duplication
    for pair in itertools.combinations(range(numHit), 2):
        hit1 = hitList[pair[0]]
        hit2 = hitList[pair[1]]
        (b1_query, e1_query) = hit1['posQuery']
        (b2_query, e2_query) = hit2['posQuery']
        (b1_temp, e1_temp) = hit1['posTemplate']
        (b2_temp, e2_temp) = hit2['posTemplate']
        overlap_query = max(0, myfunc.coverage(b1_query, e1_query, b2_query, e2_query))
        overlap_temp = max(0, myfunc.coverage(b1_temp, e1_temp, b2_temp, e2_temp))
# if one query segmnet find two hits in two template segments and both with
# similar number of TM helices
        if (    (overlap_query / float(e1_query-b1_query) >= 0.75
                    or overlap_query / float(e2_query-b2_query) >= 0.75)
                and (overlap_temp / float(e1_temp-b1_temp) < 0.25) 
                and (overlap_temp / float(e2_temp-b2_temp) < 0.25) 
                and hitList[pair[0]]['numTM1'] > 0
                and hitList[pair[0]]['numTM2'] > 0
                and hitList[pair[1]]['numTM1'] > 0
                and hitList[pair[1]]['numTM2'] > 0
                and abs(hitList[pair[0]]['numTM2']-hitList[pair[1]]['numTM2']) <=2
                ):
            return True
    return False
示例#7
0
def IsDuplicatedByHHSearch(hhrfile):  #{{{
    try:
        fpin = open(hhrfile, "r")
        lines = fpin.readlines()
        fpin.close()
        hitList = []
        numLine = len(lines)
        i = 0
        while i < numLine:
            line = lines[i]
            if line.find(" No Hit") == 0:
                j = 1
                while i + j < numLine and lines[i + j] != "":
                    hit = ExtractHit(lines[i + j])
                    if hit != {}:
                        hitList.append(hit)
                    else:
                        break
                    j += 1
                break
            else:
                i += 1
        if len(hitList) < 2:
            return False
        else:
            sortedHitList = sorted(hitList,
                                   key=lambda x: x['evalue'],
                                   reverse=False)
            hit1 = hitList[0]
            hit2 = hitList[1]
            if hit2['evalue'] > 1e-3:
                return False
            else:
                (b1, e1) = hit1['posTemplate']
                (b2, e2) = hit2['posTemplate']
                overlap = max(0, myfunc.coverage(b1, e1, b2, e2))
                if overlap / float(e1 - b1) < 0.2 and overlap / float(
                        e2 - b2) < 0.2:
                    return True
                else:
                    return False
    except IOError:
        print >> sys.stderr, "Failed to read hhrfile %s" % hhrfile
        return False
示例#8
0
def IsDuplicatedByHHSearch(hhrfile, seqid1="", seqid2="", cnt=0):#{{{
    try:
        # Read in hhsearch hits
        fpin = open(hhrfile,"r")
        lines = fpin.readlines()
        fpin.close()
    except IOError:
        print >> sys.stderr, "Failed to read hhrfile %s"%hhrfile
        return False

    lengthQuery = 0
    lengthTemplate = 0
    hitList = []
    numLine = len(lines)
    i = 0
    while i < numLine:
        line = lines[i]
        if line.find("Match_columns") == 0:
            try:
                lengthQuery = int(line.split()[1])
            except (IndexError, ValueError):
                print >> sys.stderr, "Error in hhrfile %s. Ignore"%(hhrfile)
                return False
            i += 1
        elif line.find(" No Hit") == 0:
            j = 1
            while i+j < numLine and lines[i+j] != "":
                hit = ExtractHit(lines[i+j])
                if hit != {}:
                    hitList.append(hit)
                else:
                    break
                j += 1
            break
        else:
            i += 1

    isDup = False
# checking whether the template is a duplicated form of the query
    numHit = len(hitList)
    numGoodHit = 0
    if numHit >= 2: # there should be at least two hits
        sortedHitList = sorted(hitList, key=lambda x:x['evalue'], reverse=False)
        if hitList[1]['evalue'] <= 1e-3: # there should be at leave two hits with evalue < th
            lengthTemplate = hitList[0]['lengthTemplate']
            countGoodHit = 0 # there should be at least two good hits.
            posListCoverageInTemplate = [] # covered segment list in template
            idxGoodHitList = []

            for i in xrange(len(sortedHitList)):
                hit = hitList[i]
                (b1, e1) = hit['posQuery']
                if (e1-b1)/float(lengthQuery) > 0.5:
                    idxGoodHitList.append(i)
            numGoodHit = len(idxGoodHitList)
            if numGoodHit >= 2: # there should be >= 2 good Hits
# if any of two hits are not overlapping, consider it as a duplication
                for pair in itertools.combinations(idxGoodHitList, 2):
                    hit1 = hitList[pair[0]]
                    hit2 = hitList[pair[1]]
                    (b1, e1) = hit1['posTemplate']
                    (b2, e2) = hit2['posTemplate']
                    overlap = max(0, myfunc.coverage(b1, e1, b2, e2))
                    if (overlap / float(e1-b1) < 0.2) and (overlap / float(e2-b2) < 0.2):
                        isDup = True
                # if non pair are not overlapping, return false
    if isDup:
        ss_isdup = 'y'
    else:
        ss_isdup = 'n'
    sys.stdout.write("%d: %s-%s %s numHit=%d numGoodHit=%d\n" %(cnt, seqid1,
        seqid2, ss_isdup, numHit, numGoodHit))
    return isDup