def ReadPercentTM(infile):#{{{
    hdl = myfunc.ReadLineByBlock(infile)
    if hdl.failure:
        return {}
    percentTMDict = {}
    lines = hdl.readlines()
    cntline = 0
    while lines != None:
        for line in lines:
            cntline += 1
            if not line or line[0] == "#":
                continue
            strs = line.split()
            if len(strs) == 6:
                try:
                    pfamid = strs[0]
                    numTM = int(strs[1])
                    numSeq = int(strs[3])
                    percentTMDict[pfamid] = [numTM, numSeq]
                except (IndexError, ValueError):
                    msg = "Error in mapfile %s at line %d: \"%s\""
                    print >> sys.stderr, msg%(infile, cntline, line)
                    pass
        lines = hdl.readlines()
    hdl.close()
    return percentTMDict
예제 #2
0
def MPA2MSA_old(infile, output_format, fpout):  #{{{
    hdl = myfunc.ReadLineByBlock(infile)
    if hdl.failure:
        return 1
    lengthList = []
    remainLineList = []
    lines = hdl.readlines()
    while lines != None:
        lines = remainLineList + lines
        numLine = len(lines)
        numRD = numLine / 2
        for i in xrange(numRD):
            fpout.write("%s\n" % lines[2 * i])
            strs = lines[2 * i + 1].split()
            for ss in strs:
                if ss.find("-") != -1:
                    strs1 = ss.split("-")
                    b = int(strs1[0])
                    e = int(strs1[1])
                    li = ["-"] * (e - b)
                    fpout.write("%s" % (''.join(li)))
                else:
                    fpout.write("%s" % (ss))
            fpout.write("\n")
        if numRD * 2 < numLine:
            remainLineList = [lines[numLine - 1]]
        else:
            remainLineList = []
        lines = hdl.readlines()
    hdl.close()

    return 0
예제 #3
0
def ReadDupPairDict(infile):#{{{
    hdl = myfunc.ReadLineByBlock(infile)
    if hdl.failure:
        return {}

    dt = {}
    lines = hdl.readlines()
    while lines != None:
        for line in lines:
            strs = line.split()
            if line == "" or line[0] == "#":
                continue
            if len(strs) >= 2:
                if strs[1] == 'y': # it is a duplicated pair
                    ss = strs[0].split("-")
                    if len(ss) == 2:
                        key = (ss[0], ss[1])
                        dt[key] = {}
                        dt[key]['isDup'] = 'y'
                        li = []
                        strs1 = line.split('|')
                        for j in range(1, len(strs1)):
                            hit = ParseDupHit(strs1[j].strip()) # hit is a list of 
                                                                # two segments
                                                                # from query
                                                                # and template
                            li.append(hit)
                        dt[key]['hit'] = li
        lines = hdl.readlines()
    hdl.close()
    return (dt)
예제 #4
0
def MPA2MSA(infile, output_format, fpout):  #{{{
    hdl = myfunc.ReadLineByBlock(infile)
    if hdl.failure:
        return 1
    lengthList = []
    remainLineList = []
    lines = hdl.readlines()
    while lines != None:
        lines = remainLineList + lines
        numLine = len(lines)
        numRD = numLine / 2
        for i in xrange(numRD):
            li = []
            fpout.write("%s\n" % lines[2 * i])
            strs = lines[2 * i + 1].split()
            for ss in strs:
                if ss[0].isdigit():
                    lgap = int(ss)
                    li.append("-" * lgap)
                else:
                    li.append(ss)
            fpout.write("%s\n" % ("".join(li)))
        if numRD * 2 < numLine:
            remainLineList = [lines[numLine - 1]]
        else:
            remainLineList = []
        lines = hdl.readlines()
    hdl.close()

    return 0
def ReadPairInfo(infile):  #{{{
    hdl = myfunc.ReadLineByBlock(infile)
    lst = []
    if hdl.failure:
        return []
    lines = hdl.readlines()
    while lines != None:
        for line in lines:
            if line != "" and line[0] != "#":
                strs = line.split()
                seqid1 = strs[0]
                seqid2 = strs[1]
                NtermState1 = strs[2]
                NtermState2 = strs[3]
                numTM1 = int(strs[4])
                numTM2 = int(strs[5])
                seqLen1 = int(strs[6])
                seqLen2 = int(strs[7])
                seqidt = float(strs[8])
                lst.append([
                    seqid1, seqid2, NtermState1, NtermState2, numTM1, numTM2,
                    seqLen1, seqLen2, seqidt
                ])
        lines = hdl.readlines()
    hdl.close()
    return lst
예제 #6
0
def ReadPfamScan2(infile):#{{{
# a quick solution, to same a little memory
    evalue_threshold = g_params['evalue_threshold']
    seqIDPfamScanDict = {}
    hdl = myfunc.ReadLineByBlock(infile)
    if hdl.failure:
        return {}
    lines = hdl.readlines()

    while lines != None:
        for line in lines:
            if line != "" and line[0] != "#":
                strs = line.split()
                if len(strs) >= 15:
                    seqid = strs[0]
                    alnBeg = int (strs[1])
                    alnEnd = int (strs[2])
                    pfamid = strs[5].split('.')[0]
#                     tmpdict['pfamname'] = strs[6]
                    evalue = float(strs[12])
                    clanid = strs[14]
                    tup_info = (alnBeg, alnEnd, pfamid, clanid)
                    if evalue <= evalue_threshold:
                        if seqid in seqIDPfamScanDict:
                            seqIDPfamScanDict[seqid].append(tup_info)
                        else:
                            seqIDPfamScanDict[seqid] = []
                            seqIDPfamScanDict[seqid].append(tup_info)
        lines = hdl.readlines()
    if hdl:
        hdl.close()
    return seqIDPfamScanDict
def Filter_seqid2fam_map(infile, keyIDSet, contentIDSet, isKeyIDSet,
        isContentIDSet, fpout):
    hdl = myfunc.ReadLineByBlock(infile)
    if hdl.failure:
        return 1
    lines = hdl.readlines()
    while lines != None:
        for line in lines:
            if line:
                strs = line.split()
                try:
                    key = strs[0]
                    num = int(strs[1])
                    idlist = strs[2:]
                    tmp_idlist = []
                    if (not isKeyIDSet) or (key in keyIDSet):
                        for idd in idlist:
                            if (not isContentIDSet) or (idd in contentIDSet):
                                tmp_idlist.append(idd)
                        if len(tmp_idlist) > 0:
                            fpout.write("%s %d"%(key, len(tmp_idlist)))
                            for idd in tmp_idlist:
                                fpout.write(" %s"%(idd))
                            fpout.write("\n")
                except (IndexError):
                    msg = "Error in infile %s with line \"%s\""
                    print >> sys.stderr, msg%(infile, line)
                    return 1
        lines = hdl.readlines()
    hdl.close()
    return 0
예제 #8
0
def ReadDGScore(infile):#{{{
    hdl = myfunc.ReadLineByBlock(infile)
    if hdl.failure:
        msg = "Failed to read file %s in function %s"
        print(msg%(infile, sys._getframe().f_code.co_name), file=sys.stderr)
        return {}
    dgScoreDict = {}
    lines = hdl.readlines()
    while lines != None:
        for line in lines:
            if line and line[0] != "#":
                strs = line.split()
                numStr = len(strs)
                if numStr >= 2:
                    try:
                        seqid = strs[0]
                        if numStr == 2:
                            dgscore = float(strs[1])
                        elif numStr == 3:
                            dgscore = float(strs[2])
                        if not seqid in dgScoreDict:
                            dgScoreDict[seqid] = []
                        dgScoreDict[seqid].append(dgscore)
                    except (ValueError, TypeError):
                        pass
        lines = hdl.readlines()
    hdl.close()
    return dgScoreDict
예제 #9
0
def ReadPSIPREDSS2(infile):
    hdl = myfunc.ReadLineByBlock(infile)
    if hdl.failure:
        return (None, None, None)
    aaSeqList = []
    ssSeqList = []
    arrayList = []
    for i in range(3):
        arrayList.append(array('h'))

    lines = hdl.readlines()
    while lines != None:
        for line in lines:
            strs = line.split()
            if len(strs) == 6 and strs[0].isdigit():
                aaSeqList.append(strs[1])
                ssSeqList.append(strs[2])
                for i in range(3):
                    try:
                        value = int(float(strs[i + 3]) * 1000)
                        arrayList[i].append(value)
                    except (ValueError, IndexError):
                        msg = "Bad record \"%s\" in file %s"
                        print >> sys.stderr, msg % (line, infile)
                        return (None, None, None)
        lines = hdl.readlines()
    hdl.close()
    aaSeq = "".join(aaSeqList)
    ssSeq = "".join(ssSeqList)
    return (aaSeq, ssSeq, arrayList)
def GetFullSeq(infile, hdl_seqdb, fpout):#{{{
    hdl = myfunc.ReadLineByBlock(infile)
    if hdl.failure:
        return (1, 0, 0)

    cntRetrieved = 0

    idList = []
    lines = hdl.readlines()
    while lines != None:
        idList += GetDatabaseIDList(lines)
        lines = hdl.readlines()
    hdl.close()

    idList = myfunc.uniquelist(idList)
    numID = len(idList)
    for seqid in idList:
        record = hdl_seqdb.GetRecord(seqid)
        if record:
            fpout.write(record)
            cntRetrieved += 1
        else:
            msg = "Failed to retrieve record for ID %s"
            print >> sys.stderr, msg%(seqid)
    return (0, numID, cntRetrieved)
예제 #11
0
def ReadPairAlnTableInfo(infile):#{{{
    hdl = myfunc.ReadLineByBlock(infile)
    if hdl.failure:
        msg = "Failed to read file %s in function %s"
        print(msg%(infile, sys._getframe().f_code.co_name), file=sys.stderr)
        return {}
    pairalnStat = {}
    lines = hdl.readlines()
    while lines != None:
        for line in lines:
            if line != "" and line[0] != "#":
                strs = line.split()
                if len(strs) == 13:
                    try:
                        id1 = strs[0]
                        id2 = strs[1]
                        seqidt = float(strs[2])
                        alignLen = float(strs[4])
                        seqlen1 = int(strs[5])
                        seqlen2 = int(strs[6])
                        seqidt1 = float(strs[11])
                        seqidt2 = float(strs[12])
                        pairid = id1+'-'+id2
                        pairalnStat[pairid] = {}
                        tmpdict = pairalnStat[pairid] 
                        tmpdict['seqidt'] = seqidt
                        tmpdict['seqidt1'] = seqidt1
                        tmpdict['seqidt2'] = seqidt1
                        tmpdict['seqLength1'] = seqlen1
                        tmpdict['seqLength2'] = seqlen2
                    except (IndexError, ValueError, TypeError, KeyError):
                        pass
        lines = hdl.readlines()
    hdl.close()
    return pairalnStat
예제 #12
0
def ReadPfamDefFile(infile):#{{{
    hdl = myfunc.ReadLineByBlock(infile)
    if hdl.failure:
        return ({}, {})

    dtClan = {}
    dtPfam = {}
    lines = hdl.readlines()

    while lines != None:
        for line in lines:
            strs = line.split("\t")
            try:
                pfamid = strs[0]
                pfamDefShort = strs[3]
                dtPfam[pfamid] = pfamDefShort

                clanid = strs[1]
                clanDefShort = strs[2]
                if clanid != r"\N":
                    dtClan[clanid] = clanDefShort
                else:
                    dtClan[pfamid] = pfamDefShort
            except IndexError:
                pass
        lines = hdl.readlines()
    hdl.close()
    return (dtPfam, dtClan)
def CountUniquePairInvertedInfo(infile, pfamidDefDict, fpout):
    idset1 = set([])
    idset2 = set([])
    numTMSet = set([])
    numInvPair = 0
    numAllPair = 0
    ratio = 0.0
    hdl = myfunc.ReadLineByBlock(infile)
    if hdl.failure:
        return 1
    lines = hdl.readlines()
    while lines != None:
        for line in lines:
            if line.find("General") == 0:
                strs = line.split()
                numInvPair = int(strs[1])
                numAllPair = int(strs[2])
                ratio = float(strs[3])
            if line.find("Pair") == 0:
                strs = line.split()
                id1 = strs[1]
                id2 = strs[2]
                NtermState1 = strs[3]
                NtermState2 = strs[4]
                numTM = int(strs[5])
                numTMSet.add(numTM)
                if NtermState1 == 'i':
                    idset1.add(id1)
                    idset2.add(id2)
                else:
                    idset1.add(id2)
                    idset2.add(id1)
        lines = hdl.readlines()
    hdl.close()
    pfamid = os.path.basename(infile).split(".")[0]
    try:
        pfamdef = pfamidDefDict[pfamid]
    except KeyError:
        pfamdef = "N/A"
    if len(idset1) > 0 or len(idset2) > 0:
        fpout.write("%-8s %20s %4d %4d %2d %8s   %5d %5d %6.3f\n"%(
            pfamid,
            pfamdef,
            len(idset1),
            len(idset2),
            len(numTMSet),
            str(list(numTMSet)),
            numInvPair,
            numAllPair,
            ratio
            ))
예제 #14
0
def IDMap2SeqID(infile, fpout):
    hdl = myfunc.ReadLineByBlock(infile)
    if hdl.failure:
        return 1
    lines = hdl.readlines()
    while lines != None:
        for line in lines:
            if not line or line[0] == "#":
                continue
            strs = line.split()
            if len(strs) > 2:
                for ss in strs[2:]:
                    print >> fpout, ss
        lines = hdl.readlines()
    hdl.close()
예제 #15
0
def ReadSeqPathMapDict(infile):#{{{
    hdl = myfunc.ReadLineByBlock(infile)
    if hdl.failure:
        return {}
    dt = {}
    lines = hdl.readlines()
    while lines != None:
        for line in lines:
            if not line or  line[0] == "#":
                continue
            strs = line.split()
            if len(strs) == 2:
                dt[strs[0]] = strs[1]
        lines = hdl.readlines()
    return dt
예제 #16
0
def ReadGOTerm(infile):  #{{{
    hdl = myfunc.ReadLineByBlock(infile)
    dt = {}
    if hdl.failure:
        return 1
    lines = hdl.readlines()
    while lines != None:
        for line in lines:
            if line != "" and line[0] != "#":
                strs = line.split("\t")
                if len(strs) >= 2:
                    goid = strs[0].strip()
                    dt[goid] = strs[1].strip()
        lines = hdl.readlines()
    hdl.close()
    return dt
예제 #17
0
def ReadGOInfo(infile):  #{{{
    hdl = myfunc.ReadLineByBlock(infile)
    GOinfoList = []
    if hdl.failure:
        return 1
    lines = hdl.readlines()
    while lines != None:
        for line in lines:
            if line != "" and line[0] != "#":
                goinfo = ScanfGOInfo(line)
                if goinfo != {}:
                    GOinfoList.append(goinfo)
        lines = hdl.readlines()
    hdl.close()

    return GOinfoList
def ReadSignalPFile(infile):  #{{{
    hdl = myfunc.ReadLineByBlock(infile)
    dt = {}
    if hdl.failure:
        return 1
    lines = hdl.readlines()
    while lines != None:
        for line in lines:
            if line == "" or line[0] == "#":
                continue
            #seqid = myfunc.GetFirstWord(line)
            seqid = myfunc.GetSeqIDFromAnnotation(line)
            dt[seqid] = line
        lines = hdl.readlines()
    hdl.close()
    return dt
예제 #19
0
def ReadGOAnc(infile):
    hdl = myfunc.ReadLineByBlock(infile)
    goAncDict = {}
    if hdl.failure:
        return 1
    lines = hdl.readlines()
    while lines != None:
        for line in lines:
            if line != "" and line[0] != "#":
                strs = line.split(";")
                goid = strs[0].strip()
                goAncDict[goid] = line
        lines = hdl.readlines()
    hdl.close()

    return goAncDict
예제 #20
0
def ReadDupPairList(infile):#{{{
    hdl = myfunc.ReadLineByBlock(infile)
    if hdl.failure:
        return []

    li = []
    lines = hdl.readlines()
    while lines != None:
        for line in lines:
            strs = line.split()
            if len(strs) >= 2:
                if strs[1] == 'y':
                    ss = strs[0].split("-")
                    if len(ss) == 2:
                        li.append((ss[0], ss[1]))
        lines = hdl.readlines()
    hdl.close()
    return (li)
예제 #21
0
def ReadSeqLengthDict(infile):#{{{
    hdl = myfunc.ReadLineByBlock(infile)
    if hdl.failure:
        return {}
    dt = {}
    lines = hdl.readlines()
    while lines != None:
        for line in lines:
            if not line or line[0] == "#":
                continue
            strs = line.split()
            if len(strs) == 2:
                seqid = strs[0]
                length = int(strs[1])
                dt[seqid] = length
        lines = hdl.readlines()
    hdl.close()
    return dt
예제 #22
0
def ReadIDWithAnnoInfo(infile):#{{{
    hdl = myfunc.ReadLineByBlock(infile)
    if hdl.failure:
        return {}
    dt = {}
    lines = hdl.readlines()
    while lines != None:
        for line in lines:
            if not line or line[0] == "#":
                continue
            strs = line.split("\t")
            if len(strs) == 2:
                seqid = strs[0]
                anno = strs[1].strip()
                dt[seqid] = anno
        lines = hdl.readlines()
    hdl.close()
    return dt
예제 #23
0
def SelectLineByID(infile, idListSet, fpout):#{{{
    hdl = myfunc.ReadLineByBlock(infile)
    if hdl.failure:
        return 1
    method_getid = g_params['method_getid']
    sel_field_list = g_params['sel_field_list']
    if method_getid == 3:
        if len(sel_field_list) == 0:
            sel_field = 0
        elif len(sel_field_list) == 1:
            sel_field = sel_field_list[0]


    lines = hdl.readlines()
    while lines != None:
        for line in lines:
            if not line or line[0] == "#":
                fpout.write("%s\n"%line)
            else:
                try:
                    if method_getid == 0:
                        idd = line.split(None, 1)[0]
                    elif method_getid == 1:
                        idd = (line.split(None, 1)[0]).partition(";")[0]
                    elif method_getid == 2:
                        idd = myfunc.GetSeqIDFromAnnotation(line)
                    elif method_getid == 3:
                        if len(sel_field_list) < 2:
                            idd = line.split()[sel_field-1]
                        else:
                            strs = line.split()
                            tmpli = []
                            for ff in sel_field_list:
                                tmpli.append(strs[ff-1])
                            idd = tuple(tmpli)
                    else:
                        print method_getid
                except (IndexError):
                    print >> sys.stderr, ("Bad line \"%s\"\n"%line)
                if idd in idListSet:
                    fpout.write("%s\n"%line)
        lines = hdl.readlines()
    hdl.close()
    return 0
예제 #24
0
def ReadMapFile(infile):#{{{
    mapDict = {}
    hdl = myfunc.ReadLineByBlock(infile)
    if hdl.failure:
        return mapDict

    lines = hdl.readlines()
    while lines != None:
        for line in lines:
            if not line or line[0] == "#":
                continue
            strs = line.split("\t")
            try:
                mapDict[strs[0]] = strs[1]
            except IndexError:
                pass
        lines = hdl.readlines()
    hdl.close()
    return mapDict
예제 #25
0
def ReadRLTYInfo(infile):#{{{
    hdl = myfunc.ReadLineByBlock(infile)
    if hdl.failure:
        msg = "Failed to read file %s in function %s"
        print(msg%(infile, sys._getframe().f_code.co_name), file=sys.stderr)
        return {}
    rltyDict = {}
    lines = hdl.readlines()
    while lines != None:
        for line in lines:
            strs = line.split()
            if len(strs) == 2:
                try:
                    seqid = strs[0]
                    rlty = float(strs[1])
                    rltyDict[seqid] = rlty
                except (ValueError, TypeError, KeyError):
                    pass
        lines = hdl.readlines()
    hdl.close()
    return rltyDict
def ReadPairInfo(infile):  #{{{
    """
    Format of the pairlistfile
    #seqid1 seqid2 seqidt   famid         pfamdef numSeqCls1 numSeqCls2 numSeq nTM1 nTM2  isSP isPDB

    Output:
        pairInfoDict  {pfamid: {'':, ''}}
    """
    hdl = myfunc.ReadLineByBlock(infile)
    if hdl.failure:
        return {}
    lines = hdl.readlines()
    dt = {}
    while lines != None:
        for line in lines:
            if not line or line[0] == "#":
                continue
            strs = line.split()
            if len(strs) >= 12:
                pfamid = strs[3]
                if not pfamid in dt:
                    dt[pfamid] = {}
                    dt[pfamid]['pfamdef'] = strs[4]
                    dt[pfamid]['numSeqCls1'] = int(strs[5])
                    dt[pfamid]['numSeqCls2'] = int(strs[6])
                    dt[pfamid]['numseq'] = int(strs[7])
                    dt[pfamid]['nTM_Group1'] = int(strs[8])
                    dt[pfamid]['nTM_Group2'] = int(strs[9])
                    dt[pfamid]['pairlist'] = []
                seqid1 = strs[0]
                seqid2 = strs[1]
                seqidt = float(strs[2])
                isSP = int(strs[10])
                isPDB = int(strs[11])
                dt[pfamid]['pairlist'].append(
                    (seqid1, seqid2, seqidt, isSP, isPDB))
        lines = hdl.readlines()
    hdl.close()
    return dt
예제 #27
0
def ReadSignalPDict(infile):#{{{
# format of signalp file
# SeqID location Y
    hdl = myfunc.ReadLineByBlock(infile)
    if hdl.failure:
        return {}

    signalpDict = {}
    lines = hdl.readlines()
    while lines != None:
        for line in lines:
            if not line or line[0] == "#":
                continue
            strs = line.split()
            if len(strs) >= 2:
                try:
                    signalpDict[strs[0]] = int(strs[1])
                except (ValueError):
                    pass
        lines = hdl.readlines()
    hdl.close()
    return signalpDict
예제 #28
0
def FilterUniprotIDMap(infile, fpout):
    hdl = myfunc.ReadLineByBlock(infile)
    if not hdl:
        return 1
    lines = hdl.readlines()
    while lines != None:
        for line in lines:
            isIgnore = False
            strs = line.split("\t")
            try:
                if strs[2].find(strs[0]) != -1:
                    uniprotid = GetUniprotIDFromLongName(strs[2])
                    if uniprotid != "":
                        if uniprotid == strs[0]:
                            isIgnore = True
                        else:
                            print >> sys.stderr, "Error\t", line
                    else:
                        print >> sys.stderr, "Null\t", line
                if not isIgnore:
                    print >> fpout, line
            except IndexError:
                print >> sys.stderr, "IndexError\t", line
        lines = hdl.readlines()
def main(g_params):  #{{{
    argv = sys.argv
    numArgv = len(argv)
    if numArgv < 2:
        PrintHelp()
        return 1

    outfile = ""
    infile = ""
    gramPositiveFile = ""
    gramNegativeFile = ""
    eukFile = ""

    i = 1
    isNonOptionArg = False
    while i < numArgv:
        if isNonOptionArg == True:
            infile = argv[i]
            isNonOptionArg = False
            i += 1
        elif argv[i] == "--":
            isNonOptionArg = True
            i += 1
        elif argv[i][0] == "-":
            if argv[i] in ["-h", "--help"]:
                PrintHelp()
                return 1
            elif argv[i] in ["-o", "--o", "-outfile"]:
                (outfile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-gram+", "--gram+"]:
                (gramPositiveFile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-gram-", "--gram-"]:
                (gramNegativeFile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-euk", "--euk"]:
                (eukFile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-q", "--q"]:
                g_params['isQuiet'] = True
                i += 1
            elif argv[i] in ["-debug", "--debug"]:
                g_params['isDEBUG'] = True
                i += 1
            else:
                print >> sys.stderr, "Error! Wrong argument:", argv[i]
                return 1
        else:
            infile = argv[i]
            i += 1

    if myfunc.checkfile(infile, "taxidwithtaxoFile") != 0:
        return 1
    if myfunc.checkfile(gramPositiveFile, "gramPositiveFile") != 0:
        return 1
    if myfunc.checkfile(gramNegativeFile, "gramNegativeFile") != 0:
        return 1
    if myfunc.checkfile(eukFile, "eukFile") != 0:
        return 1

    gramPositiveDict = ReadSignalPFile(gramPositiveFile)
    gramNegativeDict = ReadSignalPFile(gramNegativeFile)
    eukDict = ReadSignalPFile(eukFile)

    fpout = myfunc.myopen(outfile, sys.stdout, "w", False)

    hdl = myfunc.ReadLineByBlock(infile)
    if hdl.failure:
        return 1
    lines = hdl.readlines()
    while lines != None:
        for line in lines:
            strs = line.split("\t")
            if len(strs) == 3:
                seqid = strs[0].strip()
                taxo = strs[2].strip()
                info = ""
                try:
                    if taxo == "Gram+" or taxo == "gram+":
                        info = gramPositiveDict[seqid]
                    elif taxo == "Gram-" or taxo == "gram-":
                        info = gramNegativeDict[seqid]
                    elif taxo == "Euk" or taxo == "euk":
                        info = eukDict[seqid]

                    if g_params['isDEBUG']:
                        print >> sys.stderr, "%s: %s" % (seqid, taxo)
                except KeyError:
                    info = ""
                if info != "":
                    fpout.write("%s\n" % info)

        lines = hdl.readlines()
    hdl.close()
    myfunc.myclose(fpout)
예제 #30
0
def RunHHSearchPairwise(tableinfoFile,  #{{{
        hhprofilepathList, hhprofilepathMapDictList,
        hhsearchpathList, hhsearchpathMapDictList,
        topoDict, outpath, dupfile):
    fpoutDup = None
    if dupfile != "":
        fpoutDup = myfunc.myopen(dupfile, sys.stdout, "w", False)

    hdl = myfunc.ReadLineByBlock(tableinfoFile)
    if hdl.failure:
        return 1
    cnt = 0
    lines = hdl.readlines()
    while lines != None:
        for line in lines:
            if not line or line[0] == "#":
                continue
            strs = line.split()
            try:
                seqid1 = strs[0]
                seqid2 = strs[1]
            except (IndexError, ValueError):
                print >> sys.stderr, "Bad record line \"%s\""%(line)
                continue

            try:
                topo1 = topoDict[seqid1]
            except KeyError:
                topo1 = ""
            try:
                topo2 = topoDict[seqid2]
            except KeyError:
                topo2 = ""

            seqlen1 = len(topo1)
            seqlen2 = len(topo2)

            pairlist = [(seqid1, seqlen1), (seqid2, seqlen2)]
            pairlist = sorted(pairlist, key=lambda x:x[1], reverse=False) # short - long

            hhrfile = "%s%s%s_%s.hhr"%(outpath, os.sep, seqid1, seqid2)
            if g_params['isUsePreBuildHHSearchResult']:
                keystr = "%s_%s"%(seqid1, seqid2)
                tmp_hhrfile = GetProfileFileName(hhsearchpathList,
                        hhsearchpathMapDictList, keystr, ".hhr")
                if os.path.exists(tmp_hhrfile):
                    hhrfile = tmp_hhrfile
                else:
                    print >> sys.stderr, "hhrfile %s does not exist in"\
                            " the prebuilt path"%(hhrfile)


            # update seqid1 and seqid2 (shorter - longer)
            seqid1 = pairlist[0][0] # shorter sequence
            seqid2 = pairlist[1][0] # longer sequence

            try:
                topo1 = topoDict[seqid1]
            except KeyError:
                topo1 = ""
            try:
                topo2 = topoDict[seqid2]
            except KeyError:
                topo2 = ""

            seqlen1 = len(topo1)
            seqlen2 = len(topo2)
            numTM1 = len(myfunc.GetTMPosition(topo1))
            numTM2 = len(myfunc.GetTMPosition(topo2))


            if not os.path.exists(hhrfile) or g_params['isForceOverWrite']:
                a3mfile = GetProfileFileName(hhprofilepathList, #query
                        hhprofilepathMapDictList, pairlist[0][0], ".a3m")
                hhmfile = GetProfileFileName(hhprofilepathList,  #template
                        hhprofilepathMapDictList, pairlist[1][0], ".hhm")
                if a3mfile == "" or not os.path.exists(a3mfile):
                    print >> sys.stderr, "a3mfile not found for %s. Ignore." %(pairlist[0][0])
                elif hhmfile == "" or not os.path.exists(hhmfile):
                    print >> sys.stderr, "hhmfile not found for %s. Ignore." %(pairlist[1][0])
                else:
                    tmp_hhrfile = "%s.tmp"%(hhrfile)
                    cmd = "hhsearch -i %s -d %s -o %s -v 0 -nocons -nopred -nodssp" % (
                            a3mfile, hhmfile, tmp_hhrfile)
                    os.system(cmd)
                    if os.path.exists(tmp_hhrfile):
                        os.system("/bin/mv -f %s %s"%(tmp_hhrfile, hhrfile))
                        print hhrfile, "output"
            if fpoutDup and os.path.exists(hhrfile):
                ss_isdup = ""
                hitinfo = {}
#                 if IsDuplicatedByHHSearch(hhrfile, seqid1, seqid2, cnt):
#                     ss_isdup = 'y'
#                 else:
#                     ss_isdup = 'n'
                hitinfo = CheckDuplication(hhrfile, seqid1, seqid2, topoDict, cnt)
                if hitinfo != {}:
                    fpoutDup.write("%s-%s %s %4d %4d %4d %4d" %(
                        seqid1, seqid2, hitinfo['isDup'],
                        len(topo1), len(topo2), numTM1, numTM2))
                    if 'hit' in hitinfo:
                        for j in xrange(len(hitinfo['hit'])):
                            hit = hitinfo['hit'][j]
                            ss_hit = "%d-%d(nTM=%d) %d-%d(nTM=%d)"%(
                                    hit['posQuery'][0], hit['posQuery'][1], hit['numTM1'],
                                    hit['posTemplate'][0], hit['posTemplate'][1], hit['numTM2'])
                            fpoutDup.write(" | %35s"%(ss_hit))
                    fpoutDup.write("\n")
            cnt += 1

        lines = hdl.readlines()
    hdl.close()
    myfunc.myclose(fpoutDup)
    return 0