Python ReadFastaByBlock 예제들, myfunc.ReadFastaByBlock Python 예제들

예제 #1

0

파일 보기

파일: filterSignalPeptide.py 프로젝트: vam-sin/bioinfo-toolbox

def FilterSignalPeptide(topofile, sigpepDict, outfile,
        isDeleteSeqWithSignalPeptide):
    hdl = myfunc.ReadFastaByBlock(topofile)
    if hdl.failure:
        return 1

    fpout = myfunc.myopen(outfile, sys.stdout, "w", False)
    recordList = hdl.readseq()
    while recordList != None:
        for rd in recordList:
            try:
                sp_pos = sigpepDict[rd.seqid]
            except KeyError:
                sp_pos = -1
            if sp_pos != -1:
                if isDeleteSeqWithSignalPeptide:
                    newtopo = ""
                else:
                    newtopo = lcmp.FilterSignalPeptideInTopology(rd.seq, sp_pos)
            else:
                newtopo = rd.seq
            if newtopo != "" and myfunc.CountTM(newtopo) > 0:
                fpout.write(">%s\n"%(rd.description))
                fpout.write("%s\n"%(newtopo))
        recordList = hdl.readseq()
    hdl.close()
    myfunc.myclose(fpout)
    return 0

예제 #2

0

파일 보기

def MatchMSATopo_using_topofile(
        msafile,
        topofile,
        isIgnoreBadseq,  #{{{
        method_match,
        outfile):
    topoDict = GetTopoDict(topofile)
    hdl = myfunc.ReadFastaByBlock(msafile)
    if hdl.failure:
        return 1

    fpout = myfunc.myopen(outfile, sys.stdout, "w", False)

    recordList = hdl.readseq()
    while recordList != None:
        for rd in recordList:
            try:
                topo = topoDict[rd.seqid]
            except KeyError:
                print("topo not found for ID %s" % (rd.seqid), file=sys.stderr)
                topo = ""
            matchedtopo = MatchSeqToTopo(rd.seq, topo, method_match)
            if not (matchedtopo == "BADSEQ" and isIgnoreBadseq):
                print(">%s" % (rd.description), file=fpout)
                print("%s" % (matchedtopo), file=fpout)
        recordList = hdl.readseq()

    myfunc.myclose(fpout)
    hdl.close()

    return 0

예제 #3

0

파일 보기

파일: msa2mpa.py 프로젝트: vam-sin/bioinfo-toolbox

def MFA2MPA_obsolete(infile, fpout):#{{{
    hdl = myfunc.ReadFastaByBlock(infile, 0, 1)
    if hdl.failure:
        return 1
    lengthList = []
    recordList = hdl.readseq()
    while recordList != None:
        for rd in recordList:
            print >> fpout, ">%s"%(rd.description)
            print >> fpout, "Length: %d"%(len(rd.seq))
            seq = rd.seq
            firstrespos = -1
            for i in xrange(len(seq)):
                if seq[i] != GAP:
                    if firstrespos == -1:
                        firstrespos = i
                        fpout.write("%d %s "%(i, seq[i]))
                    else:
                        fpout.write("%d %s "%(i-firstrespos, seq[i]))
            fpout.write("\n")
            lengthList.append(len(seq))
        recordList = hdl.readseq()
    hdl.close()
    if len(set(lengthList)) > 1:
        msg = "Warning! Length of the MSA file %s are not equal!"
        print >> sys.stderr, msg%(infile)

    return 0

예제 #4

0

파일 보기

파일: msa2mpa.py 프로젝트: vam-sin/bioinfo-toolbox

def MFA2MPA(infile, fpout):#{{{
    hdl = myfunc.ReadFastaByBlock(infile, 0, 1)
    if hdl.failure:
        return 1
    lengthList = []
    recordList = hdl.readseq()
    while recordList != None:
        for rd in recordList:
            print >> fpout, ">%s"%(rd.description)
            seq = rd.seq
            gapPosList = myfunc.GetSegPos(seq, GAP)
            num = len(gapPosList)
            length = len(seq)
            if num < 1 :
                fpout.write("%s\n"%(seq))
            else:
                if gapPosList[0][0] > 0:
                    fpout.write("%s "%(seq[0:gapPosList[0][0]]))
                for i in xrange(num-1):
                    fpout.write("%d "%(gapPosList[i][1] - gapPosList[i][0]))
                    fpout.write("%s "%(seq[gapPosList[i][1]:gapPosList[i+1][0]]))
                fpout.write("%d "%(gapPosList[num-1][1] - gapPosList[num-1][0]))
                if gapPosList[num-1][1] < length:
                    fpout.write("%s"%(seq[gapPosList[num-1][1]:length]))

                fpout.write("\n")
            lengthList.append(length)
        recordList = hdl.readseq()
    hdl.close()
    if len(set(lengthList)) > 1:
        msg = "Warning! Length of the MSA file %s are not equal!"
        print >> sys.stderr, msg%(infile)
    return 0

예제 #5

0

파일 보기

파일: is_uniqueseq.py 프로젝트: vam-sin/bioinfo-toolbox

def IsUniqueSeq(infile, method, isUseMD5):  #{{{
    """
    return value 
    yes     :  1
    no      :  0
    failed  : -1
    """
    hdl = myfunc.ReadFastaByBlock(infile)
    if hdl.failure:
        return -1

    myset = set([])

    recordList = hdl.readseq()
    while recordList != None:
        for rd in recordList:
            if method == "id":
                key = rd.seqid
            elif method == "seq":
                if isUseMD5:
                    key = md5.new(rd.seq).digest()
                else:
                    key = rd.seq
            if key in myset:  # duplicated
                return 0  # not unique
            myset.add(key)
        recordList = hdl.readseq()
    hdl.close()
    return 1  #unique

예제 #6

0

파일 보기

def ExcludeConsensus(infile, g_outpath):
    if g_outpath == "":
        outpath = os.path.dirname(infile)
        if outpath == "":
            outpath = "."
    else:
        outpath = g_outpath
    rootname = os.path.basename(os.path.splitext(infile)[0])
    outfile = "%s%s%s.nocons.fasta" % (outpath, os.sep, rootname)
    try:
        fpout = open(outfile, "w")

        hdl = myfunc.ReadFastaByBlock(infile)
        if hdl.failure:
            return 1
        recordList = hdl.readseq()
        while recordList != None:
            for record in recordList:
                if record.seqid.lower().find("consensus") == -1:
                    fpout.write(">%s\n%s\n" % (record.description, record.seq))
            recordList = hdl.readseq()
        fpout.close()

        hdl.close()
    except IOError:
        print sys.stderr, "Failed to write to file %s" % (outfile)
        return 1

예제 #7

0

파일 보기

파일: build_pfamid2seqid_fromseqid2pfamid.py 프로젝트: vam-sin/bioinfo-toolbox

def IsUniqueSeq(infile, method):#{{{
    hdl = myfunc.ReadFastaByBlock(infile)
    if hdl.failure:
        return -1

    isunique = 1 #init value
    myset = set([])

    recordList = hdl.readseq()
    while recordList != None:
        for rd in recordList:
            if method == "id":
                if rd.seqid in myset:
                    isunique = 0
                    break
                myset.add(rd.seqid)
            elif method == "seq":
                if rd.seq in myset:
                    isunique = 0
                    break
                myset.add(rd.seq)
        recordList = hdl.readseq()

    hdl.close()
    return isunique

예제 #8

0

파일 보기

파일: splitpfamfasta.py 프로젝트: vam-sin/bioinfo-toolbox

def SplitPfamFasta(infile, outpath):
    hdl = myfunc.ReadFastaByBlock(infile, 0, 0)
    if hdl.failure:
        return 1
    pfamIDThis = ""
    liThis = []
    recordList = hdl.readseq()
    cnt = 0
    while recordList != None:
        for rd in recordList:
            pfamid = ExtractPfamIDFromDescription(rd.description)
            if pfamIDThis == "":
                pfamIDThis = pfamid
            if pfamid == pfamIDThis:
                liThis.append(rd)
            else:  # starting a new record, export this record
                ExportFastaForPfamID(pfamIDThis, liThis, outpath, cnt)
                cnt += 1
                pfamIDThis = pfamid
                liThis = []
                liThis.append(rd)
        recordList = hdl.readseq()
    if len(liThis) > 0:
        ExportFastaForPfamID(pfamIDThis, liThis, outpath, cnt)
        cnt += 1
    hdl.close()
    return 0

예제 #9

0

파일 보기

파일: rewritefasta.py 프로젝트: vam-sin/bioinfo-toolbox

def ReWriteFasta(infile, outfile):  #{{{
    fpout = myfunc.myopen(outfile, sys.stdout, "w", False)
    hdl = myfunc.ReadFastaByBlock(infile, 0, 1)
    if hdl.failure:
        return 1
    recordList = hdl.readseq()
    while recordList != None:
        for rd in recordList:
            fpout.write(">%s\n" % rd.description)
            fpout.write("%s\n" % rd.seq)
        recordList = hdl.readseq()
    hdl.close()
    myfunc.myclose(fpout)
    return 0

예제 #10

0

파일 보기

파일: cleanaaseq.py 프로젝트: vam-sin/bioinfo-toolbox

def IsHasNonStandardAminoAcid(infile):  #{{{
    isHasNonStdAA = 0
    hdl = myfunc.ReadFastaByBlock(infile, 0, 0)
    if hdl.failure:
        return -1
    recordList = hdl.readseq()
    while recordList != None:
        for rd in recordList:
            for i in xrange(len(rd.seq)):
                if STD1CharAA_alphabet.find(rd.seq[i]) == -1:
                    isHasNonStdAA = 1
                    break
        recordList = hdl.readseq()
    hdl.close()
    return isHasNonStdAA

예제 #11

0

파일 보기

파일: cleanaaseq.py 프로젝트: vam-sin/bioinfo-toolbox

def CleanAASeq(infile, isOverWrite, fpout):  #{{{
    isHasNonStandardAminoAcid = IsHasNonStandardAminoAcid(infile)
    if isHasNonStandardAminoAcid == 0:
        if isOverWrite:
            msg = "seqfile %s is already cleaned. Ignore"
            print >> sys.stderr, msg % (infile)
        else:
            fpin = open(infile, "r")
            BLOCK_S = 10000
            buff = fpin.read(BLOCK_S)
            while buff:
                fpout.write(buff)
                buff = fpin.read(BLOCK_S)
            fpin.close()
        return 0
    elif isHasNonStandardAminoAcid == -1:
        return -1

    fpout_local = None
    if not isOverWrite:
        fpout_local = fpout
    else:
        try:
            fpout_local = tempfile.NamedTemporaryFile(delete=False)
        except IOError:
            msg = "Failed to write to temporary file for running seqfile %s"
            print >> sys.stderr, msg % (infile)
            return -1
    hdl = myfunc.ReadFastaByBlock(infile, 0, 0)
    if hdl.failure:
        return -1
    recordList = hdl.readseq()
    while recordList != None:
        for rd in recordList:
            seq = ReplaceNonStandardAminoAcid(rd.seq)
            fpout_local.write(">%s\n" % (rd.description))
            fpout_local.write("%s\n" % (seq))
        recordList = hdl.readseq()
    hdl.close()
    if isOverWrite:
        fpout_local.close()
        #print "tmpfile=",fpout_local.name
        os.system("/bin/mv -f %s %s" % (fpout_local.name, infile))
        os.system("chmod 644 %s" % (infile))
        if not g_params['isQuiet']:
            msg = "seqfile %s cleaned"
            print msg % (infile)

예제 #12

0

파일 보기

def MatchMSATopo_using_topodb(
        msafile,
        topodb,
        isIgnoreBadseq,  #{{{
        method_match,
        outfile):
    hdl_topo = myfunc.MyDB(topodb)
    if hdl_topo.failure:
        return 1

    hdl = myfunc.ReadFastaByBlock(msafile)
    if hdl.failure:
        return 1

    fpout = myfunc.myopen(outfile, sys.stdout, "w", False)

    recordList = hdl.readseq()
    while recordList != None:
        for rd in recordList:
            topowithanno = hdl_topo.GetRecord(rd.seqid)
            if topowithanno != None:
                (topoid, topoanno,
                 topo) = myfunc.ExtractFromSeqWithAnno(topowithanno)
            else:
                print("topo not found for ID %s" % (rd.seqid), file=sys.stderr)
                topo = ""
            matchedtopo = MatchSeqToTopo(rd.seq, topo, method_match)
            if not (matchedtopo == "BADSEQ" and isIgnoreBadseq):
                print(">%s" % (rd.description), file=fpout)
                print("%s" % (matchedtopo), file=fpout)
        recordList = hdl.readseq()

    myfunc.myclose(fpout)
    hdl.close()
    hdl_topo.close()

    return 0

예제 #13

0

파일 보기

파일: my_uniqueseq.py 프로젝트: jinlingwanggit/TOPCONS2

def RemoveDupSeq(infile, g_outpath, method, isUseMD5):  #{{{
    if g_outpath == "":
        outpath = myfunc.my_dirname(infile)
    else:
        outpath = g_outpath
    rootname = os.path.basename(os.path.splitext(infile)[0])

    outfile = "%s%s%s" % (outpath, os.sep, rootname)

    fpout = myfunc.myopen(outfile, None, "w", False)
    if fpout == None:
        return 1

    hdl = myfunc.ReadFastaByBlock(infile)
    if hdl.failure:
        return -1

    myset = set([])

    recordList = hdl.readseq()
    while recordList != None:
        for rd in recordList:
            if method == "id":
                key = rd.seqid
            elif method == "seq":
                if isUseMD5:
                    key = md5.new(rd.seq).digest()
                else:
                    key = rd.seq
            if not key in myset:
                myset.add(key)
                fpout.write(">%s\n%s\n" % (rd.description, rd.seq))
        recordList = hdl.readseq()

    hdl.close()
    myfunc.myclose(fpout)
    return 0

예제 #14

0

파일 보기

파일: debug.run_job.py 프로젝트: akhiljobby/web_boctopus2

def RunJob(infile, outpath, tmpdir, email, jobid, g_params):#{{{
    rootname = os.path.basename(os.path.splitext(infile)[0])
    starttagfile   = "%s/runjob.start"%(outpath)
    runjob_errfile = "%s/runjob.err"%(outpath)
    runjob_logfile = "%s/runjob.log"%(outpath)
    finishtagfile = "%s/runjob.finish"%(outpath)
    rmsg = ""


    resultpathname = jobid

    outpath_result = "%s/%s"%(outpath, resultpathname)
    tarball = "%s.tar.gz"%(resultpathname)
    zipfile = "%s.zip"%(resultpathname)
    tarball_fullpath = "%s.tar.gz"%(outpath_result)
    zipfile_fullpath = "%s.zip"%(outpath_result)
    outfile = "%s/%s/Topcons/topcons.top"%(outpath_result, "seq_%d"%(0))
    resultfile_text = "%s/%s"%(outpath_result, "query.result.txt")

    tmp_outpath_result = "%s/%s"%(tmpdir, resultpathname)
    isOK = True
    try:
        os.makedirs(tmp_outpath_result)
        isOK = True
    except OSError:
        msg = "Failed to create folder %s"%(tmp_outpath_result)
        myfunc.WriteFile(msg+"\n", runjob_errfile, "a")
        isOK = False

    print "isOK =", isOK

    if isOK:
        tmp_mapfile = "%s/seqid_index_map.txt"%(tmp_outpath_result)

        maplist = []
        maplist_simple = []
        hdl = myfunc.ReadFastaByBlock(infile, method_seqid=0, method_seq=0)
        if hdl.failure:
            isOK = False
        else:
            recordList = hdl.readseq()
            cnt = 0
            while recordList != None:
                for rd in recordList:
                    maplist.append("%s\t%d\t%s\t%s"%("seq_%d"%cnt, len(rd.seq),
                        rd.description, rd.seq))
                    maplist_simple.append("%s\t%d\t%s"%("seq_%d"%cnt, len(rd.seq),
                        rd.description))
                    cnt += 1
                recordList = hdl.readseq()
            hdl.close()
        myfunc.WriteFile("\n".join(maplist_simple), tmp_mapfile)

        if isOK:
#             g_params['runjob_log'].append("tmpdir = %s"%(tmpdir))
            #cmd = [script_getseqlen, infile, "-o", tmp_outfile , "-printid"]
            datetime = time.strftime("%Y-%m-%d %H:%M:%S")
            rt_msg = myfunc.WriteFile(datetime, starttagfile)
            if rt_msg:
                g_params['runjob_err'].append(rt_msg)

            cmd = [runscript, infile,  tmp_outpath_result, blastdir, blastdb ]
            g_params['runjob_log'].append(" ".join(cmd))
            begin_time = time.time()
            try:
                rmsg = subprocess.check_output(cmd)
            except subprocess.CalledProcessError, e:
                g_params['runjob_err'].append(str(e)+"\n")
                g_params['runjob_err'].append(rmsg + "\n")
                suqoutfilelist = glob.glob("%s/*.sh.*.out"%(tmpdir))
                if len(suqoutfilelist)>0:
                    suqoutfile = suqoutfilelist[0]
                g_params['runjob_err'].append(myfunc.ReadFile(suqoutfile))
            end_time = time.time()
            runtime_in_sec = end_time - begin_time

            if os.path.exists(tmp_outpath_result):
                cmd = ["cp","-rf", tmp_outpath_result, outpath]
                try:
                    subprocess.check_output(cmd)
                except subprocess.CalledProcessError, e:
                    g_params['runjob_err'].append(str(e))

            if len(g_params['runjob_log']) > 0 :
                rt_msg = myfunc.WriteFile("\n".join(g_params['runjob_log']), runjob_logfile, "a")
                if rt_msg:
                    g_params['runjob_err'].append(rt_msg)

            datetime = time.strftime("%Y-%m-%d %H:%M:%S")
            if os.path.exists(outfile):
                rt_msg = myfunc.WriteFile(datetime, finishtagfile)
                if rt_msg:
                    g_params['runjob_err'].append(rt_msg)

# now write the text output to a single file
            WriteTextResultFile(resultfile_text, maplist, runtime_in_sec)

            # now making zip instead (for windows users)
            pwd = os.getcwd()
            os.chdir(outpath)
#             cmd = ["tar", "-czf", tarball, resultpathname]
            cmd = ["zip", "-rq", zipfile, resultpathname]
            try:
                subprocess.check_output(cmd)
            except subprocess.CalledProcessError, e:
                g_params['runjob_err'].append(str(e))

예제 #15

0

파일 보기

파일: topcons2_workflow_run_job.py 프로젝트: akhiljobby/web_boctopus2

def RunJob(infile, outpath, tmpdir, email, jobid, g_params):  #{{{
    all_begin_time = time.time()

    rootname = os.path.basename(os.path.splitext(infile)[0])
    starttagfile = "%s/runjob.start" % (outpath)
    runjob_errfile = "%s/runjob.err" % (outpath)
    runjob_logfile = "%s/runjob.log" % (outpath)
    finishtagfile = "%s/runjob.finish" % (outpath)
    rmsg = ""

    resultpathname = jobid

    outpath_result = "%s/%s" % (outpath, resultpathname)
    tarball = "%s.tar.gz" % (resultpathname)
    zipfile = "%s.zip" % (resultpathname)
    tarball_fullpath = "%s.tar.gz" % (outpath_result)
    zipfile_fullpath = "%s.zip" % (outpath_result)
    outfile = "%s/%s/Topcons/topcons.top" % (outpath_result, "seq_%d" % (0))
    resultfile_text = "%s/%s" % (outpath_result, "query.result.txt")
    mapfile = "%s/seqid_index_map.txt" % (outpath_result)
    finished_seq_file = "%s/finished_seqs.txt" % (outpath_result)

    tmp_outpath_result = "%s/%s" % (tmpdir, resultpathname)
    isOK = True
    try:
        os.makedirs(tmp_outpath_result)
        isOK = True
    except OSError:
        msg = "Failed to create folder %s" % (tmp_outpath_result)
        myfunc.WriteFile(msg + "\n", runjob_errfile, "a")
        isOK = False
        pass

    try:
        os.makedirs(outpath_result)
        isOK = True
    except OSError:
        msg = "Failed to create folder %s" % (outpath_result)
        myfunc.WriteFile(msg + "\n", runjob_errfile, "a")
        isOK = False
        pass

    if isOK:
        try:
            open(finished_seq_file, 'w').close()
        except:
            pass
#first getting result from caches
# ==================================

        maplist = []
        maplist_simple = []
        toRunDict = {}
        hdl = myfunc.ReadFastaByBlock(infile, method_seqid=0, method_seq=0)
        if hdl.failure:
            isOK = False
        else:
            datetime = time.strftime("%Y-%m-%d %H:%M:%S")
            rt_msg = myfunc.WriteFile(datetime, starttagfile)

            recordList = hdl.readseq()
            cnt = 0
            origpath = os.getcwd()
            while recordList != None:
                for rd in recordList:
                    isSkip = False
                    # temp outpath for the sequence is always seq_0, and I feed
                    # only one seq a time to the workflow
                    tmp_outpath_this_seq = "%s/%s" % (tmp_outpath_result,
                                                      "seq_%d" % 0)
                    outpath_this_seq = "%s/%s" % (outpath_result,
                                                  "seq_%d" % cnt)
                    subfoldername_this_seq = "seq_%d" % (cnt)
                    if os.path.exists(tmp_outpath_this_seq):
                        try:
                            shutil.rmtree(tmp_outpath_this_seq)
                        except OSError:
                            pass

                    maplist.append(
                        "%s\t%d\t%s\t%s" %
                        ("seq_%d" % cnt, len(rd.seq), rd.description, rd.seq))
                    maplist_simple.append(
                        "%s\t%d\t%s" %
                        ("seq_%d" % cnt, len(rd.seq), rd.description))
                    if not g_params['isForceRun']:
                        md5_key = hashlib.md5(rd.seq).hexdigest()
                        subfoldername = md5_key[:2]
                        md5_link = "%s/%s/%s" % (path_md5cache, subfoldername,
                                                 md5_key)
                        if os.path.exists(md5_link):
                            # create a symlink to the cache
                            rela_path = os.path.relpath(
                                md5_link, outpath_result)  #relative path
                            os.chdir(outpath_result)
                            os.symlink(rela_path, subfoldername_this_seq)

                            if os.path.exists(outpath_this_seq):
                                runtime = 0.0  #in seconds
                                topfile = "%s/%s/topcons.top" % (
                                    outpath_this_seq, "Topcons")
                                top = myfunc.ReadFile(topfile).strip()
                                numTM = myfunc.CountTM(top)
                                posSP = myfunc.GetSPPosition(top)
                                if len(posSP) > 0:
                                    isHasSP = True
                                else:
                                    isHasSP = False
                                info_finish = [
                                    "seq_%d" % cnt,
                                    str(len(rd.seq)),
                                    str(numTM),
                                    str(isHasSP), "cached",
                                    str(runtime), rd.description
                                ]
                                myfunc.WriteFile("\t".join(info_finish) + "\n",
                                                 finished_seq_file,
                                                 "a",
                                                 isFlush=True)
                                isSkip = True

                    if not isSkip:
                        # first try to delete the outfolder if exists
                        if os.path.exists(outpath_this_seq):
                            try:
                                shutil.rmtree(outpath_this_seq)
                            except OSError:
                                pass
                        origIndex = cnt
                        numTM = 0
                        toRunDict[origIndex] = [rd.seq, numTM, rd.description
                                                ]  #init value for numTM is 0

                    cnt += 1
                recordList = hdl.readseq()
            hdl.close()
        myfunc.WriteFile("\n".join(maplist_simple) + "\n", mapfile)

        # run scampi single to estimate the number of TM helices and then run
        # the query sequences in the descending order of numTM
        torun_all_seqfile = "%s/%s" % (tmp_outpath_result, "query.torun.fa")
        dumplist = []
        for key in toRunDict:
            top = toRunDict[key][0]
            dumplist.append(">%s\n%s" % (str(key), top))
        myfunc.WriteFile("\n".join(dumplist) + "\n", torun_all_seqfile, "w")
        del dumplist

        topfile_scampiseq = "%s/%s" % (tmp_outpath_result,
                                       "query.torun.fa.topo")
        if os.path.exists(torun_all_seqfile):
            # run scampi to estimate the number of TM helices
            cmd = [
                script_scampi, torun_all_seqfile, "-outpath",
                tmp_outpath_result
            ]
            try:
                rmsg = subprocess.check_output(cmd)
            except subprocess.CalledProcessError, e:
                g_params['runjob_err'].append(str(e) + "\n")
                pass
        if os.path.exists(topfile_scampiseq):
            (idlist_scampi, annolist_scampi,
             toplist_scampi) = myfunc.ReadFasta(topfile_scampiseq)
            for jj in xrange(len(idlist_scampi)):
                numTM = myfunc.CountTM(toplist_scampi[jj])
                try:
                    toRunDict[int(idlist_scampi[jj])][1] = numTM
                except (KeyError, ValueError, TypeError):
                    pass

        sortedlist = sorted(toRunDict.items(),
                            key=lambda x: x[1][1],
                            reverse=True)
        #format of sortedlist [(origIndex: [seq, numTM, description]), ...]

        # submit sequences one by one to the workflow according to orders in
        # sortedlist

        for item in sortedlist:
            #             g_params['runjob_log'].append("tmpdir = %s"%(tmpdir))
            #cmd = [script_getseqlen, infile, "-o", tmp_outfile , "-printid"]
            origIndex = item[0]
            seq = item[1][0]
            description = item[1][2]

            outpath_this_seq = "%s/%s" % (outpath_result, "seq_%d" % origIndex)
            tmp_outpath_this_seq = "%s/%s" % (tmp_outpath_result, "seq_%d" %
                                              (0))
            if os.path.exists(tmp_outpath_this_seq):
                try:
                    shutil.rmtree(tmp_outpath_this_seq)
                except OSError:
                    pass

            seqfile_this_seq = "%s/%s" % (tmp_outpath_result, "query_%d.fa" %
                                          (origIndex))
            seqcontent = ">%d\n%s\n" % (origIndex, seq)
            myfunc.WriteFile(seqcontent, seqfile_this_seq, "w")

            if not os.path.exists(seqfile_this_seq):
                g_params['runjob_err'].append(
                    "failed to generate seq index %d" % (origIndex))
                continue

            cmd = [
                runscript, seqfile_this_seq, tmp_outpath_result, blastdir,
                blastdb
            ]
            g_params['runjob_log'].append(" ".join(cmd))
            begin_time = time.time()
            try:
                rmsg = subprocess.check_output(cmd)
                g_params['runjob_log'].append("workflow:\n" + rmsg + "\n")
            except subprocess.CalledProcessError, e:
                g_params['runjob_err'].append(str(e) + "\n")
                g_params['runjob_err'].append(rmsg + "\n")
                pass
                #suqoutfilelist = glob.glob("%s/*.sh.*.out"%(tmpdir))
                #if len(suqoutfilelist)>0:
                #    suqoutfile = suqoutfilelist[0]
                #g_params['runjob_err'].append(myfunc.ReadFile(suqoutfile))
            end_time = time.time()
            runtime_in_sec = end_time - begin_time

            if os.path.exists(tmp_outpath_this_seq):
                cmd = ["mv", "-f", tmp_outpath_this_seq, outpath_this_seq]
                isCmdSuccess = False
                try:
                    subprocess.check_output(cmd)
                    isCmdSuccess = True
                except subprocess.CalledProcessError, e:
                    msg = "Failed to run prediction for sequence No. %d\n" % (
                        origIndex)
                    g_params['runjob_err'].append(msg)
                    g_params['runjob_err'].append(str(e) + "\n")
                    pass
                timefile = "%s/time.txt" % (tmp_outpath_result)
                targetfile = "%s/time.txt" % (outpath_this_seq)
                if os.path.exists(timefile) and os.path.exists(
                        outpath_this_seq):
                    try:
                        shutil.move(timefile, targetfile)
                    except:
                        g_params['runjob_err'].append(
                            "Failed to move %s/time.txt" %
                            (tmp_outpath_result) + "\n")
                        pass

                if isCmdSuccess:
                    runtime = runtime_in_sec  #in seconds
                    topfile = "%s/%s/topcons.top" % (outpath_this_seq,
                                                     "Topcons")
                    top = myfunc.ReadFile(topfile).strip()
                    numTM = myfunc.CountTM(top)
                    posSP = myfunc.GetSPPosition(top)
                    if len(posSP) > 0:
                        isHasSP = True
                    else:
                        isHasSP = False
                    info_finish = [
                        "seq_%d" % origIndex,
                        str(len(seq)),
                        str(numTM),
                        str(isHasSP), "newrun",
                        str(runtime), description
                    ]
                    myfunc.WriteFile("\t".join(info_finish) + "\n",
                                     finished_seq_file,
                                     "a",
                                     isFlush=True)
                    # now write the text output for this seq

                    info_this_seq = "%s\t%d\t%s\t%s" % (
                        "seq_%d" % origIndex, len(seq), description, seq)
                    resultfile_text_this_seq = "%s/%s" % (outpath_this_seq,
                                                          "query.result.txt")
                    myfunc.WriteTOPCONSTextResultFile(resultfile_text_this_seq,
                                                      outpath_result,
                                                      [info_this_seq],
                                                      runtime_in_sec,
                                                      g_params['base_www_url'])
                    # create or update the md5 cache
                    # create cache only on the front-end
                    if g_params['base_www_url'].find("topcons.net") != -1:
                        md5_key = hashlib.md5(seq).hexdigest()
                        subfoldername = md5_key[:2]
                        md5_subfolder = "%s/%s" % (path_md5cache,
                                                   subfoldername)
                        md5_link = "%s/%s/%s" % (path_md5cache, subfoldername,
                                                 md5_key)
                        if os.path.exists(md5_link):
                            try:
                                os.unlink(md5_link)
                            except:
                                pass
                        subfolder_md5 = "%s/%s" % (path_md5cache,
                                                   subfoldername)
                        if not os.path.exists(subfolder_md5):
                            try:
                                os.makedirs(subfolder_md5)
                            except:
                                pass

                        rela_path = os.path.relpath(
                            outpath_this_seq, md5_subfolder)  #relative path
                        try:
                            os.chdir(md5_subfolder)
                            os.symlink(rela_path, md5_key)
                        except:
                            pass

예제 #16

0

파일 보기

파일: test.py 프로젝트: vam-sin/bioinfo-toolbox

def main():  #{{{
    if 0:  #{{{
        strTop1 = "---MMMM-----i-i-i---MMM----MMMM-ooo"
        strTop2 = "----MMMM-----i-ii-----MMM---MMM--oo"
        strProtein1 = "id1"
        strProtein2 = "id2"
        fpLog = sys.stdout
        class_gapless, num1_gapless, num2_gapless = ct.CompareToposGaplesslyNew(
            strTop1, strTop2, strProtein1, strProtein2, fpLog)
        # Note: calling the int, float, string will not change their original value
        # calling the dict, list will change their original value
        print "strTop1:", strTop1
        print "strTop2:", strTop2
#}}}
    if 0:  #{{{
        PrintFuncName()
        print("this file name is: %s" % __file__)
#}}}
    if 0:  #{{{
        # filename="/nanjiang/data/blastdb/uniprot_KW181_idt50.fasta"
        filename = sys.argv[1]
        print filename
        fp = open(filename, "r")
        lines = fp.readlines()
        fp.close()
#}}}
    if 0:  #{{{
        # filename="/nanjiang/data/blastdb/uniprot_KW181_idt50.fasta"
        filename = sys.argv[1]
        print filename
        BLOCK_SIZE = 100000
        fp = open(filename, "r")
        buff = fp.read(BLOCK_SIZE)
        while buff:
            buff = fp.read(BLOCK_SIZE)
        fp.close()
#}}}
    if 0:  #{{{
        # filename="/nanjiang/data/blastdb/uniprot_KW181_idt50.fasta"
        filename = sys.argv[1]
        print filename
        fp = open(filename, "r")
        line = fp.readline()
        while line:
            line = fp.readline()
        fp.close()
        #}}}
    if 0:  #{{{
        try:
            BLOCK_SIZE = 100000
            infile = sys.argv[1]
            fpin = open(infile, 'rb')
            unprocessedBuffer = ""
            isEOFreached = False
            while 1:
                buff = fpin.read(BLOCK_SIZE)
                if len(buff) < BLOCK_SIZE:
                    isEOFreached = True
                buff = unprocessedBuffer + buff
                recordList = []
                unprocessedBuffer = myfunc.ReadFastaFromBuffer(
                    buff, recordList, isEOFreached)
                if len(recordList) > 0:
                    for record in recordList:
                        sys.stdout.write(">%s\n" % record[1])
                        sys.stdout.write("%s\n" % record[2])
                if isEOFreached == True:
                    break
            fpin.close()
        except IOError:
            raise
            #}}}
    if 0:  #{{{
        try:
            infile = sys.argv[1]
            (annoList, seqList) = myfunc.ReadFasta_without_id(infile)
            for i in xrange(len(seqList)):
                sys.stdout.write(">%s\n" % annoList[i])
                sys.stdout.write("%s\n" % seqList[i])
        except IOError:
            raise
            #}}}
    if 0:  #{{{
        hhrfile = "hhsearch/A1RZ92-Q74DY9.hhr"
        if IsDuplicatedByHHSearch(hhrfile):
            print "yes"

#}}}
    if 0:  #{{{
        import pairlistwithfamid2pairaln_by_msa
        seq1 = "--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------MLSSTATTMLRAGVSRSSGALQPMLLRSAACPCSPFSMNTKLSQPTSV-----RPLSTSPSALVLRFRAQQQAQLAQQQLRRASSSSSSSSSSTRPRSDAELDANAAEAAAAAQSAAHAGEPVLDWNTFFKLRKTRRRVQLAFSVIMTLITSGAGGAVLSTGVADAMVAQVPLEPMFAVGLMTASFGALGWLMGPAMGGMVFNALKSKYRGQMEIKEGQFFARIKKHRVDPSASSMGNPVPDFYGEKISSVAGYRQWLKDQRAFNKKRTTFV"
        seq2 = "MDILLAVLEQGFIFSIVCFGVYITYKILDFPDLSVDGTFPLGAAVAAAFLVKGYSPVLSSLAALVAGAIAGGITGILHVKFKITNLLSGILVMVGLYSINLRIMGKSNIPLFNKIHLFSDTMNPIIIITVFLLICKITLDLFLKTKAGFILKATGDNEQLVLSLGVNKDLVKIMGLMLSNALVALGGALMAQYQGFSDVGMGTGIVVMGLASVIIGESLFGRIKALNATTRVLLGALVYKLSVSI---ALTVGLAP-------TDLKLVTAIIVVIALSLNKNPLKIITKQKTKEGGIL------NASNTKSAQSVQ-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------"
        seq1 = "---------------------------------------------------------------------------------------------------------------------------------------MALSSLFFTASALLLMFLAFLGGARNSNPLDRIYWLEAATGNIPGAPALSRWTYWNLCAVNSEGHNECGKSYPDYPFDPPSHRNFNTHVNIPAAFIGTRHYFLTSRFMFPFHIIALFFATCSLLTGFLAMCTRIGNWVSAFSAYFALTFQTITTCLMTAVYVQGRDKFNNNGQSSHLGVKAFAFMWTSVALLFLSCVIYCMGGAVGRKDGGYSGREQRRRGFFNSHRSGSLRSNKETAP"
        seq2 = "MRKIAAIGGIVFISFILTIVAMFTKLWISWSIGKFSYGIGIVPYHSNSAGWFTAASWMVFISFGLFIPLILVVLFTAYKVHHDGCCHSIRHCFNSICLICSIIAVLEIIAFVLMAVNASRYVKGASISEKKSLLQLGSSAYLDLVSAILIIVATVLSGHASHHDCH----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------"
        alignFactor = pairlistwithfamid2pairaln_by_msa.GetAlignmentFactorFromPairAlignment(
            seq1, seq2)
        print alignFactor
#}}}
    if 0:  #{{{
        try:
            dbname = sys.argv[1]
            print dbname
            from myfunc import MyDB
            cls = MyDB(dbname)
            #            print cls.idList
            record = cls.GetRecord("A0FGX9")
            if record:
                print record
                #             for rd in  cls.GetAllRecord():
                #                 print rd
                (seqid, anno, seq) = myfunc.ExtractFromSeqWithAnno(record)
                print(seqid, anno, seq)
        except IndexError:
            pass

#}}}
    if 0:  #{{{
        import my_extractdb
        #miniking my_extractdb.py see which one is faster
        try:
            dbname = sys.argv[1]
            idlistfile = sys.argv[2]
            cls = myfunc.MyDB(dbname)
            if cls.failure:
                print >> sys.stderr, "MyDB init failed"
            else:
                idlist = open(idlistfile, "r").read().split("\n")
                fpout = sys.stdout
                for seqid in idlist:
                    if seqid:
                        record = cls.GetRecord(seqid)
                        fpout.write(record)
            #             for rd in  cls.GetAllRecord():
            #                 print rd
#                (seqid, anno, seq) = myfunc.ExtractFromSeqWithAnno(record)
#                print (seqid, anno, seq)
        except IndexError:
            print "error"
            pass
#}}}
    if 0:  #{{{ #test ReadLineByBlock
        try:
            infile = sys.argv[1]
            from myfunc import ReadLineByBlock
            cls = ReadLineByBlock(infile)
            lines = cls.readlines()
            while lines != None:
                for line in lines:
                    print line
                lines = cls.readlines()

        except IndexError:
            pass
#}}}
    if 0:  #{{{ #test speed of ReadLineByBlock
        # ReadLineByBlock is about 3 times fater than file.readline()
        try:
            from myfunc import ReadLineByBlock
            infile = sys.argv[1]

            start = time.time()
            hdl = ReadLineByBlock(infile)
            lines = hdl.readlines()
            while lines != None:
                lines = hdl.readlines()
            hdl.close()
            end = time.time()
            msg = "Reading %s by ReadLineByBlock costs %.3fs seconds"
            print msg % (infile, (end - start))

            start = time.time()
            hdl = open(infile, "r")
            line = hdl.readline()
            while line:
                line = hdl.readline()
            hdl.close()
            end = time.time()
            msg = "Reading %s by readline() costs %.3fs seconds"
            print msg % (infile, (end - start))

        except IndexError:
            pass
#}}}
    if 0:  #{{{ #test readline
        try:
            infile = sys.argv[1]
            fp = open(infile, "r")
            line = fp.readline()
            while line:
                print line
                line = fp.readline()
            fp.close()
        except IndexError:
            pass
#}}}
    if 0:  #{{{ #test the speed of GetFirstWord
        try:
            nloop = int(sys.argv[1])
            string = "kjdafk jasdfj j"
            #string = "askdf askdf "
            #            string = "kajsdfasdfsdfjakasjdfka"
            #            string = "kajsdfasdf,sdfjakasjdfka"
            delimiter = " \t\r,.\n"
            delimiter = " "
            for i in xrange(nloop):
                #firstword = myfunc.GetFirstWord(string, delimiter)
                #firstword = string.split()[0]
                #firstword = string.partition(" ")[0]
                firstword = myfunc.GetFirstWord(string)
                #pass
                #print firstword
        except (IndexError, ValueError):
            pass
#}}}
    if 0:  #{{{ # read seq by SeqIO
        from Bio import SeqIO
        try:
            seqfile = sys.argv[1]
            # 1. SeqIO ####################
            start = time.time()
            handle = open(seqfile, "rU")
            cnt = 0
            for record in SeqIO.parse(handle, "fasta"):
                cnt += 1
            handle.close()
            end = time.time()
            msg = "Reading %d sequences by SeqIO costs %.3fs seconds"
            print msg % (cnt, (end - start))

            # 2. ReadFasta ####################
            start = time.time()
            seqfile = sys.argv[1]
            (idList, annoList, seqList) = myfunc.ReadFasta(seqfile)
            end = time.time()
            msg = "Reading %d sequences by ReadFasta costs %.3fs seconds"
            print msg % (len(idList), (end - start))

            # 3. ReadFasta from buffer
            BLOCK_SIZE = 100000
            start = time.time()
            cnt = 0
            fpin = open(seqfile, 'rb')
            unprocessedBuffer = ""
            isEOFreached = False
            while 1:
                buff = fpin.read(BLOCK_SIZE)
                if len(buff) < BLOCK_SIZE:
                    isEOFreached = True
                buff = unprocessedBuffer + buff
                recordList = []
                unprocessedBuffer = myfunc.ReadFastaFromBuffer(
                    buff, recordList, isEOFreached)
                cnt += len(recordList)
                if isEOFreached == True:
                    break
            fpin.close()
            end = time.time()
            msg = "Reading %d sequences by ReadFastaFromBuffer costs %.3fs seconds"
            print msg % (cnt, (end - start))

            # 4. ReadFastaByBlock ####################
            start = time.time()
            seqfile = sys.argv[1]
            hdl = myfunc.ReadFastaByBlock(seqfile, 0, 0)
            if hdl.failure:
                print >> sys.stderr, "Failed to init ReadFastaByBlock"
                return 1
            recordList = hdl.readseq()
            cnt = 0
            while recordList != None:
                cnt += len(recordList)
                #                 for rd in recordList:
                #                     print ">%s"%rd.description
                #                     print rd.seq
                recordList = hdl.readseq()
            hdl.close()
            end = time.time()
            msg = "Reading %d sequences by ReadFastaByBlock costs %.3fs seconds"
            print msg % (cnt, (end - start))
        except (IndexError, ValueError):
            pass
#}}}
    if 0:  #{{{ #test RemoveUnnecessaryGap
        try:
            infile = sys.argv[1]
            start = time.time()
            (idList, seqList) = myfunc.ReadFasta_without_annotation(infile)
            seqList = lcmp.RemoveUnnecessaryGap_old(seqList)
            end = time.time()
            msg = "Run RemoveUnnecessaryGap_old for %s costs %.3fs seconds"
            print >> sys.stderr, msg % (infile, (end - start))
            for seq in seqList:
                print seq

            start = time.time()
            (idList, seqList) = myfunc.ReadFasta_without_annotation(infile)

            seqList = lcmp.RemoveUnnecessaryGap(seqList)
            end = time.time()
            msg = "Run RemoveUnnecessaryGap for %s costs %.3fs seconds"
            print >> sys.stderr, msg % (infile, (end - start))
            for seq in seqList:
                print seq

        except IndexError:
            pass
#}}}
    if 0:  #{{{ #test ReadMPAByBlock
        try:
            infile = sys.argv[1]
            hdl = myfunc.ReadMPAByBlock(infile)
            if hdl.failure:
                return
            recordList = hdl.readseq()
            while recordList != None:
                for rd in recordList:
                    #print rd.seqid
                    print ">%s" % (rd.description)
                    print "%s" % (myfunc.mpa2seq(rd.mpa))
                recordList = hdl.readseq()
            hdl.close()
        except IndexError:
            pass
#}}}
    if 0:  #{{{
        try:
            dbname = sys.argv[1]
            print dbname
            from myfunc import MyDB
            cls = MyDB(dbname)
            #            print cls.idList
            record = cls.GetRecord("A0FGX9")
            if record:
                print record
                #             for rd in  cls.GetAllRecord():
                #                 print rd
                (seqid, anno, seq) = myfunc.ExtractFromSeqWithAnno(record)
                print(seqid, anno, seq)
        except IndexError:
            pass

#}}}
    if 0:  #{{{ #test subprocess
        import glob
        #invoke shell explicitly, not very good, may have security problems
        subprocess.call("seq 10", shell=True)
        subprocess.call("echo wait for 2 seconds...; sleep 2", shell=True)
        subprocess.call("ls topo*.py", shell=True)
    if 1:  #{{{ #test subprocess
        import glob
        #invoke shell implicitly, recommended way
        subprocess.call(["seq", "10"], shell=False)
        subprocess.call(["echo", "wait for 1 seconds..."])
        subprocess.call(["sleep", "1"])
        try:
            print subprocess.check_call(["ls",
                                         "topo*.py"])  #This will not work
        except subprocess.CalledProcessError, e:
            print "error message:", e
        subprocess.call(["ls"] + glob.glob("topo*.py"))