Python fastaFileToDictWholeNames示例，algbioi.com.fasta.fastaFileToDictWholeNames Python示例

示例#1

0

显示文件

文件： soapdenovo.py 项目： chunyuma/CSE566_final_project

def toWellMappedContigs(inFastaFile,
                        inTaxonomyWFile,
                        outFastaFile,
                        outFastaMisAssembledFile,
                        outTaxonomyFile,
                        weightThreshold=0.99):
    """
        Creates the fasta and mapping files that contain well assembled contigs (filter out misassembled contigs).

        @param inFastaFile: input fasta file with contigs
        @param inTaxonomyWFile: input file that contains taxonomy with weights (seqId, weight, taxonId)
        @param outFastaFile: fasta file containing well assembled sequences
        @param outFastaMisAssembledFile: fasta file containing misassembled contigs
        @param outTaxonomyFile: resulting taxonomy of the well assembled sequences (seqId, taxonId)
        @param weightThreshold: only contigs the weight of which is at least this value will be taken
        @return: statistics
    """
    seqIdToTaxonId = csv.predToDict(inTaxonomyWFile)
    seqIdToWeight = csv.getMapping(inTaxonomyWFile, 0, 1, '\t')
    outFastaOk = csv.OutFileBuffer(outFastaFile)
    outFastaMis = csv.OutFileBuffer(outFastaMisAssembledFile)
    outTaxonomyOk = csv.OutFileBuffer(outTaxonomyFile)

    totalBp = 0.0
    totalCount = 0.0
    okBp = 0.0
    okCount = 0.0
    avgSumBp = 0.0

    for seqId, seq in fas.fastaFileToDictWholeNames(inFastaFile).iteritems():
        bp = len(seq)
        totalBp += bp
        totalCount += 1
        seqIdPrefix = str(seqId).split(' ')[0]
        weight = seqIdToWeight[seqIdPrefix][0]
        fastaEntry = '>' + str(seqIdPrefix) + '\n' + str(seq) + '\n'
        if float(weight) >= weightThreshold:
            outFastaOk.writeText(fastaEntry)
            outTaxonomyOk.writeText(
                str(seqIdPrefix) + '\t' + str(seqIdToTaxonId[seqIdPrefix]) +
                '\n')
            okBp += bp
            okCount += 1
            avgSumBp += getCoverage(seqId) * bp
        else:
            outFastaMis.writeText(fastaEntry)

    outFastaOk.close()
    outFastaMis.close()
    outTaxonomyOk.close()

    return 'Taken: %s/%sMB, %s/%sseq, %s%% bp %s%% seq, avg coverage %s' % (
        round(okBp / 1000000, 2), round(totalBp / 1000000, 2), okCount,
        totalCount, round(
            (okBp / totalBp) * 100, 2), round(
                (okCount / totalCount) * 100, 2), round(avgSumBp / okBp, 3))

示例#2

0

显示文件

文件： cont_preproc.py 项目： algbioi/docker_ppsp

def createConcatFasta(inFastaList, outFasta):

    out = csv.OutFileBuffer(outFasta)
    for line in open(inFastaList):
        line = line.strip()
        if line == '':
            continue
        else:
            for seqId, seq in fas.fastaFileToDictWholeNames(line).iteritems():
                out.writeText('>%s\n%s\n' % (seqId, seq))
    out.close()

示例#3

0

显示文件

文件： soapdenovo.py 项目： algbioi/docker_ppsp

def toLongSeq(inFastaFileName, outFastaFileName, minLength=1000):
    """
        Creates a fasta file that contains sequences that are at least minLength long.

        @param inFastaFileName:
        @param outFastaFileName:
        @param minLength:
    """
    out = csv.OutFileBuffer(outFastaFileName)
    for seqId, seq in fas.fastaFileToDictWholeNames(inFastaFileName).iteritems():
        if len(seq) >= minLength:
            out.writeText('>' + str(seqId) + '\n' + str(seq) + '\n')
    out.close()

示例#4

0

显示文件

文件： soapdenovo.py 项目： algbioi/docker_ppsp

def toWellMappedContigs(inFastaFile, inTaxonomyWFile,
                        outFastaFile, outFastaMisAssembledFile, outTaxonomyFile, weightThreshold=0.99):
    """
        Creates the fasta and mapping files that contain well assembled contigs (filter out misassembled contigs).

        @param inFastaFile: input fasta file with contigs
        @param inTaxonomyWFile: input file that contains taxonomy with weights (seqId, weight, taxonId)
        @param outFastaFile: fasta file containing well assembled sequences
        @param outFastaMisAssembledFile: fasta file containing misassembled contigs
        @param outTaxonomyFile: resulting taxonomy of the well assembled sequences (seqId, taxonId)
        @param weightThreshold: only contigs the weight of which is at least this value will be taken
        @return: statistics
    """
    seqIdToTaxonId = csv.predToDict(inTaxonomyWFile)
    seqIdToWeight = csv.getMapping(inTaxonomyWFile, 0, 1, '\t')
    outFastaOk = csv.OutFileBuffer(outFastaFile)
    outFastaMis = csv.OutFileBuffer(outFastaMisAssembledFile)
    outTaxonomyOk = csv.OutFileBuffer(outTaxonomyFile)

    totalBp = 0.0
    totalCount = 0.0
    okBp = 0.0
    okCount = 0.0
    avgSumBp = 0.0

    for seqId, seq in fas.fastaFileToDictWholeNames(inFastaFile).iteritems():
        bp = len(seq)
        totalBp += bp
        totalCount += 1
        seqIdPrefix = str(seqId).split(' ')[0]
        weight = seqIdToWeight[seqIdPrefix][0]
        fastaEntry = '>' + str(seqIdPrefix) + '\n' + str(seq) + '\n'
        if float(weight) >= weightThreshold:
            outFastaOk.writeText(fastaEntry)
            outTaxonomyOk.writeText(str(seqIdPrefix) + '\t' + str(seqIdToTaxonId[seqIdPrefix]) + '\n')
            okBp += bp
            okCount += 1
            avgSumBp += getCoverage(seqId) * bp
        else:
            outFastaMis.writeText(fastaEntry)

    outFastaOk.close()
    outFastaMis.close()
    outTaxonomyOk.close()

    return 'Taken: %s/%sMB, %s/%sseq, %s%% bp %s%% seq, avg coverage %s' % (round(okBp / 1000000, 2),
                                                                            round(totalBp / 1000000, 2),
                                                                            okCount, totalCount,
                                                                            round((okBp / totalBp) * 100, 2),
                                                                            round((okCount / totalCount) * 100, 2),
                                                                            round(avgSumBp / okBp, 3))

示例#5

0

显示文件

文件： soapdenovo.py 项目： chunyuma/CSE566_final_project

def toLongSeq(inFastaFileName, outFastaFileName, minLength=1000):
    """
        Creates a fasta file that contains sequences that are at least minLength long.

        @param inFastaFileName:
        @param outFastaFileName:
        @param minLength:
    """
    out = csv.OutFileBuffer(outFastaFileName)
    for seqId, seq in fas.fastaFileToDictWholeNames(
            inFastaFileName).iteritems():
        if len(seq) >= minLength:
            out.writeText('>' + str(seqId) + '\n' + str(seq) + '\n')
    out.close()

示例#6

0

显示文件

文件： hio.py 项目： algbioi/snowball

def parse(inFq, inDomtblout, inProtFna=None):
    """
        Read in joined pair-end reads and its HMM annotation from the FASTQ, DOMTBLOUT, (and optionally PROT FASTA file)

        @param inFq: FASTQ file containing joined pair-end reads
        @param inDomtblout: HMM annotation file
        @param inProtFna: corresponding prot sequence (can be None)

        @type inFq: str
        @type inDomtblout: str
        @type inProtFna: str | None
        @rtype: list[read_rec.ReadRec]

        @return: a list of read-records
    """
    recList = []

    # read in prot sequences
    if inProtFna is None:
        nameToProtSeq = {}
    else:
        nameToProtSeq = fas.fastaFileToDictWholeNames(inProtFna)

    # read in dom file
    nameToDom = hmm.readDomblout(inDomtblout)

    assert inProtFna is None or len(nameToProtSeq) == len(nameToDom)

    # read in pair-end reads, create ReadRec
    for readName, dna, comment, qs in fq.ReadFqGen(inFq):

        readName = readName[1:]  # strip starting @

        protSeq = nameToProtSeq.get(readName, None)

        hit, frameTag = nameToDom[readName]

        annotStart, annotLen, strain, score, acc = hmm.dnaHitInfo(hit, dna, protSeq)  # strain, score, acc not used

        if strain == 1:
            assert 1 <= frameTag <= 3
        else:
            assert 4 <= frameTag <= 6

        # alignment env coord
        protStart = int(hit[19]) - 1
        protLen = int(hit[20]) - protStart

        assert annotLen == 3 * protLen

        # alignment coord
        protStartAli = int(hit[17]) - 1
        protLenAli = int(hit[18]) - protStartAli

        # hmm coordinates
        hmmCoordStart = int(hit[15]) - 1
        hmmCoordLen = int(hit[16]) - hmmCoordStart

        # the env coordinates start (and end) often before the alignment coordinates and end after, get the offsets
        offsetEnv = protStartAli - protStart
        offsetEnvE = protLen - offsetEnv - protLenAli

        assert offsetEnv >= 0
        assert offsetEnvE >= 0

        hmmCoordStart -= offsetEnv
        hmmCoordLen += offsetEnv + offsetEnvE

        tokens = comment.split('\t')

        if len(tokens) == 5:
            # get the ends of the pair-end read and corresponding quality-scores
            p, dna1, qs1, dna2, qs2 = tokens

            # get the QSArray representation of the first read-end
            qsA1 = qs_man.QSArray(dna=dna1, qsArrayFq=qs1)

            # get the reverse complement of the second read-end
            qsA2 = qs_man.QSArray(dna=dna2, qsArrayFq=qs2)
            qsA2.revCompl()

            # get the consensus QS Array representing of the joined read
            qsA = qs_man.QSArray(qsA1=qsA1, qsA2=qsA2, pos1Within2=len(dna2) - len(dna))
        else:
            # there is just a simple dna sequence (i.e. not joined reads)
            qsA = qs_man.QSArray(dna=dna, qsArrayFq=qs)

        recList.append(read_rec.ReadRec(readName, qsA, frameTag, annotStart, annotLen, hmmCoordStart, hmmCoordLen,
                                        protSeq, protStart, protLen))

    return recList

示例#7

0

显示文件

文件： hio.py 项目： algbioi/snowball

def partitionReads(sampleDir, scoreThreshold, accuracyThreshold, shuffleRandSeed, pfamPartitionedDir, joinedReads=True,
                   considerSam=True):
    """
        Partitioning reads into the individual gene-domains.
    """
    try:
        strainDirList = map(lambda x: os.path.join(sampleDir, x), os.listdir(sampleDir))

        samplePartDir = os.path.join(sampleDir, pfamPartitionedDir)
        if not os.path.isdir(samplePartDir):
            os.mkdir(samplePartDir)

        # for each gene-dom
        fqOutDict = {}
        fqProtOutDict = {}
        fqSamOutDict = {}
        fqDomOutDict = {}

        for strainDir in strainDirList:
            strainAcc = os.path.basename(strainDir)
            if strainAcc == 'sample_partitioned':  # skip the directory containing the partitioned data
                continue
            if os.path.isdir(strainDir):
                # for each dom file
                for f in os.listdir(strainDir):

                    if (joinedReads and f.endswith('join_prot.domtblout.gz')) \
                            or (not joinedReads and f.endswith('pair1_prot.domtblout.gz')):  # a domtblout file found
                        i = f.split('_', 1)[0]
                        if i.isdigit():
                            if joinedReads:
                                domPath = os.path.join(strainDir, f)
                                domPath2 = None
                                fqPath = os.path.join(strainDir, '%s_join.fq.gz' % (i,))
                                fqPair1Path = os.path.join(strainDir, '%s_pair1.fq.gz' % (i,))
                                fqPair2Path = os.path.join(strainDir, '%s_pair2.fq.gz' % (i,))
                                fqProtPath = os.path.join(strainDir, '%s_join_prot.fna.gz' % (i,))
                                fqProtPath2 = None
                                samPath = os.path.join(strainDir, '%s_join_gmap.sam.gz' % (i,))
                                assert os.path.isfile(fqPath)
                            else:
                                domPath = os.path.join(strainDir, f)
                                domPath2 = os.path.join(strainDir, '%s_pair2_prot.domtblout.gz' % (i,))
                                fqPath = None
                                fqPair1Path = os.path.join(strainDir, '%s_pair1.fq.gz' % (i,))
                                fqPair2Path = os.path.join(strainDir, '%s_pair2.fq.gz' % (i,))
                                fqProtPath = os.path.join(strainDir, '%s_pair1_prot.fna.gz' % (i,))
                                fqProtPath2 = os.path.join(strainDir, '%s_pair2_prot.fna.gz' % (i,))
                                samPath = os.path.join(strainDir, '%s_pair.sam.gz' % (i,))
                                assert os.path.isfile(domPath2) and os.path.isfile(fqProtPath2)
                            assert os.path.isfile(domPath) and os.path.isfile(fqPair1Path) \
                                   and os.path.isfile(fqPair2Path) and os.path.isfile(fqProtPath)

                            if considerSam:
                                assert os.path.isfile(samPath)

                            # map: read-name -> list of hits (lists sorted according to the scores)
                            nameToHitList = getReadNameToHitList(domPath)
                            if not joinedReads:
                                nameToHitList2 = getReadNameToHitList(domPath2)
                                len1 = len(nameToHitList)
                                len2 = len(nameToHitList2)
                                nameToHitList.update(nameToHitList2)
                                assert len(nameToHitList) == len1 + len2

                            # map: read-name-prot -> seq-prot
                            protNameToSeq = fas.fastaFileToDictWholeNames(fqProtPath)
                            if not joinedReads:
                                protNameToSeq.update(fas.fastaFileToDictWholeNames(fqProtPath2))

                            # map: read-name -> sam-line-entry
                            if considerSam:
                                readNameToSam = {}
                                if joinedReads:
                                    for line in gzip.open(samPath):
                                        line = line.strip()
                                        if line.startswith('#'):
                                            continue
                                        readName = line.split('\t', 1)[0]
                                        # lines with only 11 entries will be padded with * to 12
                                        if len(line.split('\t')) == 11:
                                            line += '\t*'
                                        readNameToSam[readName] = line + '\t' + strainAcc
                                else:
                                    entry = []
                                    for line in gzip.open(samPath):
                                        line = line.strip()
                                        if line.startswith('#') or line.startswith('@'):
                                            continue
                                        if len(entry) < 2:
                                            entry.append(line)
                                        if len(entry) == 2:
                                            readName = entry[0].split('\t', 1)[0]
                                            assert readName == entry[1].split('\t', 1)[0]
                                            readNameToSam[readName] = entry[0] + '\t*\t' + strainAcc + '\n' \
                                                                      + entry[1] + '\t*\t' + strainAcc
                                            entry = []

                            # map read-name -> "pair1-dna tab pair1-qs tab pair2-dna tab pair2-qs"
                            if joinedReads:
                                readNameToPairReads = fq.getFqToDict(fqPair1Path, fqPair2Path)
                            else:
                                readNameToPairReads = None

                            if joinedReads:
                                g1 = fq.ReadFqGen(fqPath)
                                g2 = []
                            else:
                                g1 = fq.ReadFqGen(fqPair1Path)
                                g2 = fq.ReadFqGen(fqPair2Path)

                            # go over all reads
                            for readName, dna, p, qs in list(g1) + list(g2):
                                readName = readName[1:]  # strip starting '@'

                                # take the hit with the highest score
                                topHit = None
                                if readName in nameToHitList:
                                    topHit = nameToHitList[readName][0]

                                # is the hit significant, filter according to the score and accuracy
                                if topHit is None or float(topHit[13]) < scoreThreshold or float(topHit[21]) < accuracyThreshold:
                                    continue
                                else:
                                    famName = topHit[3]
                                    if famName not in fqOutDict:
                                        fqOutDict[famName] = []
                                        fqProtOutDict[famName] = []
                                        fqSamOutDict[famName] = []
                                        fqDomOutDict[famName] = []

                                    if joinedReads:
                                        comment = readNameToPairReads[readName]
                                    else:
                                        comment = ''
                                    fqOutDict[famName].append((readName, dna, qs, comment))

                                    protSeqName = topHit[0]
                                    protSeq = protNameToSeq[protSeqName]
                                    fqProtOutDict[famName].append((readName, protSeq))

                                    if considerSam:
                                        if joinedReads:
                                            fqSamOutDict[famName].append(readNameToSam[readName])
                                        else:
                                            fqSamOutDict[famName].append(readNameToSam[readName[:-2]])

                                    # top hit coordinates within the read
                                    startOnRead, overlapLen, strain = hmm.dnaHitInfo(topHit, dna, protSeq)[:3]

                                    fqDomOutDict[famName].append('\t'.join(topHit) + '\t%s\t%s\t%s' % (startOnRead, overlapLen, strain))

        if joinedReads:
            ident = 'join'
        else:
            ident = 'pair'

        # for each gene dom, store reads into a file
        for famName, fqContentList in fqOutDict.iteritems():

            # get the tagged fam-dom-name that can be used in file names
            pf = comh.getGeneNameToFileName(famName)[:-4]

            # define output files
            fqOutO = os.path.join(samplePartDir, 'o_%s_%s.fq.gz' % (pf, ident))  # 'o_' ~ ordered entries
            fqOutR = os.path.join(samplePartDir, 'r_%s_%s.fq.gz' % (pf, ident))  # 'r_' ~ random shuffled entries

            fqProtOutO = os.path.join(samplePartDir, 'o_%s_%s_prot.fna.gz' % (pf, ident))
            fqProtOutR = os.path.join(samplePartDir, 'r_%s_%s_prot.fna.gz' % (pf, ident))

            fqSamOutO = os.path.join(samplePartDir, 'o_%s_%s_gmap.sam.gz' % (pf, ident))
            fqSamOutR = os.path.join(samplePartDir, 'r_%s_%s_gmap.sam.gz' % (pf, ident))

            fqDomOutO = os.path.join(samplePartDir, 'o_%s_%s_prot.domtblout.gz' % (pf, ident))
            fqDomOutR = os.path.join(samplePartDir, 'r_%s_%s_prot.domtblout.gz' % (pf, ident))

            # write FASTQ
            fqOut = fq.WriteFq(fqOutO)
            for e in fqContentList:
                fqOut.writeFqEntry('@' + e[0], e[1], e[2], e[3])
            fqOut.close()

            # write PROT
            fqProtOut = fq.WriteFq(fqProtOutO)
            fqProtOut.write('\n'.join(map(lambda x: '>%s\n%s' % (x[0], x[1]), fqProtOutDict[famName])) + '\n')
            fqProtOut.close()

            # write SAM
            if considerSam:
                fqSamOut = fq.WriteFq(fqSamOutO)
                fqSamOut.write('\n'.join(fqSamOutDict[famName]) + '\n')
                fqSamOut.close()

            # write DOM
            fqDomOut = fq.WriteFq(fqDomOutO)
            fqDomOut.write('\n'.join(fqDomOutDict[famName]) + '\n')
            fqDomOut.close()

            # shuffle file entries (to remove any bias imposed by the ordering)
            rand.shuffleLines(fqOutO, fqOutR, 4, shuffleRandSeed)
            rand.shuffleLines(fqProtOutO, fqProtOutR, 2, shuffleRandSeed)
            rand.shuffleLines(fqDomOutO, fqDomOutR, 1, shuffleRandSeed)

            if considerSam:
                if joinedReads:
                    rand.shuffleLines(fqSamOutO, fqSamOutR, 1, shuffleRandSeed)
                else:
                    rand.shuffleLines(fqSamOutO, fqSamOutR, 2, shuffleRandSeed)

            # delete ordered files (keep only the shuffled ones)
            os.remove(fqOutO)
            os.remove(fqProtOutO)
            if considerSam:
                os.remove(fqSamOutO)
            os.remove(fqDomOutO)
    except Exception as e:
        print('Exception in partitionReads:')
        print sampleDir, scoreThreshold, accuracyThreshold, shuffleRandSeed, pfamPartitionedDir, joinedReads
        print e.message
        print type(e)
        print e.args
        raise e

示例#8

0

显示文件

文件： soapdenovo.py 项目： algbioi/docker_ppsp

def getProfile(readsFFastaFile,
               communityFile, contigMFastaFile, contigLFastaFile, taxonomyMFile, taxonomyDbFile, outProfileFile):
    """
        Gets the profile of the dataset.

        @param readsFFastaFile:
        @param communityFile:
        @param contigMFastaFile:
        @param contigLFastaFile:
        @param taxonomyMFile:
        @param taxonomyDbFile: taxonomy in the sqlite3 format
        @param outProfileFile: output file
    """
    # get map: taxonId -> read count
    taxonIdToReadCount = {}
    readTotalCount = 0
    for taxonId in getReadsTaxonIdList(readsFFastaFile, communityFile, readHeaderToCommunityId=getCommunityId)[1:]:
        if taxonId in taxonIdToReadCount:
            taxonIdToReadCount[taxonId] += 1
        else:
            taxonIdToReadCount[taxonId] = 1
        readTotalCount += 1

    # get map: taxonId -> contig count
    # get map: taxonId -> contig bp
    taxonIdToContigCount = {}
    taxonIdToContigBp = {}
    totalContigCount = 0
    seqIdToTaxonId = csv.predToDict(taxonomyMFile)
    seqIdToBp = fas.getSequenceToBpDict(contigMFastaFile)
    for seqId, bp in seqIdToBp.iteritems():
        totalContigCount += 1
        taxonId = seqIdToTaxonId[seqId]
        if taxonId in taxonIdToContigBp:
            taxonIdToContigBp[taxonId] += bp
        else:
            taxonIdToContigBp[taxonId] = bp
        if taxonId in taxonIdToContigCount:
            taxonIdToContigCount[taxonId] += 1
        else:
            taxonIdToContigCount[taxonId] = 1

    taxonIdToTotalBp = {}
    taxonIdToAvgSumCov = {}
    taxonIdToAvgCov = {}
    totalBp = 0.0
    for taxonId in taxonIdToContigBp:
        taxonIdToTotalBp[taxonId] = 0.0
        taxonIdToAvgSumCov[taxonId] = 0.0
        taxonIdToAvgCov[taxonId] = 0.0

    for seqId in fas.fastaFileToDictWholeNames(contigLFastaFile):
        shortSeqId = getShortContigId(seqId)
        if shortSeqId in seqIdToBp:
            coverage = getCoverage(seqId)
            bp = seqIdToBp[shortSeqId]
            taxonId = seqIdToTaxonId[shortSeqId]
            taxonIdToTotalBp[taxonId] += bp
            taxonIdToAvgSumCov[taxonId] += float(coverage) * float(bp)
            totalBp += bp

    for taxonId, bp in taxonIdToTotalBp.iteritems():
        if bp > 0:
            taxonIdToAvgCov[taxonId] = taxonIdToAvgSumCov[taxonId] / float(bp)

    tupleList = []
    taxonomy = taxonomy_ncbi.TaxonomyNcbi(taxonomyDbFile, considerNoRank=True)
    ranks = taxonomy_ncbi.TAXONOMIC_RANKS[2:]
    avgCoverage = 0.0
    for taxonId, readCount in taxonIdToReadCount.iteritems():
        scName = ScientificNameAtRank(taxonId, taxonomy, ranks)
        tupleList.append((taxonId,
                          round(100 * (readCount / float(readTotalCount)), 1),
                          round(100 * (taxonIdToTotalBp.get(taxonId, 0) / float(totalBp)), 1),
                          round(taxonIdToAvgCov.get(taxonId, 0), 2),
                          round(taxonIdToTotalBp.get(taxonId, 0) / 1000000.0, 2),
                          taxonIdToContigCount.get(taxonId, 0),
                          taxonomy.getScientificName(taxonId),
                          scName.getNameAtRank('phylum'),
                          scName.getNameAtRank('class'),
                          scName.getNameAtRank('order'),
                          scName.getNameAtRank('family'),
                          scName.getNameAtRank('genus'),
                          scName.getNameAtRank('species')  # this could be done in a nicer way
        ))

        avgCoverage += taxonIdToAvgCov.get(taxonId, 0) * taxonIdToTotalBp.get(taxonId, 0)
    avgCoverage /= float(totalBp)
    tupleList.sort(key=lambda x: x[2], reverse=True)

    out = csv.OutFileBuffer(outProfileFile)
    out.writeText('#taxonId, % reads, % contigs, avg coverage, MB contigs, contigs count, strain name, ' +
                  ",".join(ranks) + '\n')
    for entry in tupleList:
        out.writeText(','.join(map(str, entry)) + '\n')

    out.writeText('#Sum/Avg., -, -, ' + str(round(avgCoverage, 2)) + ', ' + str(round(totalBp / 1000000.0, 2)) +
                  ', ' + str(totalContigCount) + ', -\n')
    out.close()
    taxonomy.close()

示例#9

0

显示文件

文件： soapdenovo.py 项目： chunyuma/CSE566_final_project

def getProfile(readsFFastaFile, communityFile, contigMFastaFile,
               contigLFastaFile, taxonomyMFile, taxonomyDbFile,
               outProfileFile):
    """
        Gets the profile of the dataset.

        @param readsFFastaFile:
        @param communityFile:
        @param contigMFastaFile:
        @param contigLFastaFile:
        @param taxonomyMFile:
        @param taxonomyDbFile: taxonomy in the sqlite3 format
        @param outProfileFile: output file
    """
    # get map: taxonId -> read count
    taxonIdToReadCount = {}
    readTotalCount = 0
    for taxonId in getReadsTaxonIdList(
            readsFFastaFile, communityFile,
            readHeaderToCommunityId=getCommunityId)[1:]:
        if taxonId in taxonIdToReadCount:
            taxonIdToReadCount[taxonId] += 1
        else:
            taxonIdToReadCount[taxonId] = 1
        readTotalCount += 1

    # get map: taxonId -> contig count
    # get map: taxonId -> contig bp
    taxonIdToContigCount = {}
    taxonIdToContigBp = {}
    totalContigCount = 0
    seqIdToTaxonId = csv.predToDict(taxonomyMFile)
    seqIdToBp = fas.getSequenceToBpDict(contigMFastaFile)
    for seqId, bp in seqIdToBp.iteritems():
        totalContigCount += 1
        taxonId = seqIdToTaxonId[seqId]
        if taxonId in taxonIdToContigBp:
            taxonIdToContigBp[taxonId] += bp
        else:
            taxonIdToContigBp[taxonId] = bp
        if taxonId in taxonIdToContigCount:
            taxonIdToContigCount[taxonId] += 1
        else:
            taxonIdToContigCount[taxonId] = 1

    taxonIdToTotalBp = {}
    taxonIdToAvgSumCov = {}
    taxonIdToAvgCov = {}
    totalBp = 0.0
    for taxonId in taxonIdToContigBp:
        taxonIdToTotalBp[taxonId] = 0.0
        taxonIdToAvgSumCov[taxonId] = 0.0
        taxonIdToAvgCov[taxonId] = 0.0

    for seqId in fas.fastaFileToDictWholeNames(contigLFastaFile):
        shortSeqId = getShortContigId(seqId)
        if shortSeqId in seqIdToBp:
            coverage = getCoverage(seqId)
            bp = seqIdToBp[shortSeqId]
            taxonId = seqIdToTaxonId[shortSeqId]
            taxonIdToTotalBp[taxonId] += bp
            taxonIdToAvgSumCov[taxonId] += float(coverage) * float(bp)
            totalBp += bp

    for taxonId, bp in taxonIdToTotalBp.iteritems():
        if bp > 0:
            taxonIdToAvgCov[taxonId] = taxonIdToAvgSumCov[taxonId] / float(bp)

    tupleList = []
    taxonomy = taxonomy_ncbi.TaxonomyNcbi(taxonomyDbFile, considerNoRank=True)
    ranks = taxonomy_ncbi.TAXONOMIC_RANKS[2:]
    avgCoverage = 0.0
    for taxonId, readCount in taxonIdToReadCount.iteritems():
        scName = ScientificNameAtRank(taxonId, taxonomy, ranks)
        tupleList.append((
            taxonId,
            round(100 * (readCount / float(readTotalCount)), 1),
            round(100 * (taxonIdToTotalBp.get(taxonId, 0) / float(totalBp)),
                  1),
            round(taxonIdToAvgCov.get(taxonId, 0), 2),
            round(taxonIdToTotalBp.get(taxonId, 0) / 1000000.0, 2),
            taxonIdToContigCount.get(taxonId, 0),
            taxonomy.getScientificName(taxonId),
            scName.getNameAtRank('phylum'),
            scName.getNameAtRank('class'),
            scName.getNameAtRank('order'),
            scName.getNameAtRank('family'),
            scName.getNameAtRank('genus'),
            scName.getNameAtRank(
                'species')  # this could be done in a nicer way
        ))

        avgCoverage += taxonIdToAvgCov.get(taxonId, 0) * taxonIdToTotalBp.get(
            taxonId, 0)
    avgCoverage /= float(totalBp)
    tupleList.sort(key=lambda x: x[2], reverse=True)

    out = csv.OutFileBuffer(outProfileFile)
    out.writeText(
        '#taxonId, % reads, % contigs, avg coverage, MB contigs, contigs count, strain name, '
        + ",".join(ranks) + '\n')
    for entry in tupleList:
        out.writeText(','.join(map(str, entry)) + '\n')

    out.writeText('#Sum/Avg., -, -, ' + str(round(avgCoverage, 2)) + ', ' +
                  str(round(totalBp / 1000000.0, 2)) + ', ' +
                  str(totalContigCount) + ', -\n')
    out.close()
    taxonomy.close()