Пример #1
0
def _getLabelsCreateFasta():
    """
        To process the original mercier dataset with 59 strains. Take only contigs that were mapped to the reference
        genomes. Output a fasta file and a mapping file.
    :rtype : None
    """
    # input fasta file
    fastaFilePath = '/net/metagenomics/projects/PPSmg/data/V35/contigs_1000.txt'  #contigs_1000.txt
    seqIdToSeq = fas.fastaFileToDict(fastaFilePath)

    # contigs mapped to genome names
    nameLabelsFilePath = '/net/metagenomics/projects/PPSmg/data/V35/contigs_1000_blast_labels.txt'  #contigs_1000_blast_labels.txt
    seqIdToNameLabels = csv.getMapping(nameLabelsFilePath,
                                       0,
                                       1,
                                       sep='\t',
                                       comment='#')

    # mapping: genome name -> taxon id
    genomeListFilePath = '/net/metagenomics/projects/PPSmg/data/V35/genome_list2.txt'  #genome_list.txt
    nameLabelToNcbid = csv.getMapping(genomeListFilePath,
                                      0,
                                      2,
                                      sep=';',
                                      comment='#')

    # to store mapped sequences
    outFastaFilePath = '/net/metagenomics/projects/PPSmg/data/V35/contigsMappedBlast1000.fna'  #contigsMappedBlast1000.fna
    outFasta = csv.OutFileBuffer(outFastaFilePath)
    # to stored taxonomic mapping of mapped sequences
    outLabelsFilePath = '/net/metagenomics/projects/PPSmg/data/V35/contigsMappedBlast1000Labels.txt'  #contigsMappedBlast1000Labels.txt
    outLabels = csv.OutFileBuffer(outLabelsFilePath)

    for seqId in seqIdToSeq:
        if seqId in seqIdToNameLabels:
            outFasta.writeText(
                str('>' + str(seqId) + '\n' + seqIdToSeq[seqId] + '\n'))

    outFasta.close()
    print 'fasta created'

    for seqId in seqIdToSeq:
        if seqId in seqIdToNameLabels:
            nameLabel = seqIdToNameLabels[seqId][0]
            ncbid = nameLabelToNcbid[nameLabel][0]
            outLabels.writeText(str(str(seqId) + '\t' + str(ncbid) + '\n'))

    outLabels.close()
    print 'labels created'
Пример #2
0
def scafToContigOutput(scaffContigMapFile, scaffPPSOutFile, contigPPSOutFile):
    """
        Takes scaffold-contigs mapping and scaffold placement (.out file), outputs contigs placement (.out file)

        @param scaffContigMapFile: tab sepparated scaffold-contigs mapping (scaffoldName \t contigName)
        @param scaffPPSOutFile: scaffold predictions (PPS output file)
        @param contigPPSOutFile: contigs predictions (as if it was a PPS output file)
    """
    # init output
    out = csv.OutFileBuffer(contigPPSOutFile)

    # read scaffold predictions
    scaffNameToTaxonId = csv.predToDict(scaffPPSOutFile)

    # read mapping: scaffName -> contigNameList
    scaffNameToContigNameList = csv.getMapping(scaffContigMapFile, 0, 1, sep='\t')

    # store contigs' predictions (according to scaffolds' predictions)
    for scaffName, contigNameList in scaffNameToContigNameList.iteritems():
        taxonId = scaffNameToTaxonId.get(scaffName, None)
        if taxonId is None:
            taxonId = 1
        for contigName in contigNameList:
            out.writeText(contigName + '\t' + str(taxonId) + '\n')
    out.close()
Пример #3
0
def toContigsLabelList(inFastaFileName, readsF, readsR, readOnContig,
                       community, outMappingFileName):
    """
        Gets mapping from contigIds to lists of taxonIds of individual reads of the contigs.

        @param inFastaFileName:
        @param readsF:
        @param readsR:
        @param readOnContig:
        @param community:
        @param outMappingFileName:
    """
    # contigIds
    contigIdToBp = fas.getSequenceToBpDict(inFastaFileName)

    # map: contigId -> list of readIds
    contigIdToReadList = csv.getMapping(readOnContig,
                                        1,
                                        0,
                                        sep='\t',
                                        comment='r')

    # taxonIds as a list for reads
    readFTaxonIdList = getReadsTaxonIdList(readsF, community)
    print 's1'
    readRTaxonIdList = getReadsTaxonIdList(readsR, community)
    print 's2'

    if len(readFTaxonIdList) != len(readRTaxonIdList):
        print(
            'toContigsLabels: different number of reads in the reads files, exit'
        )
        return

    for i in range(len(readFTaxonIdList))[1:]:
        if readFTaxonIdList[i] != readRTaxonIdList[i]:
            print(
                'toContigsLabels: at index %s different taxon ids %s and %s' %
                (i, readFTaxonIdList[i], readRTaxonIdList[i]))
        if readFTaxonIdList[i] is None or readRTaxonIdList[i] is None:
            print('toContigsLabels: at index %s, one is None %s or %s' %
                  (i, readFTaxonIdList[i], readRTaxonIdList[i]))
    print 's3'
    #
    out = csv.OutFileBuffer(outMappingFileName)
    for contigId in contigIdToBp:
        try:
            readList = contigIdToReadList[contigId]
            taxonIdList = []
            for readId in readList:
                taxonIdList.append(readFTaxonIdList[int(readId)])
            out.writeText(
                str(contigId) + '\t' + ','.join(map(str, taxonIdList)) + '\n')
        except KeyError:
            print("No label for contigId: %s" % contigId)
    out.close()
    print 's4'
Пример #4
0
def mergeSequences(mapFilePathList, fastaFilePathList, outputDir):
    """
        Reads all sequences. For each taxonId creates a file that contain all sequences
        mapped to this taxonId. If a seqId appears more than one it is ignored since
        acession numbers are unique.

        @param mapFilePathList: list of files where each contain mapping: seqId -> taxonId
        @param fastaFilePathList: list of fasta files that contain mapping: seqId -> seq
    """
    taxonIdToOutBuffer = {}
    seqIdSet = set()

    totalSeqCount = 0
    totalStoredSeqCount = 0
    totalIdenticalSeqCount = 0

    for mapFilePath, fastaFilePath in zip(mapFilePathList, fastaFilePathList):
        print 'processing', mapFilePath, fastaFilePath
        seqCount = 0
        storedSeqCount = 0

        seqIdToSeq = fasta.fastaFileToDict(fastaFilePath)
        seqIdToNcbidList = csv.getMapping(mapFilePath, 0, 1, sep='\t', comment='#')

        for seqId, seq in seqIdToSeq.iteritems():
            seqCount += 1
            if seqId in seqIdSet:
                totalIdenticalSeqCount += 1
                continue
            else:
                seqIdSet.add(seqId)

            taxonId = seqIdToNcbidList[seqId][0]

            if taxonId not in taxonIdToOutBuffer:
                outBuffer = csv.OutFileBuffer(os.path.join(outputDir, str(str(taxonId) + '.fna')))
                taxonIdToOutBuffer[taxonId] = outBuffer

            taxonIdToOutBuffer[taxonId].writeText(str('>' + seqId + '\n' + seq + '\n'))
            taxonIdToOutBuffer[taxonId].close()
            storedSeqCount += 1

            if len(string.replace(common.noNewLine(seq),'N','')) == 0:
                print 'zeros', seqId, fastaFilePath, len(common.noNewLine(seq))

        # for buff in taxonIdToOutBuffer.values():
        #     buff.close()

        print 'totalSeq, storedSeq', seqCount, storedSeqCount
        totalSeqCount += seqCount
        totalStoredSeqCount += storedSeqCount


    print 'totalSeqCount, totalStoredSeqCount, totalIdenticalSeqCount', totalSeqCount, totalStoredSeqCount, totalIdenticalSeqCount

    print 'sequences merged'
Пример #5
0
def toWellMappedContigs(inFastaFile,
                        inTaxonomyWFile,
                        outFastaFile,
                        outFastaMisAssembledFile,
                        outTaxonomyFile,
                        weightThreshold=0.99):
    """
        Creates the fasta and mapping files that contain well assembled contigs (filter out misassembled contigs).

        @param inFastaFile: input fasta file with contigs
        @param inTaxonomyWFile: input file that contains taxonomy with weights (seqId, weight, taxonId)
        @param outFastaFile: fasta file containing well assembled sequences
        @param outFastaMisAssembledFile: fasta file containing misassembled contigs
        @param outTaxonomyFile: resulting taxonomy of the well assembled sequences (seqId, taxonId)
        @param weightThreshold: only contigs the weight of which is at least this value will be taken
        @return: statistics
    """
    seqIdToTaxonId = csv.predToDict(inTaxonomyWFile)
    seqIdToWeight = csv.getMapping(inTaxonomyWFile, 0, 1, '\t')
    outFastaOk = csv.OutFileBuffer(outFastaFile)
    outFastaMis = csv.OutFileBuffer(outFastaMisAssembledFile)
    outTaxonomyOk = csv.OutFileBuffer(outTaxonomyFile)

    totalBp = 0.0
    totalCount = 0.0
    okBp = 0.0
    okCount = 0.0
    avgSumBp = 0.0

    for seqId, seq in fas.fastaFileToDictWholeNames(inFastaFile).iteritems():
        bp = len(seq)
        totalBp += bp
        totalCount += 1
        seqIdPrefix = str(seqId).split(' ')[0]
        weight = seqIdToWeight[seqIdPrefix][0]
        fastaEntry = '>' + str(seqIdPrefix) + '\n' + str(seq) + '\n'
        if float(weight) >= weightThreshold:
            outFastaOk.writeText(fastaEntry)
            outTaxonomyOk.writeText(
                str(seqIdPrefix) + '\t' + str(seqIdToTaxonId[seqIdPrefix]) +
                '\n')
            okBp += bp
            okCount += 1
            avgSumBp += getCoverage(seqId) * bp
        else:
            outFastaMis.writeText(fastaEntry)

    outFastaOk.close()
    outFastaMis.close()
    outTaxonomyOk.close()

    return 'Taken: %s/%sMB, %s/%sseq, %s%% bp %s%% seq, avg coverage %s' % (
        round(okBp / 1000000, 2), round(totalBp / 1000000, 2), okCount,
        totalCount, round(
            (okBp / totalBp) * 100, 2), round(
                (okCount / totalCount) * 100, 2), round(avgSumBp / okBp, 3))
Пример #6
0
def _getLabelsCreateFasta():
    """
        To process the original mercier dataset with 59 strains. Take only contigs that were mapped to the reference
        genomes. Output a fasta file and a mapping file.
    :rtype : None
    """
    # input fasta file
    fastaFilePath = '/net/metagenomics/projects/PPSmg/data/V35/contigs_1000.txt' #contigs_1000.txt
    seqIdToSeq = fas.fastaFileToDict(fastaFilePath)

    # contigs mapped to genome names
    nameLabelsFilePath = '/net/metagenomics/projects/PPSmg/data/V35/contigs_1000_blast_labels.txt' #contigs_1000_blast_labels.txt
    seqIdToNameLabels = csv.getMapping(nameLabelsFilePath, 0, 1, sep='\t', comment = '#')

    # mapping: genome name -> taxon id
    genomeListFilePath = '/net/metagenomics/projects/PPSmg/data/V35/genome_list2.txt' #genome_list.txt
    nameLabelToNcbid = csv.getMapping(genomeListFilePath, 0, 2, sep=';', comment = '#')

    # to store mapped sequences
    outFastaFilePath = '/net/metagenomics/projects/PPSmg/data/V35/contigsMappedBlast1000.fna' #contigsMappedBlast1000.fna
    outFasta = csv.OutFileBuffer(outFastaFilePath)
    # to stored taxonomic mapping of mapped sequences
    outLabelsFilePath = '/net/metagenomics/projects/PPSmg/data/V35/contigsMappedBlast1000Labels.txt' #contigsMappedBlast1000Labels.txt
    outLabels = csv.OutFileBuffer(outLabelsFilePath)

    for seqId in seqIdToSeq:
        if seqId in seqIdToNameLabels:
            outFasta.writeText(str('>' + str(seqId) + '\n' + seqIdToSeq[seqId] + '\n'))

    outFasta.close()
    print 'fasta created'

    for seqId in seqIdToSeq:
        if seqId in seqIdToNameLabels:
            nameLabel = seqIdToNameLabels[seqId][0]
            ncbid = nameLabelToNcbid[nameLabel][0]
            outLabels.writeText(str(str(seqId) + '\t' + str(ncbid) + '\n'))

    outLabels.close()
    print 'labels created'
Пример #7
0
def toWellMappedContigs(inFastaFile, inTaxonomyWFile,
                        outFastaFile, outFastaMisAssembledFile, outTaxonomyFile, weightThreshold=0.99):
    """
        Creates the fasta and mapping files that contain well assembled contigs (filter out misassembled contigs).

        @param inFastaFile: input fasta file with contigs
        @param inTaxonomyWFile: input file that contains taxonomy with weights (seqId, weight, taxonId)
        @param outFastaFile: fasta file containing well assembled sequences
        @param outFastaMisAssembledFile: fasta file containing misassembled contigs
        @param outTaxonomyFile: resulting taxonomy of the well assembled sequences (seqId, taxonId)
        @param weightThreshold: only contigs the weight of which is at least this value will be taken
        @return: statistics
    """
    seqIdToTaxonId = csv.predToDict(inTaxonomyWFile)
    seqIdToWeight = csv.getMapping(inTaxonomyWFile, 0, 1, '\t')
    outFastaOk = csv.OutFileBuffer(outFastaFile)
    outFastaMis = csv.OutFileBuffer(outFastaMisAssembledFile)
    outTaxonomyOk = csv.OutFileBuffer(outTaxonomyFile)

    totalBp = 0.0
    totalCount = 0.0
    okBp = 0.0
    okCount = 0.0
    avgSumBp = 0.0

    for seqId, seq in fas.fastaFileToDictWholeNames(inFastaFile).iteritems():
        bp = len(seq)
        totalBp += bp
        totalCount += 1
        seqIdPrefix = str(seqId).split(' ')[0]
        weight = seqIdToWeight[seqIdPrefix][0]
        fastaEntry = '>' + str(seqIdPrefix) + '\n' + str(seq) + '\n'
        if float(weight) >= weightThreshold:
            outFastaOk.writeText(fastaEntry)
            outTaxonomyOk.writeText(str(seqIdPrefix) + '\t' + str(seqIdToTaxonId[seqIdPrefix]) + '\n')
            okBp += bp
            okCount += 1
            avgSumBp += getCoverage(seqId) * bp
        else:
            outFastaMis.writeText(fastaEntry)

    outFastaOk.close()
    outFastaMis.close()
    outTaxonomyOk.close()

    return 'Taken: %s/%sMB, %s/%sseq, %s%% bp %s%% seq, avg coverage %s' % (round(okBp / 1000000, 2),
                                                                            round(totalBp / 1000000, 2),
                                                                            okCount, totalCount,
                                                                            round((okBp / totalBp) * 100, 2),
                                                                            round((okCount / totalCount) * 100, 2),
                                                                            round(avgSumBp / okBp, 3))
Пример #8
0
def toContigsLabelList(inFastaFileName, readsF, readsR, readOnContig, community, outMappingFileName):
    """
        Gets mapping from contigIds to lists of taxonIds of individual reads of the contigs.

        @param inFastaFileName:
        @param readsF:
        @param readsR:
        @param readOnContig:
        @param community:
        @param outMappingFileName:
    """
    # contigIds
    contigIdToBp = fas.getSequenceToBpDict(inFastaFileName)

    # map: contigId -> list of readIds
    contigIdToReadList = csv.getMapping(readOnContig, 1, 0, sep='\t', comment='r')

    # taxonIds as a list for reads
    readFTaxonIdList = getReadsTaxonIdList(readsF, community)
    print 's1'
    readRTaxonIdList = getReadsTaxonIdList(readsR, community)
    print 's2'

    if len(readFTaxonIdList) != len(readRTaxonIdList):
        print('toContigsLabels: different number of reads in the reads files, exit')
        return

    for i in range(len(readFTaxonIdList))[1:]:
        if readFTaxonIdList[i] != readRTaxonIdList[i]:
            print('toContigsLabels: at index %s different taxon ids %s and %s' %
                  (i, readFTaxonIdList[i], readRTaxonIdList[i] ))
        if readFTaxonIdList[i] is None or readRTaxonIdList[i] is None:
            print('toContigsLabels: at index %s, one is None %s or %s' % (i, readFTaxonIdList[i], readRTaxonIdList[i]))
    print 's3'
    #
    out = csv.OutFileBuffer(outMappingFileName)
    for contigId in contigIdToBp:
        try:
            readList = contigIdToReadList[contigId]
            taxonIdList = []
            for readId in readList:
                taxonIdList.append(readFTaxonIdList[int(readId)])
            out.writeText(str(contigId) + '\t' + ','.join(map(str, taxonIdList)) + '\n')
        except KeyError:
            print("No label for contigId: %s" % contigId)
    out.close()
    print 's4'
Пример #9
0
def filterSequences():
    """
        To filter sequences with a specific label.
    """
    inFileName = '/net/metagenomics/projects/PPSmg/data/V35/contigsMappedBlast1000.fna'
    outFileName = '/net/metagenomics/projects/PPSmg/data/V35/nostocRemoved/contigsMappedBlast1000NostocRm.fna'
    mapFileName = '/net/metagenomics/projects/PPSmg/data/V35/contigsMappedBlast1000Labels.txt'
    labelRemove = 103690
    #seq id -> label
    labelToIdsDict = csv.getMapping(mapFileName, 1, 0, sep='\t', comment = '#')
    allowedNamesSet = set()
    for i in labelToIdsDict:
        if int(i) != int(labelRemove):
            for j in labelToIdsDict[i]:
                allowedNamesSet.add(j)

    fas.filterOutSequences(inFileName, outFileName, allowedNamesSet)
Пример #10
0
def filterSequences():
    """
        To filter sequences with a specific label.
    """
    inFileName = '/net/metagenomics/projects/PPSmg/data/V35/contigsMappedBlast1000.fna'
    outFileName = '/net/metagenomics/projects/PPSmg/data/V35/nostocRemoved/contigsMappedBlast1000NostocRm.fna'
    mapFileName = '/net/metagenomics/projects/PPSmg/data/V35/contigsMappedBlast1000Labels.txt'
    labelRemove = 103690
    #seq id -> label
    labelToIdsDict = csv.getMapping(mapFileName, 1, 0, sep='\t', comment='#')
    allowedNamesSet = set()
    for i in labelToIdsDict:
        if int(i) != int(labelRemove):
            for j in labelToIdsDict[i]:
                allowedNamesSet.add(j)

    fas.filterOutSequences(inFileName, outFileName, allowedNamesSet)
Пример #11
0
def samToMap(samFile, accToNcbiFile, outMapFile):
    """

        @param samFile: sam file from an assembler
        @param accToNcbiFile: mapping: accessions -> ncbi taxon ids
        @param outMapFile: output file or directory
    """
    accToNcbi = csv.getMapping(accToNcbiFile, 0, 1, sep='\t')
    contigToAcc = parseSam(samFile)
    out = csv.OutFileBuffer(outMapFile)
    for contigId, acc in contigToAcc.iteritems():
        taxonId = accToNcbi.get(acc, None)
        if taxonId is None:
            print("No mapping for %s %s" % (contigId, acc))
        else:
            out.writeText(contigId + '\t' + taxonId[0] + '\n')
    out.close()
Пример #12
0
def samToMap(samFile, accToNcbiFile, outMapFile):
    """

        @param samFile: sam file from an assembler
        @param accToNcbiFile: mapping: accessions -> ncbi taxon ids
        @param outMapFile: output file or directory
    """
    accToNcbi = csv.getMapping(accToNcbiFile, 0, 1, sep='\t')
    contigToAcc = parseSam(samFile)
    out = csv.OutFileBuffer(outMapFile)
    for contigId, acc in contigToAcc.iteritems():
        taxonId = accToNcbi.get(acc, None)
        if taxonId is None:
            print("No mapping for %s %s" % (contigId, acc))
        else:
            out.writeText(contigId + '\t' + taxonId[0] + '\n')
    out.close()
Пример #13
0
def getFirstLabelAtAllowedRank():
    rank='species' # !!!!!!!

    predFile1 = '/net/metagenomics/projects/PPSmg/data/V35/contigsMappedBlast1000Labels.txt'
    predFile2 = '/net/metagenomics/projects/PPSmg/data/V35/contigsMappedBlast1000LabelsSpecies.txt'
    seqIdToLabel = csv.getMapping(predFile1, 0, 1, sep='\t', comment = '#')
    outPred = csv.OutFileBuffer(predFile2)

    taxonomy = tax.TaxonomyNcbi('/net/metagenomics/projects/PPSmg/data/nobackup/NCBI20120828/ncbiTax/ncbitax_sqlite.db')

    for seqId in seqIdToLabel:
        ncbid = int(seqIdToLabel[seqId][0])
        while not taxonomy.isRankNcbidAllowed(ncbid):
            ncbid = taxonomy.getParentNcbid(ncbid)
        outPred.writeText(str(seqId + '\t' + str(ncbid) + '\n'))

    taxonomy.close()
    outPred.close()
Пример #14
0
def getFirstLabelAtAllowedRank():
    rank = 'species'  # !!!!!!!

    predFile1 = '/net/metagenomics/projects/PPSmg/data/V35/contigsMappedBlast1000Labels.txt'
    predFile2 = '/net/metagenomics/projects/PPSmg/data/V35/contigsMappedBlast1000LabelsSpecies.txt'
    seqIdToLabel = csv.getMapping(predFile1, 0, 1, sep='\t', comment='#')
    outPred = csv.OutFileBuffer(predFile2)

    taxonomy = tax.TaxonomyNcbi(
        '/net/metagenomics/projects/PPSmg/data/nobackup/NCBI20120828/ncbiTax/ncbitax_sqlite.db'
    )

    for seqId in seqIdToLabel:
        ncbid = int(seqIdToLabel[seqId][0])
        while not taxonomy.isRankNcbidAllowed(ncbid):
            ncbid = taxonomy.getParentNcbid(ncbid)
        outPred.writeText(str(seqId + '\t' + str(ncbid) + '\n'))

    taxonomy.close()
    outPred.close()
Пример #15
0
def _main():
    # define arguments
    parser = argparse.ArgumentParser(
        description='Default task: PPS+ evaluation', epilog='')

    parser.add_argument(
        '-b',
        '--cont-binning-file',
        nargs=1,
        type=file,
        required=True,
        help='Binning file containing labels assigned to contigs.',
        metavar='assignments.csv',
        dest='b')

    parser.add_argument(
        '-t',
        '--cont-true-binning-file',
        nargs=1,
        type=file,
        required=True,
        help='Binning file containing true labels for the contigs.',
        metavar='labels.csv',
        dest='t')

    parser.add_argument('-f',
                        '--cont-contigs-file-listing',
                        nargs=1,
                        type=file,
                        required=False,
                        help='A list of paths of FASTA contigs files.',
                        metavar='fasta_listing.txt',
                        dest='f')

    parser.add_argument('-m',
                        '--cont-scaffold-contig-mapping',
                        nargs=1,
                        type=file,
                        required=False,
                        help='Scaffold contig mapping, tab separated.',
                        metavar='mapping.csv',
                        dest='m')

    parser.add_argument(
        '-n',
        '--cont-ncbi-taxonomy',
        nargs=1,
        required=False,
        help='Directory containing the NCBI names.dmp and nodes.dmp files.',
        metavar='taxonomy_dir',
        dest='n')

    parser.add_argument('-o',
                        '--cont-output-dir',
                        nargs=1,
                        required=True,
                        help='Output directory.',
                        metavar='output_dir',
                        dest='o')

    parser.add_argument(
        '-j',
        '--default-job',
        nargs='+',
        help=
        'What task/job should be performed (p~precision/recall, s~scaff-contig consistency, '
        'c~confusion tables, default - if not spec compute all)',
        metavar='',
        dest='j')

    args = parser.parse_args()

    # read and check the arguments
    seqIdToBp = None
    scaffToContig = None
    binning = None
    trueBinning = None
    outputDir = None
    job = None

    if args.o and len(args.o) == 1 and os.path.isdir(args.o[0]):
        outputDir = args.o[0]

    if args.b and len(args.b) == 1 and os.path.isfile(args.b[0].name):
        binningFile = args.b[0].name
        binning = cami.readAssignments(binningFile)

    if args.t and len(args.t) == 1 and os.path.isfile(args.t[0].name):
        trueBinningFile = args.t[0].name
        trueBinning = cami.readAssignments(trueBinningFile)

    if args.f and len(args.f) == 1 and os.path.isfile(args.f[0].name):
        seqIdToBp = fasta.getSequenceToBpDict(args.f[0].name)

        # contigsFileListing = args.f[0].name
        # for line in open(contigsFileListing):
        #     if os.path.isfile(line.strip()):
        #         d = fasta.getSequenceToBpDict(line.strip())
        #         if seqIdToBp is None:
        #             seqIdToBp = d
        #         else:
        #             count = len(d) + len(seqIdToBp)
        #             seqIdToBp.update(d)
        #             if count > len(seqIdToBp):
        #                 sys.stderr.write('The fasta files contain duplicate entries!')

    if args.m and len(args.m) == 1 and os.path.isfile(args.m[0].name):
        scaffoldContigMapping = args.m[0].name
        scaffToContig = csv.getMapping(scaffoldContigMapping, 0, 1, '\t')

    taxonomyPath = os.path.join(outputDir, 'taxonomy_ncbi.db')
    if not os.path.isfile(taxonomyPath):
        if args.n and len(args.n) == 1 and os.path.isdir(args.n[0]):
            # build the ncbi taxonomy in the case it doesn't exist
            ncbitax2sqlite.build_database(Args(db=taxonomyPath, dmp=args.n[0]))
        else:
            taxonomyPath = None

    if args.j and len(args.j) > 0 and len(
            set(args.j).intersection(set(['p', 's', 'c']))) > 0:
        job = set(args.j)

    # print job
    # print args.j
    # print len(seqIdToBp)
    # print len(binning)
    # print len(trueBinning)
    # print taxonomyPath
    # print outputDir

    if (
            job is None or 'p' in args.j
    ) and seqIdToBp and binning and trueBinning and taxonomyPath and outputDir:
        print('Computing precision/recall')
        # precision/recall - no correction
        acc = accuracy.Accuracy(seqIdToBp, binning, trueBinning, taxonomyPath)
        out = csv.OutFileBuffer(os.path.join(outputDir,
                                             'precision_recall.csv'))
        out.writeText(
            acc.getAccuracyPrint(RANKS, MIN_FRAC_CLADE, MIN_FRAC_CLADE))
        out.close()
        acc.close()

        # precision/recall - with correction
        acc = accuracy.Accuracy(seqIdToBp, binning, trueBinning, taxonomyPath,
                                CORRECT_LABEL_THRESHOLD)
        out = csv.OutFileBuffer(
            os.path.join(outputDir, 'precision_recall_correction.csv'))
        out.writeText(
            acc.getAccuracyPrint(RANKS, MIN_FRAC_CLADE, MIN_FRAC_CLADE))
        out.close()
        acc.close()

    # compute confusion matrices
    if (
            job is None or 'c' in args.j
    ) and seqIdToBp and binning and trueBinning and taxonomyPath and outputDir:
        print('Computing confusion matrices')
        confusionMatrix = confusion_matrix.ConfusionMatrix(
            seqIdToBp, binning, trueBinning, taxonomyPath, RANKS)
        for rank in RANKS:
            confusionMatrix.generateConfusionMatrix(
                rank, os.path.join(outputDir, 'confusion_matrix'))
        confusionMatrix.close()

    # compute scaffold contig consistency
    if (job is None or 's' in args.j) and seqIdToBp and binning and scaffToContig and taxonomyPath \
            and outputDir:
        print('Computing scaffold-contig consistency')
        cons = consistency.Consistency(seqIdToBp, binning, scaffToContig,
                                       taxonomyPath)
        out = csv.OutFileBuffer(os.path.join(outputDir, 'consistency.txt'))
        out.writeText(cons.getGroupedScaffoldsPrint())
        cons.close()
        out.close()

    createEvalMetaFile(outputDir)
def _main():
    # define arguments
    parser = argparse.ArgumentParser(description='Default task: PPS+ evaluation', epilog='')

    parser.add_argument('-b', '--cont-binning-file', nargs=1, type=file, required=True,
                        help='Binning file containing labels assigned to contigs.', metavar='assignments.csv', dest='b')

    parser.add_argument('-t', '--cont-true-binning-file', nargs=1, type=file, required=True,
                        help='Binning file containing true labels for the contigs.', metavar='labels.csv', dest='t')

    parser.add_argument('-f', '--cont-contigs-file-listing', nargs=1, type=file, required=False,
                        help='A list of paths of FASTA contigs files.', metavar='fasta_listing.txt', dest='f')

    parser.add_argument('-m', '--cont-scaffold-contig-mapping', nargs=1, type=file, required=False,
                        help='Scaffold contig mapping, tab separated.', metavar='mapping.csv', dest='m')

    parser.add_argument('-n', '--cont-ncbi-taxonomy', nargs=1, required=False,
                        help='Directory containing the NCBI names.dmp and nodes.dmp files.', metavar='taxonomy_dir',
                        dest='n')

    parser.add_argument('-o', '--cont-output-dir', nargs=1, required=True,
                        help='Output directory.', metavar='output_dir', dest='o')

    parser.add_argument('-j', '--default-job', nargs='+',
                        help='What task/job should be performed (p~precision/recall, s~scaff-contig consistency, '
                             'c~confusion tables, default - if not spec compute all)', metavar='', dest='j')

    args = parser.parse_args()

    # read and check the arguments
    seqIdToBp = None
    scaffToContig = None
    binning = None
    trueBinning = None
    outputDir = None
    job = None

    if args.o and len(args.o) == 1 and os.path.isdir(args.o[0]):
        outputDir = args.o[0]

    if args.b and len(args.b) == 1 and os.path.isfile(args.b[0].name):
        binningFile = args.b[0].name
        binning = cami.readAssignments(binningFile)

    if args.t and len(args.t) == 1 and os.path.isfile(args.t[0].name):
        trueBinningFile = args.t[0].name
        trueBinning = cami.readAssignments(trueBinningFile)

    if args.f and len(args.f) == 1 and os.path.isfile(args.f[0].name):
        seqIdToBp = fasta.getSequenceToBpDict(args.f[0].name)

        # contigsFileListing = args.f[0].name
        # for line in open(contigsFileListing):
        #     if os.path.isfile(line.strip()):
        #         d = fasta.getSequenceToBpDict(line.strip())
        #         if seqIdToBp is None:
        #             seqIdToBp = d
        #         else:
        #             count = len(d) + len(seqIdToBp)
        #             seqIdToBp.update(d)
        #             if count > len(seqIdToBp):
        #                 sys.stderr.write('The fasta files contain duplicate entries!')

    if args.m and len(args.m) == 1 and os.path.isfile(args.m[0].name):
        scaffoldContigMapping = args.m[0].name
        scaffToContig = csv.getMapping(scaffoldContigMapping, 0, 1, '\t')

    taxonomyPath = os.path.join(outputDir, 'taxonomy_ncbi.db')
    if not os.path.isfile(taxonomyPath):
        if args.n and len(args.n) == 1 and os.path.isdir(args.n[0]):
            # build the ncbi taxonomy in the case it doesn't exist
            ncbitax2sqlite.build_database(Args(db=taxonomyPath, dmp=args.n[0]))
        else:
            taxonomyPath = None

    if args.j and len(args.j) > 0 and len(set(args.j).intersection(set(['p', 's', 'c']))) > 0:
        job = set(args.j)

    # print job
    # print args.j
    # print len(seqIdToBp)
    # print len(binning)
    # print len(trueBinning)
    # print taxonomyPath
    # print outputDir

    if (job is None or 'p' in args.j) and seqIdToBp and binning and trueBinning and taxonomyPath and outputDir:
        print('Computing precision/recall')
        # precision/recall - no correction
        acc = accuracy.Accuracy(seqIdToBp, binning, trueBinning, taxonomyPath)
        out = csv.OutFileBuffer(os.path.join(outputDir, 'precision_recall.csv'))
        out.writeText(acc.getAccuracyPrint(RANKS, MIN_FRAC_CLADE, MIN_FRAC_CLADE))
        out.close()
        acc.close()

        # precision/recall - with correction
        acc = accuracy.Accuracy(seqIdToBp, binning, trueBinning, taxonomyPath, CORRECT_LABEL_THRESHOLD)
        out = csv.OutFileBuffer(os.path.join(outputDir, 'precision_recall_correction.csv'))
        out.writeText(acc.getAccuracyPrint(RANKS, MIN_FRAC_CLADE, MIN_FRAC_CLADE))
        out.close()
        acc.close()

    # compute confusion matrices
    if (job is None or 'c' in args.j) and seqIdToBp and binning and trueBinning and taxonomyPath and outputDir:
        print('Computing confusion matrices')
        confusionMatrix = confusion_matrix.ConfusionMatrix(seqIdToBp, binning, trueBinning, taxonomyPath, RANKS)
        for rank in RANKS:
            confusionMatrix.generateConfusionMatrix(rank, os.path.join(outputDir, 'confusion_matrix'))
        confusionMatrix.close()

    # compute scaffold contig consistency
    if (job is None or 's' in args.j) and seqIdToBp and binning and scaffToContig and taxonomyPath \
            and outputDir:
        print('Computing scaffold-contig consistency')
        cons = consistency.Consistency(seqIdToBp, binning, scaffToContig, taxonomyPath)
        out = csv.OutFileBuffer(os.path.join(outputDir, 'consistency.txt'))
        out.writeText(cons.getGroupedScaffoldsPrint())
        cons.close()
        out.close()

    createEvalMetaFile(outputDir)
Пример #17
0
    def _init(self, align=True, dm=True, cluster=True):
        """
            Init data, compute: alignment, distance matrix, clusters.
        """
        if self._initDone:
            return
        self._initDone = True

        fastaPathList = [] # fasta files containing regions that correspond to particular marker genes
        self._mgList = [] # list of names of marker genes
        mgToFastaPath = dict([]) # marker gene name -> fasta file path

        #collect regions from Amphora mg
        for fastaFile in glob.glob(os.path.join(os.path.normpath(self._mgWorkingDir),'*.gff')):
            fastaPathList.append(fastaFile)
        for path in fastaPathList:
            name = re.sub('([^\.]+)\..*$', r'\1' , os.path.basename(path))
            mg = re.sub(r'([^_]+)_dna', r'\1',name)
            dir = os.path.dirname(path)
            self._mgList.append(mg)
            mgToFastaPath[mg] = path

        #add 16S
        s16List = ['5S_rRNA', '16S_rRNA', '23S_rRNA']
        for mg in s16List:
            mgToFastaPath[mg] = str(self._s16Prefix + '.' + mg + '.fna')
            self._mgList.append(mg)

        #For each marker gene create filtered fasta file that contains for each mg and sequence at most one region.
        mgToFilteredFastaPath = dict([])
        mgToSeqNameToTaxPathDict = dict([]) #mg -> seqName (~region name) -> pred
        for mg in self._mgList:
            mgToSeqNameToTaxPathDict[mg] = dict([])

        for seq in self._sequences.sequences:
            id = str(str(seq.scaffold.id) + '_' + str(seq.id))
            for mg,tag,pred in zip(seq.getCandidateTaxPathSourceList(), seq.getCandidateTaxPathTagList(),
                                    seq.getCandidateTaxPathDictList()):
                mgToSeqNameToTaxPathDict[mg][str(id + '_' + tag)] = pred

        #for each marker gene: choose only one sequence region for each mg and sequence
        #all sequences are predicted at least at superkingdom
        for mg in self._mgList:
            seqNameToPred = mgToSeqNameToTaxPathDict[mg] #sequence region predictions for this mg
            seqNameToSeq = fastaFileToDict(mgToFastaPath[mg]) #read the fasta file
            outPath = os.path.normpath(os.path.join(self._clustDir, str(mg + '.filter.fna')))
            mgToFilteredFastaPath[mg] = outPath
            out = OutFileBuffer(outPath)
            seqBaseToSeqName = dict([]) # sequence base (scaffId_seqId) -> region name
            for seqName in seqNameToSeq:
                seqBase = re.sub(r'^([0-9]+_[0-9]+)[^0-9].*',r'\1', seqName)
                if seqBase not in seqBaseToSeqName:
                    seqBaseToSeqName[seqBase] = []
                seqBaseToSeqName[seqBase].append(seqName)
            for seqBase in seqBaseToSeqName:
                seqId = int(re.sub(r'^[0-9]+_([0-9]+)',r'\1', seqBase))
                seqBaseTaxPathDict = self._sequences.getSequence(seqId).getTaxonomyPath()
                list = seqBaseToSeqName[seqBase]
                candidateSeq = [] # sequence region is predicted at least at rank superkingdom
                for seqName in list:
                    if seqName not in seqNameToPred:
                        taxPathDict = None
                    else:
                        taxPathDict = seqNameToPred[seqName]
                    if taxPathDict != None:
                         candidateSeq.append(seqName)
                if len(candidateSeq) == 0:
                    continue
                candidateSeq2 = [] # sequence regions predicted at least at the same rank as the whole sequence
                for seqName in candidateSeq:
                    taxPathDict = seqNameToPred[seqName]
                    if ((seqBaseTaxPathDict == None)
                        or (len(taxPathDict) >= len(seqBaseTaxPathDict))): #predict at least at the same level
                        candidateSeq2.append(seqName)
                if len(candidateSeq2) > 0: #take the longest sequence
                    sMax = candidateSeq2[0]
                    for s in candidateSeq2[1:]:
                        if len(seqNameToSeq[s]) > len(seqNameToSeq[sMax]):
                            sMax = s
                else: #all sequence regions are predicted higher than the sequence
                    sMax = candidateSeq[0] #sequence region with the most specific prediction
                    for s in candidateSeq[1:]:
                        taxPathDictMax = seqNameToPred[sMax]
                        taxPathDictS = seqNameToPred[s]
                        if taxPathDictS == None:
                            continue
                        if taxPathDictMax == None:
                            sMax = s
                            continue
                        if len(taxPathDictMax) < len(taxPathDictS):
                            sMax = s

                    candidateSeq3 = [] #get all sequence regions with the most specific prediction
                    taxPathDictMax = seqNameToPred[sMax]
                    for s in candidateSeq:
                        taxPathDictS = seqNameToPred[s]
                        if taxPathDictMax == None:
                            candidateSeq3.append(s)
                        elif len(taxPathDictS) == len(taxPathDictMax):
                            candidateSeq3.append(s)
                    sMax = candidateSeq3[0]
                    for s in candidateSeq3[1:]: #take the longest sequence
                        if len(seqNameToSeq[sMax]) < len(seqNameToSeq[s]):
                            sMax = s

                out.writeText(str('>' + str(sMax) + '\n' + str(seqNameToSeq[sMax]) + '\n'))

            out.close()

        mgToAlignPath = dict([])
        for mg in self._mgList:
            mgToAlignPath[mg] = os.path.normpath(os.path.join(self._clustDir, str(mg + '.align.fna')))

        #build alignment
        if align:
            for mg in self._mgList:
                alignCmd = str(self._config.get('aligner') + ' -in ' + mgToFilteredFastaPath[mg]
                + ' -out ' + mgToAlignPath[mg] + ' -quiet')
                assert os.name == 'posix'
                predictProc = subprocess.Popen(alignCmd, cwd=self._mgWorkingDir, shell=True, bufsize=-1) #stdout=subprocess.STDOUT, stderr=subprocess.STDOUT)
                predictProc.wait()
                print 'Muscle return code for', mg, ':', predictProc.returncode
                if predictProc.returncode != 0:
                    sys.stderr.write(str(alignCmd + ' \n'))

        #compute DM
        if dm:
            for mg in self._mgList:
                mothur = os.path.join(os.path.normpath(self._configRRNA16S.get('mothurInstallDir')), 'mothur')
                mothurCmd = str('time ' + mothur + ' "#dist.seqs(fasta=' + mgToAlignPath[mg]
                                + ', processors=2, countends=F, calc=nogaps, cutoff=0.3, output=lt)"')
                assert os.name == 'posix'
                mothurProc = subprocess.Popen(mothurCmd, shell=True, bufsize=-1, cwd=self._mgWorkingDir)
                mothurProc.wait()
                print 'Mothur return code dist:', mg, mothurProc.returncode
                #distFilePath = os.path.join(os.path.dirname(mgToAlignPath[mg]), str(mg + '.align.phylip.dist'))
                #self._mgToDM[mg] = forEachLine(distFilePath, DM())
                #self._mgToDM[mg].printDM()

        #cluster
        if cluster:
            for mg in self._mgList:
                distFilePath = os.path.join(os.path.dirname(mgToAlignPath[mg]), str(mg + '.align.phylip.dist'))
                mothur = os.path.join(os.path.normpath(self._configRRNA16S.get('mothurInstallDir')), 'mothur')
                mothurCmd = str('time ' + mothur + ' "#cluster(phylip=' + distFilePath
                                + ', method=furthest, hard=t, precision=1000)"')
                assert os.name == 'posix'
                mothurProc = subprocess.Popen(mothurCmd, shell=True, bufsize=-1, cwd=self._mgWorkingDir)
                mothurProc.wait()
                print 'Mothur return code cluster:', mg, mothurProc.returncode

        #read DM and clusters

        #sequence predictions
        self._seqIdToTaxPathDict = dict([])
        self._seqIdToWeight = dict([])
        for seq in self._sequences.sequences:
            id = int(seq.id)
            self._seqIdToTaxPathDict[id] = seq.getTaxonomyPath()
            self._seqIdToWeight[id] = seq.getTaxonomyPathWeight()

        #similarity thresholds
        thresholds = self._configMG.get('mgSimilarityThresholds')
        self._mgToMaxThreshold = dict([])
        tmpDict = getMapping(self._configMG.get('mgSimilarityThresholds'), 0, 1, sep='\t', comment = '#')
        for k in tmpDict:
            self._mgToMaxThreshold[k] = float(tmpDict[k][0])

        self._mgToDM = dict([])
        self._mgToCluster = dict([])
        for mg in self._mgList:
            file = os.path.join(os.path.dirname(mgToAlignPath[mg]), str(mg + '.align.phylip.dist'))
            self._mgToDM[mg] = forEachLine(file, DM())
            file = os.path.join(os.path.dirname(mgToAlignPath[mg]), str(mg + '.align.phylip.fn.list'))
            self._mgToCluster[mg] = forEachLine(file, MCluster(self._seqIdToTaxPathDict, self._mgToMaxThreshold[mg]))
Пример #18
0
def _main():
    """ See the module description."""
    parser = argparse.ArgumentParser(description=__doc__, epilog="""""")

    parser.add_argument('-i', '--input-data-dir', action='store', nargs=1, required=True,
        help="""Directory that contains fasta files and corresponding mapping files, for each "*.tax" (or "*.csv")
                 file there must be a "*.fna" file with the same name. All files with suffix "tax" (or "*.csv")
                 will be considered. (Takes only Bacteria and Archaea)""",
        metavar='input_dir',
        dest='inDir')

    parser.add_argument('-o', '--output-dir', action='store', nargs=1, required=True,
        help='Directory that contains the output files.',
        metavar='out_dir',
        dest='outDir')

    parser.add_argument('-s', '--source-type', required=True, nargs=1, choices=["s","a"],
        help='To determine the source, use "s" for the Silva database and "a" for the Amphora database.',
        dest='srcType')

    parser.add_argument('-t', '--taxonomy-file', nargs=1, type=file, required=True,
        help='NCBI taxonomy database file in the sqlite3 format.', metavar='ncbitax_sqlite.db',
        dest='taxonomy')

    parser.add_argument('-n', '--not-considered-taxonIds', action='store', nargs=1,
        help='Comma separated leaf level or top level taxonIds (as a string) what fill be filtered out. (optional)',
        metavar='"2759,10239,77133,155900,408172,32644, 408170,433727,749907,556182,702656,410661,652676,410659,797283'\
                ',408171,703336,256318,32630,433724,766747,488339,942017,1076179,717931,455559,527640,904678,552539,'\
                '54395,198431,358574,415540,511564,369433,380357,81726,198834,271928,311313,2759,749906,1077529,'\
                '1077529,361146,511563,361147"',
        dest='filterOut')

    # parse arguments
    args = parser.parse_args()
    inDir = args.inDir[0]
    outDir =  args.outDir[0]
    srcType = args.srcType[0]
    filterOutTaxonIdsSet = set()
    try:
        if args.filterOut:
            filterOutTaxonIdsSet.update(set(map(int, str(args.filterOut[0]).split(','))))
    except:
        print('Taxon ids that are to be filtered out are in a wrong format! Comma separated integers are needed!')
        raise

    taxonomy = TaxonomyWrap(args.taxonomy[0].name)
    for dir in [inDir, outDir]:
        assert os.path.isdir(dir), 'Path: "' + dir + '" does not exists!'

    # create db for each gene
    mapDict = {}  # map: seqId -> ncbid
    for mapFilePath in glob.glob(os.path.join(os.path.normpath(inDir), r'*.[ct][sa][vx]')):  # *.csv or *.tax

        assert mapFilePath.endswith(('.csv', '.tax')), \
            'The mapping files can either end with .csv or .tax ' + mapFilePath

        base = os.path.basename(mapFilePath).rsplit('.', 1)[0]  # cut out dir path and suffix
        fastaDict = fas.fastaFileToDict(os.path.join(os.path.dirname(mapFilePath), (base + '.fna'))) # map: seqId -> seq
        print("Processing: %s seq count: %s" % (base, str(len(fastaDict))))

        if 'a' in srcType:  # Amphora
            mapDict = {}
            for k in csv.getColumnAsList(mapFilePath, colNum=0, sep='\t'):
                v =  int(k.rsplit('|', 1)[1].split(':')[1]) # get ncbid
                assert ((k not in mapDict) or (mapDict[k] == v)), str(
                    'There are at least two different values for key: ' + str(k) + ' in ' + mapFilePath)
                mapDict[k] = v
        elif 's' in srcType:  # Silva
            mapTmp = csv.getMapping(mapFilePath, 0, 2, '\t')
            mapDict = {}
            for k, v in mapTmp.iteritems():
                mapDict[k] = int(v[0])
        else:
            assert False, 'Unsupported source type!'

        # same number of entries in both files (fasta and mapping) ?
        if len(mapDict) != len(fastaDict):
            print(str('%s: The mapping file and the corresponding fasta file have different number of entries: ' +
                      '"%s" "%s" these files will be skipped!') % (base, str(len(mapDict)), str(len(fastaDict))))
            continue

        # are duplicates in the mapping file ?
        count = len(csv.getColumnAsList(mapFilePath))
        if len(mapDict) != count:
            print('%s: The mapping file contained duplicates! unique: %s non-unique: %s' % (
                base, str(len(mapDict)), str(count)))

        # store data to the output directory
        outDna = csv.OutFileBuffer(os.path.join(outDir, str(base + '.fna')))
        outTax = csv.OutFileBuffer(os.path.join(outDir, str(base + '.tax')))
        count = 0
        filteredLeaf = 0
        filteredSup = 0
        notMapped = 0
        noBacArch = 0
        for seqId, taxonId in mapDict.iteritems():
            if taxonId in filterOutTaxonIdsSet:
                filteredLeaf += 1
                continue
            path = taxonomy.getPathToRoot(taxonId)
            if path is None:
                print('Could not find: %s for seqId: %s record skipped!' % (str(taxonId), seqId))
                notMapped += 1
                continue
            topLevel = int(path.split(';', 1)[0])
            if topLevel in filterOutTaxonIdsSet:
                filteredSup += 1
                continue
            if topLevel not in [2, 2157]:  # Bacteria, Archaea
                noBacArch += 1
                print('NoBactArch: ', topLevel)

            seq = fastaDict[seqId]
            if 'a' in srcType:  # Amphora
                id = seqId
            elif 's' in srcType:  # Silva
                id = str(seqId + '|ncbid:' + str(taxonId))

            outTax.writeText(str(id + '\t' + path + '\n'))
            outDna.writeText(str('>' + id + '\n' + seq + '\n'))
            count += 1

        outDna.close()
        outTax.close()
        print('Stored entries: %s filtered out: %s leaf, %s top level, not mapped: %s' %
              (count, filteredLeaf, filteredSup, notMapped))
        if noBacArch > 0:
            print('WARN: stored %s of non Bacterial and non Archaeal sequences: ' % (noBacArch))

        # Silva:
        #-i /Users/ivan/Documents/work/binning/database/silva111/arbGenerated -s s -t /Users/ivan/Documents/work/binning/taxonomy/20121122/ncbitax_sqlite.db
        # -o /Users/ivan/Documents/work/binning/database/silva111/db -n ...

        # Amphora
        # -i /Users/ivan/Documents/work/binning/database/markerGenes3/mGenesExtracted -s a -t /Users/ivan/Documents/work/binning/taxonomy/20121122/ncbitax_sqlite.db
        # -o /Users/ivan/Documents/work/binning/database/markerGenes3/db

    taxonomy.close()
    print 'done'
Пример #19
0
    def _init(self, align=True, dm=True, cluster=True):
        """
            Init data, compute: alignment, distance matrix, clusters.
        """
        if self._initDone:
            return
        self._initDone = True

        fastaPathList = [
        ]  # fasta files containing regions that correspond to particular marker genes
        self._mgList = []  # list of names of marker genes
        mgToFastaPath = dict([])  # marker gene name -> fasta file path

        #collect regions from Amphora mg
        for fastaFile in glob.glob(
                os.path.join(os.path.normpath(self._mgWorkingDir), '*.gff')):
            fastaPathList.append(fastaFile)
        for path in fastaPathList:
            name = re.sub('([^\.]+)\..*$', r'\1', os.path.basename(path))
            mg = re.sub(r'([^_]+)_dna', r'\1', name)
            dir = os.path.dirname(path)
            self._mgList.append(mg)
            mgToFastaPath[mg] = path

        #add 16S
        s16List = ['5S_rRNA', '16S_rRNA', '23S_rRNA']
        for mg in s16List:
            mgToFastaPath[mg] = str(self._s16Prefix + '.' + mg + '.fna')
            self._mgList.append(mg)

        #For each marker gene create filtered fasta file that contains for each mg and sequence at most one region.
        mgToFilteredFastaPath = dict([])
        mgToSeqNameToTaxPathDict = dict(
            [])  #mg -> seqName (~region name) -> pred
        for mg in self._mgList:
            mgToSeqNameToTaxPathDict[mg] = dict([])

        for seq in self._sequences.sequences:
            id = str(str(seq.scaffold.id) + '_' + str(seq.id))
            for mg, tag, pred in zip(seq.getCandidateTaxPathSourceList(),
                                     seq.getCandidateTaxPathTagList(),
                                     seq.getCandidateTaxPathDictList()):
                mgToSeqNameToTaxPathDict[mg][str(id + '_' + tag)] = pred

        #for each marker gene: choose only one sequence region for each mg and sequence
        #all sequences are predicted at least at superkingdom
        for mg in self._mgList:
            seqNameToPred = mgToSeqNameToTaxPathDict[
                mg]  #sequence region predictions for this mg
            seqNameToSeq = fastaFileToDict(
                mgToFastaPath[mg])  #read the fasta file
            outPath = os.path.normpath(
                os.path.join(self._clustDir, str(mg + '.filter.fna')))
            mgToFilteredFastaPath[mg] = outPath
            out = OutFileBuffer(outPath)
            seqBaseToSeqName = dict(
                [])  # sequence base (scaffId_seqId) -> region name
            for seqName in seqNameToSeq:
                seqBase = re.sub(r'^([0-9]+_[0-9]+)[^0-9].*', r'\1', seqName)
                if seqBase not in seqBaseToSeqName:
                    seqBaseToSeqName[seqBase] = []
                seqBaseToSeqName[seqBase].append(seqName)
            for seqBase in seqBaseToSeqName:
                seqId = int(re.sub(r'^[0-9]+_([0-9]+)', r'\1', seqBase))
                seqBaseTaxPathDict = self._sequences.getSequence(
                    seqId).getTaxonomyPath()
                list = seqBaseToSeqName[seqBase]
                candidateSeq = [
                ]  # sequence region is predicted at least at rank superkingdom
                for seqName in list:
                    if seqName not in seqNameToPred:
                        taxPathDict = None
                    else:
                        taxPathDict = seqNameToPred[seqName]
                    if taxPathDict != None:
                        candidateSeq.append(seqName)
                if len(candidateSeq) == 0:
                    continue
                candidateSeq2 = [
                ]  # sequence regions predicted at least at the same rank as the whole sequence
                for seqName in candidateSeq:
                    taxPathDict = seqNameToPred[seqName]
                    if ((seqBaseTaxPathDict == None)
                            or (len(taxPathDict) >= len(seqBaseTaxPathDict))
                        ):  #predict at least at the same level
                        candidateSeq2.append(seqName)
                if len(candidateSeq2) > 0:  #take the longest sequence
                    sMax = candidateSeq2[0]
                    for s in candidateSeq2[1:]:
                        if len(seqNameToSeq[s]) > len(seqNameToSeq[sMax]):
                            sMax = s
                else:  #all sequence regions are predicted higher than the sequence
                    sMax = candidateSeq[
                        0]  #sequence region with the most specific prediction
                    for s in candidateSeq[1:]:
                        taxPathDictMax = seqNameToPred[sMax]
                        taxPathDictS = seqNameToPred[s]
                        if taxPathDictS == None:
                            continue
                        if taxPathDictMax == None:
                            sMax = s
                            continue
                        if len(taxPathDictMax) < len(taxPathDictS):
                            sMax = s

                    candidateSeq3 = [
                    ]  #get all sequence regions with the most specific prediction
                    taxPathDictMax = seqNameToPred[sMax]
                    for s in candidateSeq:
                        taxPathDictS = seqNameToPred[s]
                        if taxPathDictMax == None:
                            candidateSeq3.append(s)
                        elif len(taxPathDictS) == len(taxPathDictMax):
                            candidateSeq3.append(s)
                    sMax = candidateSeq3[0]
                    for s in candidateSeq3[1:]:  #take the longest sequence
                        if len(seqNameToSeq[sMax]) < len(seqNameToSeq[s]):
                            sMax = s

                out.writeText(
                    str('>' + str(sMax) + '\n' + str(seqNameToSeq[sMax]) +
                        '\n'))

            out.close()

        mgToAlignPath = dict([])
        for mg in self._mgList:
            mgToAlignPath[mg] = os.path.normpath(
                os.path.join(self._clustDir, str(mg + '.align.fna')))

        #build alignment
        if align:
            for mg in self._mgList:
                alignCmd = str(
                    self._config.get('aligner') + ' -in ' +
                    mgToFilteredFastaPath[mg] + ' -out ' + mgToAlignPath[mg] +
                    ' -quiet')
                assert os.name == 'posix'
                predictProc = subprocess.Popen(
                    alignCmd, cwd=self._mgWorkingDir, shell=True, bufsize=-1
                )  #stdout=subprocess.STDOUT, stderr=subprocess.STDOUT)
                predictProc.wait()
                print 'Muscle return code for', mg, ':', predictProc.returncode
                if predictProc.returncode != 0:
                    sys.stderr.write(str(alignCmd + ' \n'))

        #compute DM
        if dm:
            for mg in self._mgList:
                mothur = os.path.join(
                    os.path.normpath(
                        self._configRRNA16S.get('mothurInstallDir')), 'mothur')
                mothurCmd = str(
                    'time ' + mothur + ' "#dist.seqs(fasta=' +
                    mgToAlignPath[mg] +
                    ', processors=2, countends=F, calc=nogaps, cutoff=0.3, output=lt)"'
                )
                assert os.name == 'posix'
                mothurProc = subprocess.Popen(mothurCmd,
                                              shell=True,
                                              bufsize=-1,
                                              cwd=self._mgWorkingDir)
                mothurProc.wait()
                print 'Mothur return code dist:', mg, mothurProc.returncode
                #distFilePath = os.path.join(os.path.dirname(mgToAlignPath[mg]), str(mg + '.align.phylip.dist'))
                #self._mgToDM[mg] = forEachLine(distFilePath, DM())
                #self._mgToDM[mg].printDM()

        #cluster
        if cluster:
            for mg in self._mgList:
                distFilePath = os.path.join(os.path.dirname(mgToAlignPath[mg]),
                                            str(mg + '.align.phylip.dist'))
                mothur = os.path.join(
                    os.path.normpath(
                        self._configRRNA16S.get('mothurInstallDir')), 'mothur')
                mothurCmd = str('time ' + mothur + ' "#cluster(phylip=' +
                                distFilePath +
                                ', method=furthest, hard=t, precision=1000)"')
                assert os.name == 'posix'
                mothurProc = subprocess.Popen(mothurCmd,
                                              shell=True,
                                              bufsize=-1,
                                              cwd=self._mgWorkingDir)
                mothurProc.wait()
                print 'Mothur return code cluster:', mg, mothurProc.returncode

        #read DM and clusters

        #sequence predictions
        self._seqIdToTaxPathDict = dict([])
        self._seqIdToWeight = dict([])
        for seq in self._sequences.sequences:
            id = int(seq.id)
            self._seqIdToTaxPathDict[id] = seq.getTaxonomyPath()
            self._seqIdToWeight[id] = seq.getTaxonomyPathWeight()

        #similarity thresholds
        thresholds = self._configMG.get('mgSimilarityThresholds')
        self._mgToMaxThreshold = dict([])
        tmpDict = getMapping(self._configMG.get('mgSimilarityThresholds'),
                             0,
                             1,
                             sep='\t',
                             comment='#')
        for k in tmpDict:
            self._mgToMaxThreshold[k] = float(tmpDict[k][0])

        self._mgToDM = dict([])
        self._mgToCluster = dict([])
        for mg in self._mgList:
            file = os.path.join(os.path.dirname(mgToAlignPath[mg]),
                                str(mg + '.align.phylip.dist'))
            self._mgToDM[mg] = forEachLine(file, DM())
            file = os.path.join(os.path.dirname(mgToAlignPath[mg]),
                                str(mg + '.align.phylip.fn.list'))
            self._mgToCluster[mg] = forEachLine(
                file,
                MCluster(self._seqIdToTaxPathDict, self._mgToMaxThreshold[mg]))
def main():
    """
        Wraps pIRS read simulator to simulate Illumina paired end reads.

        Sample config: /Users/ivan/Documents/work/binning/data/V35/simMetagenome/configMetagenome01.cfg
    """
    if os.name != 'posix':
        print 'runs only on posix systems'
        return

    #parse arguments
    parser = argparse.ArgumentParser(
        description=
        '''A simple Metagenome Illumina read simulator that wraps pIRS''',
        epilog='''''')

    parser.add_argument('-c',
                        '--config',
                        nargs=1,
                        type=file,
                        required=True,
                        help='configuration file of the simulator',
                        metavar='configMetagenome.cfg',
                        dest='config')

    parser.add_argument(
        '-p',
        '--pIRS-param',
        action='store',
        nargs='+',
        help='parameters of the pIRS simulator, e.g. "-Q 64 -E 1"',
        dest='p')

    args = parser.parse_args()
    config = Config(args.config[0], 'Sim')

    pirsParam = ''
    if args.p:
        pirsParam = args.p[0]

    #reads configuration
    workingDir = config.get('workingDir')
    referenceSeq = config.get('referenceSeq')
    frequenciesInfo = config.get('frequenciesInfo')
    coverageFrequencyMultiplier = float(
        config.get('coverageFrequencyMultiplier'))
    pirsInstallDir = config.get('pirsInstallDir')
    insertSizeMean = int(config.get('insertSizeMean'))
    insertSizeSd = int(config.get('insertSizeSd'))
    readLength = int(config.get('readLength'))

    #check whether the pIRS optional parameters doesn`t contain those predefined elsewhere (e.g. in the config)
    if (string.count(pirsParam, '-m') != 0
            or string.count(pirsParam, '-v') != 0
            or string.count(pirsParam, '-l') != 0
            or string.count(pirsParam, '-x') != 0
            or string.count(pirsParam, '-i') != 0
            or string.count(pirsParam, '-o') != 0):
        print 'pIRS parameters -m -v -l (-x) must be set in the configuration file, parameters -i -o cannot be set '
        return

    #check working directory, create temporary directory
    tmpDir = os.path.join(workingDir, 'tmp')
    if not os.path.isdir(workingDir):
        print str('The working directory does not exists, create it! (' +
                  str(workingDir) + ')')
        return
    if not os.path.isdir(tmpDir):
        os.mkdir(tmpDir)

    seqNameToSeq = fastaFileToDict(referenceSeq)
    seqNameToFreq = getMapping(frequenciesInfo, 0, 1, sep='\t', comment='#')

    outReads1Merged = OutFileBuffer(os.path.join(workingDir, 'reads_1.fq'))
    outReads2Merged = OutFileBuffer(os.path.join(workingDir, 'reads_2.fq'))

    for seqName in seqNameToFreq:
        seq = seqNameToSeq[seqName]
        coverage = float(
            seqNameToFreq[seqName][0]) * coverageFrequencyMultiplier

        fastaFile = os.path.join(tmpDir, str(seqName + '.fna'))
        outBuffer = OutFileBuffer(fastaFile)
        outBuffer.writeText(str('>' + seqName + '\n' + seq + '\n'))
        outBuffer.close()

        cmd = str(
            os.path.join(pirsInstallDir, 'pirs') + ' simulate -i ' +
            fastaFile + ' -x ' + str(coverage) + ' -m ' + str(insertSizeMean) +
            ' -v ' + str(insertSizeSd) + ' -l ' + str(readLength) + ' -o ' +
            seqName + ' ' + pirsParam)
        #print cmd
        proc = subprocess.Popen(
            cmd, shell=True, bufsize=-1,
            cwd=tmpDir)  # stdout=subprocess.STDOUT, stderr=subprocess.STDOUT)
        proc.wait()
        if proc.returncode != 0:
            sys.stderr.write(str('command failed: ' + cmd))

        #append generated reads to the merged files
        reads1 = gzip.open(
            os.path.join(
                tmpDir,
                str(seqName + '_' + str(readLength) + '_' +
                    str(insertSizeMean) + '_1.fq.gz')), 'rb')
        file1Content = reads1.read()
        outReads1Merged.writeText(
            str(
                file1Content.replace('@read_', str('@read_' + seqName + '_')) +
                '\n'))
        reads1.close()

        reads2 = gzip.open(
            os.path.join(
                tmpDir,
                str(seqName + '_' + str(readLength) + '_' +
                    str(insertSizeMean) + '_2.fq.gz')), 'rb')
        file2Content = reads2.read()
        outReads2Merged.writeText(
            str(
                file2Content.replace('@read_', str('@read_' + seqName + '_')) +
                '\n'))
        reads2.close()

    outReads1Merged.close()
    outReads2Merged.close()
Пример #21
0
def main():
    """
        Wraps pIRS read simulator to simulate Illumina paired end reads.

        Sample config: /Users/ivan/Documents/work/binning/data/V35/simMetagenome/configMetagenome01.cfg
    """
    if os.name != 'posix':
        print 'runs only on posix systems'
        return

    #parse arguments
    parser = argparse.ArgumentParser(description='''A simple Metagenome Illumina read simulator that wraps pIRS''',
                                 epilog='''''')

    parser.add_argument('-c', '--config', nargs=1, type=file, required=True,
                        help='configuration file of the simulator', metavar='configMetagenome.cfg',
                        dest='config')

    parser.add_argument('-p', '--pIRS-param', action='store', nargs='+',
                        help='parameters of the pIRS simulator, e.g. "-Q 64 -E 1"',
                        dest='p')

    args = parser.parse_args()
    config = Config(args.config[0], 'Sim')

    pirsParam = ''
    if args.p:
        pirsParam = args.p[0]

    #reads configuration
    workingDir = config.get('workingDir')
    referenceSeq = config.get('referenceSeq')
    frequenciesInfo = config.get('frequenciesInfo')
    coverageFrequencyMultiplier = float(config.get('coverageFrequencyMultiplier'))
    pirsInstallDir = config.get('pirsInstallDir')
    insertSizeMean = int(config.get('insertSizeMean'))
    insertSizeSd = int(config.get('insertSizeSd'))
    readLength = int(config.get('readLength'))

    #check whether the pIRS optional parameters doesn`t contain those predefined elsewhere (e.g. in the config)
    if (string.count(pirsParam,'-m') != 0 or string.count(pirsParam,'-v') != 0 or string.count(pirsParam,'-l') != 0
        or string.count(pirsParam,'-x') != 0 or string.count(pirsParam,'-i') != 0 or string.count(pirsParam,'-o') != 0):
        print 'pIRS parameters -m -v -l (-x) must be set in the configuration file, parameters -i -o cannot be set '
        return

    #check working directory, create temporary directory
    tmpDir = os.path.join(workingDir,'tmp')
    if not os.path.isdir(workingDir):
        print str('The working directory does not exists, create it! (' + str(workingDir) + ')')
        return
    if not os.path.isdir(tmpDir):
        os.mkdir(tmpDir)

    seqNameToSeq = fastaFileToDict(referenceSeq)
    seqNameToFreq = getMapping(frequenciesInfo, 0, 1, sep='\t', comment = '#')

    outReads1Merged = OutFileBuffer(os.path.join(workingDir,'reads_1.fq'))
    outReads2Merged = OutFileBuffer(os.path.join(workingDir,'reads_2.fq'))

    for seqName in seqNameToFreq:
        seq = seqNameToSeq[seqName]
        coverage = float(seqNameToFreq[seqName][0])*coverageFrequencyMultiplier

        fastaFile = os.path.join(tmpDir,str(seqName + '.fna'))
        outBuffer = OutFileBuffer(fastaFile)
        outBuffer.writeText(str('>' + seqName + '\n' + seq + '\n'))
        outBuffer.close()

        cmd = str(os.path.join(pirsInstallDir,'pirs') + ' simulate -i ' + fastaFile + ' -x ' + str(coverage) +
                  ' -m ' + str(insertSizeMean) + ' -v ' + str(insertSizeSd) + ' -l ' + str(readLength)
                  + ' -o ' + seqName + ' ' + pirsParam)
        #print cmd
        proc = subprocess.Popen(cmd, shell=True, bufsize=-1, cwd=tmpDir)# stdout=subprocess.STDOUT, stderr=subprocess.STDOUT)
        proc.wait()
        if proc.returncode != 0:
            sys.stderr.write(str('command failed: ' + cmd))

        #append generated reads to the merged files
        reads1 = gzip.open(os.path.join(tmpDir, str(seqName + '_' + str(readLength) + '_' + str(insertSizeMean) + '_1.fq.gz')), 'rb')
        file1Content = reads1.read()
        outReads1Merged.writeText(str(file1Content.replace('@read_',str('@read_' + seqName + '_')) + '\n'))
        reads1.close()

        reads2 = gzip.open(os.path.join(tmpDir, str(seqName + '_' + str(readLength) + '_' + str(insertSizeMean) + '_2.fq.gz')), 'rb')
        file2Content = reads2.read()
        outReads2Merged.writeText(str(file2Content.replace('@read_',str('@read_' + seqName + '_')) + '\n'))
        reads2.close()

    outReads1Merged.close()
    outReads2Merged.close()
Пример #22
0
    def __init__(self, contigNameToBp, contigNameToNcbid, scaffToContigList, taxonomy,
                 minScaffContigCount=None, minScaffBpLen=None, cladesSet=None, considerContigWithNoScaff=True,
                 ignoreScaffPredToRoot=True):
        """
            Initializes the main Consistency class.

            @param contigNameToBp: dictionary that maps contig names to bp (int);
                or a fasta file that contain contigs
            @param contigNameToNcbid: dictionary that maps contig names to ncbids (int);
                or a prediction file - first column contig name, last column ncbid
            @param scaffToContigList: dictionary that maps scaffold names to list of contig names;
                or a file - first column scaffold name, second column contig name
            @param minScaffContigCount: consider only scaffolds that contain at least this number of contigs
            @param minScaffBpLen: consider only scaffolds with at least this collective length (in bp)
            @param cladesSet: consider only scaffolds that contain at least one contig from this set
            @param considerContigWithNoScaff: consider also contigs that are not assigned to scaffolds
                (as artificial scaffolds)
            @param ignoreScaffPredToRoot: ignore scaffolds that are assigned based on the root (uninformative)
        """
        # check input options
        assert minScaffContigCount is None or isinstance(minScaffContigCount, int)
        assert minScaffBpLen is None or isinstance(minScaffBpLen, int)
        assert cladesSet is None or isinstance(cladesSet, set)
        assert isinstance(considerContigWithNoScaff, bool)
        assert isinstance(ignoreScaffPredToRoot, bool)

        if isinstance(contigNameToBp, dict):
            self._contigNameToBp = contigNameToBp
        elif isinstance(contigNameToBp, str) and os.path.isfile(contigNameToBp):
            self._contigNameToBp = getSequenceToBpDict(contigNameToBp)
        else:
            print("Can't get contig info from: ", contigNameToBp)
            return
        if isinstance(contigNameToNcbid, dict):
            self._contigToPred = contigNameToNcbid
        elif isinstance(contigNameToNcbid, str) and os.path.isfile(contigNameToNcbid):
            self._contigToPred = cami.readAssignments(contigNameToNcbid)
        else:
            print("Can't get prediction info from: ", contigNameToNcbid)
            return
        if isinstance(scaffToContigList, dict):
            self._scaffToContigsList = scaffToContigList
        elif isinstance(scaffToContigList, str) and os.path.isfile(scaffToContigList):
            self._scaffToContigsList = getMapping(scaffToContigList, 0, 1, '\t')
        else:
            print("Can't get scaffold config mapping from: ", scaffToContigList)
            return

        if isinstance(taxonomy, _TaxonomyWrapper) and (not taxonomy.isClosed()):
            self._taxonomy = taxonomy
        elif isinstance(taxonomy, str) and os.path.isfile(taxonomy):
            self._taxonomy = _TaxonomyWrapper(taxonomy)
        else:
            print("Can't use taxonomy:", taxonomy)
            return

        # check the consistency of the data!

        # if a contig that is defined in the mapping doesn't exist (in the fasta file) we remove it
        for scaff, contigsList in self._scaffToContigsList.iteritems():
            removeList = []
            for contig in contigsList:
                if contig not in self._contigNameToBp:
                    removeList.append(contig)

            for contig in removeList:
                contigsList.remove(contig)

        # if a contig was predicted but there is no scaffold assigned to it then this
        # contig is assigned to an "artificial scaffold"
        if considerContigWithNoScaff:
            scaffContigSet = set()
            for s, l in self._scaffToContigsList.iteritems():
                for c in l:
                    scaffContigSet.add(c)
            aloneContigSet = set()
            for c in self._contigToPred:
                if c not in scaffContigSet:
                    aloneContigSet.add(c)

            for c in aloneContigSet:
                scaffName = str('scaffold_' + c)  # make up a scaffold name
                assert scaffName not in self._scaffToContigsList, 'The names of contigs are ambiguous!'
                self._scaffToContigsList[scaffName] = [c]

        # filter out scaffolds according to the input constrains
        self._scaffolds = dict()
        for scaffName, contigsList in self._scaffToContigsList.iteritems():
            if minScaffContigCount is not None:
                if len(contigsList) < minScaffContigCount:
                    continue

            if minScaffBpLen is not None:
                sum = 0
                for contig in contigsList:
                    sum += self._contigNameToBp[contig]
                if sum < minScaffBpLen:
                    continue

            if cladesSet is not None:
                passScaff = False
                for contig in contigsList:
                    if (contig in self._contigToPred) and (self._contigToPred[contig] in cladesSet):
                        passScaff = True
                        break
                if not passScaff:
                    continue

            # process the scaffold, but if everything in the scaffold was assigned to the root, then ignore it!
            s = self._processScaffold(scaffName)
            if not ((s.getNcbid() == 1) and ignoreScaffPredToRoot):
                self._scaffolds[scaffName] = s
Пример #23
0
def mergeSequences(mapFilePathList, fastaFilePathList, outputDir):
    """
        Reads all sequences. For each taxonId creates a file that contain all sequences
        mapped to this taxonId. If a seqId appears more than one it is ignored since
        acession numbers are unique.

        @param mapFilePathList: list of files where each contain mapping: seqId -> taxonId
        @param fastaFilePathList: list of fasta files that contain mapping: seqId -> seq
    """
    taxonIdToOutBuffer = {}
    seqIdSet = set()

    totalSeqCount = 0
    totalStoredSeqCount = 0
    totalIdenticalSeqCount = 0

    for mapFilePath, fastaFilePath in zip(mapFilePathList, fastaFilePathList):
        print 'processing', mapFilePath, fastaFilePath
        seqCount = 0
        storedSeqCount = 0

        seqIdToSeq = fasta.fastaFileToDict(fastaFilePath)
        seqIdToNcbidList = csv.getMapping(mapFilePath,
                                          0,
                                          1,
                                          sep='\t',
                                          comment='#')

        for seqId, seq in seqIdToSeq.iteritems():
            seqCount += 1
            if seqId in seqIdSet:
                totalIdenticalSeqCount += 1
                continue
            else:
                seqIdSet.add(seqId)

            taxonId = seqIdToNcbidList[seqId][0]

            if taxonId not in taxonIdToOutBuffer:
                outBuffer = csv.OutFileBuffer(
                    os.path.join(outputDir, str(str(taxonId) + '.fna')))
                taxonIdToOutBuffer[taxonId] = outBuffer

            taxonIdToOutBuffer[taxonId].writeText(
                str('>' + seqId + '\n' + seq + '\n'))
            taxonIdToOutBuffer[taxonId].close()
            storedSeqCount += 1

            if len(string.replace(common.noNewLine(seq), 'N', '')) == 0:
                print 'zeros', seqId, fastaFilePath, len(common.noNewLine(seq))

        # for buff in taxonIdToOutBuffer.values():
        #     buff.close()

        print 'totalSeq, storedSeq', seqCount, storedSeqCount
        totalSeqCount += seqCount
        totalStoredSeqCount += storedSeqCount

    print 'totalSeqCount, totalStoredSeqCount, totalIdenticalSeqCount', totalSeqCount, totalStoredSeqCount, totalIdenticalSeqCount

    print 'sequences merged'
Пример #24
0
    def __init__(self,
                 contigNameToBp,
                 contigNameToNcbid,
                 scaffToContigList,
                 taxonomy,
                 minScaffContigCount=None,
                 minScaffBpLen=None,
                 cladesSet=None,
                 considerContigWithNoScaff=True,
                 ignoreScaffPredToRoot=True):
        """
            Initializes the main Consistency class.

            @param contigNameToBp: dictionary that maps contig names to bp (int);
                or a fasta file that contain contigs
            @param contigNameToNcbid: dictionary that maps contig names to ncbids (int);
                or a prediction file - first column contig name, last column ncbid
            @param scaffToContigList: dictionary that maps scaffold names to list of contig names;
                or a file - first column scaffold name, second column contig name
            @param minScaffContigCount: consider only scaffolds that contain at least this number of contigs
            @param minScaffBpLen: consider only scaffolds with at least this collective length (in bp)
            @param cladesSet: consider only scaffolds that contain at least one contig from this set
            @param considerContigWithNoScaff: consider also contigs that are not assigned to scaffolds
                (as artificial scaffolds)
            @param ignoreScaffPredToRoot: ignore scaffolds that are assigned based on the root (uninformative)
        """
        # check input options
        assert minScaffContigCount is None or isinstance(
            minScaffContigCount, int)
        assert minScaffBpLen is None or isinstance(minScaffBpLen, int)
        assert cladesSet is None or isinstance(cladesSet, set)
        assert isinstance(considerContigWithNoScaff, bool)
        assert isinstance(ignoreScaffPredToRoot, bool)

        if isinstance(contigNameToBp, dict):
            self._contigNameToBp = contigNameToBp
        elif isinstance(contigNameToBp,
                        str) and os.path.isfile(contigNameToBp):
            self._contigNameToBp = getSequenceToBpDict(contigNameToBp)
        else:
            print("Can't get contig info from: ", contigNameToBp)
            return
        if isinstance(contigNameToNcbid, dict):
            self._contigToPred = contigNameToNcbid
        elif isinstance(contigNameToNcbid,
                        str) and os.path.isfile(contigNameToNcbid):
            self._contigToPred = cami.readAssignments(contigNameToNcbid)
        else:
            print("Can't get prediction info from: ", contigNameToNcbid)
            return
        if isinstance(scaffToContigList, dict):
            self._scaffToContigsList = scaffToContigList
        elif isinstance(scaffToContigList,
                        str) and os.path.isfile(scaffToContigList):
            self._scaffToContigsList = getMapping(scaffToContigList, 0, 1,
                                                  '\t')
        else:
            print("Can't get scaffold config mapping from: ",
                  scaffToContigList)
            return

        if isinstance(taxonomy,
                      _TaxonomyWrapper) and (not taxonomy.isClosed()):
            self._taxonomy = taxonomy
        elif isinstance(taxonomy, str) and os.path.isfile(taxonomy):
            self._taxonomy = _TaxonomyWrapper(taxonomy)
        else:
            print("Can't use taxonomy:", taxonomy)
            return

        # check the consistency of the data!

        # if a contig that is defined in the mapping doesn't exist (in the fasta file) we remove it
        for scaff, contigsList in self._scaffToContigsList.iteritems():
            removeList = []
            for contig in contigsList:
                if contig not in self._contigNameToBp:
                    removeList.append(contig)

            for contig in removeList:
                contigsList.remove(contig)

        # if a contig was predicted but there is no scaffold assigned to it then this
        # contig is assigned to an "artificial scaffold"
        if considerContigWithNoScaff:
            scaffContigSet = set()
            for s, l in self._scaffToContigsList.iteritems():
                for c in l:
                    scaffContigSet.add(c)
            aloneContigSet = set()
            for c in self._contigToPred:
                if c not in scaffContigSet:
                    aloneContigSet.add(c)

            for c in aloneContigSet:
                scaffName = str('scaffold_' + c)  # make up a scaffold name
                assert scaffName not in self._scaffToContigsList, 'The names of contigs are ambiguous!'
                self._scaffToContigsList[scaffName] = [c]

        # filter out scaffolds according to the input constrains
        self._scaffolds = dict()
        for scaffName, contigsList in self._scaffToContigsList.iteritems():
            if minScaffContigCount is not None:
                if len(contigsList) < minScaffContigCount:
                    continue

            if minScaffBpLen is not None:
                sum = 0
                for contig in contigsList:
                    sum += self._contigNameToBp[contig]
                if sum < minScaffBpLen:
                    continue

            if cladesSet is not None:
                passScaff = False
                for contig in contigsList:
                    if (contig in self._contigToPred) and (
                            self._contigToPred[contig] in cladesSet):
                        passScaff = True
                        break
                if not passScaff:
                    continue

            # process the scaffold, but if everything in the scaffold was assigned to the root, then ignore it!
            s = self._processScaffold(scaffName)
            if not ((s.getNcbid() == 1) and ignoreScaffPredToRoot):
                self._scaffolds[scaffName] = s
Пример #25
0
def _main():
    """ See the module description."""
    parser = argparse.ArgumentParser(description=__doc__, epilog="""""")

    parser.add_argument(
        '-i',
        '--input-data-dir',
        action='store',
        nargs=1,
        required=True,
        help=
        """Directory that contains fasta files and corresponding mapping files, for each "*.tax" (or "*.csv")
                 file there must be a "*.fna" file with the same name. All files with suffix "tax" (or "*.csv")
                 will be considered. (Takes only Bacteria and Archaea)""",
        metavar='input_dir',
        dest='inDir')

    parser.add_argument('-o',
                        '--output-dir',
                        action='store',
                        nargs=1,
                        required=True,
                        help='Directory that contains the output files.',
                        metavar='out_dir',
                        dest='outDir')

    parser.add_argument(
        '-s',
        '--source-type',
        required=True,
        nargs=1,
        choices=["s", "a"],
        help=
        'To determine the source, use "s" for the Silva database and "a" for the Amphora database.',
        dest='srcType')

    parser.add_argument(
        '-t',
        '--taxonomy-file',
        nargs=1,
        type=file,
        required=True,
        help='NCBI taxonomy database file in the sqlite3 format.',
        metavar='ncbitax_sqlite.db',
        dest='taxonomy')

    parser.add_argument('-n', '--not-considered-taxonIds', action='store', nargs=1,
        help='Comma separated leaf level or top level taxonIds (as a string) what fill be filtered out. (optional)',
        metavar='"2759,10239,77133,155900,408172,32644, 408170,433727,749907,556182,702656,410661,652676,410659,797283'\
                ',408171,703336,256318,32630,433724,766747,488339,942017,1076179,717931,455559,527640,904678,552539,'\
                '54395,198431,358574,415540,511564,369433,380357,81726,198834,271928,311313,2759,749906,1077529,'\
                '1077529,361146,511563,361147"',
        dest='filterOut')

    # parse arguments
    args = parser.parse_args()
    inDir = args.inDir[0]
    outDir = args.outDir[0]
    srcType = args.srcType[0]
    filterOutTaxonIdsSet = set()
    try:
        if args.filterOut:
            filterOutTaxonIdsSet.update(
                set(map(int,
                        str(args.filterOut[0]).split(','))))
    except:
        print(
            'Taxon ids that are to be filtered out are in a wrong format! Comma separated integers are needed!'
        )
        raise

    taxonomy = _TaxonomyWrap(args.taxonomy[0].name)
    for dir in [inDir, outDir]:
        assert os.path.isdir(dir), 'Path: "' + dir + '" does not exists!'

    # create db for each gene
    mapDict = {}  # map: seqId -> ncbid
    for mapFilePath in glob.glob(
            os.path.join(os.path.normpath(inDir),
                         r'*.[ct][sa][vx]')):  # *.csv or *.tax

        assert mapFilePath.endswith(('.csv', '.tax')), \
            'The mapping files can either end with .csv or .tax ' + mapFilePath

        base = os.path.basename(mapFilePath).rsplit(
            '.', 1)[0]  # cut out dir path and suffix
        fastaDict = fas.fastaFileToDict(
            os.path.join(os.path.dirname(mapFilePath),
                         (base + '.fna')))  # map: seqId -> seq
        print("Processing: %s seq count: %s" % (base, str(len(fastaDict))))

        if 'a' in srcType:  # Amphora
            mapDict = {}
            for k in csv.getColumnAsList(mapFilePath, colNum=0, sep='\t'):
                v = int(k.rsplit('|', 1)[1].split(':')[1])  # get ncbid
                assert ((k not in mapDict) or (mapDict[k] == v)), str(
                    'There are at least two different values for key: ' +
                    str(k) + ' in ' + mapFilePath)
                mapDict[k] = v
        elif 's' in srcType:  # Silva
            mapTmp = csv.getMapping(mapFilePath, 0, 2, '\t')
            mapDict = {}
            for k, v in mapTmp.iteritems():
                mapDict[k] = int(v[0])
        else:
            assert False, 'Unsupported source type!'

        # same number of entries in both files (fasta and mapping) ?
        if len(mapDict) != len(fastaDict):
            print(
                str('%s: The mapping file and the corresponding fasta file have different number of entries: '
                    + '"%s" "%s" these files will be skipped!') %
                (base, str(len(mapDict)), str(len(fastaDict))))
            continue

        # are duplicates in the mapping file ?
        count = len(csv.getColumnAsList(mapFilePath))
        if len(mapDict) != count:
            print(
                '%s: The mapping file contained duplicates! unique: %s non-unique: %s'
                % (base, str(len(mapDict)), str(count)))

        # store data to the output directory
        outDna = csv.OutFileBuffer(os.path.join(outDir, str(base + '.fna')))
        outTax = csv.OutFileBuffer(os.path.join(outDir, str(base + '.tax')))
        count = 0
        filteredLeaf = 0
        filteredSup = 0
        notMapped = 0
        noBacArch = 0
        for seqId, taxonId in mapDict.iteritems():
            if taxonId in filterOutTaxonIdsSet:
                filteredLeaf += 1
                continue
            path = taxonomy.getPathToRoot(taxonId)
            if path is None:
                print('Could not find: %s for seqId: %s record skipped!' %
                      (str(taxonId), seqId))
                notMapped += 1
                continue
            topLevel = int(path.split(';', 1)[0])
            if topLevel in filterOutTaxonIdsSet:
                filteredSup += 1
                continue
            if topLevel not in [2, 2157]:  # Bacteria, Archaea
                noBacArch += 1
                print('NoBactArch: ', topLevel)

            seq = fastaDict[seqId]
            if 'a' in srcType:  # Amphora
                id = seqId
            elif 's' in srcType:  # Silva
                id = str(seqId + '|ncbid:' + str(taxonId))

            outTax.writeText(str(id + '\t' + path + '\n'))
            outDna.writeText(str('>' + id + '\n' + seq + '\n'))
            count += 1

        outDna.close()
        outTax.close()
        print(
            'Stored entries: %s filtered out: %s leaf, %s top level, not mapped: %s'
            % (count, filteredLeaf, filteredSup, notMapped))
        if noBacArch > 0:
            print(
                'WARN: stored %s of non Bacterial and non Archaeal sequences: '
                % (noBacArch))

        # Silva:
        #-i /Users/ivan/Documents/work/binning/database/silva111/arbGenerated -s s -t /Users/ivan/Documents/work/binning/taxonomy/20121122/ncbitax_sqlite.db
        # -o /Users/ivan/Documents/work/binning/database/silva111/db -n ...

        # Amphora
        # -i /Users/ivan/Documents/work/binning/database/markerGenes3/mGenesExtracted -s a -t /Users/ivan/Documents/work/binning/taxonomy/20121122/ncbitax_sqlite.db
        # -o /Users/ivan/Documents/work/binning/database/markerGenes3/db

    taxonomy.close()
    print 'done'