Пример #1
0
def ppsOut2ppOut(inFile, outFile, taxonomicRanks, databaseFile):
    """
        Transforms a PPS output file into a file in the PP format.

        @param inFile: input file in the PPS format (first column: seq name, last column: ncbi taxon id)
        @param outFile: output file in the PP format
        @param taxonomicRanks: taxonomic ranks (starting from superkingdom)
        @param databaseFile: database file in the sqlite3 format
    """
    taxonomy = Taxonomy(databaseFile, taxonomicRanks)
    outBuff = csv.OutFileBuffer(outFile)
    namesList = csv.getColumnAsList(inFile,
                                    entryModifyFunction=None,
                                    colNum=0,
                                    sep='\t',
                                    comment='#')
    valCol = 1
    ncbidsList = csv.getColumnAsList(inFile,
                                     entryModifyFunction=None,
                                     colNum=valCol,
                                     sep='\t',
                                     comment='#')

    while True:  # this is not efficient!
        valCol += 1
        tmpList = csv.getColumnAsList(inFile,
                                      entryModifyFunction=None,
                                      colNum=valCol,
                                      sep='\t',
                                      comment='#')
        if len(tmpList) == len(namesList):
            ncbidsList = tmpList
        else:
            break

    header = str('#PPS file transformed to PP format, input file: ' +
                 str(inFile) + '\n#ID' + '\t' + 'root')
    for rank in taxonomicRanks:
        header += str('\t' + rank)
    outBuff.writeText(str(header + '\n'))

    for i in range(len(namesList)):
        name = namesList[i]
        ncbid = ncbidsList[i]
        taxPathDict = taxonomy.getPathToRoot(int(ncbid))
        buff = str(name)
        if taxPathDict is None:
            buff += str('\t')
        else:
            buff += str('\t' + 'root')

        for rank in taxonomicRanks:
            if (taxPathDict is not None) and (rank in taxPathDict) and (
                    not taxPathDict[rank].isCopy()):
                buff += str('\t' + taxPathDict[rank].name)
            else:
                buff += '\t'
        outBuff.writeText(str(buff + '\n'))
    outBuff.close()
    taxonomy.close()
Пример #2
0
def removeLines(mg):
    removeListFilePath = '/net/metagenomics/projects/PPSmg/data/V35/genome_ncbids.txt'
    #removeListFilePath = '/net/metagenomics/projects/PPSmg/data/V35/genome_accession_silva.txt'
    srcFilePath = str('/net/metagenomics/projects/PPSmg/data/markerGenes/db/' +
                      mg + '_bact+arch_dnaV.tax')
    dstFilePath = str(
        '/net/metagenomics/projects/PPSmg/data/V35/genomesRemoved/markerGenes/db/'
        + mg + '_bact+arch_dnaV.tax')
    #srcFilePath = str('/net/metagenomics/projects/PPSmg/data/silva/' + mg + '_silva106_ncbitax.bacteria+archaea.tax' )
    #dstFilePath = str('/net/metagenomics/projects/PPSmg/data/V35/genomesRemoved/silva/' + mg + '_silva106_ncbitax.bacteria+archaea.tax' )
    pattern = r'.*ncbid:([0-9]+)$'
    #pattern = r'^([^\-]+)\-.*$'

    removeSet = set(
        csv.getColumnAsList(removeListFilePath, colNum=0, comment='#'))
    col0 = csv.getColumnAsList(srcFilePath, colNum=0, sep='\t', comment='#')
    col1 = csv.getColumnAsList(srcFilePath, colNum=1, sep='\t', comment='#')
    out = csv.OutFileBuffer(dstFilePath)
    removed = 0
    for col0, col1 in zip(col0, col1):
        if re.sub(pattern, r'\1', col0) not in removeSet:
            out.writeText(str(col0 + '\t' + col1 + '\n'))
        else:
            removed += 1

    out.close()
    print mg, 'removeLines', removed
Пример #3
0
def removeEntries(mg):
    """
        Removes sequences from the marker gene files at the level from species, genus, family etc.
    """
    removeListPath = '/net/metagenomics/projects/PPSmg/data/V35/genome_ncbids_species.txt'
    srcFilePath = str(
        '/net/metagenomics/projects/PPSmg/data/markerGenes2/db/' + mg +
        '_bact+arch_dnaV.tax')
    dstFilePath = str(
        '/net/metagenomics/projects/PPSmg/data/V35/mgScenarios/speciesRemoved/db/'
        + mg + '_bact+arch_dnaV.tax')
    out = csv.OutFileBuffer(dstFilePath)
    removeSet = set(csv.getColumnAsList(removeListPath, colNum=0, comment='#'))
    removeSetInt = set()
    removeSetIds = set()
    removed = 0
    for s in removeSet:
        if s != '':
            removeSetInt.add(int(s))
    col0 = csv.getColumnAsList(srcFilePath, colNum=0, sep='\t', comment='#')
    col1 = csv.getColumnAsList(srcFilePath, colNum=1, sep='\t', comment='#')
    for col0, col1 in zip(col0, col1):
        lineSetInt = set()
        for s in col1.split(';'):
            if s != '':
                lineSetInt.add(int(s))
        if len(removeSetInt.intersection(
                lineSetInt)) > 0:  #the intersection is not empty
            removed += 1
            removeSetIds.add(col0)
        else:
            out.writeText(str(col0 + '\t' + col1 + '\n'))
    out.close()

    print mg, 'removedEntries', removed

    srcFilePath = str(
        '/net/metagenomics/projects/PPSmg/data/markerGenes2/db/' + mg +
        '_bact+arch_dnaV.noalign.fna')
    dstFilePath = str(
        '/net/metagenomics/projects/PPSmg/data/V35/mgScenarios/speciesRemoved/db/'
        + mg + '_bact+arch_dnaV.noalign.fna')
    out = csv.OutFileBuffer(dstFilePath)
    seqIdToSeq = fas.fastaFileToDict(srcFilePath)
    removed = 0
    for seqId in seqIdToSeq:
        if seqId in removeSetIds:
            removed += 1
        else:
            out.writeText(
                str('>' + str(seqId) + '\n' + str(seqIdToSeq[seqId]) + '\n'))

    out.close()

    print mg, 'removedSeq', removed
Пример #4
0
def tmpCmp():
    communityList = csv.getColumnAsList('/Users/ivan/Documents/work/binning/data/mercier51Strains/syn-mercier51strains/'
                                        'generation/community_20121116.tax',
                        colNum=1, sep='\t')
    profileList = csv.getColumnAsList('/Users/ivan/Documents/nobackup/assembly/uniform/soap_uniform.contig.profile.csv',
                                      colNum=0, sep=',')

    cSet = set(map(int, communityList))
    pSet = set(map(int, profileList))
    for i in cSet:
        if i not in pSet:
            print("Ncbid %s from community is not in profile" % i)
    for i in pSet:
        if i not in cSet:
            print("Ncbid %s from profile is not in community" % i)
Пример #5
0
def toContigsLabels(inMapFile, outMapFile):
    """
        Creates the label of contigs from the label of reads.

        @param inMapFile: maps contigId to a list of read taxonIds
        @param outMapFile: maps contigId to weight and the most prevalent taxonId
    """
    out = csv.OutFileBuffer(outMapFile)

    for line in csv.getColumnAsList(inMapFile, sep='\n'):
        contigId, taxonIds = str(line).split('\t')
        taxonIdsList = map(int, str(taxonIds).split(','))
        idToCount = {}
        totalCount = 0.0
        for taxonId in taxonIdsList:
            totalCount += 1
            if taxonId in idToCount:
                idToCount[taxonId] += 1
            else:
                idToCount[taxonId] = 1
        pairList = []
        for taxonId, count in idToCount.iteritems():
            pairList.append((taxonId, count))
        pairList.sort(key=lambda x: x[1], reverse=True)
        weight = round(float(pairList[0][1]) / totalCount, 3)
        out.writeText(str(contigId) + '\t' + str(weight) + '\t' + str(pairList[0][0]) + '\n')

    out.close()
Пример #6
0
def toContigsLabels(inMapFile, outMapFile):
    """
        Creates the label of contigs from the label of reads.

        @param inMapFile: maps contigId to a list of read taxonIds
        @param outMapFile: maps contigId to weight and the most prevalent taxonId
    """
    out = csv.OutFileBuffer(outMapFile)

    for line in csv.getColumnAsList(inMapFile, sep='\n'):
        contigId, taxonIds = str(line).split('\t')
        taxonIdsList = map(int, str(taxonIds).split(','))
        idToCount = {}
        totalCount = 0.0
        for taxonId in taxonIdsList:
            totalCount += 1
            if taxonId in idToCount:
                idToCount[taxonId] += 1
            else:
                idToCount[taxonId] = 1
        pairList = []
        for taxonId, count in idToCount.iteritems():
            pairList.append((taxonId, count))
        pairList.sort(key=lambda x: x[1], reverse=True)
        weight = round(float(pairList[0][1]) / totalCount, 3)
        out.writeText(
            str(contigId) + '\t' + str(weight) + '\t' + str(pairList[0][0]) +
            '\n')

    out.close()
Пример #7
0
def getReadsTaxonIdList(readsFile,
                        communityFile,
                        readHeaderToCommunityId=getCommunityId):
    """
        Gets list of taxonIds in the same order as they are in the readOnContig file.
        The first taxonId is at index 1.

        @param readsFile:
        @param communityFile:
        @param readHeaderToCommunityId:
        @return:
    """
    communityIdToTaxonId = csv.predToDict(communityFile)
    d = [None]
    rowList = csv.getColumnAsList(readsFile, colNum=0, sep='\n')
    for line in rowList:
        if str(line).startswith('>'):
            try:
                taxonId = int(
                    communityIdToTaxonId.get(readHeaderToCommunityId(line)))
            except TypeError as ex:
                print(ex.message)
                print("%s, %s" % (taxonId, line))
                raise ex
            d.append(taxonId)
            d.append(taxonId)
    return d
Пример #8
0
def removeSequences(mg):
    removeListFilePath = '/net/metagenomics/projects/PPSmg/data/V35/genome_ncbids.txt'
    #removeListFilePath = '/net/metagenomics/projects/PPSmg/data/V35/genome_accession_silva.txt'
    srcFilePath = str('/net/metagenomics/projects/PPSmg/data/markerGenes/db/' +
                      mg + '_bact+arch_dnaV.noalign.fna')
    dstFilePath = str(
        '/net/metagenomics/projects/PPSmg/data/V35/genomesRemoved/markerGenes/db/'
        + mg + '_bact+arch_dnaV.noalign.fna')
    #srcFilePath = str('/net/metagenomics/projects/PPSmg/data/silva/' + mg + '_silva106_ncbitax.bacteria+archaea.fna' )
    #dstFilePath = str('/net/metagenomics/projects/PPSmg/data/V35/genomesRemoved/silva/' + mg + '_silva106_ncbitax.bacteria+archaea.fna' )
    pattern = r'.*ncbid:([0-9]+)$'
    #pattern = r'^([^\-]+)\-.*$'

    removeSet = set(
        csv.getColumnAsList(removeListFilePath, colNum=0, comment='#'))
    seqIdToSeq = fas.fastaFileToDict(srcFilePath)
    out = csv.OutFileBuffer(dstFilePath)
    removed = 0
    for seqId in seqIdToSeq:
        if re.sub(pattern, r'\1', str(seqId)) not in removeSet:
            out.writeText(
                str('>' + str(seqId) + '\n' + str(seqIdToSeq[seqId]) + '\n'))
        else:
            removed += 1

    out.close()
    print mg, 'removeSequences', removed
Пример #9
0
def ppsOut2ppOut(inFile, outFile, taxonomicRanks, databaseFile):
    """
        Transforms a PPS output file into a file in the PP format.

        @param inFile: input file in the PPS format (first column: seq name, last column: ncbi taxon id)
        @param outFile: output file in the PP format
        @param taxonomicRanks: taxonomic ranks (starting from superkingdom)
        @param databaseFile: database file in the sqlite3 format
    """
    taxonomy = Taxonomy(databaseFile, taxonomicRanks)
    outBuff = csv.OutFileBuffer(outFile)
    namesList = csv.getColumnAsList(inFile, entryModifyFunction=None, colNum=0, sep='\t', comment='#')
    valCol = 1
    ncbidsList = csv.getColumnAsList(inFile, entryModifyFunction=None, colNum=valCol, sep='\t', comment='#')

    while True:  # this is not efficient!
        valCol += 1
        tmpList = csv.getColumnAsList(inFile, entryModifyFunction=None, colNum=valCol, sep='\t', comment='#')
        if len(tmpList) == len(namesList):
            ncbidsList = tmpList
        else:
            break

    header = str('#PPS file transformed to PP format, input file: ' + str(inFile) + '\n#ID' + '\t' + 'root')
    for rank in taxonomicRanks:
        header += str('\t' + rank)
    outBuff.writeText(str(header + '\n'))

    for i in range(len(namesList)):
        name = namesList[i]
        ncbid = ncbidsList[i]
        taxPathDict = taxonomy.getPathToRoot(int(ncbid))
        buff = str(name)
        if taxPathDict is None:
            buff += str('\t')
        else:
            buff += str('\t' + 'root')

        for rank in taxonomicRanks:
            if (taxPathDict is not None) and (rank in taxPathDict) and (not taxPathDict[rank].isCopy()):
                buff += str('\t' + taxPathDict[rank].name)
            else:
                buff += '\t'
        outBuff.writeText(str(buff + '\n'))
    outBuff.close()
    taxonomy.close()
Пример #10
0
def removeEntries(mg):
    """
        Removes sequences from the marker gene files at the level from species, genus, family etc.
    """
    removeListPath = '/net/metagenomics/projects/PPSmg/data/V35/genome_ncbids_species.txt'
    srcFilePath = str('/net/metagenomics/projects/PPSmg/data/markerGenes2/db/' + mg + '_bact+arch_dnaV.tax')
    dstFilePath = str('/net/metagenomics/projects/PPSmg/data/V35/mgScenarios/speciesRemoved/db/' + mg + '_bact+arch_dnaV.tax')
    out = csv.OutFileBuffer(dstFilePath)
    removeSet = set(csv.getColumnAsList(removeListPath, colNum=0, comment='#'))
    removeSetInt = set()
    removeSetIds = set()
    removed = 0
    for s in removeSet:
        if s != '':
            removeSetInt.add(int(s))
    col0 = csv.getColumnAsList(srcFilePath, colNum=0, sep='\t', comment='#')
    col1 = csv.getColumnAsList(srcFilePath, colNum=1, sep='\t', comment='#')
    for col0,col1 in zip(col0,col1):
        lineSetInt = set()
        for s in col1.split(';'):
            if s != '':
                lineSetInt.add(int(s))
        if len(removeSetInt.intersection(lineSetInt)) > 0: #the intersection is not empty
            removed += 1
            removeSetIds.add(col0)
        else:
            out.writeText(str(col0 + '\t' + col1 + '\n'))
    out.close()

    print mg, 'removedEntries', removed

    srcFilePath = str('/net/metagenomics/projects/PPSmg/data/markerGenes2/db/' + mg + '_bact+arch_dnaV.noalign.fna')
    dstFilePath = str('/net/metagenomics/projects/PPSmg/data/V35/mgScenarios/speciesRemoved/db/' + mg + '_bact+arch_dnaV.noalign.fna')
    out = csv.OutFileBuffer(dstFilePath)
    seqIdToSeq = fas.fastaFileToDict(srcFilePath)
    removed=0
    for seqId in seqIdToSeq:
        if seqId in removeSetIds:
            removed += 1
        else:
            out.writeText(str('>' + str(seqId) + '\n' + str(seqIdToSeq[seqId]) + '\n'))

    out.close()

    print mg, 'removedSeq', removed
Пример #11
0
def tmpCmp():
    communityList = csv.getColumnAsList(
        '/Users/ivan/Documents/work/binning/data/mercier51Strains/syn-mercier51strains/'
        'generation/community_20121116.tax',
        colNum=1,
        sep='\t')
    profileList = csv.getColumnAsList(
        '/Users/ivan/Documents/nobackup/assembly/uniform/soap_uniform.contig.profile.csv',
        colNum=0,
        sep=',')

    cSet = set(map(int, communityList))
    pSet = set(map(int, profileList))
    for i in cSet:
        if i not in pSet:
            print("Ncbid %s from community is not in profile" % i)
    for i in pSet:
        if i not in cSet:
            print("Ncbid %s from profile is not in community" % i)
Пример #12
0
def removeLines(mg):
    removeListFilePath = '/net/metagenomics/projects/PPSmg/data/V35/genome_ncbids.txt'
    #removeListFilePath = '/net/metagenomics/projects/PPSmg/data/V35/genome_accession_silva.txt'
    srcFilePath = str('/net/metagenomics/projects/PPSmg/data/markerGenes/db/' + mg + '_bact+arch_dnaV.tax')
    dstFilePath = str('/net/metagenomics/projects/PPSmg/data/V35/genomesRemoved/markerGenes/db/' + mg + '_bact+arch_dnaV.tax')
    #srcFilePath = str('/net/metagenomics/projects/PPSmg/data/silva/' + mg + '_silva106_ncbitax.bacteria+archaea.tax' )
    #dstFilePath = str('/net/metagenomics/projects/PPSmg/data/V35/genomesRemoved/silva/' + mg + '_silva106_ncbitax.bacteria+archaea.tax' )
    pattern = r'.*ncbid:([0-9]+)$'
    #pattern = r'^([^\-]+)\-.*$'

    removeSet = set(csv.getColumnAsList(removeListFilePath, colNum=0, comment='#'))
    col0 = csv.getColumnAsList(srcFilePath, colNum=0, sep='\t', comment='#')
    col1 = csv.getColumnAsList(srcFilePath, colNum=1, sep='\t', comment='#')
    out = csv.OutFileBuffer(dstFilePath)
    removed = 0
    for col0,col1 in zip(col0,col1):
        if re.sub(pattern, r'\1', col0) not in removeSet:
            out.writeText(str(col0 + '\t' + col1 + '\n'))
        else:
            removed += 1

    out.close()
    print mg, 'removeLines', removed
Пример #13
0
def genomesToMask():
    rank = 'genus'  #which rank will be masked
    fileName = '/Users/ivan/Documents/work/binning/data/simMC/fromJohannes/contigs_genus_ncbids.txt'
    outFile = '/Users/ivan/Documents/work/binning/data/simMC/fromJohannes/genome_genus_masked.txt'
    outFile2 = '/Users/ivan/Documents/work/binning/data/simMC/fromJohannes/genome_ncbids_genus.txt'
    #outFile = '/Users/ivan/Documents/work/binning/data/V35/genome_species_masked.txt' #output file
    #outFile2 = '/Users/ivan/Documents/work/binning/data/V35/genome_ncbids_species.txt' #output file
    #fileName='/Users/ivan/Documents/work/binning/data/V35/genome_ncbids.txt' #list of all genome ncbids
    dbFile = '/Users/ivan/Documents/work/binning/taxonomy/20120828/ncbitax_sqlite.db'  #DB
    out = csv.OutFileBuffer(outFile)
    out2 = csv.OutFileBuffer(outFile2)

    genomeNcbids = csv.getColumnAsList(fileName,
                                       entryModifyFunction=None,
                                       colNum=0,
                                       sep=None,
                                       comment='#')
    taxonomy = taxonomy_ncbi.TaxonomyNcbi(dbFile)

    maskNcbids = []
    #print len(genomeNcbids), genomeNcbids
    for ncbid in genomeNcbids:
        while taxonomy.getRank(ncbid) != rank:
            ncbid = taxonomy.getParentNcbid(ncbid)
            if int(ncbid) == 1:
                print 'root reached!'
                break
        maskNcbids.append(int(ncbid))

    #print len(Set(maskNcbids)), maskNcbids

    maskSet = set(maskNcbids)
    for i in maskSet:
        out2.writeText(str(str(i) + '\n'))

    resultList = []
    for ncbid in maskSet:
        list = collectChildren(taxonomy, ncbid)
        for i in list:
            out.writeText(str(str(i) + '\n'))
        print ncbid, list

    #print taxonomy.childrenNcbids(818) #997888,818

    out.close()
    out2.close()
    taxonomy.close()
Пример #14
0
def genomesToMask():
    rank = 'genus' #which rank will be masked
    fileName = '/Users/ivan/Documents/work/binning/data/simMC/fromJohannes/contigs_genus_ncbids.txt'
    outFile = '/Users/ivan/Documents/work/binning/data/simMC/fromJohannes/genome_genus_masked.txt'
    outFile2 = '/Users/ivan/Documents/work/binning/data/simMC/fromJohannes/genome_ncbids_genus.txt'
    #outFile = '/Users/ivan/Documents/work/binning/data/V35/genome_species_masked.txt' #output file
    #outFile2 = '/Users/ivan/Documents/work/binning/data/V35/genome_ncbids_species.txt' #output file
    #fileName='/Users/ivan/Documents/work/binning/data/V35/genome_ncbids.txt' #list of all genome ncbids
    dbFile = '/Users/ivan/Documents/work/binning/taxonomy/20120828/ncbitax_sqlite.db' #DB
    out = csv.OutFileBuffer(outFile)
    out2 = csv.OutFileBuffer(outFile2)

    genomeNcbids = csv.getColumnAsList(fileName, entryModifyFunction=None, colNum=0, sep=None, comment='#')
    taxonomy = taxonomy_ncbi.TaxonomyNcbi(dbFile)

    maskNcbids = []
    #print len(genomeNcbids), genomeNcbids
    for ncbid in genomeNcbids:
        while taxonomy.getRank(ncbid) != rank:
            ncbid = taxonomy.getParentNcbid(ncbid)
            if int(ncbid) == 1:
                print 'root reached!'
                break
        maskNcbids.append(int(ncbid))

    #print len(Set(maskNcbids)), maskNcbids

    maskSet = set(maskNcbids)
    for i in maskSet:
        out2.writeText(str(str(i) + '\n'))

    resultList = []
    for ncbid in maskSet:
        list = collectChildren(taxonomy, ncbid)
        for i in list:
            out.writeText(str(str(i) + '\n'))
        print ncbid, list

    #print taxonomy.childrenNcbids(818) #997888,818


    out.close()
    out2.close()
    taxonomy.close()
Пример #15
0
def sortReads(inReadsFile, outReadsFile, headerToNum=lambda x: int(x.split('_', 2)[1].strip('nr'))):
    i = 0
    seqName = None
    tupleList = []
    for line in csv.getColumnAsList(inReadsFile, sep='\n'):
        if i % 2 == 0:
            seqName = line
        else:
            seq = line
            assert seqName is not None
            tupleList.append((seqName, zlib.compress(seq), headerToNum(seqName)))
            seqName = None
        i += 1
    tupleList.sort(key=lambda x: x[2])

    out = csv.OutFileBuffer(outReadsFile)
    for t in tupleList:
        out.writeText(str(t[0]) + '\n' + str(zlib.decompress(t[1])) + '\n')
    out.close()
Пример #16
0
def removeSequences(mg):
    removeListFilePath = '/net/metagenomics/projects/PPSmg/data/V35/genome_ncbids.txt'
    #removeListFilePath = '/net/metagenomics/projects/PPSmg/data/V35/genome_accession_silva.txt'
    srcFilePath = str('/net/metagenomics/projects/PPSmg/data/markerGenes/db/' + mg + '_bact+arch_dnaV.noalign.fna')
    dstFilePath = str('/net/metagenomics/projects/PPSmg/data/V35/genomesRemoved/markerGenes/db/' + mg + '_bact+arch_dnaV.noalign.fna')
    #srcFilePath = str('/net/metagenomics/projects/PPSmg/data/silva/' + mg + '_silva106_ncbitax.bacteria+archaea.fna' )
    #dstFilePath = str('/net/metagenomics/projects/PPSmg/data/V35/genomesRemoved/silva/' + mg + '_silva106_ncbitax.bacteria+archaea.fna' )
    pattern = r'.*ncbid:([0-9]+)$'
    #pattern = r'^([^\-]+)\-.*$'

    removeSet = set(csv.getColumnAsList(removeListFilePath, colNum=0, comment='#'))
    seqIdToSeq = fas.fastaFileToDict(srcFilePath)
    out = csv.OutFileBuffer(dstFilePath)
    removed = 0
    for seqId in seqIdToSeq:
        if re.sub(pattern, r'\1', str(seqId)) not in removeSet:
            out.writeText(str('>' + str(seqId) + '\n' + str(seqIdToSeq[seqId]) + '\n'))
        else:
            removed += 1

    out.close()
    print mg, 'removeSequences', removed
Пример #17
0
def sortReads(inReadsFile,
              outReadsFile,
              headerToNum=lambda x: int(x.split('_', 2)[1].strip('nr'))):
    i = 0
    seqName = None
    tupleList = []
    for line in csv.getColumnAsList(inReadsFile, sep='\n'):
        if i % 2 == 0:
            seqName = line
        else:
            seq = line
            assert seqName is not None
            tupleList.append(
                (seqName, zlib.compress(seq), headerToNum(seqName)))
            seqName = None
        i += 1
    tupleList.sort(key=lambda x: x[2])

    out = csv.OutFileBuffer(outReadsFile)
    for t in tupleList:
        out.writeText(str(t[0]) + '\n' + str(zlib.decompress(t[1])) + '\n')
    out.close()
Пример #18
0
def getReadsTaxonIdList(readsFile, communityFile, readHeaderToCommunityId=getCommunityId):
    """
        Gets list of taxonIds in the same order as they are in the readOnContig file.
        The first taxonId is at index 1.

        @param readsFile:
        @param communityFile:
        @param readHeaderToCommunityId:
        @return:
    """
    communityIdToTaxonId = csv.predToDict(communityFile)
    d = [None]
    rowList = csv.getColumnAsList(readsFile, colNum=0, sep='\n')
    for line in rowList:
        if str(line).startswith('>'):
            try:
                taxonId = int(communityIdToTaxonId.get(readHeaderToCommunityId(line)))
            except TypeError as ex:
                print(ex.message)
                print("%s, %s" % (taxonId, line))
                raise ex
            d.append(taxonId)
            d.append(taxonId)
    return d
Пример #19
0
def computeTrainingAccuracy(workingDir, taWorkingDir, sampleSpecificDir,
                            ppsTrainDataDir, outputDir, ppsInstallDir,
                            ppsScripts, ppsConfigFilePath, predictLogFileName,
                            modelTaxonIdFilePath, databaseFile):
    """
        Computes the training accuracy for the PPS training data.
        This function doesn't consider training data used to train intermediate (misc?) nodes!
        The training data that correspond to the sample specific data is fragmented (via PPS) and
        contained in the training data of different lengths.

        @param workingDir: working directory of the PPS+ pipeline
        @param taWorkingDir: working directory for the accuracy computation
        @param sampleSpecificDir: directory containing the sample specific data
        @param ppsTrainDataDir: directory 'sampled_fasta' containing PPS training data
        @param outputDir: directory for output files
        @param ppsScripts: directory containing PPS scripts
        @param ppsConfigFilePath: the PPS configuration file
        @param ppsInstallDir: directory where PPS is installed
        @param predictLogFileName: logging file for PPS prediction
        @param modelTaxonIdFilePath: file containing all leaf ncbi taxon ids that are modelled
        @param databaseFile: ncbi taxonomy file in the sqlite3 format
    """
    for d in [
            workingDir, taWorkingDir, sampleSpecificDir, ppsTrainDataDir,
            outputDir, ppsInstallDir, ppsScripts,
            os.path.dirname(predictLogFileName)
    ]:
        assert os.path.isdir(d), "Directory '%s' doesn't exist!" % d
    for f in [ppsConfigFilePath, databaseFile, modelTaxonIdFilePath]:
        assert os.path.isfile(f), "File '%s' doesn't exist!" % f

    # all directories that contain PPS training data
    trainDirList = [sampleSpecificDir]
    for d in os.listdir(ppsTrainDataDir):
        trainDirList.append(os.path.join(ppsTrainDataDir, d))

    # fasta file with all training sequences
    allTrainFastaFile = os.path.join(taWorkingDir, 'all_train_data.fna')
    out = csv.OutFileBuffer(allTrainFastaFile)
    seqIdToTruePred = {}

    # merge all training fasta files to one fasta file
    for d in trainDirList:
        dName = os.path.basename(d)
        for f in os.listdir(d):
            taxonId = int(os.path.basename(f).rsplit('.', 2)[0])
            for seqId, seq in fasta.fastaFileToDict(os.path.join(
                    d, f)).iteritems():
                if d == sampleSpecificDir:
                    #label = int(str(str(seqId).rsplit('|', 1)[1]).split(':', 1)[1])
                    id = str(
                        taxonId) + '|' + dName + '|' + seqId + '|label:' + str(
                            taxonId)
                else:
                    id = str(taxonId) + '|' + dName + '|' + seqId
                out.writeText('>' + id + '\n' + seq + '\n')
                seqIdToTruePred[id] = taxonId
    out.close()

    # predict the merged file using the generated model
    if os.name == 'posix':
        predictCmd = str(
            os.path.join(ppsScripts, 'predict.rb') + ' ' + allTrainFastaFile +
            ' ' + ppsConfigFilePath)
        #print(predictCmd)
        logOut = open(predictLogFileName, 'w')
        predictProc = subprocess.Popen(
            predictCmd,
            shell=True,
            bufsize=-1,
            cwd=ppsInstallDir,
            stdout=logOut,
            stderr=subprocess.STDOUT)  # stdout=subprocess.STDOUT
        predictProc.wait()
        logOut.close()
        if predictProc.returncode != 0:
            raise Exception(
                "PPS 'predict' training data returned with non-zero status: %s, cmd: %s"
                % (predictProc.returncode, predictCmd))
    else:
        print("Can't run PPS on a non-posix system!")
        return

    # read in predicted train data
    seqIdToPred = csv.predToDict(allTrainFastaFile + '.nox.fna.out')

    # read fasta file
    seqIdToBp = fasta.getSequenceToBpDict(allTrainFastaFile)

    # leaf taxonIds that are modelled
    modelLeafTaxonIds = set(map(int,
                                csv.getColumnAsList(modelTaxonIdFilePath)))

    taxonomyS = taxonomy_ncbi.TaxonomyNcbi(databaseFile, considerNoRank=True)
    notLeafTaxonIds = set()
    for id in modelLeafTaxonIds:
        notLeafTaxonIds.update(
            set(map(int, (taxonomyS.getParentsNcbidSet(id)))))
    taxonomyS.close()

    # get only sequences with true taxonId defined at leaf level that is modelled or lower
    seqIdToBp2 = {}
    seqIdToPred2 = {}
    seqIdToTruePred2 = {}
    seqIdToBpMisc = {}
    seqIdToPredMisc = {}
    seqIdToTruePredMisc = {}
    for seqId, bp in seqIdToBp.iteritems():
        label = int(str(str(seqId).rsplit('|', 1)[1]).split(':', 1)[1])
        if label not in notLeafTaxonIds:
            seqIdToBp2[seqId] = bp
            seqIdToPred2[seqId] = seqIdToPred[seqId]
            seqIdToTruePred2[seqId] = seqIdToTruePred[seqId]
        else:
            seqIdToBpMisc[seqId] = bp
            seqIdToPredMisc[seqId] = seqIdToPred[seqId]
            seqIdToTruePredMisc[seqId] = seqIdToTruePred[seqId]
    seqIdToBp = seqIdToBp2
    seqIdToPred = seqIdToPred2
    seqIdToTruePred = seqIdToTruePred2

    # accuracy for all, filter out sample specific data (whole length)
    seqIdToBpNoSampleSpec = {}
    for seqId, bp in seqIdToBp.iteritems():
        if str(seqId).split(
                '|',
                2)[1].strip() != os.path.basename(sampleSpecificDir).strip():
            seqIdToBpNoSampleSpec[seqId] = bp

    acc = accuracy.Accuracy(seqIdToBpNoSampleSpec, seqIdToPred,
                            seqIdToTruePred, databaseFile)
    out = csv.OutFileBuffer(os.path.join(outputDir, 'train_accuracy_all.txt'))
    out.writeText(
        acc.getAccuracyPrint(taxonomy_ncbi.TAXONOMIC_RANKS[1:],
                             minFracClade=None,
                             minFracPred=None,
                             overview=True))
    out.close()
    taxonomyA = acc.getTaxonomy()
    acc.close(closeTaxonomy=False)

    # accuracy for (misc) nodes
    acc = accuracy.Accuracy(seqIdToBpMisc, seqIdToPredMisc,
                            seqIdToTruePredMisc, taxonomyA)
    out = csv.OutFileBuffer(os.path.join(outputDir, 'train_accuracy_misc.txt'))
    out.writeText(
        acc.getAccuracyPrint(taxonomy_ncbi.TAXONOMIC_RANKS[1:],
                             minFracClade=None,
                             minFracPred=None,
                             overview=True))
    out.close()
    acc.close(closeTaxonomy=False)

    # generate the confusion matrices (for the "for all" scenario)
    cm = confusion_matrix.ConfusionMatrix(seqIdToBp, seqIdToPred,
                                          seqIdToTruePred, databaseFile,
                                          taxonomy_ncbi.TAXONOMIC_RANKS[1:])
    for rank in taxonomy_ncbi.TAXONOMIC_RANKS[1:]:
        cm.generateConfusionMatrix(
            rank, os.path.join(outputDir, 'train_accuracy_cmp_all'))
    taxonomyCM = cm.getTaxonomy()
    cm.close(closeTaxonomy=False)

    # accuracy for individual directories (seq lengths)
    # (the sample specific fragments are among PPS sampled fasta)
    for d in trainDirList:
        dName = os.path.basename(d)
        seqIdToBpSub = {}
        seqIdToPredSub = {}
        seqIdToTruePredSub = {}
        for seqId, bp in seqIdToBp.iteritems():
            if str(seqId).split('|', 2)[1].strip() == str(dName).strip():
                seqIdToBpSub[seqId] = seqIdToBp[seqId]
                seqIdToPredSub[seqId] = seqIdToPred[seqId]
                seqIdToTruePredSub[seqId] = seqIdToTruePred[seqId]

        # accuracy
        acc = accuracy.Accuracy(seqIdToBpSub, seqIdToPredSub,
                                seqIdToTruePredSub, taxonomyA)
        out = csv.OutFileBuffer(
            os.path.join(outputDir, 'train_accuracy_' + dName + '.txt'))
        out.writeText(
            acc.getAccuracyPrint(taxonomy_ncbi.TAXONOMIC_RANKS[1:],
                                 minFracClade=None,
                                 minFracPred=None,
                                 overview=True))

        # confusion matrices
        cm = confusion_matrix.ConfusionMatrix(
            seqIdToBpSub, seqIdToPredSub, seqIdToTruePredSub, taxonomyCM,
            taxonomy_ncbi.TAXONOMIC_RANKS[1:])
        for rank in taxonomy_ncbi.TAXONOMIC_RANKS[1:]:
            cm.generateConfusionMatrix(
                rank, os.path.join(outputDir, 'train_accuracy_cmp_' + dName))
        cm.close(closeTaxonomy=False)

        out.close()
        acc.close(closeTaxonomy=False)
    taxonomyA.close()
    taxonomyCM.close()
Пример #20
0
def _main():
    """ See the module description."""
    parser = argparse.ArgumentParser(description=__doc__, epilog="""""")

    parser.add_argument(
        '-i',
        '--input-data-dir',
        action='store',
        nargs=1,
        required=True,
        help=
        """Directory that contains fasta files and corresponding mapping files, for each "*.tax" (or "*.csv")
                 file there must be a "*.fna" file with the same name. All files with suffix "tax" (or "*.csv")
                 will be considered. (Takes only Bacteria and Archaea)""",
        metavar='input_dir',
        dest='inDir')

    parser.add_argument('-o',
                        '--output-dir',
                        action='store',
                        nargs=1,
                        required=True,
                        help='Directory that contains the output files.',
                        metavar='out_dir',
                        dest='outDir')

    parser.add_argument(
        '-s',
        '--source-type',
        required=True,
        nargs=1,
        choices=["s", "a"],
        help=
        'To determine the source, use "s" for the Silva database and "a" for the Amphora database.',
        dest='srcType')

    parser.add_argument(
        '-t',
        '--taxonomy-file',
        nargs=1,
        type=file,
        required=True,
        help='NCBI taxonomy database file in the sqlite3 format.',
        metavar='ncbitax_sqlite.db',
        dest='taxonomy')

    parser.add_argument('-n', '--not-considered-taxonIds', action='store', nargs=1,
        help='Comma separated leaf level or top level taxonIds (as a string) what fill be filtered out. (optional)',
        metavar='"2759,10239,77133,155900,408172,32644, 408170,433727,749907,556182,702656,410661,652676,410659,797283'\
                ',408171,703336,256318,32630,433724,766747,488339,942017,1076179,717931,455559,527640,904678,552539,'\
                '54395,198431,358574,415540,511564,369433,380357,81726,198834,271928,311313,2759,749906,1077529,'\
                '1077529,361146,511563,361147"',
        dest='filterOut')

    # parse arguments
    args = parser.parse_args()
    inDir = args.inDir[0]
    outDir = args.outDir[0]
    srcType = args.srcType[0]
    filterOutTaxonIdsSet = set()
    try:
        if args.filterOut:
            filterOutTaxonIdsSet.update(
                set(map(int,
                        str(args.filterOut[0]).split(','))))
    except:
        print(
            'Taxon ids that are to be filtered out are in a wrong format! Comma separated integers are needed!'
        )
        raise

    taxonomy = _TaxonomyWrap(args.taxonomy[0].name)
    for dir in [inDir, outDir]:
        assert os.path.isdir(dir), 'Path: "' + dir + '" does not exists!'

    # create db for each gene
    mapDict = {}  # map: seqId -> ncbid
    for mapFilePath in glob.glob(
            os.path.join(os.path.normpath(inDir),
                         r'*.[ct][sa][vx]')):  # *.csv or *.tax

        assert mapFilePath.endswith(('.csv', '.tax')), \
            'The mapping files can either end with .csv or .tax ' + mapFilePath

        base = os.path.basename(mapFilePath).rsplit(
            '.', 1)[0]  # cut out dir path and suffix
        fastaDict = fas.fastaFileToDict(
            os.path.join(os.path.dirname(mapFilePath),
                         (base + '.fna')))  # map: seqId -> seq
        print("Processing: %s seq count: %s" % (base, str(len(fastaDict))))

        if 'a' in srcType:  # Amphora
            mapDict = {}
            for k in csv.getColumnAsList(mapFilePath, colNum=0, sep='\t'):
                v = int(k.rsplit('|', 1)[1].split(':')[1])  # get ncbid
                assert ((k not in mapDict) or (mapDict[k] == v)), str(
                    'There are at least two different values for key: ' +
                    str(k) + ' in ' + mapFilePath)
                mapDict[k] = v
        elif 's' in srcType:  # Silva
            mapTmp = csv.getMapping(mapFilePath, 0, 2, '\t')
            mapDict = {}
            for k, v in mapTmp.iteritems():
                mapDict[k] = int(v[0])
        else:
            assert False, 'Unsupported source type!'

        # same number of entries in both files (fasta and mapping) ?
        if len(mapDict) != len(fastaDict):
            print(
                str('%s: The mapping file and the corresponding fasta file have different number of entries: '
                    + '"%s" "%s" these files will be skipped!') %
                (base, str(len(mapDict)), str(len(fastaDict))))
            continue

        # are duplicates in the mapping file ?
        count = len(csv.getColumnAsList(mapFilePath))
        if len(mapDict) != count:
            print(
                '%s: The mapping file contained duplicates! unique: %s non-unique: %s'
                % (base, str(len(mapDict)), str(count)))

        # store data to the output directory
        outDna = csv.OutFileBuffer(os.path.join(outDir, str(base + '.fna')))
        outTax = csv.OutFileBuffer(os.path.join(outDir, str(base + '.tax')))
        count = 0
        filteredLeaf = 0
        filteredSup = 0
        notMapped = 0
        noBacArch = 0
        for seqId, taxonId in mapDict.iteritems():
            if taxonId in filterOutTaxonIdsSet:
                filteredLeaf += 1
                continue
            path = taxonomy.getPathToRoot(taxonId)
            if path is None:
                print('Could not find: %s for seqId: %s record skipped!' %
                      (str(taxonId), seqId))
                notMapped += 1
                continue
            topLevel = int(path.split(';', 1)[0])
            if topLevel in filterOutTaxonIdsSet:
                filteredSup += 1
                continue
            if topLevel not in [2, 2157]:  # Bacteria, Archaea
                noBacArch += 1
                print('NoBactArch: ', topLevel)

            seq = fastaDict[seqId]
            if 'a' in srcType:  # Amphora
                id = seqId
            elif 's' in srcType:  # Silva
                id = str(seqId + '|ncbid:' + str(taxonId))

            outTax.writeText(str(id + '\t' + path + '\n'))
            outDna.writeText(str('>' + id + '\n' + seq + '\n'))
            count += 1

        outDna.close()
        outTax.close()
        print(
            'Stored entries: %s filtered out: %s leaf, %s top level, not mapped: %s'
            % (count, filteredLeaf, filteredSup, notMapped))
        if noBacArch > 0:
            print(
                'WARN: stored %s of non Bacterial and non Archaeal sequences: '
                % (noBacArch))

        # Silva:
        #-i /Users/ivan/Documents/work/binning/database/silva111/arbGenerated -s s -t /Users/ivan/Documents/work/binning/taxonomy/20121122/ncbitax_sqlite.db
        # -o /Users/ivan/Documents/work/binning/database/silva111/db -n ...

        # Amphora
        # -i /Users/ivan/Documents/work/binning/database/markerGenes3/mGenesExtracted -s a -t /Users/ivan/Documents/work/binning/taxonomy/20121122/ncbitax_sqlite.db
        # -o /Users/ivan/Documents/work/binning/database/markerGenes3/db

    taxonomy.close()
    print 'done'
Пример #21
0
def maskDb(action,
           inDir,
           outDir,
           rank,
           clades,
           taxonomyFilePath,
           verbose=False):
    """
        Main function (function interface), see module description.

        @param action: one action that will be performed [cl, mr, mg] ~ (generate list, mask seq, mask mg)
        @type action str
        @param inDir: directory containing input files
        @type inDir: str
        @param outDir: directory containing output files
        @type: outDir: str
        @param rank: the data will be excluded at this rank
        @type rank: str
        @param clades: a file containing clades that will be masked (one ncbi taxon id at a line),
            or a set of ncbi taxon ids that will be masked
        @type clades: file or set of int
        @param taxonomyFilePath: taxonomy database file in the sqlite3 format
        @type taxonomyFilePath: str
    """
    # check input parameters
    assert action in ['cl', 'mr',
                      'mg'], str('Given action is not supported: ' + action)
    if action == 'mr':
        assert os.name == 'posix', 'Symbolic links can be created only on posix systems, action "mr" is not valid!'
    for dir in [inDir, outDir]:
        assert os.path.isdir(dir), str("Directory doesn't exists: " + dir)
    assert rank in _RANKS, str('Not supported rank: ' + rank)
    assert os.path.isfile(taxonomyFilePath), str(
        "Taxonomy database file doesn't exist: " + taxonomyFilePath)
    assert isinstance(
        clades, set
    ) or (isinstance(clades, str) and os.path.isfile(clades)), str(
        "Parameter 'clades' can be either a file or a set of ncbi taxonIds to be excluded."
    )

    # maps a rank to a lower rank
    toLowerRank = {}
    for i in range(1, len(_RANKS)):
        toLowerRank[_RANKS[i - 1]] = _RANKS[i]

    taxonomy = _TaxonomyWrapMD(taxonomyFilePath)

    # leaf clades to mask
    if isinstance(clades, set):
        inCladesSet = set(map(int, clades))
    else:
        inCladesSet = set(map(int, csv.getColumnAsList(clades)))

    # clades in the reference
    refCladesSet = set()
    if action in ['cl', 'mr']:
        # get the list of all taxon ids that appear in the directory (as PPS reference)
        for fastaFilePath in glob.glob(
                os.path.join(os.path.normpath(inDir),
                             r'*.f[na][as]')):  # *.fas or *.fna
            refCladesSet.add(_refFilePathToTaxonId(
                fastaFilePath))  # taxonId.1.fna or taxonId.1.fas
    elif action in ['mg']:
        # get the list of all taxon ids that appear in any file in the input directory as taxonomy ".tax"
        for mapFilePath in glob.glob(
                os.path.join(os.path.normpath(inDir), r'*.tax')):  # *.tax
            refCladesSet.update(
                set(
                    map(_mgSeqIdToTaxonId,
                        csv.getColumnAsList(mapFilePath, sep='\t'))))
    else:
        assert False, str('Not supported action: ' + action)

    # checks whether taxonIds are in the taxonomy
    for taxonId in inCladesSet:
        assert taxonomy.exists(taxonId), str(
            'taxonId: %s from clades list is not contained in the taxonomy!' %
            taxonId)
    for taxonId in refCladesSet:
        assert taxonomy.exists(taxonId), str(
            'taxonId: %s from the reference is not contained in the taxonomy!'
            % taxonId)

    # checks whether the taxonIds are leafs (doesn't have to be (unless you want to mask at the strain level))
    for taxonId in inCladesSet:
        if not taxonomy.isLeaf(taxonId):
            print(
                'Taxon id %s does not represent a leaf clade in the taxonomy.'
                % taxonId)

    if verbose:
        print('Initial checks done.')

    # taxonIds that should be excluded
    toExcludeSet = set()
    for taxonId in inCladesSet:
        taxonIdAtRank = taxonomy.getTaxonIdAtRank(taxonId, rank)
        if taxonIdAtRank is None:  # the lineage is not defined at this rank ! try a lower rank !
            print('Taxon id: "%s" is not defined at rank: "%s"' %
                  (taxonId, rank))
            currentRank = rank  # find a lower rank at which it's defined
            while currentRank in toLowerRank:
                currentRank = toLowerRank[currentRank]
                taxonIdAtRank = taxonomy.getTaxonIdAtRank(taxonId, currentRank)
                if taxonIdAtRank is not None:
                    break
            if taxonIdAtRank is None:
                taxonIdAtRank = taxonId
                currentRank = _STRAIN
            print('Taxon id: %s will be masked at rank: %s' %
                  (taxonId, currentRank))

        # all child clades (and itself)
        toExcludeSet.add(int(taxonIdAtRank))
        toExcludeSet.update(
            set(map(int, taxonomy.getAllChildren(taxonIdAtRank))))

    # all clades that should be excluded (there is at least one sequence for each taxonId in the reference)
    toExcludeSet.intersection_update(refCladesSet)
    if verbose:
        print('Data to mask collected done.')

    print('To exclude: ', len(toExcludeSet))

    # exclude data from the reference
    if action == 'cl':
        # generates a list of taxonIds
        out = csv.OutFileBuffer(os.path.join(outDir, 'exclude_list.txt'))
        for taxonId in toExcludeSet:
            out.writeText(str(taxonId) + '\n')
        out.close()
    elif action == 'mr':
        # masked reference sequences (create sim links to files that were not excluded)
        for fastaFilePath in glob.glob(
                os.path.join(os.path.normpath(inDir),
                             r'*.f[na][as]')):  # *.fas or *.fna
            taxonId = _refFilePathToTaxonId(
                fastaFilePath)  # taxonId.1.fna or taxonId.1.fas
            if taxonId not in toExcludeSet:
                # assert os.name == 'posix'
                os.symlink(
                    fastaFilePath,
                    os.path.join(outDir, os.path.basename(fastaFilePath)))
    elif action == 'mg':
        # exclude sequences from the marker gene databases
        for mapFilePath in glob.glob(
                os.path.join(os.path.normpath(inDir), r'*.tax')):

            # get entries that can stay in the mapping and fasta files
            allowedEntriesSet = set(
                map(_mgSeqIdToTaxonId,
                    csv.getColumnAsList(mapFilePath, sep='\t')))
            allowedEntriesSet.difference_update(toExcludeSet)

            # filter out entries from the mapping file
            csv.filterOutLines(mapFilePath,
                               os.path.join(outDir,
                                            os.path.basename(mapFilePath)),
                               allowedEntriesSet,
                               entryModifyFunction=_mgSeqIdToTaxonId,
                               colNum=0,
                               sep='\t')

            # filter out entries from the fasta file
            fastaFilePath = str(mapFilePath.rsplit('.', 1)[0] + '.fna')
            fas.filterOutSequences(fastaFilePath,
                                   os.path.join(
                                       outDir,
                                       os.path.basename(fastaFilePath)),
                                   allowedEntriesSet,
                                   seqNameModifyFunction=_mgSeqIdToTaxonId)
    else:
        assert False, 'Not supported action!'

    taxonomy.close()
    if verbose:
        print('Data masked done.')
Пример #22
0
def _main():
    mergeS = False
    sortS = False
    clusterS = False
    filterOutSeq = True  # this is optional, e.g. to remove plasmids

    # handle broken pipes
    signal.signal(signal.SIGPIPE, signal.SIG_DFL)

    #printStatDbk()
    #checkForPlasmids()

    # mapFilePathList = ['/local/johdro/refdata/static/ncbi-genomes-bacteria_20121122/nobackup/dna_acc.nuc.tax',
    #                    '/local/johdro/refdata/static/ncbi-draftgenomes-bacteria_20121122/nobackup/dna-contigs_acc.nuc.tax',
    #                    '/local/johdro/refdata/static/ncbi-draftgenomes-bacteria_20121122/nobackup/dna-scaffolds_acc.nuc.tax',
    #                    '/local/johdro/refdata/static/ncbi-hmp_20121016/nobackup/dna-contigs_acc.nuc.tax',
    #                    '/local/johdro/refdata/static/ncbi-hmp_20121016/nobackup/dna-scaffolds_acc.nuc.tax',
    #                    '/local/johdro/refdata/static/ncbi-refseq-microbial_56/nobackup/dna_acc.nuc.tax'
    #                    ]

    # fastaFilePathList = ['/local/johdro/refdata/static/ncbi-genomes-bacteria_20121122/nobackup/dna_acc.nuc.fna',
    #                      '/local/johdro/refdata/static/ncbi-draftgenomes-bacteria_20121122/nobackup/dna-contigs_acc.nuc.fna',
    #                      '/local/johdro/refdata/static/ncbi-draftgenomes-bacteria_20121122/nobackup/dna-scaffolds_acc.nuc.fna',
    #                      '/local/johdro/refdata/static/ncbi-hmp_20121016/nobackup/dna-contigs_acc.nuc.fna',
    #                      '/local/johdro/refdata/static/ncbi-hmp_20121016/nobackup/dna-scaffolds_acc.nuc.fna',
    #                      '/local/johdro/refdata/static/ncbi-refseq-microbial_56/nobackup/dna_acc.nuc.fna'
    #                      ]

    # input files
    mapFilePathList = ['/net/refdata/static/nonredundant-microbial_20140513/nobackup/ncbi-genomes-bacteria_20140428/dna_acc.tax',
                        '/net/refdata/static/nonredundant-microbial_20140513/nobackup/ncbi-draftgenomes-bacteria_20140508/dna-scaffolds_acc.tax',
                        '/net/refdata/static/nonredundant-microbial_20140513/nobackup/ncbi-hmp_20131125/dna-contigs_acc.tax',
                        '/net/refdata/static/nonredundant-microbial_20140513/nobackup/ncbi-hmp_20131125/dna-scaffolds_acc.tax',
                        '/net/refdata/static/nonredundant-microbial_20140513/nobackup/ncbi-refseq-microbial_64/dna_acc.tax'
                        ]

    fastaFilePathList = ['/net/refdata/static/nonredundant-microbial_20140513/nobackup/ncbi-genomes-bacteria_20140428/dna_acc.fna',
                         '/net/refdata/static/nonredundant-microbial_20140513/nobackup/ncbi-draftgenomes-bacteria_20140508/dna-scaffolds_acc.fna',
                         '/net/refdata/static/nonredundant-microbial_20140513/nobackup/ncbi-hmp_20131125/dna-contigs_acc.fna',
                         '/net/refdata/static/nonredundant-microbial_20140513/nobackup/ncbi-hmp_20131125/dna-scaffolds_acc.fna',
                         '/net/refdata/static/nonredundant-microbial_20140513/nobackup/ncbi-refseq-microbial_64/dna_acc.fna'
                         ]

    # output dirs
    mergedDir = '/net/refdata/static/nonredundant-microbial_20140513/nobackup/merged'  # '/local/igregor/ref_20121122/nobackup/merged'
    sortedDir = '/net/refdata/static/nonredundant-microbial_20140513/nobackup/sorted'  # '/local/igregor/ref_20121122/nobackup/sorted'
    centroidsDir = '/net/refdata/static/nonredundant-microbial_20140513/nobackup/centroids'  # '/local/igregor/ref_20121122/nobackup/centroids_1_0'
    # clustersDir = '/net/refdata/static/nonredundant-microbial_20140513/nobackup/clusters'  # '/local/igregor/ref_20121122/nobackup/clusters_1_0'

    # tools
    usearch5 = '/net/metagenomics/projects/PPSmg/tools/usearch/usearch5.2.32_i86linux32'
    usearch6 = '/net/metagenomics/projects/PPSmg/tools/usearch/usearch6.0.307_i86linux32'
    # dnaClust = '/net/metagenomics/projects/PPSmg/tools/dnaclust_64bit_parallel/dnaclust'

    # merge sequences from multiple files
    if mergeS:
        mergeSequences(mapFilePathList, fastaFilePathList, mergedDir)

    if sortS:
        sortSeqDesc(usearch5, usearch6, mergedDir, sortedDir, mapFilePathList)

    # sort and cluster sequences
    if clusterS:
        toCentroids(sortedDir, centroidsDir, mapFilePathList)
        move(centroidsDir)

    if filterOutSeq:
        taxonIdSet = getAllTaxonIdSet(mapFilePathList)
        srcDir = '/net/refdata/static/nonredundant-microbial_20140513/nobackup/centroids'  # '/local/igregor/ref_20121122/nobackup/centroids_1_0'
        dstDir = '/net/refdata/static/nonredundant-microbial_20140513/nobackup/centroids_noplasmids'  # '/local/igregor/ref_20121122/nobackup/centroids_1_0_no_plasmids'
        notAllowedSet = set(csv.getColumnAsList('/net/refdata/static/nonredundant-microbial_20140513/nobackup/plasmids_accessions.txt', colNum=0))  # /local/igregor/ref_20121122/nobackup/plasmid_accessions2.txt
        filterOutSequencesBatch(taxonIdSet, srcDir, dstDir, notAllowedSet)
Пример #23
0
def getAllTaxonIdSet(mapFilePathList):
    taxonIdSet = set()
    for mapFile in mapFilePathList:
        taxonIdSet = taxonIdSet.union(set(csv.getColumnAsList(mapFile, colNum=1, sep='\t')))
    return taxonIdSet
Пример #24
0
def _main():
    """ See the module description."""
    parser = argparse.ArgumentParser(description=__doc__, epilog="""""")

    parser.add_argument('-i', '--input-data-dir', action='store', nargs=1, required=True,
        help="""Directory that contains fasta files and corresponding mapping files, for each "*.tax" (or "*.csv")
                 file there must be a "*.fna" file with the same name. All files with suffix "tax" (or "*.csv")
                 will be considered. (Takes only Bacteria and Archaea)""",
        metavar='input_dir',
        dest='inDir')

    parser.add_argument('-o', '--output-dir', action='store', nargs=1, required=True,
        help='Directory that contains the output files.',
        metavar='out_dir',
        dest='outDir')

    parser.add_argument('-s', '--source-type', required=True, nargs=1, choices=["s","a"],
        help='To determine the source, use "s" for the Silva database and "a" for the Amphora database.',
        dest='srcType')

    parser.add_argument('-t', '--taxonomy-file', nargs=1, type=file, required=True,
        help='NCBI taxonomy database file in the sqlite3 format.', metavar='ncbitax_sqlite.db',
        dest='taxonomy')

    parser.add_argument('-n', '--not-considered-taxonIds', action='store', nargs=1,
        help='Comma separated leaf level or top level taxonIds (as a string) what fill be filtered out. (optional)',
        metavar='"2759,10239,77133,155900,408172,32644, 408170,433727,749907,556182,702656,410661,652676,410659,797283'\
                ',408171,703336,256318,32630,433724,766747,488339,942017,1076179,717931,455559,527640,904678,552539,'\
                '54395,198431,358574,415540,511564,369433,380357,81726,198834,271928,311313,2759,749906,1077529,'\
                '1077529,361146,511563,361147"',
        dest='filterOut')

    # parse arguments
    args = parser.parse_args()
    inDir = args.inDir[0]
    outDir =  args.outDir[0]
    srcType = args.srcType[0]
    filterOutTaxonIdsSet = set()
    try:
        if args.filterOut:
            filterOutTaxonIdsSet.update(set(map(int, str(args.filterOut[0]).split(','))))
    except:
        print('Taxon ids that are to be filtered out are in a wrong format! Comma separated integers are needed!')
        raise

    taxonomy = TaxonomyWrap(args.taxonomy[0].name)
    for dir in [inDir, outDir]:
        assert os.path.isdir(dir), 'Path: "' + dir + '" does not exists!'

    # create db for each gene
    mapDict = {}  # map: seqId -> ncbid
    for mapFilePath in glob.glob(os.path.join(os.path.normpath(inDir), r'*.[ct][sa][vx]')):  # *.csv or *.tax

        assert mapFilePath.endswith(('.csv', '.tax')), \
            'The mapping files can either end with .csv or .tax ' + mapFilePath

        base = os.path.basename(mapFilePath).rsplit('.', 1)[0]  # cut out dir path and suffix
        fastaDict = fas.fastaFileToDict(os.path.join(os.path.dirname(mapFilePath), (base + '.fna'))) # map: seqId -> seq
        print("Processing: %s seq count: %s" % (base, str(len(fastaDict))))

        if 'a' in srcType:  # Amphora
            mapDict = {}
            for k in csv.getColumnAsList(mapFilePath, colNum=0, sep='\t'):
                v =  int(k.rsplit('|', 1)[1].split(':')[1]) # get ncbid
                assert ((k not in mapDict) or (mapDict[k] == v)), str(
                    'There are at least two different values for key: ' + str(k) + ' in ' + mapFilePath)
                mapDict[k] = v
        elif 's' in srcType:  # Silva
            mapTmp = csv.getMapping(mapFilePath, 0, 2, '\t')
            mapDict = {}
            for k, v in mapTmp.iteritems():
                mapDict[k] = int(v[0])
        else:
            assert False, 'Unsupported source type!'

        # same number of entries in both files (fasta and mapping) ?
        if len(mapDict) != len(fastaDict):
            print(str('%s: The mapping file and the corresponding fasta file have different number of entries: ' +
                      '"%s" "%s" these files will be skipped!') % (base, str(len(mapDict)), str(len(fastaDict))))
            continue

        # are duplicates in the mapping file ?
        count = len(csv.getColumnAsList(mapFilePath))
        if len(mapDict) != count:
            print('%s: The mapping file contained duplicates! unique: %s non-unique: %s' % (
                base, str(len(mapDict)), str(count)))

        # store data to the output directory
        outDna = csv.OutFileBuffer(os.path.join(outDir, str(base + '.fna')))
        outTax = csv.OutFileBuffer(os.path.join(outDir, str(base + '.tax')))
        count = 0
        filteredLeaf = 0
        filteredSup = 0
        notMapped = 0
        noBacArch = 0
        for seqId, taxonId in mapDict.iteritems():
            if taxonId in filterOutTaxonIdsSet:
                filteredLeaf += 1
                continue
            path = taxonomy.getPathToRoot(taxonId)
            if path is None:
                print('Could not find: %s for seqId: %s record skipped!' % (str(taxonId), seqId))
                notMapped += 1
                continue
            topLevel = int(path.split(';', 1)[0])
            if topLevel in filterOutTaxonIdsSet:
                filteredSup += 1
                continue
            if topLevel not in [2, 2157]:  # Bacteria, Archaea
                noBacArch += 1
                print('NoBactArch: ', topLevel)

            seq = fastaDict[seqId]
            if 'a' in srcType:  # Amphora
                id = seqId
            elif 's' in srcType:  # Silva
                id = str(seqId + '|ncbid:' + str(taxonId))

            outTax.writeText(str(id + '\t' + path + '\n'))
            outDna.writeText(str('>' + id + '\n' + seq + '\n'))
            count += 1

        outDna.close()
        outTax.close()
        print('Stored entries: %s filtered out: %s leaf, %s top level, not mapped: %s' %
              (count, filteredLeaf, filteredSup, notMapped))
        if noBacArch > 0:
            print('WARN: stored %s of non Bacterial and non Archaeal sequences: ' % (noBacArch))

        # Silva:
        #-i /Users/ivan/Documents/work/binning/database/silva111/arbGenerated -s s -t /Users/ivan/Documents/work/binning/taxonomy/20121122/ncbitax_sqlite.db
        # -o /Users/ivan/Documents/work/binning/database/silva111/db -n ...

        # Amphora
        # -i /Users/ivan/Documents/work/binning/database/markerGenes3/mGenesExtracted -s a -t /Users/ivan/Documents/work/binning/taxonomy/20121122/ncbitax_sqlite.db
        # -o /Users/ivan/Documents/work/binning/database/markerGenes3/db

    taxonomy.close()
    print 'done'
Пример #25
0
def computeTrainingAccuracy(workingDir, taWorkingDir, sampleSpecificDir, ppsTrainDataDir, outputDir, ppsInstallDir,
                            ppsScripts, ppsConfigFilePath, predictLogFileName, modelTaxonIdFilePath, databaseFile):
    """
        Computes the training accuracy for the PPS training data.
        This function doesn't consider training data used to train intermediate (misc?) nodes!
        The training data that correspond to the sample specific data is fragmented (via PPS) and
        contained in the training data of different lengths.

        @param workingDir: working directory of the PPS+ pipeline
        @param taWorkingDir: working directory for the accuracy computation
        @param sampleSpecificDir: directory containing the sample specific data
        @param ppsTrainDataDir: directory 'sampled_fasta' containing PPS training data
        @param outputDir: directory for output files
        @param ppsScripts: directory containing PPS scripts
        @param ppsConfigFilePath: the PPS configuration file
        @param ppsInstallDir: directory where PPS is installed
        @param predictLogFileName: logging file for PPS prediction
        @param modelTaxonIdFilePath: file containing all leaf ncbi taxon ids that are modelled
        @param databaseFile: ncbi taxonomy file in the sqlite3 format
    """
    for d in [workingDir, taWorkingDir, sampleSpecificDir,
              ppsTrainDataDir, outputDir, ppsInstallDir, ppsScripts, os.path.dirname(predictLogFileName)]:
        assert os.path.isdir(d), "Directory '%s' doesn't exist!" % d
    for f in [ppsConfigFilePath, databaseFile, modelTaxonIdFilePath]:
        assert os.path.isfile(f), "File '%s' doesn't exist!" % f

    # all directories that contain PPS training data
    trainDirList = [sampleSpecificDir]
    for d in os.listdir(ppsTrainDataDir):
        trainDirList.append(os.path.join(ppsTrainDataDir, d))

    # fasta file with all training sequences
    allTrainFastaFile = os.path.join(taWorkingDir, 'all_train_data.fna')
    out = csv.OutFileBuffer(allTrainFastaFile)
    seqIdToTruePred = {}

    # merge all training fasta files to one fasta file
    for d in trainDirList:
        dName = os.path.basename(d)
        for f in os.listdir(d):
            taxonId = int(os.path.basename(f).rsplit('.', 2)[0])
            for seqId, seq in fasta.fastaFileToDict(os.path.join(d, f)).iteritems():
                if d == sampleSpecificDir:
                    #label = int(str(str(seqId).rsplit('|', 1)[1]).split(':', 1)[1])
                    id = str(taxonId) + '|' + dName + '|' + seqId + '|label:' + str(taxonId)
                else:
                    id = str(taxonId) + '|' + dName + '|' + seqId
                out.writeText('>' + id + '\n' + seq + '\n')
                seqIdToTruePred[id] = taxonId
    out.close()

    # predict the merged file using the generated model
    if os.name == 'posix':
        predictCmd = str(os.path.join(ppsScripts, 'predict.rb') + ' ' + allTrainFastaFile + ' ' + ppsConfigFilePath)
        #print(predictCmd)
        logOut = open(predictLogFileName, 'w')
        predictProc = subprocess.Popen(predictCmd, shell=True, bufsize=-1, cwd=ppsInstallDir, stdout=logOut,
                                       stderr=subprocess.STDOUT)  # stdout=subprocess.STDOUT
        predictProc.wait()
        logOut.close()
        if predictProc.returncode != 0:
            raise Exception("PPS 'predict' training data returned with non-zero status: %s, cmd: %s" %
                            (predictProc.returncode, predictCmd))
    else:
        print("Can't run PPS on a non-posix system!")
        return

    # read in predicted train data
    seqIdToPred = csv.predToDict(allTrainFastaFile + '.nox.fna.out')

    # read fasta file
    seqIdToBp = fasta.getSequenceToBpDict(allTrainFastaFile)

    # leaf taxonIds that are modelled
    modelLeafTaxonIds = set(map(int, csv.getColumnAsList(modelTaxonIdFilePath)))

    taxonomyS = taxonomy_ncbi.TaxonomyNcbi(databaseFile, considerNoRank=True)
    notLeafTaxonIds = set()
    for id in modelLeafTaxonIds:
        notLeafTaxonIds.update(set(map(int, (taxonomyS.getParentsNcbidSet(id)))))
    taxonomyS.close()

    # get only sequences with true taxonId defined at leaf level that is modelled or lower
    seqIdToBp2 = {}
    seqIdToPred2 = {}
    seqIdToTruePred2 = {}
    seqIdToBpMisc = {}
    seqIdToPredMisc = {}
    seqIdToTruePredMisc = {}
    for seqId, bp in seqIdToBp.iteritems():
        label = int(str(str(seqId).rsplit('|', 1)[1]).split(':', 1)[1])
        if label not in notLeafTaxonIds:
            seqIdToBp2[seqId] = bp
            seqIdToPred2[seqId] = seqIdToPred[seqId]
            seqIdToTruePred2[seqId] = seqIdToTruePred[seqId]
        else:
            seqIdToBpMisc[seqId] = bp
            seqIdToPredMisc[seqId] = seqIdToPred[seqId]
            seqIdToTruePredMisc[seqId] = seqIdToTruePred[seqId]
    seqIdToBp = seqIdToBp2
    seqIdToPred = seqIdToPred2
    seqIdToTruePred = seqIdToTruePred2

    # accuracy for all, filter out sample specific data (whole length)
    seqIdToBpNoSampleSpec = {}
    for seqId, bp in seqIdToBp.iteritems():
        if str(seqId).split('|', 2)[1].strip() != os.path.basename(sampleSpecificDir).strip():
            seqIdToBpNoSampleSpec[seqId] = bp

    acc = accuracy.Accuracy(seqIdToBpNoSampleSpec, seqIdToPred, seqIdToTruePred, databaseFile)
    out = csv.OutFileBuffer(os.path.join(outputDir, 'train_accuracy_all.txt'))
    out.writeText(acc.getAccuracyPrint(taxonomy_ncbi.TAXONOMIC_RANKS[1:],
                                       minFracClade=None, minFracPred=None, overview=True))
    out.close()
    taxonomyA = acc.getTaxonomy()
    acc.close(closeTaxonomy=False)

    # accuracy for (misc) nodes
    acc = accuracy.Accuracy(seqIdToBpMisc, seqIdToPredMisc, seqIdToTruePredMisc, taxonomyA)
    out = csv.OutFileBuffer(os.path.join(outputDir, 'train_accuracy_misc.txt'))
    out.writeText(acc.getAccuracyPrint(taxonomy_ncbi.TAXONOMIC_RANKS[1:],
                                       minFracClade=None, minFracPred=None, overview=True))
    out.close()
    acc.close(closeTaxonomy=False)

    # generate the confusion matrices (for the "for all" scenario)
    cm = confusion_matrix.ConfusionMatrix(seqIdToBp, seqIdToPred, seqIdToTruePred, databaseFile,
                                          taxonomy_ncbi.TAXONOMIC_RANKS[1:])
    for rank in taxonomy_ncbi.TAXONOMIC_RANKS[1:]:
        cm.generateConfusionMatrix(rank, os.path.join(outputDir, 'train_accuracy_cmp_all'))
    taxonomyCM = cm.getTaxonomy()
    cm.close(closeTaxonomy=False)

    # accuracy for individual directories (seq lengths)
    # (the sample specific fragments are among PPS sampled fasta)
    for d in trainDirList:
        dName = os.path.basename(d)
        seqIdToBpSub = {}
        seqIdToPredSub = {}
        seqIdToTruePredSub = {}
        for seqId, bp in seqIdToBp.iteritems():
            if str(seqId).split('|', 2)[1].strip() == str(dName).strip():
                seqIdToBpSub[seqId] = seqIdToBp[seqId]
                seqIdToPredSub[seqId] = seqIdToPred[seqId]
                seqIdToTruePredSub[seqId] = seqIdToTruePred[seqId]

        # accuracy
        acc = accuracy.Accuracy(seqIdToBpSub, seqIdToPredSub, seqIdToTruePredSub, taxonomyA)
        out = csv.OutFileBuffer(os.path.join(outputDir, 'train_accuracy_' + dName + '.txt'))
        out.writeText(acc.getAccuracyPrint(taxonomy_ncbi.TAXONOMIC_RANKS[1:],
                                           minFracClade=None, minFracPred=None, overview=True))

        # confusion matrices
        cm = confusion_matrix.ConfusionMatrix(seqIdToBpSub, seqIdToPredSub, seqIdToTruePredSub, taxonomyCM,
                                              taxonomy_ncbi.TAXONOMIC_RANKS[1:])
        for rank in taxonomy_ncbi.TAXONOMIC_RANKS[1:]:
            cm.generateConfusionMatrix(rank, os.path.join(outputDir, 'train_accuracy_cmp_' + dName))
        cm.close(closeTaxonomy=False)

        out.close()
        acc.close(closeTaxonomy=False)
    taxonomyA.close()
    taxonomyCM.close()
Пример #26
0
def getAllTaxonIdSet(mapFilePathList):
    taxonIdSet = set()
    for mapFile in mapFilePathList:
        taxonIdSet = taxonIdSet.union(
            set(csv.getColumnAsList(mapFile, colNum=1, sep='\t')))
    return taxonIdSet
Пример #27
0
def _main():
    mergeS = False
    sortS = False
    clusterS = False
    filterOutSeq = True  # this is optional, e.g. to remove plasmids

    # handle broken pipes
    signal.signal(signal.SIGPIPE, signal.SIG_DFL)

    #printStatDbk()
    #checkForPlasmids()

    # mapFilePathList = ['/local/johdro/refdata/static/ncbi-genomes-bacteria_20121122/nobackup/dna_acc.nuc.tax',
    #                    '/local/johdro/refdata/static/ncbi-draftgenomes-bacteria_20121122/nobackup/dna-contigs_acc.nuc.tax',
    #                    '/local/johdro/refdata/static/ncbi-draftgenomes-bacteria_20121122/nobackup/dna-scaffolds_acc.nuc.tax',
    #                    '/local/johdro/refdata/static/ncbi-hmp_20121016/nobackup/dna-contigs_acc.nuc.tax',
    #                    '/local/johdro/refdata/static/ncbi-hmp_20121016/nobackup/dna-scaffolds_acc.nuc.tax',
    #                    '/local/johdro/refdata/static/ncbi-refseq-microbial_56/nobackup/dna_acc.nuc.tax'
    #                    ]

    # fastaFilePathList = ['/local/johdro/refdata/static/ncbi-genomes-bacteria_20121122/nobackup/dna_acc.nuc.fna',
    #                      '/local/johdro/refdata/static/ncbi-draftgenomes-bacteria_20121122/nobackup/dna-contigs_acc.nuc.fna',
    #                      '/local/johdro/refdata/static/ncbi-draftgenomes-bacteria_20121122/nobackup/dna-scaffolds_acc.nuc.fna',
    #                      '/local/johdro/refdata/static/ncbi-hmp_20121016/nobackup/dna-contigs_acc.nuc.fna',
    #                      '/local/johdro/refdata/static/ncbi-hmp_20121016/nobackup/dna-scaffolds_acc.nuc.fna',
    #                      '/local/johdro/refdata/static/ncbi-refseq-microbial_56/nobackup/dna_acc.nuc.fna'
    #                      ]

    # input files
    mapFilePathList = [
        '/net/refdata/static/nonredundant-microbial_20140513/nobackup/ncbi-genomes-bacteria_20140428/dna_acc.tax',
        '/net/refdata/static/nonredundant-microbial_20140513/nobackup/ncbi-draftgenomes-bacteria_20140508/dna-scaffolds_acc.tax',
        '/net/refdata/static/nonredundant-microbial_20140513/nobackup/ncbi-hmp_20131125/dna-contigs_acc.tax',
        '/net/refdata/static/nonredundant-microbial_20140513/nobackup/ncbi-hmp_20131125/dna-scaffolds_acc.tax',
        '/net/refdata/static/nonredundant-microbial_20140513/nobackup/ncbi-refseq-microbial_64/dna_acc.tax'
    ]

    fastaFilePathList = [
        '/net/refdata/static/nonredundant-microbial_20140513/nobackup/ncbi-genomes-bacteria_20140428/dna_acc.fna',
        '/net/refdata/static/nonredundant-microbial_20140513/nobackup/ncbi-draftgenomes-bacteria_20140508/dna-scaffolds_acc.fna',
        '/net/refdata/static/nonredundant-microbial_20140513/nobackup/ncbi-hmp_20131125/dna-contigs_acc.fna',
        '/net/refdata/static/nonredundant-microbial_20140513/nobackup/ncbi-hmp_20131125/dna-scaffolds_acc.fna',
        '/net/refdata/static/nonredundant-microbial_20140513/nobackup/ncbi-refseq-microbial_64/dna_acc.fna'
    ]

    # output dirs
    mergedDir = '/net/refdata/static/nonredundant-microbial_20140513/nobackup/merged'  # '/local/igregor/ref_20121122/nobackup/merged'
    sortedDir = '/net/refdata/static/nonredundant-microbial_20140513/nobackup/sorted'  # '/local/igregor/ref_20121122/nobackup/sorted'
    centroidsDir = '/net/refdata/static/nonredundant-microbial_20140513/nobackup/centroids'  # '/local/igregor/ref_20121122/nobackup/centroids_1_0'
    # clustersDir = '/net/refdata/static/nonredundant-microbial_20140513/nobackup/clusters'  # '/local/igregor/ref_20121122/nobackup/clusters_1_0'

    # tools
    usearch5 = '/net/metagenomics/projects/PPSmg/tools/usearch/usearch5.2.32_i86linux32'
    usearch6 = '/net/metagenomics/projects/PPSmg/tools/usearch/usearch6.0.307_i86linux32'
    # dnaClust = '/net/metagenomics/projects/PPSmg/tools/dnaclust_64bit_parallel/dnaclust'

    # merge sequences from multiple files
    if mergeS:
        mergeSequences(mapFilePathList, fastaFilePathList, mergedDir)

    if sortS:
        sortSeqDesc(usearch5, usearch6, mergedDir, sortedDir, mapFilePathList)

    # sort and cluster sequences
    if clusterS:
        toCentroids(sortedDir, centroidsDir, mapFilePathList)
        move(centroidsDir)

    if filterOutSeq:
        taxonIdSet = getAllTaxonIdSet(mapFilePathList)
        srcDir = '/net/refdata/static/nonredundant-microbial_20140513/nobackup/centroids'  # '/local/igregor/ref_20121122/nobackup/centroids_1_0'
        dstDir = '/net/refdata/static/nonredundant-microbial_20140513/nobackup/centroids_noplasmids'  # '/local/igregor/ref_20121122/nobackup/centroids_1_0_no_plasmids'
        notAllowedSet = set(
            csv.getColumnAsList(
                '/net/refdata/static/nonredundant-microbial_20140513/nobackup/plasmids_accessions.txt',
                colNum=0)
        )  # /local/igregor/ref_20121122/nobackup/plasmid_accessions2.txt
        filterOutSequencesBatch(taxonIdSet, srcDir, dstDir, notAllowedSet)
Пример #28
0
def maskDb(action, inDir, outDir, rank, clades, taxonomyFilePath, verbose=False):
    """
        Main function (function interface), see module description.

        @param action: one action that will be performed [cl, mr, mg] ~ (generate list, mask seq, mask mg)
        @type action str
        @param inDir: directory containing input files
        @type inDir: str
        @param outDir: directory containing output files
        @type: outDir: str
        @param rank: the data will be excluded at this rank
        @type rank: str
        @param clades: a file containing clades that will be masked (one ncbi taxon id at a line),
            or a set of ncbi taxon ids that will be masked
        @type clades: file or set of int
        @param taxonomyFilePath: taxonomy database file in the sqlite3 format
        @type taxonomyFilePath: str
    """
    # check input parameters
    assert action in ['cl', 'mr', 'mg'], str('Given action is not supported: ' + action)
    if action == 'mr':
        assert os.name == 'posix', 'Symbolic links can be created only on posix systems, action "mr" is not valid!'
    for dir in [inDir, outDir]:
        assert os.path.isdir(dir), str("Directory doesn't exists: " + dir)
    assert rank in _RANKS, str('Not supported rank: ' + rank)
    assert os.path.isfile(taxonomyFilePath), str("Taxonomy database file doesn't exist: " + taxonomyFilePath)
    assert isinstance(clades, set) or (isinstance(clades, str) and os.path.isfile(clades)), str(
        "Parameter 'clades' can be either a file or a set of ncbi taxonIds to be excluded.")

    # maps a rank to a lower rank
    toLowerRank = {}
    for i in range(1, len(_RANKS)):
        toLowerRank[_RANKS[i-1]] = _RANKS[i]

    taxonomy = _TaxonomyWrapMD(taxonomyFilePath)

    # leaf clades to mask
    if isinstance(clades, set):
        inCladesSet = set(map(int, clades))
    else:
        inCladesSet = set(map(int, csv.getColumnAsList(clades)))

    # clades in the reference
    refCladesSet = set()
    if action in ['cl', 'mr']:
        # get the list of all taxon ids that appear in the directory (as PPS reference)
        for fastaFilePath in glob.glob(os.path.join(os.path.normpath(inDir), r'*.f[na][as]')):  # *.fas or *.fna
            refCladesSet.add(_refFilePathToTaxonId(fastaFilePath))  # taxonId.1.fna or taxonId.1.fas
    elif action in ['mg']:
        # get the list of all taxon ids that appear in any file in the input directory as taxonomy ".tax"
        for mapFilePath in glob.glob(os.path.join(os.path.normpath(inDir), r'*.tax')):  # *.tax
            refCladesSet.update(set(map(_mgSeqIdToTaxonId, csv.getColumnAsList(mapFilePath, sep='\t'))))
    else:
        assert False, str('Not supported action: ' + action)

    # checks whether taxonIds are in the taxonomy
    for taxonId in inCladesSet:
        assert taxonomy.exists(taxonId), str(
            'taxonId: %s from clades list is not contained in the taxonomy!' % taxonId)
    for taxonId in refCladesSet:
        assert taxonomy.exists(taxonId), str(
            'taxonId: %s from the reference is not contained in the taxonomy!' % taxonId)

    # checks whether the taxonIds are leafs (doesn't have to be (unless you want to mask at the strain level))
    for taxonId in inCladesSet:
        if not taxonomy.isLeaf(taxonId):
            print('Taxon id %s does not represent a leaf clade in the taxonomy.' % taxonId)

    if verbose:
        print('Initial checks done.')

    # taxonIds that should be excluded
    toExcludeSet = set()
    for taxonId in inCladesSet:
        taxonIdAtRank = taxonomy.getTaxonIdAtRank(taxonId, rank)
        if taxonIdAtRank is None:  # the lineage is not defined at this rank ! try a lower rank !
            print('Taxon id: "%s" is not defined at rank: "%s"' % (taxonId, rank))
            currentRank = rank  # find a lower rank at which it's defined
            while currentRank in toLowerRank:
                currentRank = toLowerRank[currentRank]
                taxonIdAtRank = taxonomy.getTaxonIdAtRank(taxonId, currentRank)
                if taxonIdAtRank is not None:
                    break
            if taxonIdAtRank is None:
                taxonIdAtRank = taxonId
                currentRank = _STRAIN
            print('Taxon id: %s will be masked at rank: %s' % (taxonId, currentRank))

        # all child clades (and itself)
        toExcludeSet.add(int(taxonIdAtRank))
        toExcludeSet.update(set(map(int, taxonomy.getAllChildren(taxonIdAtRank))))

    # all clades that should be excluded (there is at least one sequence for each taxonId in the reference)
    toExcludeSet.intersection_update(refCladesSet)
    if verbose:
        print('Data to mask collected done.')

    print('To exclude: ', len(toExcludeSet))

    # exclude data from the reference
    if action == 'cl':
        # generates a list of taxonIds
        out = csv.OutFileBuffer(os.path.join(outDir, 'exclude_list.txt'))
        for taxonId in toExcludeSet:
            out.writeText(str(taxonId) + '\n')
        out.close()
    elif action == 'mr':
        # masked reference sequences (create sim links to files that were not excluded)
        for fastaFilePath in glob.glob(os.path.join(os.path.normpath(inDir), r'*.f[na][as]')):  # *.fas or *.fna
            taxonId = _refFilePathToTaxonId(fastaFilePath)  # taxonId.1.fna or taxonId.1.fas
            if taxonId not in toExcludeSet:
                # assert os.name == 'posix'
                os.symlink(fastaFilePath, os.path.join(outDir, os.path.basename(fastaFilePath)))
    elif action == 'mg':
        # exclude sequences from the marker gene databases
        for mapFilePath in glob.glob(os.path.join(os.path.normpath(inDir), r'*.tax')):

            # get entries that can stay in the mapping and fasta files
            allowedEntriesSet = set(map(_mgSeqIdToTaxonId, csv.getColumnAsList(mapFilePath, sep='\t')))
            allowedEntriesSet.difference_update(toExcludeSet)

            # filter out entries from the mapping file
            csv.filterOutLines(mapFilePath, os.path.join(outDir, os.path.basename(mapFilePath)),
                               allowedEntriesSet, entryModifyFunction=_mgSeqIdToTaxonId, colNum=0, sep='\t')

            # filter out entries from the fasta file
            fastaFilePath = str(mapFilePath.rsplit('.', 1)[0] + '.fna')
            fas.filterOutSequences(fastaFilePath, os.path.join(outDir, os.path.basename(fastaFilePath)),
                                   allowedEntriesSet, seqNameModifyFunction=_mgSeqIdToTaxonId)
    else:
        assert False, 'Not supported action!'

    taxonomy.close()
    if verbose:
        print('Data masked done.')