def filterOutSequencesBatch(taxonIdSet, srcDir, dstDir, notAllowedSeqIdSet): """ For each fasta file that is in directory srcDir filters out sequences that are not defined in the allowedSeqIdSet. """ for taxonId in taxonIdSet: srcFilePath = os.path.join(srcDir,str(str(taxonId) + '.1.fna')) dstFilePath = os.path.join(dstDir,str(str(taxonId) + '.1.fna')) seqIdDict = fasta.getSequenceToBpDict(srcFilePath) allowedNamesSet = set() for id in seqIdDict.iterkeys(): if id not in notAllowedSeqIdSet: allowedNamesSet.add(id) fasta.filterOutSequences(srcFilePath, dstFilePath, allowedNamesSet)
def filterOutSequencesBatch(taxonIdSet, srcDir, dstDir, notAllowedSeqIdSet): """ For each fasta file that is in directory srcDir filters out sequences that are not defined in the allowedSeqIdSet. """ for taxonId in taxonIdSet: srcFilePath = os.path.join(srcDir, str(str(taxonId) + '.1.fna')) dstFilePath = os.path.join(dstDir, str(str(taxonId) + '.1.fna')) seqIdDict = fasta.getSequenceToBpDict(srcFilePath) allowedNamesSet = set() for id in seqIdDict.iterkeys(): if id not in notAllowedSeqIdSet: allowedNamesSet.add(id) fasta.filterOutSequences(srcFilePath, dstFilePath, allowedNamesSet)
def filterSequences(): """ To filter sequences with a specific label. """ inFileName = '/net/metagenomics/projects/PPSmg/data/V35/contigsMappedBlast1000.fna' outFileName = '/net/metagenomics/projects/PPSmg/data/V35/nostocRemoved/contigsMappedBlast1000NostocRm.fna' mapFileName = '/net/metagenomics/projects/PPSmg/data/V35/contigsMappedBlast1000Labels.txt' labelRemove = 103690 #seq id -> label labelToIdsDict = csv.getMapping(mapFileName, 1, 0, sep='\t', comment='#') allowedNamesSet = set() for i in labelToIdsDict: if int(i) != int(labelRemove): for j in labelToIdsDict[i]: allowedNamesSet.add(j) fas.filterOutSequences(inFileName, outFileName, allowedNamesSet)
def filterSequences(): """ To filter sequences with a specific label. """ inFileName = '/net/metagenomics/projects/PPSmg/data/V35/contigsMappedBlast1000.fna' outFileName = '/net/metagenomics/projects/PPSmg/data/V35/nostocRemoved/contigsMappedBlast1000NostocRm.fna' mapFileName = '/net/metagenomics/projects/PPSmg/data/V35/contigsMappedBlast1000Labels.txt' labelRemove = 103690 #seq id -> label labelToIdsDict = csv.getMapping(mapFileName, 1, 0, sep='\t', comment = '#') allowedNamesSet = set() for i in labelToIdsDict: if int(i) != int(labelRemove): for j in labelToIdsDict[i]: allowedNamesSet.add(j) fas.filterOutSequences(inFileName, outFileName, allowedNamesSet)
def maskDb(action, inDir, outDir, rank, clades, taxonomyFilePath, verbose=False): """ Main function (function interface), see module description. @param action: one action that will be performed [cl, mr, mg] ~ (generate list, mask seq, mask mg) @type action str @param inDir: directory containing input files @type inDir: str @param outDir: directory containing output files @type: outDir: str @param rank: the data will be excluded at this rank @type rank: str @param clades: a file containing clades that will be masked (one ncbi taxon id at a line), or a set of ncbi taxon ids that will be masked @type clades: file or set of int @param taxonomyFilePath: taxonomy database file in the sqlite3 format @type taxonomyFilePath: str """ # check input parameters assert action in ['cl', 'mr', 'mg'], str('Given action is not supported: ' + action) if action == 'mr': assert os.name == 'posix', 'Symbolic links can be created only on posix systems, action "mr" is not valid!' for dir in [inDir, outDir]: assert os.path.isdir(dir), str("Directory doesn't exists: " + dir) assert rank in _RANKS, str('Not supported rank: ' + rank) assert os.path.isfile(taxonomyFilePath), str( "Taxonomy database file doesn't exist: " + taxonomyFilePath) assert isinstance( clades, set ) or (isinstance(clades, str) and os.path.isfile(clades)), str( "Parameter 'clades' can be either a file or a set of ncbi taxonIds to be excluded." ) # maps a rank to a lower rank toLowerRank = {} for i in range(1, len(_RANKS)): toLowerRank[_RANKS[i - 1]] = _RANKS[i] taxonomy = _TaxonomyWrapMD(taxonomyFilePath) # leaf clades to mask if isinstance(clades, set): inCladesSet = set(map(int, clades)) else: inCladesSet = set(map(int, csv.getColumnAsList(clades))) # clades in the reference refCladesSet = set() if action in ['cl', 'mr']: # get the list of all taxon ids that appear in the directory (as PPS reference) for fastaFilePath in glob.glob( os.path.join(os.path.normpath(inDir), r'*.f[na][as]')): # *.fas or *.fna refCladesSet.add(_refFilePathToTaxonId( fastaFilePath)) # taxonId.1.fna or taxonId.1.fas elif action in ['mg']: # get the list of all taxon ids that appear in any file in the input directory as taxonomy ".tax" for mapFilePath in glob.glob( os.path.join(os.path.normpath(inDir), r'*.tax')): # *.tax refCladesSet.update( set( map(_mgSeqIdToTaxonId, csv.getColumnAsList(mapFilePath, sep='\t')))) else: assert False, str('Not supported action: ' + action) # checks whether taxonIds are in the taxonomy for taxonId in inCladesSet: assert taxonomy.exists(taxonId), str( 'taxonId: %s from clades list is not contained in the taxonomy!' % taxonId) for taxonId in refCladesSet: assert taxonomy.exists(taxonId), str( 'taxonId: %s from the reference is not contained in the taxonomy!' % taxonId) # checks whether the taxonIds are leafs (doesn't have to be (unless you want to mask at the strain level)) for taxonId in inCladesSet: if not taxonomy.isLeaf(taxonId): print( 'Taxon id %s does not represent a leaf clade in the taxonomy.' % taxonId) if verbose: print('Initial checks done.') # taxonIds that should be excluded toExcludeSet = set() for taxonId in inCladesSet: taxonIdAtRank = taxonomy.getTaxonIdAtRank(taxonId, rank) if taxonIdAtRank is None: # the lineage is not defined at this rank ! try a lower rank ! print('Taxon id: "%s" is not defined at rank: "%s"' % (taxonId, rank)) currentRank = rank # find a lower rank at which it's defined while currentRank in toLowerRank: currentRank = toLowerRank[currentRank] taxonIdAtRank = taxonomy.getTaxonIdAtRank(taxonId, currentRank) if taxonIdAtRank is not None: break if taxonIdAtRank is None: taxonIdAtRank = taxonId currentRank = _STRAIN print('Taxon id: %s will be masked at rank: %s' % (taxonId, currentRank)) # all child clades (and itself) toExcludeSet.add(int(taxonIdAtRank)) toExcludeSet.update( set(map(int, taxonomy.getAllChildren(taxonIdAtRank)))) # all clades that should be excluded (there is at least one sequence for each taxonId in the reference) toExcludeSet.intersection_update(refCladesSet) if verbose: print('Data to mask collected done.') print('To exclude: ', len(toExcludeSet)) # exclude data from the reference if action == 'cl': # generates a list of taxonIds out = csv.OutFileBuffer(os.path.join(outDir, 'exclude_list.txt')) for taxonId in toExcludeSet: out.writeText(str(taxonId) + '\n') out.close() elif action == 'mr': # masked reference sequences (create sim links to files that were not excluded) for fastaFilePath in glob.glob( os.path.join(os.path.normpath(inDir), r'*.f[na][as]')): # *.fas or *.fna taxonId = _refFilePathToTaxonId( fastaFilePath) # taxonId.1.fna or taxonId.1.fas if taxonId not in toExcludeSet: # assert os.name == 'posix' os.symlink( fastaFilePath, os.path.join(outDir, os.path.basename(fastaFilePath))) elif action == 'mg': # exclude sequences from the marker gene databases for mapFilePath in glob.glob( os.path.join(os.path.normpath(inDir), r'*.tax')): # get entries that can stay in the mapping and fasta files allowedEntriesSet = set( map(_mgSeqIdToTaxonId, csv.getColumnAsList(mapFilePath, sep='\t'))) allowedEntriesSet.difference_update(toExcludeSet) # filter out entries from the mapping file csv.filterOutLines(mapFilePath, os.path.join(outDir, os.path.basename(mapFilePath)), allowedEntriesSet, entryModifyFunction=_mgSeqIdToTaxonId, colNum=0, sep='\t') # filter out entries from the fasta file fastaFilePath = str(mapFilePath.rsplit('.', 1)[0] + '.fna') fas.filterOutSequences(fastaFilePath, os.path.join( outDir, os.path.basename(fastaFilePath)), allowedEntriesSet, seqNameModifyFunction=_mgSeqIdToTaxonId) else: assert False, 'Not supported action!' taxonomy.close() if verbose: print('Data masked done.')
def maskDb(action, inDir, outDir, rank, clades, taxonomyFilePath, verbose=False): """ Main function (function interface), see module description. @param action: one action that will be performed [cl, mr, mg] ~ (generate list, mask seq, mask mg) @type action str @param inDir: directory containing input files @type inDir: str @param outDir: directory containing output files @type: outDir: str @param rank: the data will be excluded at this rank @type rank: str @param clades: a file containing clades that will be masked (one ncbi taxon id at a line), or a set of ncbi taxon ids that will be masked @type clades: file or set of int @param taxonomyFilePath: taxonomy database file in the sqlite3 format @type taxonomyFilePath: str """ # check input parameters assert action in ['cl', 'mr', 'mg'], str('Given action is not supported: ' + action) if action == 'mr': assert os.name == 'posix', 'Symbolic links can be created only on posix systems, action "mr" is not valid!' for dir in [inDir, outDir]: assert os.path.isdir(dir), str("Directory doesn't exists: " + dir) assert rank in _RANKS, str('Not supported rank: ' + rank) assert os.path.isfile(taxonomyFilePath), str("Taxonomy database file doesn't exist: " + taxonomyFilePath) assert isinstance(clades, set) or (isinstance(clades, str) and os.path.isfile(clades)), str( "Parameter 'clades' can be either a file or a set of ncbi taxonIds to be excluded.") # maps a rank to a lower rank toLowerRank = {} for i in range(1, len(_RANKS)): toLowerRank[_RANKS[i-1]] = _RANKS[i] taxonomy = _TaxonomyWrapMD(taxonomyFilePath) # leaf clades to mask if isinstance(clades, set): inCladesSet = set(map(int, clades)) else: inCladesSet = set(map(int, csv.getColumnAsList(clades))) # clades in the reference refCladesSet = set() if action in ['cl', 'mr']: # get the list of all taxon ids that appear in the directory (as PPS reference) for fastaFilePath in glob.glob(os.path.join(os.path.normpath(inDir), r'*.f[na][as]')): # *.fas or *.fna refCladesSet.add(_refFilePathToTaxonId(fastaFilePath)) # taxonId.1.fna or taxonId.1.fas elif action in ['mg']: # get the list of all taxon ids that appear in any file in the input directory as taxonomy ".tax" for mapFilePath in glob.glob(os.path.join(os.path.normpath(inDir), r'*.tax')): # *.tax refCladesSet.update(set(map(_mgSeqIdToTaxonId, csv.getColumnAsList(mapFilePath, sep='\t')))) else: assert False, str('Not supported action: ' + action) # checks whether taxonIds are in the taxonomy for taxonId in inCladesSet: assert taxonomy.exists(taxonId), str( 'taxonId: %s from clades list is not contained in the taxonomy!' % taxonId) for taxonId in refCladesSet: assert taxonomy.exists(taxonId), str( 'taxonId: %s from the reference is not contained in the taxonomy!' % taxonId) # checks whether the taxonIds are leafs (doesn't have to be (unless you want to mask at the strain level)) for taxonId in inCladesSet: if not taxonomy.isLeaf(taxonId): print('Taxon id %s does not represent a leaf clade in the taxonomy.' % taxonId) if verbose: print('Initial checks done.') # taxonIds that should be excluded toExcludeSet = set() for taxonId in inCladesSet: taxonIdAtRank = taxonomy.getTaxonIdAtRank(taxonId, rank) if taxonIdAtRank is None: # the lineage is not defined at this rank ! try a lower rank ! print('Taxon id: "%s" is not defined at rank: "%s"' % (taxonId, rank)) currentRank = rank # find a lower rank at which it's defined while currentRank in toLowerRank: currentRank = toLowerRank[currentRank] taxonIdAtRank = taxonomy.getTaxonIdAtRank(taxonId, currentRank) if taxonIdAtRank is not None: break if taxonIdAtRank is None: taxonIdAtRank = taxonId currentRank = _STRAIN print('Taxon id: %s will be masked at rank: %s' % (taxonId, currentRank)) # all child clades (and itself) toExcludeSet.add(int(taxonIdAtRank)) toExcludeSet.update(set(map(int, taxonomy.getAllChildren(taxonIdAtRank)))) # all clades that should be excluded (there is at least one sequence for each taxonId in the reference) toExcludeSet.intersection_update(refCladesSet) if verbose: print('Data to mask collected done.') print('To exclude: ', len(toExcludeSet)) # exclude data from the reference if action == 'cl': # generates a list of taxonIds out = csv.OutFileBuffer(os.path.join(outDir, 'exclude_list.txt')) for taxonId in toExcludeSet: out.writeText(str(taxonId) + '\n') out.close() elif action == 'mr': # masked reference sequences (create sim links to files that were not excluded) for fastaFilePath in glob.glob(os.path.join(os.path.normpath(inDir), r'*.f[na][as]')): # *.fas or *.fna taxonId = _refFilePathToTaxonId(fastaFilePath) # taxonId.1.fna or taxonId.1.fas if taxonId not in toExcludeSet: # assert os.name == 'posix' os.symlink(fastaFilePath, os.path.join(outDir, os.path.basename(fastaFilePath))) elif action == 'mg': # exclude sequences from the marker gene databases for mapFilePath in glob.glob(os.path.join(os.path.normpath(inDir), r'*.tax')): # get entries that can stay in the mapping and fasta files allowedEntriesSet = set(map(_mgSeqIdToTaxonId, csv.getColumnAsList(mapFilePath, sep='\t'))) allowedEntriesSet.difference_update(toExcludeSet) # filter out entries from the mapping file csv.filterOutLines(mapFilePath, os.path.join(outDir, os.path.basename(mapFilePath)), allowedEntriesSet, entryModifyFunction=_mgSeqIdToTaxonId, colNum=0, sep='\t') # filter out entries from the fasta file fastaFilePath = str(mapFilePath.rsplit('.', 1)[0] + '.fna') fas.filterOutSequences(fastaFilePath, os.path.join(outDir, os.path.basename(fastaFilePath)), allowedEntriesSet, seqNameModifyFunction=_mgSeqIdToTaxonId) else: assert False, 'Not supported action!' taxonomy.close() if verbose: print('Data masked done.')