def ppsOut2ppOut(inFile, outFile, taxonomicRanks, databaseFile): """ Transforms a PPS output file into a file in the PP format. @param inFile: input file in the PPS format (first column: seq name, last column: ncbi taxon id) @param outFile: output file in the PP format @param taxonomicRanks: taxonomic ranks (starting from superkingdom) @param databaseFile: database file in the sqlite3 format """ taxonomy = Taxonomy(databaseFile, taxonomicRanks) outBuff = csv.OutFileBuffer(outFile) namesList = csv.getColumnAsList(inFile, entryModifyFunction=None, colNum=0, sep='\t', comment='#') valCol = 1 ncbidsList = csv.getColumnAsList(inFile, entryModifyFunction=None, colNum=valCol, sep='\t', comment='#') while True: # this is not efficient! valCol += 1 tmpList = csv.getColumnAsList(inFile, entryModifyFunction=None, colNum=valCol, sep='\t', comment='#') if len(tmpList) == len(namesList): ncbidsList = tmpList else: break header = str('#PPS file transformed to PP format, input file: ' + str(inFile) + '\n#ID' + '\t' + 'root') for rank in taxonomicRanks: header += str('\t' + rank) outBuff.writeText(str(header + '\n')) for i in range(len(namesList)): name = namesList[i] ncbid = ncbidsList[i] taxPathDict = taxonomy.getPathToRoot(int(ncbid)) buff = str(name) if taxPathDict is None: buff += str('\t') else: buff += str('\t' + 'root') for rank in taxonomicRanks: if (taxPathDict is not None) and (rank in taxPathDict) and ( not taxPathDict[rank].isCopy()): buff += str('\t' + taxPathDict[rank].name) else: buff += '\t' outBuff.writeText(str(buff + '\n')) outBuff.close() taxonomy.close()
def main01(): #config = Config(open(os.path.normpath('/Users/ivan/Documents/work/binning/tests/CowRumen/01/config.cfg')), 'pPPS') #config = Config(open(os.path.normpath('/net/metagenomics/projects/PPSmg/tests/V35/config.cfg')), 'pPPS') #configMl = Config2(config, 'MLTreeMap') #configPPS = Config2(config, 'PPS') #read sequences #sequences = Sequences(config) #write ids file #sequences.writeSequences(config.get('inputIdsFastaFile')) #taxonomy = Taxonomy(config.get('databaseFile'), config.get('taxonomicRanks').split(',')) taxonomicRanks = 'superkingdom,phylum,class,order,family,genus,species'.split(',') taxonomy = Taxonomy('/Users/ivan/Documents/work/binning/taxonomy/20120828/ncbitax_sqlite.db', taxonomicRanks) #ppsOut2ppOut('D:\\VM\\tmp\\simMC_AMD\\AMD.Arachne.genus', 'D:\\VM\\tmp\\simMC_AMD\\AMD.Arachne.genus.PP.out', taxonomy, config.get('taxonomicRanks').split(',')) #ppsOut2ppOut('/Users/ivan/Documents/work/binning/data/CowRumen/cowRumenOrderNcbids.txt', # '/Users/ivan/Documents/work/binning/data/CowRumen/cowRumenOrderNcbids.PP.txt', taxonomy, config.get('taxonomicRanks').split(',')) #ppsOut2ppOut('/net/metagenomics/projects/PPSmg/data/V35/contigsMappedBlast1000LabelsSpecies.txt', # '/net/metagenomics/projects/PPSmg/data/V35/contigsMappedBlast1000LabelsSpecies.PP.txt', taxonomy, config.get('taxonomicRanks').split(',')) ppsOut2ppOut('/Users/ivan/Documents/work/binning/data/simMC/fromJohannes/contigs.genus.tax', '/Users/ivan/Documents/work/binning/data/simMC/fromJohannes/contigs.genus.PP.tax', taxonomy, taxonomicRanks) #readPPSOutput(sequences, taxonomy, config.get('inputIdsFastaFile')) #sequences.writePlacements(str(config.get('inputIdsFastaFile') + '.pOUT'), config.get('taxonomicRanks').split(',')) #toRealNames(config, sequences) taxonomy.close()
def ppsOut2ppOut(inFile, outFile, taxonomicRanks, databaseFile): """ Transforms a PPS output file into a file in the PP format. @param inFile: input file in the PPS format (first column: seq name, last column: ncbi taxon id) @param outFile: output file in the PP format @param taxonomicRanks: taxonomic ranks (starting from superkingdom) @param databaseFile: database file in the sqlite3 format """ taxonomy = Taxonomy(databaseFile, taxonomicRanks) outBuff = csv.OutFileBuffer(outFile) namesList = csv.getColumnAsList(inFile, entryModifyFunction=None, colNum=0, sep='\t', comment='#') valCol = 1 ncbidsList = csv.getColumnAsList(inFile, entryModifyFunction=None, colNum=valCol, sep='\t', comment='#') while True: # this is not efficient! valCol += 1 tmpList = csv.getColumnAsList(inFile, entryModifyFunction=None, colNum=valCol, sep='\t', comment='#') if len(tmpList) == len(namesList): ncbidsList = tmpList else: break header = str('#PPS file transformed to PP format, input file: ' + str(inFile) + '\n#ID' + '\t' + 'root') for rank in taxonomicRanks: header += str('\t' + rank) outBuff.writeText(str(header + '\n')) for i in range(len(namesList)): name = namesList[i] ncbid = ncbidsList[i] taxPathDict = taxonomy.getPathToRoot(int(ncbid)) buff = str(name) if taxPathDict is None: buff += str('\t') else: buff += str('\t' + 'root') for rank in taxonomicRanks: if (taxPathDict is not None) and (rank in taxPathDict) and (not taxPathDict[rank].isCopy()): buff += str('\t' + taxPathDict[rank].name) else: buff += '\t' outBuff.writeText(str(buff + '\n')) outBuff.close() taxonomy.close()
def main01(): #config = Config(open(os.path.normpath('/Users/ivan/Documents/work/binning/tests/CowRumen/01/config.cfg')), 'pPPS') #config = Config(open(os.path.normpath('/net/metagenomics/projects/PPSmg/tests/V35/config.cfg')), 'pPPS') #configMl = Config2(config, 'MLTreeMap') #configPPS = Config2(config, 'PPS') #read sequences #sequences = Sequences(config) #write ids file #sequences.writeSequences(config.get('inputIdsFastaFile')) #taxonomy = Taxonomy(config.get('databaseFile'), config.get('taxonomicRanks').split(',')) taxonomicRanks = 'superkingdom,phylum,class,order,family,genus,species'.split( ',') taxonomy = Taxonomy( '/Users/ivan/Documents/work/binning/taxonomy/20120828/ncbitax_sqlite.db', taxonomicRanks) #ppsOut2ppOut('D:\\VM\\tmp\\simMC_AMD\\AMD.Arachne.genus', 'D:\\VM\\tmp\\simMC_AMD\\AMD.Arachne.genus.PP.out', taxonomy, config.get('taxonomicRanks').split(',')) #ppsOut2ppOut('/Users/ivan/Documents/work/binning/data/CowRumen/cowRumenOrderNcbids.txt', # '/Users/ivan/Documents/work/binning/data/CowRumen/cowRumenOrderNcbids.PP.txt', taxonomy, config.get('taxonomicRanks').split(',')) #ppsOut2ppOut('/net/metagenomics/projects/PPSmg/data/V35/contigsMappedBlast1000LabelsSpecies.txt', # '/net/metagenomics/projects/PPSmg/data/V35/contigsMappedBlast1000LabelsSpecies.PP.txt', taxonomy, config.get('taxonomicRanks').split(',')) ppsOut2ppOut( '/Users/ivan/Documents/work/binning/data/simMC/fromJohannes/contigs.genus.tax', '/Users/ivan/Documents/work/binning/data/simMC/fromJohannes/contigs.genus.PP.tax', taxonomy, taxonomicRanks) #readPPSOutput(sequences, taxonomy, config.get('inputIdsFastaFile')) #sequences.writePlacements(str(config.get('inputIdsFastaFile') + '.pOUT'), config.get('taxonomicRanks').split(',')) #toRealNames(config, sequences) taxonomy.close()
def test(): markerGeneName = 'rpsC' #'rpsI' #'rpsS' # 'rpsK' annotationDir = os.path.normpath( 'D:/A_Phylo/A_Metagenomic/data/markerGenes/annotation') outDir = os.path.normpath( 'D:/A_Phylo/A_Metagenomic/data/markerGenes/mGenesExtracted') taxonomy = Taxonomy( os.path.normpath( 'D:/A_Phylo/A_Metagenomic/data/ncbiTaxonomy20111007/ncbitax_sqlite.db' ), [ 'superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species' ]) relaxGeneNames = False recSkipCount = 0 firstErrorStop = False createGeneDb(markerGeneName, annotationDir, outDir, taxonomy, relaxGeneNames, recSkipCount, firstErrorStop)
def main(): parser = argparse.ArgumentParser( description='''Creates database for a marker gene''', epilog=''' ''') parser.add_argument('-m', '--marker-gene-name', action='store', nargs=1, required=True, help='The name of a specific marker gene.', dest='marker') parser.add_argument( '-a', '--annotation-dir', action='store', nargs=1, required=True, help='The name of the directory that contains annotation files.', dest='annotationDir') parser.add_argument( '-o', '--output-dir', action='store', nargs=1, required=True, help='The name of the directory where the output files will be stored.', dest='outDir') parser.add_argument('-t', '--taxonomyDb', action='store', nargs=1, required=True, help='Taxonomy database file (SQLite).', dest='taxonomyDb') parser.add_argument( '-r', '--relax-gene-names', action='store_true', help= 'If enabled, the script doesn`t control if the gene names are correct.', dest='relaxGeneNames') parser.add_argument( '-s', '--rec-skip', action='store', nargs=1, help= 'The number of records that will be skipped at the beginning of the annotation file.', dest='recSkip') parser.add_argument('-p', '--print-first-error', action='store_true', help='The script stops after first error occurs', dest='firstErrorStop') args = parser.parse_args() markerGeneName = str(args.marker[0]) annotationDir = os.path.normpath(str(args.annotationDir[0])) outDir = os.path.normpath(str(args.outDir[0])) taxonomy = Taxonomy(os.path.normpath(str(args.taxonomyDb[0])), [ 'superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species' ]) if args.recSkip: recSkip = int(args.recSkip[0]) else: recSkip = 0 if args.firstErrorStop: firstErrorStop = True else: firstErrorStop = False if args.relaxGeneNames: relaxGeneNames = True else: relaxGeneNames = False createGeneDb(markerGeneName, annotationDir, outDir, taxonomy, relaxGeneNames, recSkip, firstErrorStop)
entry += str('\t' + taxPathDict[rank].name) else: entry += '\t' f.write(entry) except Exception: print "Cannot create a file or write to it:", outFile raise finally: f.close() if __name__ == "__main__": #test 2 #ppsOutFile = 'D:\A_Phylo\A_Metagenomic\data\humanGut\PPS_contigs.txt' #outPPOutFile = 'D:\A_Phylo\A_Metagenomic\data\humanGut\PPS_PP_contigs.txt' #ppsOutFile = 'C:/Documents and Settings/Administrator/Desktop/temp/johdroPred/inputTW.fas.ids04.lP' ppsOutFile = 'C:/Documents and Settings/Administrator/Desktop/temp/johdroPred/inputTW.fas.ids05.lP' #outPPOutFile = 'C:/Documents and Settings/Administrator/Desktop/temp/johdroPred/inputTW.fas.ids04.lP.PP.out' outPPOutFile = 'C:/Documents and Settings/Administrator/Desktop/temp/johdroPred/inputTW.fas.ids05.lP.PP.out' config = Config(open(os.path.normpath('D:\\A_Phylo\\A_Metagenomic\\pPPS\\workspace\\pPPS\\config01.cfg')), 'pPPS') databaseFile = os.path.normpath(config.get('databaseFile')) taxonomicRanks = config.get('taxonomicRanks').split(',') taxonomy = Taxonomy(databaseFile, taxonomicRanks) ppsOutToPPOut(ppsOutFile, outPPOutFile, taxonomicRanks, taxonomy) #test 1 #scafContigFile = 'D:/A_Phylo/A_Metagenomic/reindeer/data/scaffolds-contigs.tab' #scafPPSOutFile = 'D:/A_Phylo/A_Metagenomic/reindeer/predictions/pps04/scaffoldsOut/SRM_Scaffolds_namesOnly.fna.PP.out' #contigPPSOutFile = 'D:/A_Phylo/A_Metagenomic/reindeer/predictions/pps04/scaffoldsOut/SRM_Scaffolds_namesOnly.fna.PP.out_contigs' #scafToContigOutput(scafContigFile, scafPPSOutFile, contigPPSOutFile)