def main(): # parse some argument lists inputArgs = parser.parse_args() #### PathoID modules #### start = time(); if (inputArgs.subcommand=='LIB'): ################################################$ #append taxon id in the front of sequence header ################################################$ NAs = 'X' if inputArgs.lib_dbuser!=NAs and inputArgs.lib_dbpasswd==NAs: print 'if you want to use mysql, make sure that you install pathoDB and ' 'also specify the corresponding mysql password correctly ' '(Ask your mysql admin to access the database).' MysqlConf=(inputArgs.lib_dbhost,inputArgs.lib_dbport,inputArgs.lib_dbuser,inputArgs.lib_dbpasswd,inputArgs.lib_db) taxon_ids=pathoLib.parse_input_app_build_nt_tgt(inputArgs.lib_taxon_ids) exclude_taxon_ids=pathoLib.parse_input_app_build_nt_tgt(inputArgs.lib_exclude_taxon_ids) (ncbiNt_ti,ncbiNt_invalid) = pathoLib.append_ti_into_fasta_app(inputArgs.lib_reference, taxon_ids, exclude_taxon_ids, inputArgs.lib_subtax,MysqlConf, not(inputArgs.lib_nodesc), inputArgs.lib_online_search, inputArgs.lib_outprefix, inputArgs.lib_outdir) if (inputArgs.subcommand=='MAP'): pathoMapOptions = PathoMapA.PathoMapOptions() pathoMapOptions.verbose = inputArgs.verbose pathoMapOptions.outDir = inputArgs.map_outdir pathoMapOptions.indexDir = inputArgs.map_indexdir pathoMapOptions.outAlignFile = inputArgs.map_outalign pathoMapOptions.inReadFile = inputArgs.map_inputread pathoMapOptions.inReadFilePair1 = inputArgs.map_inputread1 pathoMapOptions.inReadFilePair2 = inputArgs.map_inputread2 pathoMapOptions.targetAlignParameters = inputArgs.map_targetalignparams pathoMapOptions.filterAlignParameters = inputArgs.map_filteralignparams if (len(inputArgs.map_targetref)>0): pathoMapOptions.targetRefFiles = inputArgs.map_targetref.split(",") if (len(inputArgs.map_filterref)>0): pathoMapOptions.filterRefFiles = inputArgs.map_filterref.split(",") if (len(inputArgs.map_targetindex)>0): pathoMapOptions.targetIndexPrefixes = inputArgs.map_targetindex.split(",") if (len(inputArgs.map_filterindex)>0): pathoMapOptions.filterIndexPrefixes = inputArgs.map_filterindex.split(",") if (len(inputArgs.map_targetalign)>0): pathoMapOptions.targetAlignFiles = inputArgs.map_targetalign.split(",") if (len(inputArgs.map_filteralign)>0): pathoMapOptions.filterAlignFiles = inputArgs.map_filteralign.split(",") pathoMapOptions.btHome = inputArgs.map_bthome pathoMapOptions.numThreads = inputArgs.map_numthreads pathoMapOptions.exp_tag = inputArgs.map_exp_tag + "-" PathoMapA.processPathoMap(pathoMapOptions) if (inputArgs.subcommand=='ID'): pathoIdOptions = PathoID.PathoIdOptions(inputArgs.id_ali_file) pathoIdOptions.ali_format = inputArgs.id_ali_format pathoIdOptions.verbose = inputArgs.verbose pathoIdOptions.out_matrix_flag = inputArgs.id_out_matrix pathoIdOptions.score_cutoff = inputArgs.id_score_cutoff pathoIdOptions.exp_tag = inputArgs.id_exp_tag pathoIdOptions.outdir = inputArgs.id_outdir pathoIdOptions.emEpsilon = inputArgs.id_emEpsilon pathoIdOptions.maxIter = inputArgs.id_maxIter pathoIdOptions.piPrior = inputArgs.id_piPrior pathoIdOptions.thetaPrior = inputArgs.id_thetaPrior pathoIdOptions.noalign = inputArgs.id_noalign pathoIdOptions.noCutOff = inputArgs.id_nocutoff PathoID.pathoscope_reassign(pathoIdOptions) if (inputArgs.subcommand=='REP'): pathoReportOptions = PathoReportA.PathoReportOptions(inputArgs.rep_ali_file) pathoReportOptions.verbose = inputArgs.verbose pathoReportOptions.contigFlag = inputArgs.rep_contig_flag pathoReportOptions.outDir = inputArgs.rep_outdir pathoReportOptions.samtoolsHome = inputArgs.rep_samtoolshome pathoReportOptions.noCutOff = inputArgs.rep_nocutoff mysqlConf=(inputArgs.rep_dbhost,inputArgs.rep_dbport,inputArgs.rep_dbuser, inputArgs.rep_dbpasswd,inputArgs.rep_db) pathoReportOptions.mysqlConf = mysqlConf PathoReportA.processPathoReport(pathoReportOptions) if (inputArgs.subcommand=='QC'): qcargs = sys.argv[2:] pathoqcdir = pathoscopedir + os.path.sep + 'pathoscope' + os.path.sep + 'pathoqc' pathoqcfile = pathoqcdir + os.path.sep + 'pathoqc.py' if os.path.exists(pathoqcfile): cmd = sys.executable cmd += " " + pathoqcfile + " " cmd += " ".join(qcargs) print(cmd) os.system(cmd) else: print("PathoQC (" + pathoqcfile + ") not found. Please download pathoqc_vXXX.tar.gz and " "install it ("+pathoqcdir+") from http://sourceforge.net/projects/pathoscope/") elapsed = time() - start; if inputArgs.verbose: print "Total Elapsed Time: %d" % (elapsed)
def pathoscope_reassign(pathoIdOptions): out_matrix = pathoIdOptions.out_matrix_flag verbose = pathoIdOptions.verbose scoreCutoff = pathoIdOptions.score_cutoff expTag = pathoIdOptions.exp_tag ali_format = pathoIdOptions.ali_format ali_file = pathoIdOptions.ali_file outdir = pathoIdOptions.outdir emEpsilon = pathoIdOptions.emEpsilon maxIter = pathoIdOptions.maxIter upalign = not (pathoIdOptions.noalign) piPrior = pathoIdOptions.piPrior thetaPrior = pathoIdOptions.thetaPrior noCutOff = pathoIdOptions.noCutOff if float(os.stat(ali_file).st_size) < 1.0: print 'the alignment file [%s] is empty.' % ali_file sys.exit(1) if ali_format == 'gnu-sam': aliFormat = 0 if verbose: print "parsing gnu-sam file/likelihood score/reads and mapped genomes..." elif ali_format == 'sam': #standard sam aliFormat = 1 if verbose: print "parsing sam file/likelihood score/reads and mapped genomes..." elif ali_format == 'bl8': #blat m8 format aliFormat = 2 if verbose: print "parsing bl8 file/likelihood score/reads and mapped genomes..." else: print "unknown alignment format file..." return (U, NU, genomes, reads) = conv_align2GRmat(ali_file, scoreCutoff, aliFormat) nG = len(genomes) nR = len(reads) if verbose: print "EM iteration..." print "(Genomes,Reads)=%dx%d" % (nG, nR) print "Delta Change:" if out_matrix: if verbose: print "writing initial alignment ..." out_initial_align_matrix(genomes, reads, U, NU, expTag, ali_file, outdir) (bestHitInitialReads, bestHitInitial, level1Initial, level2Initial) = \ PathoReportA.computeBestHit(U, NU, genomes, reads) (initPi, pi, _, NU) = pathoscope_em(U, NU, genomes, maxIter, emEpsilon, verbose, piPrior, thetaPrior) tmp = zip(initPi, genomes) tmp = sorted(tmp, reverse=True) #similar to sort row if out_matrix: initialGuess = outdir + os.sep + expTag + '-initGuess.txt' oFp = open(initialGuess, 'wb') csv_writer = csv.writer(oFp, delimiter='\t') csv_writer.writerows(tmp) oFp.close() del tmp (bestHitFinalReads, bestHitFinal, level1Final, level2Final) = \ PathoReportA.computeBestHit(U, NU, genomes, reads) if out_matrix: finalGuess = outdir + os.sep + expTag + '-finGuess.txt' oFp = open(finalGuess, 'wb') tmp = zip(pi, genomes) tmp = sorted(tmp, reverse=True) csv_writer = csv.writer(oFp, delimiter='\t') csv_writer.writerows(tmp) oFp.close() finalReport = outdir + os.sep + expTag + '-' + ali_format + '-report.tsv' header = ['Genome', 'Final Guess', 'Final Best Hit', 'Final Best Hit Read Numbers', \ 'Final High Confidence Hits', 'Final Low Confidence Hits', 'Initial Guess', \ 'Initial Best Hit', 'Initial Best Hit Read Numbers', \ 'Initial High Confidence Hits', 'Initial Low Confidence Hits'] (x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11) = PathoReportA.write_tsv_report( finalReport, nR, nG, pi, genomes, initPi, bestHitInitial, bestHitInitialReads, bestHitFinal, bestHitFinalReads, level1Initial, level2Initial, level1Final, level2Final, header, noCutOff) reAlignfile = ali_file if upalign: reAlignfile = rewrite_align(U, NU, ali_file, scoreCutoff, aliFormat, outdir) return (finalReport, x2, x3, x4, x5, x1, x6, x7, x8, x9, x10, x11, reAlignfile)
def main(): # parse some argument lists inputArgs = parser.parse_args() #### PathoID modules #### start = time() if (inputArgs.subcommand == 'LIB'): ################################################$ #append taxon id in the front of sequence header ################################################$ NAs = 'X' if inputArgs.lib_dbuser != NAs and inputArgs.lib_dbpasswd == NAs: print 'if you want to use mysql, make sure that you install pathoDB and ' 'also specify the corresponding mysql password correctly ' '(Ask your mysql admin to access the database).' MysqlConf = (inputArgs.lib_dbhost, inputArgs.lib_dbport, inputArgs.lib_dbuser, inputArgs.lib_dbpasswd, inputArgs.lib_db) taxon_ids = pathoLib.parse_input_app_build_nt_tgt( inputArgs.lib_taxon_ids) exclude_taxon_ids = pathoLib.parse_input_app_build_nt_tgt( inputArgs.lib_exclude_taxon_ids) (ncbiNt_ti, ncbiNt_invalid) = pathoLib.append_ti_into_fasta_app( inputArgs.lib_reference, taxon_ids, exclude_taxon_ids, inputArgs.lib_subtax, MysqlConf, not (inputArgs.lib_nodesc), inputArgs.lib_online_search, inputArgs.lib_outprefix, inputArgs.lib_outdir) if (inputArgs.subcommand == 'MAP'): pathoMapOptions = PathoMapA.PathoMapOptions() pathoMapOptions.verbose = inputArgs.verbose pathoMapOptions.outDir = inputArgs.map_outdir pathoMapOptions.indexDir = inputArgs.map_indexdir pathoMapOptions.outAlignFile = inputArgs.map_outalign pathoMapOptions.inReadFile = inputArgs.map_inputread pathoMapOptions.inReadFilePair1 = inputArgs.map_inputread1 pathoMapOptions.inReadFilePair2 = inputArgs.map_inputread2 pathoMapOptions.targetAlignParameters = inputArgs.map_targetalignparams pathoMapOptions.filterAlignParameters = inputArgs.map_filteralignparams if (len(inputArgs.map_targetref) > 0): pathoMapOptions.targetRefFiles = inputArgs.map_targetref.split(",") if (len(inputArgs.map_filterref) > 0): pathoMapOptions.filterRefFiles = inputArgs.map_filterref.split(",") if (len(inputArgs.map_targetindex) > 0): pathoMapOptions.targetIndexPrefixes = inputArgs.map_targetindex.split( ",") if (len(inputArgs.map_filterindex) > 0): pathoMapOptions.filterIndexPrefixes = inputArgs.map_filterindex.split( ",") if (len(inputArgs.map_targetalign) > 0): pathoMapOptions.targetAlignFiles = inputArgs.map_targetalign.split( ",") if (len(inputArgs.map_filteralign) > 0): pathoMapOptions.filterAlignFiles = inputArgs.map_filteralign.split( ",") pathoMapOptions.btHome = inputArgs.map_bthome pathoMapOptions.numThreads = inputArgs.map_numthreads pathoMapOptions.exp_tag = inputArgs.map_exp_tag + "-" PathoMapA.processPathoMap(pathoMapOptions) if (inputArgs.subcommand == 'ID'): pathoIdOptions = PathoID.PathoIdOptions(inputArgs.id_ali_file) pathoIdOptions.ali_format = inputArgs.id_ali_format pathoIdOptions.verbose = inputArgs.verbose pathoIdOptions.out_matrix_flag = inputArgs.id_out_matrix pathoIdOptions.score_cutoff = inputArgs.id_score_cutoff pathoIdOptions.exp_tag = inputArgs.id_exp_tag pathoIdOptions.outdir = inputArgs.id_outdir pathoIdOptions.emEpsilon = inputArgs.id_emEpsilon pathoIdOptions.maxIter = inputArgs.id_maxIter pathoIdOptions.piPrior = inputArgs.id_piPrior pathoIdOptions.thetaPrior = inputArgs.id_thetaPrior pathoIdOptions.noalign = inputArgs.id_noalign pathoIdOptions.noCutOff = inputArgs.id_nocutoff PathoID.pathoscope_reassign(pathoIdOptions) if (inputArgs.subcommand == 'REP'): pathoReportOptions = PathoReportA.PathoReportOptions( inputArgs.rep_ali_file) pathoReportOptions.verbose = inputArgs.verbose pathoReportOptions.contigFlag = inputArgs.rep_contig_flag pathoReportOptions.outDir = inputArgs.rep_outdir pathoReportOptions.samtoolsHome = inputArgs.rep_samtoolshome pathoReportOptions.noCutOff = inputArgs.rep_nocutoff mysqlConf = (inputArgs.rep_dbhost, inputArgs.rep_dbport, inputArgs.rep_dbuser, inputArgs.rep_dbpasswd, inputArgs.rep_db) pathoReportOptions.mysqlConf = mysqlConf PathoReportA.processPathoReport(pathoReportOptions) if (inputArgs.subcommand == 'QC'): qcargs = sys.argv[2:] pathoqcdir = pathoscopedir + os.path.sep + 'pathoscope' + os.path.sep + 'pathoqc' pathoqcfile = pathoqcdir + os.path.sep + 'pathoqc.py' if os.path.exists(pathoqcfile): cmd = sys.executable cmd += " " + pathoqcfile + " " cmd += " ".join(qcargs) print(cmd) os.system(cmd) else: print( "PathoQC (" + pathoqcfile + ") not found. Please download pathoqc_vXXX.tar.gz and " "install it (" + pathoqcdir + ") from http://sourceforge.net/projects/pathoscope/") elapsed = time() - start if inputArgs.verbose: print "Total Elapsed Time: %d" % (elapsed)
def pathoscope_reassign(pathoIdOptions): out_matrix = pathoIdOptions.out_matrix_flag verbose = pathoIdOptions.verbose scoreCutoff = pathoIdOptions.score_cutoff expTag = pathoIdOptions.exp_tag ali_format = pathoIdOptions.ali_format ali_file = pathoIdOptions.ali_file outdir = pathoIdOptions.outdir emEpsilon = pathoIdOptions.emEpsilon maxIter = pathoIdOptions.maxIter upalign = not(pathoIdOptions.noalign) piPrior = pathoIdOptions.piPrior thetaPrior = pathoIdOptions.thetaPrior noCutOff = pathoIdOptions.noCutOff if float(os.stat(ali_file).st_size)<1.0: print 'the alignment file [%s] is empty.' % ali_file sys.exit(1) if ali_format == 'gnu-sam': aliFormat = 0 if verbose: print "parsing gnu-sam file/likelihood score/reads and mapped genomes..." elif ali_format == 'sam': #standard sam aliFormat = 1 if verbose: print "parsing sam file/likelihood score/reads and mapped genomes..." elif ali_format == 'bl8': #blat m8 format aliFormat = 2 if verbose: print "parsing bl8 file/likelihood score/reads and mapped genomes..." else: print "unknown alignment format file..." return (U, NU, genomes, reads) = conv_align2GRmat(ali_file,scoreCutoff,aliFormat) nG = len(genomes) nR = len(reads) if verbose: print "EM iteration..." print "(Genomes,Reads)=%dx%d" % (nG, nR) print "Delta Change:" if out_matrix: if verbose: print "writing initial alignment ..." out_initial_align_matrix(genomes, reads, U, NU, expTag, ali_file, outdir) (bestHitInitialReads, bestHitInitial, level1Initial, level2Initial) = \ PathoReportA.computeBestHit(U, NU, genomes, reads) (initPi, pi, _, NU) = pathoscope_em(U, NU, genomes, maxIter, emEpsilon, verbose, piPrior, thetaPrior) tmp = zip(initPi,genomes) tmp = sorted(tmp,reverse=True) #similar to sort row if out_matrix: initialGuess = outdir + os.sep + expTag + '-initGuess.txt' oFp = open(initialGuess,'wb') csv_writer = csv.writer(oFp, delimiter='\t') csv_writer.writerows(tmp) oFp.close() del tmp (bestHitFinalReads, bestHitFinal, level1Final, level2Final) = \ PathoReportA.computeBestHit(U, NU, genomes, reads) if out_matrix: finalGuess = outdir + os.sep + expTag + '-finGuess.txt' oFp = open(finalGuess,'wb') tmp = zip(pi,genomes) tmp = sorted(tmp,reverse=True) csv_writer = csv.writer(oFp, delimiter='\t') csv_writer.writerows(tmp) oFp.close() finalReport = outdir + os.sep + expTag +'-'+ ali_format + '-report.tsv' header = ['Genome', 'Final Guess', 'Final Best Hit', 'Final Best Hit Read Numbers', \ 'Final High Confidence Hits', 'Final Low Confidence Hits', 'Initial Guess', \ 'Initial Best Hit', 'Initial Best Hit Read Numbers', \ 'Initial High Confidence Hits', 'Initial Low Confidence Hits'] (x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11) = PathoReportA.write_tsv_report( finalReport, nR, nG, pi, genomes, initPi, bestHitInitial, bestHitInitialReads, bestHitFinal, bestHitFinalReads, level1Initial, level2Initial, level1Final, level2Final, header, noCutOff) reAlignfile = ali_file if upalign: reAlignfile = rewrite_align(U, NU, ali_file, scoreCutoff, aliFormat, outdir) return (finalReport, x2, x3, x4, x5, x1, x6, x7, x8, x9, x10, x11, reAlignfile)
pathoIdOptions = PathoID.PathoIdOptions(inputArgs.id_ali_file) pathoIdOptions.ali_format = inputArgs.id_ali_format pathoIdOptions.verbose = inputArgs.verbose pathoIdOptions.out_matrix_flag = inputArgs.id_out_matrix pathoIdOptions.score_cutoff = inputArgs.id_score_cutoff pathoIdOptions.exp_tag = inputArgs.id_exp_tag pathoIdOptions.outdir = inputArgs.id_outdir pathoIdOptions.emEpsilon = inputArgs.id_emEpsilon pathoIdOptions.maxIter = inputArgs.id_maxIter pathoIdOptions.piPrior = inputArgs.id_piPrior pathoIdOptions.thetaPrior = inputArgs.id_thetaPrior pathoIdOptions.noalign = inputArgs.id_noalign PathoID.pathoscope_reassign(pathoIdOptions) if (inputArgs.subcommand == 'REP'): pathoReportOptions = PathoReportA.PathoReportOptions( inputArgs.rep_ali_file) pathoReportOptions.verbose = inputArgs.verbose pathoReportOptions.contigFlag = inputArgs.rep_contig_flag pathoReportOptions.outDir = inputArgs.rep_outdir pathoReportOptions.samtoolsHome = inputArgs.rep_samtoolshome mysqlConf = (inputArgs.rep_dbhost, inputArgs.rep_dbport, inputArgs.rep_dbuser, inputArgs.rep_dbpasswd, inputArgs.rep_db) pathoReportOptions.mysqlConf = mysqlConf PathoReportA.processPathoReport(pathoReportOptions) elapsed = time() - start if inputArgs.verbose: print "Total Elapsed Time: %d" % (elapsed)
if (inputArgs.subcommand=='ID'): pathoIdOptions = PathoID.PathoIdOptions(inputArgs.id_ali_file) pathoIdOptions.ali_format = inputArgs.id_ali_format pathoIdOptions.verbose = inputArgs.verbose pathoIdOptions.out_matrix_flag = inputArgs.id_out_matrix pathoIdOptions.score_cutoff = inputArgs.id_score_cutoff pathoIdOptions.exp_tag = inputArgs.id_exp_tag pathoIdOptions.outdir = inputArgs.id_outdir pathoIdOptions.emEpsilon = inputArgs.id_emEpsilon pathoIdOptions.maxIter = inputArgs.id_maxIter pathoIdOptions.piPrior = inputArgs.id_piPrior pathoIdOptions.thetaPrior = inputArgs.id_thetaPrior pathoIdOptions.noalign = inputArgs.id_noalign PathoID.pathoscope_reassign(pathoIdOptions) if (inputArgs.subcommand=='REP'): pathoReportOptions = PathoReportA.PathoReportOptions(inputArgs.rep_ali_file) pathoReportOptions.verbose = inputArgs.verbose pathoReportOptions.contigFlag = inputArgs.rep_contig_flag pathoReportOptions.outDir = inputArgs.rep_outdir pathoReportOptions.samtoolsHome = inputArgs.rep_samtoolshome mysqlConf=(inputArgs.rep_dbhost,inputArgs.rep_dbport,inputArgs.rep_dbuser, inputArgs.rep_dbpasswd,inputArgs.rep_db) pathoReportOptions.mysqlConf = mysqlConf PathoReportA.processPathoReport(pathoReportOptions) elapsed = time() - start; if inputArgs.verbose: print "Total Elapsed Time: %d" % (elapsed)