def find_analyses(target, recordsToAnalyze, templateFastqFiles, complementFastqFiles, references, outputDir): """takes a set of records to analyze and finds the corresponding sequences and creates alignment targets""" files = {"template": [], "complement": []} logger.info("Finding template analyses") for fastqFile in templateFastqFiles: for name, seq, qual in fastqRead(fastqFile): if name in recordsToAnalyze: outfile = os.path.join(target.getGlobalTempDir(), "template_" + name) files["template"].append(outfile) ref_name, ref_start, ref_stop = recordsToAnalyze[name] ref_seq = references[ref_name][ref_start:ref_stop] analysis = [name, seq, ref_name, ref_seq, outfile] target.addChildTarget( Target.makeTargetFn(analyze, args=analysis)) logger.info("Finding complement analyses") for fastqFile in complementFastqFiles: for name, seq, qual in fastqRead(fastqFile): if name in recordsToAnalyze: outfile = os.path.join(target.getGlobalTempDir(), "complement_" + name) files["complement"].append(outfile) ref_name, ref_start, ref_stop = recordsToAnalyze[name] ref_seq = references[ref_name][ref_start:ref_stop] analysis = [name, seq, ref_name, ref_seq, outfile] target.addChildTarget( Target.makeTargetFn(analyze, args=analysis)) target.setFollowOnTargetFn(merge, args=(files, outputDir))
def find_analyses(target, recordsToAnalyze, templateFastqFiles, complementFastqFiles, references, outputDir): """takes a set of records to analyze and finds the corresponding sequences and creates alignment targets""" files = {"template":[], "complement":[]} logger.info("Finding template analyses") for fastqFile in templateFastqFiles: for name, seq, qual in fastqRead(fastqFile): if name in recordsToAnalyze: outfile = os.path.join(target.getGlobalTempDir(), "template_" + name) files["template"].append(outfile) ref_name, ref_start, ref_stop = recordsToAnalyze[name] ref_seq = references[ref_name][ref_start : ref_stop] analysis = [name, seq, ref_name, ref_seq, outfile] target.addChildTarget(Target.makeTargetFn(analyze, args=analysis)) logger.info("Finding complement analyses") for fastqFile in complementFastqFiles: for name, seq, qual in fastqRead(fastqFile): if name in recordsToAnalyze: outfile = os.path.join(target.getGlobalTempDir(), "complement_" + name) files["complement"].append(outfile) ref_name, ref_start, ref_stop = recordsToAnalyze[name] ref_seq = references[ref_name][ref_start : ref_stop] analysis = [name, seq, ref_name, ref_seq, outfile] target.addChildTarget(Target.makeTargetFn(analyze, args=analysis)) target.setFollowOnTargetFn(merge, args=(files, outputDir))
def main(): #Parse the inputs args/options parser = OptionParser(usage="usage: workingDir [options]", version="%prog 0.1") Stack.addJobTreeOptions(parser) options, args = parser.parse_args() setLoggingFromOptions(options) if len(args) != 1: raise RuntimeError("Expected one argument, got %s arguments: %s" % (len(args), " ".join(args))) workingDir = args[0] #Assign the input files readFastqFiles = [ os.path.join(workingDir, "readFastqFiles", i) for i in os.listdir(os.path.join(workingDir, "readFastqFiles")) if ".fq" in i or ".fastq" in i ] referenceFastaFiles = [ os.path.join(workingDir, "referenceFastaFiles", i) for i in os.listdir(os.path.join(workingDir, "referenceFastaFiles")) if ".fa" in i or ".fasta" in i ] outputDir = os.path.join(workingDir, "output") #Log the inputs logger.info("Using the following working directory: %s" % workingDir) logger.info("Using the following output directory: %s" % outputDir) for readFastqFile in readFastqFiles: logger.info("Got the following read fastq file: %s" % readFastqFile) for referenceFastaFile in referenceFastaFiles: logger.info("Got the following reference fasta files: %s" % referenceFastaFile) #This line invokes jobTree i = Stack(Target.makeTargetFn(setupExperiments, args=(readFastqFiles, referenceFastaFiles, mappers, analyses, outputDir))).startJobTree(options) if i != 0: raise RuntimeError("Got failed jobs")
def chainNetStartup(opts): "entry to start jobtree" # FIXME: should generate an exception target = Target.makeTargetFn(chainNetTarget, (opts.hal, opts.queryGenome, opts.queryTwoBit, opts.targetGenome, opts.targetTwoBit, opts.chainFile, opts.netFile, opts.synChainFile, opts.synNetFile)) failures = Stack(target).startJobTree(opts) if failures != 0: raise Exception("Error: " + str(failures) + " jobs failed")
def main(): parser = argparse.ArgumentParser() parser.add_argument("--genome", required=True) parser.add_argument("--database", required=True) parser.add_argument("--hintsDir", required=True) parser.add_argument("--fasta", required=True) parser.add_argument("--filterTissues", nargs="+") parser.add_argument("--filterCenters", nargs="+") bamfiles = parser.add_mutually_exclusive_group(required=True) bamfiles.add_argument("--bamFiles", nargs="+", help="bamfiles being used", dest="bams") bamfiles.add_argument("--bamFofn", help="File containing list of bamfiles", dest="bams") Stack.addJobTreeOptions(parser) args = parser.parse_args() # below is an ugly hack to filter out tissues/centers by checking for the words in the file path if not isinstance(args.bams, list): if not os.path.exists(args.bams): raise RuntimeError("ERROR: bamFofn does not exist.") bams = {x.rstrip() for x in open(args.bams)} args.bams = bams else: args.bams = set(args.bams) for to_remove_list in [args.filterTissues, args.filterCenters]: if isinstance(to_remove_list, list): to_remove = set() for x in to_remove_list: for b in args.bams: if x in b: to_remove.add(b) args.bams -= to_remove s = Stack(Target.makeTargetFn(main_hints_fn, memory=8 * 1024 ** 3, args=[args.bams, args.database, args.genome, args.fasta, args.hintsDir])) i = s.startJobTree(args) if i != 0: raise RuntimeError("Got failed jobs")
def main(): args = parse_args() i = Stack( Target.makeTargetFn(build_analyses, memory=8 * (1024**3), args=[args])).startJobTree(args) if i != 0: raise RuntimeError("Got failed jobs")
def main(): parser = build_parser() Stack.addJobTreeOptions(parser) args = parser.parse_args() if args.mode == "reference": s = Stack(Target.makeTargetFn(main_ref_fn, args=[args.comparativeAnnotationDir, args.gencode, args.genome, args.outDir, args.filterChroms])) elif args.mode == "transMap": s = Stack(Target.makeTargetFn(main_fn, args=[args.comparativeAnnotationDir, args.gencode, args.genome, args.refGenome, args.outDir, args.filterChroms])) else: s = Stack(Target.makeTargetFn(main_augustus_fn, args=[args.comparativeAnnotationDir, args.gencode, args.genome, args.outDir, args.filterChroms])) i = s.startJobTree(args) if i != 0: raise RuntimeError("Got failed jobs")
def mapThenAnalyse(target, readFastqFile, readType, referenceFastaFile, mapper, analyses, experimentDir): if not os.path.exists(experimentDir): os.mkdir(experimentDir) target.logToMaster("Creating experiment dir: %s" % experimentDir) else: target.logToMaster("Experiment dir already exists: %s" % experimentDir) samFile = os.path.join(experimentDir, "mapping.sam") hmmFileToTrain = os.path.join(experimentDir, "hmm.txt") remapped = False if not os.path.exists(samFile): target.logToMaster( "Starting mapper %s for reference file %s and read file %s" % (mapper.__name__, referenceFastaFile, readFastqFile)) target.addChildTarget( mapper(readFastqFile, readType, referenceFastaFile, samFile, hmmFileToTrain)) remapped = True else: target.logToMaster( "Mapper %s for reference file %s and read file %s is already complete" % (mapper.__name__, referenceFastaFile, readFastqFile)) target.setFollowOnTarget( Target.makeTargetFn(runAnalyses, args=(readFastqFile, readType, referenceFastaFile, samFile, analyses, experimentDir, remapped, mapper)))
def main(): #Parse the inputs args/options parser = OptionParser(usage="usage: workingDir [options]", version="%prog 0.1") options = Options() parser.add_option("--sequences", dest="sequences", help="Quoted list of fasta files containing sequences") parser.add_option("--alignments", dest="alignments", help="Cigar file ") addExpectationMaximisationOptions(parser, options) Stack.addJobTreeOptions(parser) options, args = parser.parse_args() setLoggingFromOptions(options) if len(args) != 0: raise RuntimeError("Expected no arguments, got %s arguments: %s" % (len(args), " ".join(args))) #Log the inputs logger.info( "Got '%s' sequences, '%s' alignments file, '%s' output model and '%s' iterations of training" % (options.sequences, options.alignments, options.outputModel, options.iterations)) #This line invokes jobTree i = Stack( Target.makeTargetFn(expectationMaximisationTrials, args=(options.sequences, options.alignments, options.outputModel, options))).startJobTree(options) if i != 0: raise RuntimeError("Got failed jobs")
def main(): parser = build_parser() Stack.addJobTreeOptions(parser) args = parser.parse_args() # biotypes = get_all_biotypes(args.attributePath) biotypes = [ "protein_coding", "miRNA", "snoRNA", "snRNA", "lincRNA", "processed_pseudogenes", "unprocessed_pseudogenes", "pseudogenes", ] job_args = ( args.comparativeAnnotationDir, args.attributePath, args.annotationGp, args.gencode, args.genomes, biotypes, args.outDir, ) i = Stack(Target.makeTargetFn(wrapper, args=job_args)).startJobTree(args) if i != 0: raise RuntimeError("Got failed jobs")
def main(): parser = argparse.ArgumentParser() parser.add_argument("--genome", required=True) parser.add_argument("--refGenome", required=True) parser.add_argument("--refTranscriptFasta", required=True) parser.add_argument("--targetGenomeFasta", required=True) parser.add_argument("--outDb", default="cgp_cds_metrics.db") parser.add_argument("--compAnnPath", required=True) gp_group = parser.add_mutually_exclusive_group(required=True) gp_group.add_argument("--cgpGp") gp_group.add_argument("--consensusGp") Stack.addJobTreeOptions(parser) args = parser.parse_args() out_db = os.path.join(args.compAnnPath, args.outDb) if args.cgpGp is not None: gp = args.cgpGp mode = "cgp" chunk_size = 15 # smaller chunk size because we will do more alignments per transcript else: gp = args.consensusGp mode = "consensus" chunk_size = 40 s = Stack( Target.makeTargetFn(align_gp, args=[ args.genome, args.refGenome, args.refTranscriptFasta, args.targetGenomeFasta, gp, mode, out_db, args.compAnnPath, chunk_size ])) i = s.startJobTree(args) if i != 0: raise RuntimeError("Got failed jobs")
def main(): parser = build_parser() Stack.addJobTreeOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) ilp_config = ilp_tuple(args.breakpoint_penalty, args.data_penalty, args.expected_value_penalty, args.trash_penalty, args.kmer_size) paths = paths_tuple(args.out_dir, args.aln_index, args.whitelist, args.masked_ref, args.unmasked_ref, args.bad_kmers, args.normalizing, args.key_file) try: cgquery_dict = pickle.load(open(args.cgquery_file)) except IOError: raise IOError("Cgquery dict does not exist.") if not os.path.exists(paths.out_dir): os.makedirs(paths.out_dir) i = Stack( Target.makeTargetFn(build_analyses, args=(paths, ilp_config, cgquery_dict))).startJobTree(args) if i != 0: raise RuntimeError("Got failed jobs")
def main(): parser = OptionParser() Stack.addJobTreeOptions(parser) parser.add_option("--fileToSort", dest="fileToSort", help="The file you wish to sort") parser.add_option( "--N", dest="N", help= "The threshold below which a serial sort function is used to sort file. All lines must of length less than or equal to N or program will fail", default=10000) options, args = parser.parse_args() if options.fileToSort == None: raise RuntimeError("No file to sort given") if not os.path.exists(options.fileToSort): raise RuntimeError("File to sort does not exist: %s" % options.fileToSort) if int(options.N) <= 0: raise RuntimeError("Invalid value of N: %s" % options.N) if len(args) != 0: raise RuntimeError("Unrecognised input arguments: %s" % " ".join(args)) #Now we are ready to run i = Stack(Target.makeTargetFn( setup, (options.fileToSort, int(options.N)))).startJobTree(options)
def main(): parser = build_parser() Stack.addJobTreeOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) if not os.path.exists(args.outDir): os.mkdir(args.outDir) if args.overwriteDb is True: if os.path.exists(args.mergedDb): os.remove(args.mergedDb) for g in args.genomes: if os.path.exists(os.path.join(args.outDir, g + ".db")): os.remove(os.path.join(args.outDir, g + ".db")) logger.info("Building paths to the required files") alnPslDict = parse_dir(args.genomes, args.dataDir, alignment_ext) seqTwoBitDict = parse_dir(args.genomes, args.dataDir, sequence_ext) geneCheckBedDict = parse_dir(args.genomes, args.dataDir, gene_check_ext) #geneCheckBedDetailsDict = parse_dir(args.genomes, args.geneCheckDir, gene_check_details_ext) refSequence = os.path.join(args.dataDir, args.refGenome + ".2bit") if not os.path.exists(refSequence): raise RuntimeError("Reference genome 2bit not present at {}".format(refSequence)) args.refSequence = refSequence i = Stack(Target.makeTargetFn(build_analysis, args=(alnPslDict, seqTwoBitDict, geneCheckBedDict, args.gencodeAttributeMap, args.genomes, args.annotationBed, args.outDir, args.primaryKey, args.refGenome))).startJobTree(args) if i != 0: raise RuntimeError("Got failed jobs") merge_databases(args.outDir, args.mergedDb, args.genomes)
def main(): parser = OptionParser() Stack.addJobTreeOptions(parser) parser.add_option("--fileToSort", dest="fileToSort", help="The file you wish to sort") parser.add_option("--N", dest="N", help="The threshold below which a serial sort function is used to sort file. All lines must of length less than or equal to N or program will fail", default=10000) options, args = parser.parse_args() if options.fileToSort == None: raise RuntimeError("No file to sort given") if not os.path.exists(options.fileToSort): raise RuntimeError("File to sort does not exist: %s" % options.fileToSort) if int(options.N) <= 0: raise RuntimeError("Invalid value of N: %s" % options.N) if len(args) != 0: raise RuntimeError("Unrecognised input arguments: %s" % " ".join(args)) #Now we are ready to run i = Stack(Target.makeTargetFn(setup, (options.fileToSort, int(options.N)))).startJobTree(options)
def main(): parser = build_parser() Stack.addJobTreeOptions(parser) args = parser.parse_args() i = Stack(Target.makeTargetFn(build_analyses, args=(args.refGenome, args.genome, args.annotationGp, args.psl, args.gp, args.augustusGp, args.fasta, args.refFasta, args.sizes, args.gencodeAttributes, args.outDir))).startJobTree(args) if i != 0: raise RuntimeError("Got failed jobs")
def main(): parser = build_parser() Stack.addJobTreeOptions(parser) args = parser.parse_args() i = Stack(Target.makeTargetFn(build_analyses, args=(args.refGenome, args.genome, args.annotationGp, args.psl, args.gp, args.fasta, args.refFasta, args.sizes, args.gencodeAttributes, args.outDir))).startJobTree(args) if i != 0: raise RuntimeError("Got failed jobs")
def main(): args = parse_args() args.target_genomes = extract_model_tree(args.model) - set( [args.ref_genome]) args.msa_split_options = " ".join( ['--windows', args.windows, '--between-blocks', args.between_blocks]) s = Stack(Target.makeTargetFn(dless_pipeline_wrapper, args=(args, ))) i = s.startJobTree(args) if i != 0: raise RuntimeError("Got failed jobs")
def main(): parser = build_parser() Stack.addJobTreeOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) args.defaultCpu = args.num_threads args.defaultMemory = 8 * 1024 ** 3 i = Stack(Target.makeTargetFn(wrapper, args=(args,), memory=args.defaultMemory, cpu=args.defaultCpu)).startJobTree(args) if i != 0: raise RuntimeError("Got failed jobs")
def main(): parser = build_parser() Stack.addJobTreeOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) i = Stack(Target.makeTargetFn(wrapper, args=(args.source_dir, args.reference, args.out_dir))).startJobTree(args) if i != 0: raise RuntimeError("Got failed jobs")
def mapThenAnalyse(target, readFastaFile, referenceFastaFile, mapper, analyses, experimentDir): print "Experiment dir", experimentDir if not os.path.exists(experimentDir): os.mkdir(experimentDir) target.logToMaster("Creating experiment dir: %s" % experimentDir) else: target.logToMaster("Experiment dir already exists: %s" % experimentDir) samFile = os.path.join(experimentDir, "mapping.sam") if not os.path.exists(samFile) or isNewer(readFastaFile, samFile) or isNewer(referenceFastaFile, samFile): target.addChildTarget(mapper(readFastaFile, referenceFastaFile, samFile)) target.setFollowOnTarget(Target.makeTargetFn(runAnalyses, args=(readFastaFile, referenceFastaFile, samFile, analyses, experimentDir)))
def main(): parser = buildParser() Stack.addJobTreeOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) i = Stack(Target.makeTargetFn(buildAnalyses, args=( args.output, args.breakpoint_penalty, args.data_penalty, args.tightness_penalty, args.graph, args.kmer_size, args.save_intermediate))).startJobTree( args) if i != 0: raise RuntimeError("Got failed jobs")
def main(): parser = buildParser() Stack.addJobTreeOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) i = Stack( Target.makeTargetFn(buildAnalyses, args=(args.output, args.fastq_list, args.save_intermediate)) ).startJobTree(args) if i != 0: raise RuntimeError("Got failed jobs")
def main(): parser = argparse.ArgumentParser() parser.add_argument("--genomes", nargs="+", required=True) parser.add_argument("--refFasta", required=True) parser.add_argument("--outDir", required=True) parser.add_argument("--augustusStatsDir", required=True) Stack.addJobTreeOptions(parser) args = parser.parse_args() i = Stack(Target.makeTargetFn(wrapper, args=(args.genomes, args.refFasta, args.augustusStatsDir, args.outDir))).startJobTree(args) if i != 0: raise RuntimeError("Got failed jobs") shutil.rmtree(os.path.join(args.outDir, "tmp"))
def main(): args = parse_args() if args.target_genomes is None: args.target_genomes = extract_model_tree(args.model) - set( [args.ref_genome]) args.msa_split_options = ' '.join([ '--windows', args.windows, '--between-blocks', args.between_blocks, '--min-informative', args.min_informative ]) s = Stack(Target.makeTargetFn(subset_hal_pipeline, args=(args, ))) i = s.startJobTree(args) if i != 0: raise RuntimeError("Got failed jobs")
def main(): parser = argparse.ArgumentParser() parser.add_argument("--inputGp", required=True) parser.add_argument("--outputGtf", required=True) parser.add_argument("--genome", required=True) parser.add_argument("--chromSizes", required=True) parser.add_argument("--fasta", required=True) Stack.addJobTreeOptions(parser) args = parser.parse_args() i = Stack(Target.makeTargetFn(wrapper, args=(args.inputGp, args.outputGtf, args.genome, args.chromSizes, args.fasta))).startJobTree(args) if i != 0: raise RuntimeError("Got failed jobs")
def find_analyses(target, unmappedByReadType, outputDir): outfiles = dict() for readType in unmappedByReadType: outfiles[readType] = list() records = list() for (name, sequence), i in izip(unmappedByReadType[readType].iteritems(), xrange(len(unmappedByReadType[readType]))): records.append(">{}\n{}\n".format(name, sequence)) if i % 10 == 1200 or i == len(unmappedByReadType[readType]) - 1: tmpalign = os.path.join(target.getGlobalTempDir(), str(i) + ".txt") outfiles[readType].append(tmpalign) target.addChildTarget(Target.makeTargetFn(run_blast, args=(records, tmpalign))) records = list() target.setFollowOnTargetFn(merge, args=(outfiles, outputDir))
def main(): parser = buildParser() Stack.addJobTreeOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) i = Stack( Target.makeTargetFn(buildAnalyses, args=(args.output, args.fastq_list, args.save_intermediate))).startJobTree(args) if i != 0: raise RuntimeError("Got failed jobs")
def setupExperiments(target, readFastaFiles, referenceFastaFiles, mappers, analysers, outputDir): if not os.path.exists(outputDir): #If the output dir doesn't yet exist create it os.mkdir(outputDir) target.logToMaster("Creating output dir: %s" % outputDir) else: target.logToMaster("Root output dir already exists: %s" % outputDir) for readFastaFile in readFastaFiles: for referenceFastaFile in referenceFastaFiles: for mapper in mappers: target.addChildTarget(Target.makeTargetFn(mapThenAnalyse, \ args=(readFastaFile, referenceFastaFile, mapper, analyses, os.path.join(outputDir, "experiment_%s_%s_%s" % \ (os.path.split(readFastaFile)[-1], os.path.split(referenceFastaFile)[-1], mapper.__name__)))))
def main(): parser = OptionParser() Stack.addJobTreeOptions(parser) options, args = parser.parse_args() setLoggingFromOptions(options) outputDir = "muscle_compare_2d/output/" if not os.path.exists(outputDir): logger.info("Output dir {} does not exist. Creating.") os.mkdir(outputDir) if len(os.listdir(outputDir)) > 0: logger.info("Output dir not empty.") if len(args) != 3: raise RuntimeError("Error: expected three arguments got %s arguments: %s" % (len(args), " ".join(args))) templateRecords = {x.qname for x in pysam.Samfile(args[0]) if not x.is_unmapped} complementRecords = {x.qname for x in pysam.Samfile(args[1]) if not x.is_unmapped} twodSamFile = pysam.Samfile(args[2]) twodRecords = {x.qname : x for x in twodSamFile if not x.is_unmapped} recordsToAnalyze = dict() for name, record in twodRecords.iteritems(): if name not in templateRecords and name not in complementRecords: ref_name = twodSamFile.getrname(record.tid) ref_start, ref_stop = int(record.aend - record.alen), int(record.aend) recordsToAnalyze[name] = [ref_name, ref_start, ref_stop] if os.path.exists("../readFastqFiles/template/") and os.path.exists("../readFastqFiles/complement"): templateFastqFiles = [os.path.join("../readFastqFiles/template/", x) for x in os.listdir("../readFastqFiles/template/") if x.endswith(".fastq") or x.endswith(".fq")] complementFastqFiles = [os.path.join("../readFastqFiles/complement/", x) for x in os.listdir("../readFastqFiles/complement/") if x.endswith(".fastq") or x.endswith(".fq")] else: raise RuntimeError("Error: readFastqFiles does not contain template and/or complement folders") referenceFastaFiles = [os.path.join("../referenceFastaFiles", x) for x in os.listdir("../referenceFastaFiles") if x.endswith(".fa") or x.endswith(".fasta")] if len(referenceFastaFiles) > 0: references = { y[0].split(" ")[0] : y[1] for x in referenceFastaFiles for y in fastaRead(x) } else: raise RuntimeError("Error: no reference fasta files") if len(recordsToAnalyze) == 0: raise RuntimeError("Error: none of the mappable twoD reads in this set did not map as template/complement.") logger.info("Starting to find analyses to run...") args = (recordsToAnalyze, templateFastqFiles, complementFastqFiles, references, outputDir) i = Stack(Target.makeTargetFn(find_analyses, args=args)).startJobTree(options) if i != 0: raise RuntimeError("Got {} failed jobs".format(i))
def main(): #Parse the inputs args/options parser = OptionParser(usage="usage: inputSamFile referenceFastaFile outputVcfFile [options]", version="%prog 0.1") #Options parser.add_option("--noMargin", dest="noMargin", help="Do not marginalise over the read \ alignments, rather use the input alignment to call the variants (this will be faster)", default=False, action="store_true") parser.add_option("--alignmentModel", default=os.path.join(pathToBaseNanoporeDir(), "src", "margin", "mappers", "last_hmm_20.txt"), help="The model to use in realigning the reads to the reference.") parser.add_option("--errorModel", default=os.path.join(pathToBaseNanoporeDir(), "src", "margin", "mappers", "last_hmm_20.txt"), help="The model to use in calculating the difference between the predicted true reference and the reads.") parser.add_option("--maxAlignmentLengthPerJob", default=7000000, help="Maximum total alignment length of alignments to include in one posterior prob calculation job.", type=int) parser.add_option("--threshold", default=0.3, help="The posterior probability threshold for a non-reference base above which to report a variant.", type=float) #Add the jobTree options Stack.addJobTreeOptions(parser) #Parse the options/arguments options, args = parser.parse_args() #Setup logging setLoggingFromOptions(options) #Print help message if no input if len(sys.argv) == 1: parser.print_help() sys.exit(0) #Exit if the arguments are not what we expect if len(args) != 3: raise RuntimeError("Expected three arguments, got: %s" % " ".join(args)) print options.errorModel print options.threshold #This line invokes jobTree i = Stack(Target.makeTargetFn(fn=marginCallerTargetFn, args=(args[0], args[1], args[2], options))).startJobTree(options) #The return value of the jobtree script is the number of failed jobs. If we have any then #report this. if i != 0: raise RuntimeError("Got failed jobs")
def main(): parser = build_parser() Stack.addJobTreeOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) if not os.path.exists(args.out_dir): os.mkdir(args.out_dir) count_files = [[x, os.path.join(args.data_dir, x, x + ".Counts.fa")] for x in os.listdir(args.data_dir)] i = Stack(Target.makeTargetFn(buildDictWrapper, args=(count_files, args.out_dir, args.graph, args.new_graph))).startJobTree(args) if i != 0: raise RuntimeError("Got failed jobs")
def main(): parser = build_parser() Stack.addJobTreeOptions(parser) args = parser.parse_args() #biotypes = get_all_biotypes(args.attributePath) biotypes = [ "protein_coding", "miRNA", "snoRNA", "snRNA", "lincRNA", "processed_pseudogenes", "unprocessed_pseudogenes", "pseudogenes" ] job_args = (args.comparativeAnnotationDir, args.attributePath, args.annotationGp, args.gencode, args.genomes, biotypes, args.outDir) i = Stack(Target.makeTargetFn(wrapper, args=job_args)).startJobTree(args) if i != 0: raise RuntimeError("Got failed jobs")
def main(): parser = argparse.ArgumentParser() parser.add_argument("--inputGp", required=True) parser.add_argument("--outputGtf", required=True) parser.add_argument("--genome", required=True) parser.add_argument("--chromSizes", required=True) parser.add_argument("--fasta", required=True) Stack.addJobTreeOptions(parser) args = parser.parse_args() i = Stack( Target.makeTargetFn(wrapper, args=(args.inputGp, args.outputGtf, args.genome, args.chromSizes, args.fasta))).startJobTree(args) if i != 0: raise RuntimeError("Got failed jobs")
def main(): opts = parse_args() # Create labels for the HALs if none were provided if opts.labels is None: opts.labels = [os.path.basename(hal) for hal in opts.hals] if len(opts.labels) != len(opts.hals): raise ValueError("%d labels were provided, but %d hals were provided." % (len(opts.labels), len(opts.hals))) # Ensure that the hals have some genomes in common, and take the # common genomes to display in the hub. genomess = [getGenomesInHal(hal) for hal in opts.hals] genomes = reduce(lambda a, i: a.intersection(i), genomess) if len(genomes) == 0: raise ValueError("No genomes in common between the HALs.") Stack(Target.makeTargetFn(createHub, (genomes, opts))).startJobTree(opts)
def find_analyses(target, unmappedByReadType, outputDir): outfiles = dict() for readType in unmappedByReadType: outfiles[readType] = list() records = list() for (name, sequence), i in izip(unmappedByReadType[readType].iteritems(), xrange(len(unmappedByReadType[readType]))): records.append(">{}\n{}\n".format(name, sequence)) if i % 10 == 1200 or i == len(unmappedByReadType[readType]) - 1: tmpalign = os.path.join(target.getGlobalTempDir(), str(i) + ".txt") outfiles[readType].append(tmpalign) target.addChildTarget( Target.makeTargetFn(run_blast, args=(records, tmpalign))) records = list() target.setFollowOnTargetFn(merge, args=(outfiles, outputDir))
def main(): parser = buildParser() Stack.addJobTreeOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) if args.fastq is not None: i = Stack(ModelWrapperLocalFiles(args.name, args.output, args.breakpoint_penalty, args.data_penalty, args.tightness_penalty, args.graph, args.fastq, args.save_intermediate)).startJobTree(args) else: i = Stack(Target.makeTargetFn(buildAnalyses, args=( args.name, args.output, args.breakpoint_penalty, args.data_penalty, args.tightness_penalty, args.graph, args.fastq_list, args.save_intermediate))).startJobTree(args) if i != 0: raise RuntimeError("Got failed jobs")
def main(): opts = parse_args() # Create labels for the HALs if none were provided if opts.labels is None: opts.labels = [os.path.basename(hal) for hal in opts.hals] if len(opts.labels) != len(opts.hals): raise ValueError( "%d labels were provided, but %d hals were provided." % (len(opts.labels), len(opts.hals))) # Ensure that the hals have some genomes in common, and take the # common genomes to display in the hub. genomess = [getGenomesInHal(hal) for hal in opts.hals] genomes = reduce(lambda a, i: a.intersection(i), genomess) if len(genomes) == 0: raise ValueError("No genomes in common between the HALs.") Stack(Target.makeTargetFn(createHub, (genomes, opts))).startJobTree(opts)
def main(): parser = build_parser() Stack.addJobTreeOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) if not os.path.exists(args.out_dir): os.mkdir(args.out_dir) count_files = [[x, os.path.join(args.data_dir, x, x + ".Counts.fa")] for x in os.listdir(args.data_dir)] i = Stack( Target.makeTargetFn(buildDictWrapper, args=(count_files, args.out_dir, args.graph, args.new_graph))).startJobTree(args) if i != 0: raise RuntimeError("Got failed jobs")
def main(): args = parse_args() if args.target_genomes is None: args.target_genomes = extract_model_tree(args.model) - set( [args.ref_genome]) else: args.target_genomes = set(args.target_genomes) - set([args.ref_genome]) args.msa_split_options = " ".join([ '--windows', args.windows, '--between-blocks', args.between_blocks, '--min-informative', args.min_informative ]) args.phastcons_options = " ".join([ '--target-coverage', args.target_coverage, '--expected-length', args.expected_length ]) s = Stack(Target.makeTargetFn(phastcons_pipeline_wrapper, args=(args, ))) i = s.startJobTree(args) if i != 0: raise RuntimeError("Got failed jobs")
def setupExperiments(target, readFastqFiles, referenceFastaFiles, mappers, analysers, metaAnalyses, outputDir): experiments = [] for readType, readTypeFastaFiles in readFastqFiles: outputBase = os.path.join(outputDir, "analysis_" + readType) if not os.path.exists(outputBase): os.mkdir(outputBase) for readFastqFile in readTypeFastaFiles: for referenceFastaFile in referenceFastaFiles: for mapper in mappers: experimentDir = os.path.join(outputBase, "experiment_%s_%s_%s" % \ (os.path.split(readFastqFile)[-1], os.path.split(referenceFastaFile)[-1], mapper.__name__)) experiment = (readFastqFile, readType, referenceFastaFile, mapper, analyses, experimentDir) target.addChildTarget( Target.makeTargetFn(mapThenAnalyse, args=experiment)) experiments.append(experiment) target.setFollowOnTargetFn(runMetaAnalyses, args=(metaAnalyses, outputDir, experiments))
def main(): parser = argparse.ArgumentParser() parser.add_argument("--genome", required=True) parser.add_argument("--refTranscriptFasta", required=True) parser.add_argument("--targetTranscriptFasta", required=True) parser.add_argument("--targetTranscriptFastaIndex", required=True) parser.add_argument("--outDir", required=True) parser.add_argument("--outDb", default="augustus_attributes.db") Stack.addJobTreeOptions(parser) args = parser.parse_args() out_db = os.path.join(args.outDir, args.outDb) i = Stack( Target.makeTargetFn(align_augustus, args=[ args.genome, args.refTranscriptFasta, args.targetTranscriptFasta, args.targetTranscriptFastaIndex, out_db ])).startJobTree(args) if i != 0: raise RuntimeError("Got failed jobs")
def main(): parser = build_parser() Stack.addJobTreeOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) ilp_config = ilp_tuple(args.breakpoint_penalty, args.data_penalty, args.expected_value_penalty, args.trash_penalty, args.kmer_size) paths = paths_tuple(args.out_dir, args.aln_index, args.whitelist, args.masked_ref, args.unmasked_ref, args.bad_kmers, args.normalizing, args.key_file) try: cgquery_dict = pickle.load(open(args.cgquery_file)) except IOError: raise IOError("Cgquery dict does not exist.") if not os.path.exists(paths.out_dir): os.makedirs(paths.out_dir) i = Stack(Target.makeTargetFn(build_analyses, args=(paths, ilp_config, cgquery_dict))).startJobTree(args) if i != 0: raise RuntimeError("Got failed jobs")
def main(): #Parse the inputs args/options parser = OptionParser(usage="usage: workingDir [options]", version="%prog 0.1") options = Options() parser.add_option("--sequences", dest="sequences", help="Quoted list of fasta files containing sequences") parser.add_option("--alignments", dest="alignments", help="Cigar file ") addExpectationMaximisationOptions(parser, options) Stack.addJobTreeOptions(parser) options, args = parser.parse_args() setLoggingFromOptions(options) if len(args) != 0: raise RuntimeError("Expected no arguments, got %s arguments: %s" % (len(args), " ".join(args))) #Log the inputs logger.info("Got '%s' sequences, '%s' alignments file, '%s' output model and '%s' iterations of training" % (options.sequences, options.alignments, options.outputModel, options.iterations)) #This line invokes jobTree i = Stack(Target.makeTargetFn(expectationMaximisationTrials, args=(options.sequences, options.alignments, options.outputModel, options))).startJobTree(options) if i != 0: raise RuntimeError("Got failed jobs")
def main(): parser = buildParser() Stack.addJobTreeOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) if args.fastq is not None: i = Stack( ModelWrapperLocalFiles(args.name, args.output, args.breakpoint_penalty, args.data_penalty, args.tightness_penalty, args.graph, args.fastq, args.save_intermediate)).startJobTree(args) else: i = Stack( Target.makeTargetFn( buildAnalyses, args=(args.name, args.output, args.breakpoint_penalty, args.data_penalty, args.tightness_penalty, args.graph, args.fastq_list, args.save_intermediate))).startJobTree(args) if i != 0: raise RuntimeError("Got failed jobs")
for l in r: l = l.split() tot += int(l[-1]) - int(l[-2]) outf.write('\t'.join( map(str, [chrom, start, stop, format_ratio(tot, length)])) + '\n') def cat_results(target, args, paths): """ Concatenates final scores output into one bed file. """ fofn = os.path.join(target.getGlobalTempDir(), 'fofn') with open(fofn, 'w') as outf: for p in paths: outf.write(p + '\n') tmp_p = os.path.join(target.getGlobalTempDir(), os.path.basename(args.out_bed) + '.tmp') cat_cmd = 'cat {} | xargs -n 50 cat > {}'.format(fofn, tmp_p) system(cat_cmd) os.rename(tmp_p, args.out_bed) if __name__ == '__main__': from phast.find_single_copy_regions import * args = parse_args() s = Stack(Target.makeTargetFn(single_copy_wrapper, args=(args, ))) i = s.startJobTree(args) if i != 0: raise RuntimeError("Got failed jobs")
def main(): parser = OptionParser() Stack.addJobTreeOptions(parser) options, args = parser.parse_args() setLoggingFromOptions(options) outputDir = "blast_combined/output/" if not os.path.exists(outputDir): logger.info("Output dir {} does not exist. Creating.") os.mkdir(outputDir) if len(os.listdir(outputDir)) > 0: logger.info("Output dir not empty.") #find all read fastq files, load into a dict by read type readFastqFiles = dict() for readType in readTypes: readFastqFiles[readType] = [os.path.join("../output/processedReadFastqFiles/", readType, x) for x in os.listdir(os.path.join("../output/processedReadFastqFiles/", readType)) if x.endswith(".fq") or x.endswith(".fastq")] #find all reference fasta files referenceFastaFiles = [x for x in os.listdir("../referenceFastaFiles") if x.endswith(".fasta") or x.endswith(".fa")] #find all sam files that were analyzed using combinedAnalyses samFiles = {} for readType in readTypes: samFiles[readType] = [(readFastqFile, os.path.join("../output", "analysis_" + readType, "experiment_" + os.path.basename(readFastqFile) + "_" + referenceFastaFile + "_" + analysis, "mapping.sam")) for readFastqFile, referenceFastaFile, analysis in product(readFastqFiles[readType], referenceFastaFiles, combinedAnalyses)] mappedByReadType = defaultdict(set) for readType in readTypes: for readFastqFileFullPath, samFile in samFiles[readType]: readFastqFile = os.path.basename(readFastqFileFullPath) mappedNames = {(x.qname, readFastqFile) for x in pysam.Samfile(samFile) if not x.is_unmapped} mappedByReadType[readType] = mappedByReadType[readType].union(mappedNames) unmappedByReadType = defaultdict(dict) for readType in readTypes: for readFastqFileFullPath, samFile in samFiles[readType]: readFastqFile = os.path.basename(readFastqFileFullPath) for name, seq, qual in fastqRead(readFastqFileFullPath): name = name.split(" ")[0] if (name, readFastqFile) not in mappedByReadType[readType]: unmappedByReadType[readType][(name, readFastqFile)] = seq i = Stack(Target.makeTargetFn(find_analyses, args=(unmappedByReadType, outputDir))).startJobTree(options) if i != 0: raise RuntimeError("Got {} failed jobs".format(i)) for readType in readTypes: #build a counter of blast hits and set of read names that did not map blast_hits, no_hits = Counter(), set() for query, result in parse_blast(open(os.path.join(outputDir, readType + "_blast_out.txt"))): if result is None: no_hits.add(query) else: blast_hits[tuple(result)] += 1 #count number of times each hit was seen #write the unmapped hits to a fasta file outf = open(os.path.join(outputDir, readType + "_no_hits.fasta"), "w") for (name, readFastqFile), seq in unmappedByReadType[readType].iteritems(): if name in no_hits: outf.write(">{}\n{}\n".format(name, seq)) outf.close() #write the blast report blast_out = open(os.path.join(outputDir, readType + "_blast_report.txt"), "w") blast_out.write("gi|##|gb|##|\tSpecies\tseqID\tCount\n") #header to output for result, count in sorted(blast_hits.items(), key = lambda x: -int(x[-1])): blast_out.write("{}\t{}\n".format("\t".join(result), count)) blast_out.close() #calculate percents and make a barplot blast_count = sum(blast_hits.values()) unmapped_count = len(unmappedByReadType[readType]) - sum(blast_hits.values()) mapped_count = len(mappedByReadType[readType]) #blast_percent = 1.0 * sum(blast_hits.values()) / (len(mappedByReadType[readType]) + len(unmappedByReadType[readType])) #unmapped_percent = (1.0 * len(unmappedByReadType[readType]) - sum(blast_hits.values())) / (len(mappedByReadType[readType]) + len(unmappedByReadType[readType])) #mapped_percent = 1.0 * len(mappedByReadType[readType]) / (len(mappedByReadType[readType]) + len(unmappedByReadType[readType])) outf = open(os.path.join(outputDir, readType + "percents.txt"),"w") outf.write("\n".join(map(str,[blast_count, unmapped_count, mapped_count]))) outf.close() #system("Rscript blast_combined/barplot_blast.R {} {} {} {} {}".format(blast_percent, unmapped_percent, mapped_percent, readType, os.path.join(outputDir, readType + "_blast_barplot.pdf"))) system("Rscript blast_combined/barplot_blast.R {} {} {} {} {}".format(blast_count, unmapped_count, mapped_count, readType, os.path.join(outputDir, readType + "_blast_barplot.pdf")))
chrom, start, stop)) continue test_ancestral_nodes(target, region_specific_conserved, accelerated_genomes, maf_path, region_bed, outf_handle) def cat_results(target, args, paths): """ Concatenates final phastcons output into one gff file. """ fofn = os.path.join(target.getGlobalTempDir(), 'fofn') with open(fofn, 'w') as outf: for p in paths: outf.write(p + '\n') tmp_p = os.path.join(target.getGlobalTempDir(), os.path.basename(args.out_bed) + '.tmp') cat_cmd = 'cat {} | xargs -n 50 cat > {}'.format(fofn, tmp_p) system(cat_cmd) os.rename(tmp_p, args.out_bed) if __name__ == '__main__': from phast.run_acceleration_tests import * args = parse_args() setLoggingFromOptions(args) s = Stack(Target.makeTargetFn(extract_maf_wrapper, args=(args, ))) i = s.startJobTree(args) if i != 0: raise RuntimeError("Got failed jobs")
def main(args): opts = parseArgs(args) Stack(Target.makeTargetFn(pipeline, args=[opts])).startJobTree(opts)
def main(): parser = OptionParser() Stack.addJobTreeOptions(parser) options, args = parser.parse_args() setLoggingFromOptions(options) outputDir = "blast_combined/output/" if not os.path.exists(outputDir): logger.info("Output dir {} does not exist. Creating.") os.mkdir(outputDir) if len(os.listdir(outputDir)) > 0: logger.info("Output dir not empty.") #find all read fastq files, load into a dict by read type readFastqFiles = dict() for readType in readTypes: readFastqFiles[readType] = [ os.path.join("../output/processedReadFastqFiles/", readType, x) for x in os.listdir( os.path.join("../output/processedReadFastqFiles/", readType)) if x.endswith(".fq") or x.endswith(".fastq") ] #find all reference fasta files referenceFastaFiles = [ x for x in os.listdir("../referenceFastaFiles") if x.endswith(".fasta") or x.endswith(".fa") ] #find all sam files that were analyzed using combinedAnalyses samFiles = {} for readType in readTypes: samFiles[readType] = [ (readFastqFile, os.path.join( "../output", "analysis_" + readType, "experiment_" + os.path.basename(readFastqFile) + "_" + referenceFastaFile + "_" + analysis, "mapping.sam")) for readFastqFile, referenceFastaFile, analysis in product( readFastqFiles[readType], referenceFastaFiles, combinedAnalyses) ] mappedByReadType = defaultdict(set) for readType in readTypes: for readFastqFileFullPath, samFile in samFiles[readType]: readFastqFile = os.path.basename(readFastqFileFullPath) mappedNames = {(x.qname, readFastqFile) for x in pysam.Samfile(samFile) if not x.is_unmapped} mappedByReadType[readType] = mappedByReadType[readType].union( mappedNames) unmappedByReadType = defaultdict(dict) for readType in readTypes: for readFastqFileFullPath, samFile in samFiles[readType]: readFastqFile = os.path.basename(readFastqFileFullPath) for name, seq, qual in fastqRead(readFastqFileFullPath): name = name.split(" ")[0] if (name, readFastqFile) not in mappedByReadType[readType]: unmappedByReadType[readType][(name, readFastqFile)] = seq i = Stack( Target.makeTargetFn(find_analyses, args=(unmappedByReadType, outputDir))).startJobTree(options) if i != 0: raise RuntimeError("Got {} failed jobs".format(i)) for readType in readTypes: #build a counter of blast hits and set of read names that did not map blast_hits, no_hits = Counter(), set() for query, result in parse_blast( open(os.path.join(outputDir, readType + "_blast_out.txt"))): if result is None: no_hits.add(query) else: blast_hits[tuple( result)] += 1 #count number of times each hit was seen #write the unmapped hits to a fasta file outf = open(os.path.join(outputDir, readType + "_no_hits.fasta"), "w") for (name, readFastqFile), seq in unmappedByReadType[readType].iteritems(): if name in no_hits: outf.write(">{}\n{}\n".format(name, seq)) outf.close() #write the blast report blast_out = open( os.path.join(outputDir, readType + "_blast_report.txt"), "w") blast_out.write( "gi|##|gb|##|\tSpecies\tseqID\tCount\n") #header to output for result, count in sorted(blast_hits.items(), key=lambda x: -int(x[-1])): blast_out.write("{}\t{}\n".format("\t".join(result), count)) blast_out.close() #calculate percents and make a barplot blast_count = sum(blast_hits.values()) unmapped_count = len(unmappedByReadType[readType]) - sum( blast_hits.values()) mapped_count = len(mappedByReadType[readType]) #blast_percent = 1.0 * sum(blast_hits.values()) / (len(mappedByReadType[readType]) + len(unmappedByReadType[readType])) #unmapped_percent = (1.0 * len(unmappedByReadType[readType]) - sum(blast_hits.values())) / (len(mappedByReadType[readType]) + len(unmappedByReadType[readType])) #mapped_percent = 1.0 * len(mappedByReadType[readType]) / (len(mappedByReadType[readType]) + len(unmappedByReadType[readType])) outf = open(os.path.join(outputDir, readType + "percents.txt"), "w") outf.write("\n".join( map(str, [blast_count, unmapped_count, mapped_count]))) outf.close() #system("Rscript blast_combined/barplot_blast.R {} {} {} {} {}".format(blast_percent, unmapped_percent, mapped_percent, readType, os.path.join(outputDir, readType + "_blast_barplot.pdf"))) system("Rscript blast_combined/barplot_blast.R {} {} {} {} {}".format( blast_count, unmapped_count, mapped_count, readType, os.path.join(outputDir, readType + "_blast_barplot.pdf")))
def main(): #Parse the inputs args/options parser = OptionParser(usage="usage: workingDir [options]", version="%prog 0.1") Stack.addJobTreeOptions(parser) options, args = parser.parse_args() setLoggingFromOptions(options) if len(args) != 1: raise RuntimeError("Expected one argument, got %s arguments: %s" % (len(args), " ".join(args))) workingDir = args[0] # call read sampler script; samples 75, 50, and 25% reads #SampleReads(workingDir) #Create (if necessary) the output dir outputDir = os.path.join(workingDir, "output") if not os.path.exists(outputDir): logger.info("Creating output dir: %s" % outputDir) os.mkdir(outputDir) else: logger.info("Root output dir already exists: %s" % outputDir) #Assign/process (uniquify the names of) the input read fastq files processedFastqFiles = os.path.join(outputDir, "processedReadFastqFiles") if not os.path.exists(processedFastqFiles): os.mkdir(processedFastqFiles) fastqParentDir = os.path.join(workingDir, "readFastqFiles") readFastqFiles = list() for fastqSubDir in filter( os.path.isdir, [os.path.join(fastqParentDir, x) for x in os.listdir(fastqParentDir)]): readType = os.path.basename(fastqSubDir) if not os.path.exists( os.path.join(processedFastqFiles, os.path.basename(fastqSubDir))): os.mkdir(os.path.join(processedFastqFiles, readType)) readFastqFiles.append([ readType, [ makeFastqSequenceNamesUnique( os.path.join(workingDir, "readFastqFiles", readType, i), os.path.join(processedFastqFiles, readType, i)) for i in os.listdir( os.path.join(workingDir, "readFastqFiles", readType)) if (".fq" in i and i[-3:] == '.fq') or ( ".fastq" in i and i[-6:] == '.fastq') ] ]) #Assign/process (uniquify the names of) the input reference fasta files processedFastaFiles = os.path.join(outputDir, "processedReferenceFastaFiles") if not os.path.exists(processedFastaFiles): os.mkdir(processedFastaFiles) referenceFastaFiles = [ makeFastaSequenceNamesUnique( os.path.join(workingDir, "referenceFastaFiles", i), os.path.join(processedFastaFiles, i)) for i in os.listdir(os.path.join(workingDir, "referenceFastaFiles")) if (".fa" in i and i[-3:] == '.fa') or ( ".fasta" in i and i[-6:] == '.fasta') ] # call reference mutator script; introduces 1%, and 5% mutations (No nucleotide bias used for now) #referenceFastaFiles = mutateReferenceSequences(referenceFastaFiles) #Log the inputs logger.info("Using the following working directory: %s" % workingDir) logger.info("Using the following output directory: %s" % outputDir) for readType, readTypeFastqFiles in readFastqFiles: logger.info("Got the follow read type: %s" % readType) for readFastqFile in readTypeFastqFiles: logger.info("Got the following read fastq file: %s" % readFastqFile) for referenceFastaFile in referenceFastaFiles: logger.info("Got the following reference fasta files: %s" % referenceFastaFile) #This line invokes jobTree i = Stack( Target.makeTargetFn(setupExperiments, args=(readFastqFiles, referenceFastaFiles, mappers, analyses, metaAnalyses, outputDir))).startJobTree(options) if i != 0: raise RuntimeError("Got failed jobs")