Exemplo n.º 1
0
def find_analyses(target, recordsToAnalyze, templateFastqFiles,
                  complementFastqFiles, references, outputDir):
    """takes a set of records to analyze and finds the corresponding sequences and creates alignment targets"""
    files = {"template": [], "complement": []}

    logger.info("Finding template analyses")
    for fastqFile in templateFastqFiles:
        for name, seq, qual in fastqRead(fastqFile):
            if name in recordsToAnalyze:
                outfile = os.path.join(target.getGlobalTempDir(),
                                       "template_" + name)
                files["template"].append(outfile)
                ref_name, ref_start, ref_stop = recordsToAnalyze[name]
                ref_seq = references[ref_name][ref_start:ref_stop]
                analysis = [name, seq, ref_name, ref_seq, outfile]
                target.addChildTarget(
                    Target.makeTargetFn(analyze, args=analysis))

    logger.info("Finding complement analyses")
    for fastqFile in complementFastqFiles:
        for name, seq, qual in fastqRead(fastqFile):
            if name in recordsToAnalyze:
                outfile = os.path.join(target.getGlobalTempDir(),
                                       "complement_" + name)
                files["complement"].append(outfile)
                ref_name, ref_start, ref_stop = recordsToAnalyze[name]
                ref_seq = references[ref_name][ref_start:ref_stop]
                analysis = [name, seq, ref_name, ref_seq, outfile]
                target.addChildTarget(
                    Target.makeTargetFn(analyze, args=analysis))

    target.setFollowOnTargetFn(merge, args=(files, outputDir))
Exemplo n.º 2
0
def find_analyses(target, recordsToAnalyze, templateFastqFiles, complementFastqFiles, references, outputDir):
    """takes a set of records to analyze and finds the corresponding sequences and creates alignment targets"""
    files = {"template":[], "complement":[]}

    logger.info("Finding template analyses")
    for fastqFile in templateFastqFiles:
        for name, seq, qual in fastqRead(fastqFile):
            if name in recordsToAnalyze:
                outfile = os.path.join(target.getGlobalTempDir(), "template_" + name)
                files["template"].append(outfile)
                ref_name, ref_start, ref_stop = recordsToAnalyze[name]
                ref_seq = references[ref_name][ref_start : ref_stop]
                analysis = [name, seq, ref_name, ref_seq, outfile]
                target.addChildTarget(Target.makeTargetFn(analyze, args=analysis))

    logger.info("Finding complement analyses")
    for fastqFile in complementFastqFiles:
        for name, seq, qual in fastqRead(fastqFile):
            if name in recordsToAnalyze:
                outfile = os.path.join(target.getGlobalTempDir(), "complement_" + name)
                files["complement"].append(outfile)
                ref_name, ref_start, ref_stop = recordsToAnalyze[name]
                ref_seq = references[ref_name][ref_start : ref_stop]
                analysis = [name, seq, ref_name, ref_seq, outfile]
                target.addChildTarget(Target.makeTargetFn(analyze, args=analysis))

    target.setFollowOnTargetFn(merge, args=(files, outputDir))
Exemplo n.º 3
0
def main():
    #Parse the inputs args/options
    parser = OptionParser(usage="usage: workingDir [options]", version="%prog 0.1")
    Stack.addJobTreeOptions(parser)
    options, args = parser.parse_args()
    setLoggingFromOptions(options)
    
    if len(args) != 1:
        raise RuntimeError("Expected one argument, got %s arguments: %s" % (len(args), " ".join(args)))
    workingDir = args[0]
    
    #Assign the input files
    readFastqFiles = [ os.path.join(workingDir, "readFastqFiles", i) for i in os.listdir(os.path.join(workingDir, "readFastqFiles")) if ".fq" in i or ".fastq" in i ]
    referenceFastaFiles = [ os.path.join(workingDir, "referenceFastaFiles", i) for i in os.listdir(os.path.join(workingDir, "referenceFastaFiles")) if ".fa" in i or ".fasta" in i ] 
    outputDir = os.path.join(workingDir, "output")
    
    #Log the inputs
    logger.info("Using the following working directory: %s" % workingDir)
    logger.info("Using the following output directory: %s" % outputDir)
    for readFastqFile in readFastqFiles:
        logger.info("Got the following read fastq file: %s" % readFastqFile)
    for referenceFastaFile in referenceFastaFiles:
        logger.info("Got the following reference fasta files: %s" % referenceFastaFile)
    
    #This line invokes jobTree  
    i = Stack(Target.makeTargetFn(setupExperiments, args=(readFastqFiles, referenceFastaFiles, mappers, analyses, outputDir))).startJobTree(options) 
    
    if i != 0:
        raise RuntimeError("Got failed jobs")
Exemplo n.º 4
0
def chainNetStartup(opts):
    "entry to start jobtree"
    # FIXME: should generate an exception
    target = Target.makeTargetFn(chainNetTarget, (opts.hal, opts.queryGenome, opts.queryTwoBit, opts.targetGenome, opts.targetTwoBit, opts.chainFile, opts.netFile, opts.synChainFile, opts.synNetFile))
    failures = Stack(target).startJobTree(opts)
    if failures != 0:
        raise Exception("Error: " + str(failures) + " jobs failed")
Exemplo n.º 5
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--genome", required=True)
    parser.add_argument("--database", required=True)
    parser.add_argument("--hintsDir", required=True)
    parser.add_argument("--fasta", required=True)
    parser.add_argument("--filterTissues", nargs="+")
    parser.add_argument("--filterCenters", nargs="+")
    bamfiles = parser.add_mutually_exclusive_group(required=True)
    bamfiles.add_argument("--bamFiles", nargs="+", help="bamfiles being used", dest="bams")
    bamfiles.add_argument("--bamFofn", help="File containing list of bamfiles", dest="bams")
    Stack.addJobTreeOptions(parser)
    args = parser.parse_args()
    # below is an ugly hack to filter out tissues/centers by checking for the words in the file path
    if not isinstance(args.bams, list):
        if not os.path.exists(args.bams):
            raise RuntimeError("ERROR: bamFofn does not exist.")
        bams = {x.rstrip() for x in open(args.bams)}
        args.bams = bams
    else:
        args.bams = set(args.bams)
    for to_remove_list in [args.filterTissues, args.filterCenters]:
        if isinstance(to_remove_list, list):
            to_remove = set()
            for x in to_remove_list:
                for b in args.bams:
                    if x in b:
                        to_remove.add(b)
            args.bams -= to_remove
    s = Stack(Target.makeTargetFn(main_hints_fn, memory=8 * 1024 ** 3,
                                  args=[args.bams, args.database, args.genome, args.fasta, args.hintsDir]))
    i = s.startJobTree(args)
    if i != 0:
        raise RuntimeError("Got failed jobs")
def main():
    args = parse_args()
    i = Stack(
        Target.makeTargetFn(build_analyses, memory=8 * (1024**3),
                            args=[args])).startJobTree(args)
    if i != 0:
        raise RuntimeError("Got failed jobs")
Exemplo n.º 7
0
def main():
    parser = build_parser()
    Stack.addJobTreeOptions(parser)
    args = parser.parse_args()
    if args.mode == "reference":
        s = Stack(Target.makeTargetFn(main_ref_fn, args=[args.comparativeAnnotationDir, args.gencode, args.genome,
                                                         args.outDir, args.filterChroms]))
    elif args.mode == "transMap":
        s = Stack(Target.makeTargetFn(main_fn, args=[args.comparativeAnnotationDir, args.gencode, args.genome,
                                                     args.refGenome, args.outDir, args.filterChroms]))
    else:
        s = Stack(Target.makeTargetFn(main_augustus_fn, args=[args.comparativeAnnotationDir, args.gencode, args.genome,
                                                              args.outDir, args.filterChroms]))
    i = s.startJobTree(args)
    if i != 0:
        raise RuntimeError("Got failed jobs")
Exemplo n.º 8
0
def mapThenAnalyse(target, readFastqFile, readType, referenceFastaFile, mapper,
                   analyses, experimentDir):
    if not os.path.exists(experimentDir):
        os.mkdir(experimentDir)
        target.logToMaster("Creating experiment dir: %s" % experimentDir)
    else:
        target.logToMaster("Experiment dir already exists: %s" % experimentDir)
    samFile = os.path.join(experimentDir, "mapping.sam")
    hmmFileToTrain = os.path.join(experimentDir, "hmm.txt")
    remapped = False
    if not os.path.exists(samFile):
        target.logToMaster(
            "Starting mapper %s for reference file %s and read file %s" %
            (mapper.__name__, referenceFastaFile, readFastqFile))
        target.addChildTarget(
            mapper(readFastqFile, readType, referenceFastaFile, samFile,
                   hmmFileToTrain))
        remapped = True
    else:
        target.logToMaster(
            "Mapper %s for reference file %s and read file %s is already complete"
            % (mapper.__name__, referenceFastaFile, readFastqFile))
    target.setFollowOnTarget(
        Target.makeTargetFn(runAnalyses,
                            args=(readFastqFile, readType, referenceFastaFile,
                                  samFile, analyses, experimentDir, remapped,
                                  mapper)))
Exemplo n.º 9
0
def main():
    #Parse the inputs args/options
    parser = OptionParser(usage="usage: workingDir [options]",
                          version="%prog 0.1")
    options = Options()
    parser.add_option("--sequences",
                      dest="sequences",
                      help="Quoted list of fasta files containing sequences")
    parser.add_option("--alignments", dest="alignments", help="Cigar file ")
    addExpectationMaximisationOptions(parser, options)

    Stack.addJobTreeOptions(parser)
    options, args = parser.parse_args()
    setLoggingFromOptions(options)

    if len(args) != 0:
        raise RuntimeError("Expected no arguments, got %s arguments: %s" %
                           (len(args), " ".join(args)))

    #Log the inputs
    logger.info(
        "Got '%s' sequences, '%s' alignments file, '%s' output model and '%s' iterations of training"
        % (options.sequences, options.alignments, options.outputModel,
           options.iterations))

    #This line invokes jobTree
    i = Stack(
        Target.makeTargetFn(expectationMaximisationTrials,
                            args=(options.sequences, options.alignments,
                                  options.outputModel,
                                  options))).startJobTree(options)

    if i != 0:
        raise RuntimeError("Got failed jobs")
Exemplo n.º 10
0
def main():
    parser = build_parser()
    Stack.addJobTreeOptions(parser)
    args = parser.parse_args()
    # biotypes = get_all_biotypes(args.attributePath)
    biotypes = [
        "protein_coding",
        "miRNA",
        "snoRNA",
        "snRNA",
        "lincRNA",
        "processed_pseudogenes",
        "unprocessed_pseudogenes",
        "pseudogenes",
    ]
    job_args = (
        args.comparativeAnnotationDir,
        args.attributePath,
        args.annotationGp,
        args.gencode,
        args.genomes,
        biotypes,
        args.outDir,
    )
    i = Stack(Target.makeTargetFn(wrapper, args=job_args)).startJobTree(args)
    if i != 0:
        raise RuntimeError("Got failed jobs")
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--genome", required=True)
    parser.add_argument("--refGenome", required=True)
    parser.add_argument("--refTranscriptFasta", required=True)
    parser.add_argument("--targetGenomeFasta", required=True)
    parser.add_argument("--outDb", default="cgp_cds_metrics.db")
    parser.add_argument("--compAnnPath", required=True)
    gp_group = parser.add_mutually_exclusive_group(required=True)
    gp_group.add_argument("--cgpGp")
    gp_group.add_argument("--consensusGp")
    Stack.addJobTreeOptions(parser)
    args = parser.parse_args()
    out_db = os.path.join(args.compAnnPath, args.outDb)
    if args.cgpGp is not None:
        gp = args.cgpGp
        mode = "cgp"
        chunk_size = 15  # smaller chunk size because we will do more alignments per transcript
    else:
        gp = args.consensusGp
        mode = "consensus"
        chunk_size = 40
    s = Stack(
        Target.makeTargetFn(align_gp,
                            args=[
                                args.genome, args.refGenome,
                                args.refTranscriptFasta,
                                args.targetGenomeFasta, gp, mode, out_db,
                                args.compAnnPath, chunk_size
                            ]))
    i = s.startJobTree(args)
    if i != 0:
        raise RuntimeError("Got failed jobs")
Exemplo n.º 12
0
def main():
    parser = build_parser()
    Stack.addJobTreeOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)

    ilp_config = ilp_tuple(args.breakpoint_penalty, args.data_penalty,
                           args.expected_value_penalty, args.trash_penalty,
                           args.kmer_size)
    paths = paths_tuple(args.out_dir, args.aln_index, args.whitelist,
                        args.masked_ref, args.unmasked_ref, args.bad_kmers,
                        args.normalizing, args.key_file)
    try:
        cgquery_dict = pickle.load(open(args.cgquery_file))
    except IOError:
        raise IOError("Cgquery dict does not exist.")

    if not os.path.exists(paths.out_dir):
        os.makedirs(paths.out_dir)

    i = Stack(
        Target.makeTargetFn(build_analyses,
                            args=(paths, ilp_config,
                                  cgquery_dict))).startJobTree(args)

    if i != 0:
        raise RuntimeError("Got failed jobs")
Exemplo n.º 13
0
def main():
    parser = OptionParser()
    Stack.addJobTreeOptions(parser)

    parser.add_option("--fileToSort",
                      dest="fileToSort",
                      help="The file you wish to sort")

    parser.add_option(
        "--N",
        dest="N",
        help=
        "The threshold below which a serial sort function is used to sort file. All lines must of length less than or equal to N or program will fail",
        default=10000)

    options, args = parser.parse_args()

    if options.fileToSort == None:
        raise RuntimeError("No file to sort given")

    if not os.path.exists(options.fileToSort):
        raise RuntimeError("File to sort does not exist: %s" %
                           options.fileToSort)

    if int(options.N) <= 0:
        raise RuntimeError("Invalid value of N: %s" % options.N)

    if len(args) != 0:
        raise RuntimeError("Unrecognised input arguments: %s" % " ".join(args))

    #Now we are ready to run
    i = Stack(Target.makeTargetFn(
        setup, (options.fileToSort, int(options.N)))).startJobTree(options)
Exemplo n.º 14
0
def main():
    parser = build_parser()
    Stack.addJobTreeOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)

    if not os.path.exists(args.outDir):
        os.mkdir(args.outDir)

    if args.overwriteDb is True:
        if os.path.exists(args.mergedDb):
            os.remove(args.mergedDb)
        for g in args.genomes:
            if os.path.exists(os.path.join(args.outDir, g + ".db")):
                os.remove(os.path.join(args.outDir, g + ".db"))

    logger.info("Building paths to the required files")
    alnPslDict = parse_dir(args.genomes, args.dataDir, alignment_ext)
    seqTwoBitDict = parse_dir(args.genomes, args.dataDir, sequence_ext)
    geneCheckBedDict = parse_dir(args.genomes, args.dataDir, gene_check_ext)
    #geneCheckBedDetailsDict = parse_dir(args.genomes, args.geneCheckDir, gene_check_details_ext)

    refSequence = os.path.join(args.dataDir, args.refGenome + ".2bit")
    if not os.path.exists(refSequence):
        raise RuntimeError("Reference genome 2bit not present at {}".format(refSequence))
    args.refSequence = refSequence

    i = Stack(Target.makeTargetFn(build_analysis, args=(alnPslDict, seqTwoBitDict, geneCheckBedDict, 
            args.gencodeAttributeMap, args.genomes, args.annotationBed, args.outDir, args.primaryKey, 
            args.refGenome))).startJobTree(args)

    if i != 0:
        raise RuntimeError("Got failed jobs")

    merge_databases(args.outDir, args.mergedDb, args.genomes)
Exemplo n.º 15
0
def main():
    parser = OptionParser()
    Stack.addJobTreeOptions(parser)
    
    parser.add_option("--fileToSort", dest="fileToSort",
                      help="The file you wish to sort")
    
    parser.add_option("--N", dest="N",
                      help="The threshold below which a serial sort function is used to sort file. All lines must of length less than or equal to N or program will fail", 
                      default=10000)
    
    options, args = parser.parse_args()
    
    if options.fileToSort == None:
        raise RuntimeError("No file to sort given")

    if not os.path.exists(options.fileToSort):
        raise RuntimeError("File to sort does not exist: %s" % options.fileToSort)
    
    if int(options.N) <= 0:
        raise RuntimeError("Invalid value of N: %s" % options.N)
    
    if len(args) != 0:
        raise RuntimeError("Unrecognised input arguments: %s" % " ".join(args))
    
    #Now we are ready to run
    i = Stack(Target.makeTargetFn(setup, (options.fileToSort, int(options.N)))).startJobTree(options)
def main():
    parser = build_parser()
    Stack.addJobTreeOptions(parser)
    args = parser.parse_args()
    i = Stack(Target.makeTargetFn(build_analyses, args=(args.refGenome, args.genome, args.annotationGp, args.psl,
                                                        args.gp, args.augustusGp, args.fasta, args.refFasta, args.sizes,
                                                        args.gencodeAttributes, args.outDir))).startJobTree(args)
    if i != 0:
        raise RuntimeError("Got failed jobs")
Exemplo n.º 17
0
def main():
    parser = build_parser()
    Stack.addJobTreeOptions(parser)
    args = parser.parse_args()
    i = Stack(Target.makeTargetFn(build_analyses, args=(args.refGenome, args.genome, args.annotationGp, args.psl,
                                                        args.gp, args.fasta, args.refFasta, args.sizes,
                                                        args.gencodeAttributes, args.outDir))).startJobTree(args)
    if i != 0:
        raise RuntimeError("Got failed jobs")
Exemplo n.º 18
0
def main():
    args = parse_args()
    args.target_genomes = extract_model_tree(args.model) - set(
        [args.ref_genome])
    args.msa_split_options = " ".join(
        ['--windows', args.windows, '--between-blocks', args.between_blocks])
    s = Stack(Target.makeTargetFn(dless_pipeline_wrapper, args=(args, )))
    i = s.startJobTree(args)
    if i != 0:
        raise RuntimeError("Got failed jobs")
def main():
    parser = build_parser()
    Stack.addJobTreeOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    args.defaultCpu = args.num_threads
    args.defaultMemory = 8 * 1024 ** 3
    i = Stack(Target.makeTargetFn(wrapper, args=(args,), memory=args.defaultMemory, cpu=args.defaultCpu)).startJobTree(args)
    if i != 0:
        raise RuntimeError("Got failed jobs")
def main():
    parser = build_parser()
    Stack.addJobTreeOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)

    i = Stack(Target.makeTargetFn(wrapper, args=(args.source_dir, args.reference, args.out_dir))).startJobTree(args)

    if i != 0:
        raise RuntimeError("Got failed jobs")
Exemplo n.º 21
0
def mapThenAnalyse(target, readFastaFile, referenceFastaFile, mapper, analyses, experimentDir):
    print "Experiment dir", experimentDir
    if not os.path.exists(experimentDir):
        os.mkdir(experimentDir)
        target.logToMaster("Creating experiment dir: %s" % experimentDir)
    else:
        target.logToMaster("Experiment dir already exists: %s" % experimentDir)
    samFile = os.path.join(experimentDir, "mapping.sam")
    if not os.path.exists(samFile) or isNewer(readFastaFile, samFile) or isNewer(referenceFastaFile, samFile):
        target.addChildTarget(mapper(readFastaFile, referenceFastaFile, samFile))
    target.setFollowOnTarget(Target.makeTargetFn(runAnalyses, args=(readFastaFile, referenceFastaFile, samFile, analyses, experimentDir))) 
Exemplo n.º 22
0
def main():
    parser = buildParser()
    Stack.addJobTreeOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)

    i = Stack(Target.makeTargetFn(buildAnalyses, args=(
        args.output, args.breakpoint_penalty, args.data_penalty, args.tightness_penalty, args.graph, args.kmer_size, args.save_intermediate))).startJobTree(
        args)

    if i != 0:
        raise RuntimeError("Got failed jobs")
def main():
    parser = buildParser()
    Stack.addJobTreeOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)

    i = Stack(
        Target.makeTargetFn(buildAnalyses, args=(args.output, args.fastq_list, args.save_intermediate))
    ).startJobTree(args)

    if i != 0:
        raise RuntimeError("Got failed jobs")
Exemplo n.º 24
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--genomes", nargs="+", required=True)
    parser.add_argument("--refFasta", required=True)
    parser.add_argument("--outDir", required=True)
    parser.add_argument("--augustusStatsDir", required=True)
    Stack.addJobTreeOptions(parser)
    args = parser.parse_args()
    i = Stack(Target.makeTargetFn(wrapper, args=(args.genomes, args.refFasta, args.augustusStatsDir,
                                                 args.outDir))).startJobTree(args)
    if i != 0:
        raise RuntimeError("Got failed jobs")
    shutil.rmtree(os.path.join(args.outDir, "tmp"))
Exemplo n.º 25
0
def main():
    args = parse_args()
    if args.target_genomes is None:
        args.target_genomes = extract_model_tree(args.model) - set(
            [args.ref_genome])
    args.msa_split_options = ' '.join([
        '--windows', args.windows, '--between-blocks', args.between_blocks,
        '--min-informative', args.min_informative
    ])
    s = Stack(Target.makeTargetFn(subset_hal_pipeline, args=(args, )))
    i = s.startJobTree(args)
    if i != 0:
        raise RuntimeError("Got failed jobs")
Exemplo n.º 26
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--inputGp", required=True)
    parser.add_argument("--outputGtf", required=True)
    parser.add_argument("--genome", required=True)
    parser.add_argument("--chromSizes", required=True)
    parser.add_argument("--fasta", required=True)
    Stack.addJobTreeOptions(parser)
    args = parser.parse_args()
    i = Stack(Target.makeTargetFn(wrapper, args=(args.inputGp, args.outputGtf, args.genome,
                                                 args.chromSizes, args.fasta))).startJobTree(args)
    if i != 0:
        raise RuntimeError("Got failed jobs")
Exemplo n.º 27
0
def find_analyses(target, unmappedByReadType, outputDir):
    outfiles = dict()
    for readType in unmappedByReadType:
        outfiles[readType] = list()
        records = list()
        for (name, sequence), i in izip(unmappedByReadType[readType].iteritems(), xrange(len(unmappedByReadType[readType]))):
                records.append(">{}\n{}\n".format(name, sequence))
                if i % 10 == 1200 or i == len(unmappedByReadType[readType]) - 1:
                    tmpalign = os.path.join(target.getGlobalTempDir(), str(i) + ".txt")
                    outfiles[readType].append(tmpalign)
                    target.addChildTarget(Target.makeTargetFn(run_blast, args=(records, tmpalign)))
                    records = list()
    target.setFollowOnTargetFn(merge, args=(outfiles, outputDir))
def main():
    parser = buildParser()
    Stack.addJobTreeOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)

    i = Stack(
        Target.makeTargetFn(buildAnalyses,
                            args=(args.output, args.fastq_list,
                                  args.save_intermediate))).startJobTree(args)

    if i != 0:
        raise RuntimeError("Got failed jobs")
Exemplo n.º 29
0
def setupExperiments(target, readFastaFiles, referenceFastaFiles, mappers, analysers, outputDir):
    if not os.path.exists(outputDir): #If the output dir doesn't yet exist create it
        os.mkdir(outputDir)
        target.logToMaster("Creating output dir: %s" % outputDir)
    else:
        target.logToMaster("Root output dir already exists: %s" % outputDir)
    for readFastaFile in readFastaFiles:
        for referenceFastaFile in referenceFastaFiles:
            for mapper in mappers:
                target.addChildTarget(Target.makeTargetFn(mapThenAnalyse, \
                args=(readFastaFile, referenceFastaFile, mapper, analyses,
                      os.path.join(outputDir, "experiment_%s_%s_%s" % \
                        (os.path.split(readFastaFile)[-1], os.path.split(referenceFastaFile)[-1], mapper.__name__)))))
Exemplo n.º 30
0
def main():
    parser = OptionParser()
    Stack.addJobTreeOptions(parser)
    options, args = parser.parse_args()
    setLoggingFromOptions(options)
    
    outputDir = "muscle_compare_2d/output/"

    if not os.path.exists(outputDir):
        logger.info("Output dir {} does not exist. Creating.")
        os.mkdir(outputDir)
    if len(os.listdir(outputDir)) > 0:
        logger.info("Output dir not empty.")

    if len(args) != 3:
        raise RuntimeError("Error: expected three arguments got %s arguments: %s" % (len(args), " ".join(args)))

    templateRecords = {x.qname for x in pysam.Samfile(args[0]) if not x.is_unmapped}
    complementRecords = {x.qname for x in pysam.Samfile(args[1]) if not x.is_unmapped}
    
    twodSamFile = pysam.Samfile(args[2])
    twodRecords = {x.qname : x for x in twodSamFile if not x.is_unmapped}

    recordsToAnalyze = dict()
    for name, record in twodRecords.iteritems():
        if name not in templateRecords and name not in complementRecords:
            ref_name = twodSamFile.getrname(record.tid)
            ref_start, ref_stop = int(record.aend - record.alen), int(record.aend)
            recordsToAnalyze[name] = [ref_name, ref_start, ref_stop]
    if os.path.exists("../readFastqFiles/template/") and os.path.exists("../readFastqFiles/complement"):
        templateFastqFiles = [os.path.join("../readFastqFiles/template/", x) for x in os.listdir("../readFastqFiles/template/") if x.endswith(".fastq") or x.endswith(".fq")]
        complementFastqFiles = [os.path.join("../readFastqFiles/complement/", x) for x in os.listdir("../readFastqFiles/complement/") if x.endswith(".fastq") or x.endswith(".fq")]
    else:
        raise RuntimeError("Error: readFastqFiles does not contain template and/or complement folders")

    referenceFastaFiles = [os.path.join("../referenceFastaFiles", x) for x in os.listdir("../referenceFastaFiles") if x.endswith(".fa") or x.endswith(".fasta")]
    
    if len(referenceFastaFiles) > 0:
        references = { y[0].split(" ")[0] : y[1] for x in referenceFastaFiles for y in fastaRead(x) }
    else:
        raise RuntimeError("Error: no reference fasta files")

    if len(recordsToAnalyze) == 0:
        raise RuntimeError("Error: none of the mappable twoD reads in this set did not map as template/complement.")

    logger.info("Starting to find analyses to run...")
    args = (recordsToAnalyze, templateFastqFiles, complementFastqFiles, references, outputDir)
    i = Stack(Target.makeTargetFn(find_analyses, args=args)).startJobTree(options) 

    if i != 0:
        raise RuntimeError("Got {} failed jobs".format(i))
Exemplo n.º 31
0
def main():
    #Parse the inputs args/options
    parser = OptionParser(usage="usage: inputSamFile referenceFastaFile outputVcfFile [options]", 
                          version="%prog 0.1")
    
    #Options
    parser.add_option("--noMargin", dest="noMargin", help="Do not marginalise over the read \
    alignments, rather use the input alignment to call the variants (this will be faster)", 
                      default=False, action="store_true")
    parser.add_option("--alignmentModel", default=os.path.join(pathToBaseNanoporeDir(), 
                                                          "src", "margin", "mappers", "last_hmm_20.txt"), 
                     help="The model to use in realigning the reads to the reference.")
    parser.add_option("--errorModel", default=os.path.join(pathToBaseNanoporeDir(), 
                                                          "src", "margin", "mappers", "last_hmm_20.txt"), 
                     help="The model to use in calculating the difference between the predicted true reference and the reads.")
    parser.add_option("--maxAlignmentLengthPerJob", default=7000000, 
                     help="Maximum total alignment length of alignments to include in one posterior prob calculation job.", 
                     type=int)
    parser.add_option("--threshold", default=0.3, 
                     help="The posterior probability threshold for a non-reference base above which to report a variant.", 
                     type=float)
    
    #Add the jobTree options
    Stack.addJobTreeOptions(parser)
    
    #Parse the options/arguments
    options, args = parser.parse_args()
    
    #Setup logging
    setLoggingFromOptions(options)
    
    #Print help message if no input
    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(0)
    
    #Exit if the arguments are not what we expect
    if len(args) != 3:
        raise RuntimeError("Expected three arguments, got: %s" % " ".join(args))
    
    print options.errorModel
    print options.threshold
    
    #This line invokes jobTree  
    i = Stack(Target.makeTargetFn(fn=marginCallerTargetFn, args=(args[0], args[1], args[2], options))).startJobTree(options) 
        
    #The return value of the jobtree script is the number of failed jobs. If we have any then
    #report this.       
    if i != 0:
        raise RuntimeError("Got failed jobs")
def main():
    parser = build_parser()
    Stack.addJobTreeOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)

    if not os.path.exists(args.out_dir):
        os.mkdir(args.out_dir)

    count_files = [[x, os.path.join(args.data_dir, x, x + ".Counts.fa")] for x in os.listdir(args.data_dir)]

    i = Stack(Target.makeTargetFn(buildDictWrapper, args=(count_files, args.out_dir, args.graph, args.new_graph))).startJobTree(args)

    if i != 0:
        raise RuntimeError("Got failed jobs")
Exemplo n.º 33
0
def main():
    parser = build_parser()
    Stack.addJobTreeOptions(parser)
    args = parser.parse_args()
    #biotypes = get_all_biotypes(args.attributePath)
    biotypes = [
        "protein_coding", "miRNA", "snoRNA", "snRNA", "lincRNA",
        "processed_pseudogenes", "unprocessed_pseudogenes", "pseudogenes"
    ]
    job_args = (args.comparativeAnnotationDir, args.attributePath,
                args.annotationGp, args.gencode, args.genomes, biotypes,
                args.outDir)
    i = Stack(Target.makeTargetFn(wrapper, args=job_args)).startJobTree(args)
    if i != 0:
        raise RuntimeError("Got failed jobs")
Exemplo n.º 34
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--inputGp", required=True)
    parser.add_argument("--outputGtf", required=True)
    parser.add_argument("--genome", required=True)
    parser.add_argument("--chromSizes", required=True)
    parser.add_argument("--fasta", required=True)
    Stack.addJobTreeOptions(parser)
    args = parser.parse_args()
    i = Stack(
        Target.makeTargetFn(wrapper,
                            args=(args.inputGp, args.outputGtf, args.genome,
                                  args.chromSizes,
                                  args.fasta))).startJobTree(args)
    if i != 0:
        raise RuntimeError("Got failed jobs")
def main():
    opts = parse_args()
    # Create labels for the HALs if none were provided
    if opts.labels is None:
        opts.labels = [os.path.basename(hal) for hal in opts.hals]
    if len(opts.labels) != len(opts.hals):
        raise ValueError("%d labels were provided, but %d hals were provided." % (len(opts.labels), len(opts.hals)))

    # Ensure that the hals have some genomes in common, and take the
    # common genomes to display in the hub.
    genomess = [getGenomesInHal(hal) for hal in opts.hals]
    genomes = reduce(lambda a, i: a.intersection(i), genomess)
    if len(genomes) == 0:
        raise ValueError("No genomes in common between the HALs.")

    Stack(Target.makeTargetFn(createHub, (genomes, opts))).startJobTree(opts)
Exemplo n.º 36
0
def find_analyses(target, unmappedByReadType, outputDir):
    outfiles = dict()
    for readType in unmappedByReadType:
        outfiles[readType] = list()
        records = list()
        for (name,
             sequence), i in izip(unmappedByReadType[readType].iteritems(),
                                  xrange(len(unmappedByReadType[readType]))):
            records.append(">{}\n{}\n".format(name, sequence))
            if i % 10 == 1200 or i == len(unmappedByReadType[readType]) - 1:
                tmpalign = os.path.join(target.getGlobalTempDir(),
                                        str(i) + ".txt")
                outfiles[readType].append(tmpalign)
                target.addChildTarget(
                    Target.makeTargetFn(run_blast, args=(records, tmpalign)))
                records = list()
    target.setFollowOnTargetFn(merge, args=(outfiles, outputDir))
def main():
    parser = buildParser()
    Stack.addJobTreeOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)

    if args.fastq is not None:
        i = Stack(ModelWrapperLocalFiles(args.name, args.output, args.breakpoint_penalty, args.data_penalty,
                                         args.tightness_penalty, args.graph, args.fastq,
                                         args.save_intermediate)).startJobTree(args)
    else:
        i = Stack(Target.makeTargetFn(buildAnalyses, args=(
            args.name, args.output, args.breakpoint_penalty, args.data_penalty, args.tightness_penalty, args.graph,
            args.fastq_list, args.save_intermediate))).startJobTree(args)

    if i != 0:
        raise RuntimeError("Got failed jobs")
Exemplo n.º 38
0
def main():
    opts = parse_args()
    # Create labels for the HALs if none were provided
    if opts.labels is None:
        opts.labels = [os.path.basename(hal) for hal in opts.hals]
    if len(opts.labels) != len(opts.hals):
        raise ValueError(
            "%d labels were provided, but %d hals were provided." %
            (len(opts.labels), len(opts.hals)))

    # Ensure that the hals have some genomes in common, and take the
    # common genomes to display in the hub.
    genomess = [getGenomesInHal(hal) for hal in opts.hals]
    genomes = reduce(lambda a, i: a.intersection(i), genomess)
    if len(genomes) == 0:
        raise ValueError("No genomes in common between the HALs.")

    Stack(Target.makeTargetFn(createHub, (genomes, opts))).startJobTree(opts)
Exemplo n.º 39
0
def main():
    parser = build_parser()
    Stack.addJobTreeOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)

    if not os.path.exists(args.out_dir):
        os.mkdir(args.out_dir)

    count_files = [[x, os.path.join(args.data_dir, x, x + ".Counts.fa")]
                   for x in os.listdir(args.data_dir)]

    i = Stack(
        Target.makeTargetFn(buildDictWrapper,
                            args=(count_files, args.out_dir, args.graph,
                                  args.new_graph))).startJobTree(args)

    if i != 0:
        raise RuntimeError("Got failed jobs")
Exemplo n.º 40
0
def main():
    args = parse_args()
    if args.target_genomes is None:
        args.target_genomes = extract_model_tree(args.model) - set(
            [args.ref_genome])
    else:
        args.target_genomes = set(args.target_genomes) - set([args.ref_genome])
    args.msa_split_options = " ".join([
        '--windows', args.windows, '--between-blocks', args.between_blocks,
        '--min-informative', args.min_informative
    ])
    args.phastcons_options = " ".join([
        '--target-coverage', args.target_coverage, '--expected-length',
        args.expected_length
    ])
    s = Stack(Target.makeTargetFn(phastcons_pipeline_wrapper, args=(args, )))
    i = s.startJobTree(args)
    if i != 0:
        raise RuntimeError("Got failed jobs")
Exemplo n.º 41
0
def setupExperiments(target, readFastqFiles, referenceFastaFiles, mappers,
                     analysers, metaAnalyses, outputDir):
    experiments = []
    for readType, readTypeFastaFiles in readFastqFiles:
        outputBase = os.path.join(outputDir, "analysis_" + readType)
        if not os.path.exists(outputBase):
            os.mkdir(outputBase)
        for readFastqFile in readTypeFastaFiles:
            for referenceFastaFile in referenceFastaFiles:
                for mapper in mappers:
                    experimentDir = os.path.join(outputBase, "experiment_%s_%s_%s" % \
                            (os.path.split(readFastqFile)[-1], os.path.split(referenceFastaFile)[-1], mapper.__name__))
                    experiment = (readFastqFile, readType, referenceFastaFile,
                                  mapper, analyses, experimentDir)
                    target.addChildTarget(
                        Target.makeTargetFn(mapThenAnalyse, args=experiment))
                    experiments.append(experiment)
    target.setFollowOnTargetFn(runMetaAnalyses,
                               args=(metaAnalyses, outputDir, experiments))
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--genome", required=True)
    parser.add_argument("--refTranscriptFasta", required=True)
    parser.add_argument("--targetTranscriptFasta", required=True)
    parser.add_argument("--targetTranscriptFastaIndex", required=True)
    parser.add_argument("--outDir", required=True)
    parser.add_argument("--outDb", default="augustus_attributes.db")
    Stack.addJobTreeOptions(parser)
    args = parser.parse_args()
    out_db = os.path.join(args.outDir, args.outDb)
    i = Stack(
        Target.makeTargetFn(align_augustus,
                            args=[
                                args.genome, args.refTranscriptFasta,
                                args.targetTranscriptFasta,
                                args.targetTranscriptFastaIndex, out_db
                            ])).startJobTree(args)
    if i != 0:
        raise RuntimeError("Got failed jobs")
Exemplo n.º 43
0
def main():
    parser = build_parser()
    Stack.addJobTreeOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)

    ilp_config = ilp_tuple(args.breakpoint_penalty, args.data_penalty, args.expected_value_penalty, args.trash_penalty,
                           args.kmer_size)
    paths = paths_tuple(args.out_dir, args.aln_index, args.whitelist, args.masked_ref, args.unmasked_ref,
                        args.bad_kmers, args.normalizing, args.key_file)
    try:
        cgquery_dict = pickle.load(open(args.cgquery_file))
    except IOError:
        raise IOError("Cgquery dict does not exist.")

    if not os.path.exists(paths.out_dir):
        os.makedirs(paths.out_dir)

    i = Stack(Target.makeTargetFn(build_analyses, args=(paths, ilp_config, cgquery_dict))).startJobTree(args)

    if i != 0:
        raise RuntimeError("Got failed jobs")
Exemplo n.º 44
0
def main():
    #Parse the inputs args/options
    parser = OptionParser(usage="usage: workingDir [options]", version="%prog 0.1")
    options = Options()
    parser.add_option("--sequences", dest="sequences", help="Quoted list of fasta files containing sequences")
    parser.add_option("--alignments", dest="alignments", help="Cigar file ")
    addExpectationMaximisationOptions(parser, options)
    
    Stack.addJobTreeOptions(parser)
    options, args = parser.parse_args()
    setLoggingFromOptions(options)
    
    if len(args) != 0:
        raise RuntimeError("Expected no arguments, got %s arguments: %s" % (len(args), " ".join(args)))
    
    #Log the inputs
    logger.info("Got '%s' sequences, '%s' alignments file, '%s' output model and '%s' iterations of training" % (options.sequences, options.alignments, options.outputModel, options.iterations))

    #This line invokes jobTree  
    i = Stack(Target.makeTargetFn(expectationMaximisationTrials, args=(options.sequences, options.alignments, options.outputModel, options))).startJobTree(options) 
    
    if i != 0:
        raise RuntimeError("Got failed jobs")
Exemplo n.º 45
0
def main():
    parser = buildParser()
    Stack.addJobTreeOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)

    if args.fastq is not None:
        i = Stack(
            ModelWrapperLocalFiles(args.name, args.output,
                                   args.breakpoint_penalty, args.data_penalty,
                                   args.tightness_penalty, args.graph,
                                   args.fastq,
                                   args.save_intermediate)).startJobTree(args)
    else:
        i = Stack(
            Target.makeTargetFn(
                buildAnalyses,
                args=(args.name, args.output, args.breakpoint_penalty,
                      args.data_penalty, args.tightness_penalty, args.graph,
                      args.fastq_list,
                      args.save_intermediate))).startJobTree(args)

    if i != 0:
        raise RuntimeError("Got failed jobs")
Exemplo n.º 46
0
            for l in r:
                l = l.split()
                tot += int(l[-1]) - int(l[-2])
            outf.write('\t'.join(
                map(str, [chrom, start, stop,
                          format_ratio(tot, length)])) + '\n')


def cat_results(target, args, paths):
    """
    Concatenates final scores output into one bed file.
    """
    fofn = os.path.join(target.getGlobalTempDir(), 'fofn')
    with open(fofn, 'w') as outf:
        for p in paths:
            outf.write(p + '\n')
    tmp_p = os.path.join(target.getGlobalTempDir(),
                         os.path.basename(args.out_bed) + '.tmp')
    cat_cmd = 'cat {} | xargs -n 50 cat > {}'.format(fofn, tmp_p)
    system(cat_cmd)
    os.rename(tmp_p, args.out_bed)


if __name__ == '__main__':
    from phast.find_single_copy_regions import *
    args = parse_args()
    s = Stack(Target.makeTargetFn(single_copy_wrapper, args=(args, )))
    i = s.startJobTree(args)
    if i != 0:
        raise RuntimeError("Got failed jobs")
Exemplo n.º 47
0
def main():
    parser = OptionParser()
    Stack.addJobTreeOptions(parser)
    options, args = parser.parse_args()
    setLoggingFromOptions(options)
    
    outputDir = "blast_combined/output/"

    if not os.path.exists(outputDir):
        logger.info("Output dir {} does not exist. Creating.")
        os.mkdir(outputDir)
    if len(os.listdir(outputDir)) > 0:
        logger.info("Output dir not empty.")

    #find all read fastq files, load into a dict by read type
    readFastqFiles = dict()
    for readType in readTypes:
        readFastqFiles[readType] = [os.path.join("../output/processedReadFastqFiles/", readType, x) for x in os.listdir(os.path.join("../output/processedReadFastqFiles/", readType)) if x.endswith(".fq") or x.endswith(".fastq")]
    
    #find all reference fasta files
    referenceFastaFiles = [x for x in os.listdir("../referenceFastaFiles") if x.endswith(".fasta") or x.endswith(".fa")]

    #find all sam files that were analyzed using combinedAnalyses
    samFiles = {}
    for readType in readTypes:
        samFiles[readType] = [(readFastqFile, os.path.join("../output", "analysis_" + readType, "experiment_" + os.path.basename(readFastqFile) + "_" + referenceFastaFile + "_" + analysis, "mapping.sam")) for readFastqFile, referenceFastaFile, analysis in product(readFastqFiles[readType], referenceFastaFiles, combinedAnalyses)]

    mappedByReadType = defaultdict(set)
    for readType in readTypes:
        for readFastqFileFullPath, samFile in samFiles[readType]:
            readFastqFile = os.path.basename(readFastqFileFullPath)
            mappedNames = {(x.qname, readFastqFile) for x in pysam.Samfile(samFile) if not x.is_unmapped}
            mappedByReadType[readType] = mappedByReadType[readType].union(mappedNames)

    unmappedByReadType = defaultdict(dict)
    for readType in readTypes:
        for readFastqFileFullPath, samFile in samFiles[readType]:
            readFastqFile = os.path.basename(readFastqFileFullPath)
            for name, seq, qual in fastqRead(readFastqFileFullPath):
                name = name.split(" ")[0]
                if (name, readFastqFile) not in mappedByReadType[readType]:
                    unmappedByReadType[readType][(name, readFastqFile)] = seq
        

    i = Stack(Target.makeTargetFn(find_analyses, args=(unmappedByReadType, outputDir))).startJobTree(options) 

    if i != 0:
        raise RuntimeError("Got {} failed jobs".format(i))

    for readType in readTypes:
        #build a counter of blast hits and set of read names that did not map
        blast_hits, no_hits = Counter(), set()
        for query, result in parse_blast(open(os.path.join(outputDir, readType + "_blast_out.txt"))):
            if result is None:
                no_hits.add(query)
            else:
                blast_hits[tuple(result)] += 1 #count number of times each hit was seen
        #write the unmapped hits to a fasta file
        outf = open(os.path.join(outputDir, readType + "_no_hits.fasta"), "w")
        for (name, readFastqFile), seq in unmappedByReadType[readType].iteritems():
            if name in no_hits:
                outf.write(">{}\n{}\n".format(name, seq))
        outf.close()
        #write the blast report
        blast_out = open(os.path.join(outputDir, readType + "_blast_report.txt"), "w")
        blast_out.write("gi|##|gb|##|\tSpecies\tseqID\tCount\n") #header to output
        for result, count in sorted(blast_hits.items(), key = lambda x: -int(x[-1])):
            blast_out.write("{}\t{}\n".format("\t".join(result), count))
        blast_out.close()
        #calculate percents and make a barplot
        blast_count =  sum(blast_hits.values())
        unmapped_count = len(unmappedByReadType[readType]) - sum(blast_hits.values())
        mapped_count = len(mappedByReadType[readType])
        
        #blast_percent = 1.0 * sum(blast_hits.values()) / (len(mappedByReadType[readType]) + len(unmappedByReadType[readType]))
        #unmapped_percent = (1.0 * len(unmappedByReadType[readType]) - sum(blast_hits.values())) / (len(mappedByReadType[readType]) + len(unmappedByReadType[readType]))
        #mapped_percent = 1.0 * len(mappedByReadType[readType]) / (len(mappedByReadType[readType]) + len(unmappedByReadType[readType]))
        outf = open(os.path.join(outputDir, readType + "percents.txt"),"w")
        outf.write("\n".join(map(str,[blast_count, unmapped_count, mapped_count])))
        outf.close()
        #system("Rscript blast_combined/barplot_blast.R {} {} {} {} {}".format(blast_percent, unmapped_percent, mapped_percent, readType, os.path.join(outputDir, readType + "_blast_barplot.pdf")))
        system("Rscript blast_combined/barplot_blast.R {} {} {} {} {}".format(blast_count, unmapped_count, mapped_count, readType, os.path.join(outputDir, readType + "_blast_barplot.pdf")))
                        chrom, start, stop))
                continue
            test_ancestral_nodes(target, region_specific_conserved,
                                 accelerated_genomes, maf_path, region_bed,
                                 outf_handle)


def cat_results(target, args, paths):
    """
    Concatenates final phastcons output into one gff file.
    """
    fofn = os.path.join(target.getGlobalTempDir(), 'fofn')
    with open(fofn, 'w') as outf:
        for p in paths:
            outf.write(p + '\n')
    tmp_p = os.path.join(target.getGlobalTempDir(),
                         os.path.basename(args.out_bed) + '.tmp')
    cat_cmd = 'cat {} | xargs -n 50 cat > {}'.format(fofn, tmp_p)
    system(cat_cmd)
    os.rename(tmp_p, args.out_bed)


if __name__ == '__main__':
    from phast.run_acceleration_tests import *
    args = parse_args()
    setLoggingFromOptions(args)
    s = Stack(Target.makeTargetFn(extract_maf_wrapper, args=(args, )))
    i = s.startJobTree(args)
    if i != 0:
        raise RuntimeError("Got failed jobs")
Exemplo n.º 49
0
def main(args):
    opts = parseArgs(args)
    Stack(Target.makeTargetFn(pipeline, args=[opts])).startJobTree(opts)
Exemplo n.º 50
0
def main():
    parser = OptionParser()
    Stack.addJobTreeOptions(parser)
    options, args = parser.parse_args()
    setLoggingFromOptions(options)

    outputDir = "blast_combined/output/"

    if not os.path.exists(outputDir):
        logger.info("Output dir {} does not exist. Creating.")
        os.mkdir(outputDir)
    if len(os.listdir(outputDir)) > 0:
        logger.info("Output dir not empty.")

    #find all read fastq files, load into a dict by read type
    readFastqFiles = dict()
    for readType in readTypes:
        readFastqFiles[readType] = [
            os.path.join("../output/processedReadFastqFiles/", readType, x)
            for x in os.listdir(
                os.path.join("../output/processedReadFastqFiles/", readType))
            if x.endswith(".fq") or x.endswith(".fastq")
        ]

    #find all reference fasta files
    referenceFastaFiles = [
        x for x in os.listdir("../referenceFastaFiles")
        if x.endswith(".fasta") or x.endswith(".fa")
    ]

    #find all sam files that were analyzed using combinedAnalyses
    samFiles = {}
    for readType in readTypes:
        samFiles[readType] = [
            (readFastqFile,
             os.path.join(
                 "../output", "analysis_" + readType,
                 "experiment_" + os.path.basename(readFastqFile) + "_" +
                 referenceFastaFile + "_" + analysis, "mapping.sam"))
            for readFastqFile, referenceFastaFile, analysis in product(
                readFastqFiles[readType], referenceFastaFiles,
                combinedAnalyses)
        ]

    mappedByReadType = defaultdict(set)
    for readType in readTypes:
        for readFastqFileFullPath, samFile in samFiles[readType]:
            readFastqFile = os.path.basename(readFastqFileFullPath)
            mappedNames = {(x.qname, readFastqFile)
                           for x in pysam.Samfile(samFile)
                           if not x.is_unmapped}
            mappedByReadType[readType] = mappedByReadType[readType].union(
                mappedNames)

    unmappedByReadType = defaultdict(dict)
    for readType in readTypes:
        for readFastqFileFullPath, samFile in samFiles[readType]:
            readFastqFile = os.path.basename(readFastqFileFullPath)
            for name, seq, qual in fastqRead(readFastqFileFullPath):
                name = name.split(" ")[0]
                if (name, readFastqFile) not in mappedByReadType[readType]:
                    unmappedByReadType[readType][(name, readFastqFile)] = seq

    i = Stack(
        Target.makeTargetFn(find_analyses,
                            args=(unmappedByReadType,
                                  outputDir))).startJobTree(options)

    if i != 0:
        raise RuntimeError("Got {} failed jobs".format(i))

    for readType in readTypes:
        #build a counter of blast hits and set of read names that did not map
        blast_hits, no_hits = Counter(), set()
        for query, result in parse_blast(
                open(os.path.join(outputDir, readType + "_blast_out.txt"))):
            if result is None:
                no_hits.add(query)
            else:
                blast_hits[tuple(
                    result)] += 1  #count number of times each hit was seen
        #write the unmapped hits to a fasta file
        outf = open(os.path.join(outputDir, readType + "_no_hits.fasta"), "w")
        for (name,
             readFastqFile), seq in unmappedByReadType[readType].iteritems():
            if name in no_hits:
                outf.write(">{}\n{}\n".format(name, seq))
        outf.close()
        #write the blast report
        blast_out = open(
            os.path.join(outputDir, readType + "_blast_report.txt"), "w")
        blast_out.write(
            "gi|##|gb|##|\tSpecies\tseqID\tCount\n")  #header to output
        for result, count in sorted(blast_hits.items(),
                                    key=lambda x: -int(x[-1])):
            blast_out.write("{}\t{}\n".format("\t".join(result), count))
        blast_out.close()
        #calculate percents and make a barplot
        blast_count = sum(blast_hits.values())
        unmapped_count = len(unmappedByReadType[readType]) - sum(
            blast_hits.values())
        mapped_count = len(mappedByReadType[readType])

        #blast_percent = 1.0 * sum(blast_hits.values()) / (len(mappedByReadType[readType]) + len(unmappedByReadType[readType]))
        #unmapped_percent = (1.0 * len(unmappedByReadType[readType]) - sum(blast_hits.values())) / (len(mappedByReadType[readType]) + len(unmappedByReadType[readType]))
        #mapped_percent = 1.0 * len(mappedByReadType[readType]) / (len(mappedByReadType[readType]) + len(unmappedByReadType[readType]))
        outf = open(os.path.join(outputDir, readType + "percents.txt"), "w")
        outf.write("\n".join(
            map(str, [blast_count, unmapped_count, mapped_count])))
        outf.close()
        #system("Rscript blast_combined/barplot_blast.R {} {} {} {} {}".format(blast_percent, unmapped_percent, mapped_percent, readType, os.path.join(outputDir, readType + "_blast_barplot.pdf")))
        system("Rscript blast_combined/barplot_blast.R {} {} {} {} {}".format(
            blast_count, unmapped_count, mapped_count, readType,
            os.path.join(outputDir, readType + "_blast_barplot.pdf")))
Exemplo n.º 51
0
def main():
    #Parse the inputs args/options
    parser = OptionParser(usage="usage: workingDir [options]",
                          version="%prog 0.1")
    Stack.addJobTreeOptions(parser)
    options, args = parser.parse_args()
    setLoggingFromOptions(options)

    if len(args) != 1:
        raise RuntimeError("Expected one argument, got %s arguments: %s" %
                           (len(args), " ".join(args)))
    workingDir = args[0]

    # call read sampler script; samples 75, 50, and 25% reads
    #SampleReads(workingDir)

    #Create (if necessary) the output dir
    outputDir = os.path.join(workingDir, "output")
    if not os.path.exists(outputDir):
        logger.info("Creating output dir: %s" % outputDir)
        os.mkdir(outputDir)
    else:
        logger.info("Root output dir already exists: %s" % outputDir)

    #Assign/process (uniquify the names of) the input read fastq files
    processedFastqFiles = os.path.join(outputDir, "processedReadFastqFiles")
    if not os.path.exists(processedFastqFiles):
        os.mkdir(processedFastqFiles)

    fastqParentDir = os.path.join(workingDir, "readFastqFiles")
    readFastqFiles = list()
    for fastqSubDir in filter(
            os.path.isdir,
        [os.path.join(fastqParentDir, x) for x in os.listdir(fastqParentDir)]):
        readType = os.path.basename(fastqSubDir)
        if not os.path.exists(
                os.path.join(processedFastqFiles,
                             os.path.basename(fastqSubDir))):
            os.mkdir(os.path.join(processedFastqFiles, readType))
        readFastqFiles.append([
            readType,
            [
                makeFastqSequenceNamesUnique(
                    os.path.join(workingDir, "readFastqFiles", readType, i),
                    os.path.join(processedFastqFiles, readType, i))
                for i in os.listdir(
                    os.path.join(workingDir, "readFastqFiles", readType))
                if (".fq" in i and i[-3:] == '.fq') or (
                    ".fastq" in i and i[-6:] == '.fastq')
            ]
        ])

    #Assign/process (uniquify the names of) the input reference fasta files
    processedFastaFiles = os.path.join(outputDir,
                                       "processedReferenceFastaFiles")
    if not os.path.exists(processedFastaFiles):
        os.mkdir(processedFastaFiles)
    referenceFastaFiles = [
        makeFastaSequenceNamesUnique(
            os.path.join(workingDir, "referenceFastaFiles", i),
            os.path.join(processedFastaFiles, i))
        for i in os.listdir(os.path.join(workingDir, "referenceFastaFiles"))
        if (".fa" in i and i[-3:] == '.fa') or (
            ".fasta" in i and i[-6:] == '.fasta')
    ]

    # call reference mutator script; introduces 1%, and 5% mutations (No nucleotide bias used for now)
    #referenceFastaFiles = mutateReferenceSequences(referenceFastaFiles)

    #Log the inputs
    logger.info("Using the following working directory: %s" % workingDir)
    logger.info("Using the following output directory: %s" % outputDir)
    for readType, readTypeFastqFiles in readFastqFiles:
        logger.info("Got the follow read type: %s" % readType)
        for readFastqFile in readTypeFastqFiles:
            logger.info("Got the following read fastq file: %s" %
                        readFastqFile)
    for referenceFastaFile in referenceFastaFiles:
        logger.info("Got the following reference fasta files: %s" %
                    referenceFastaFile)

    #This line invokes jobTree
    i = Stack(
        Target.makeTargetFn(setupExperiments,
                            args=(readFastqFiles, referenceFastaFiles, mappers,
                                  analyses, metaAnalyses,
                                  outputDir))).startJobTree(options)

    if i != 0:
        raise RuntimeError("Got failed jobs")