Exemplo n.º 1
0
 def startJobTree(self, options):
     """Runs jobtree using the given options (see Stack.getDefaultOptions
     and Stack.addJobTreeOptions).
     """
     setLoggingFromOptions(options)
     options.jobTree = os.path.abspath(options.jobTree)
     if os.path.isdir(options.jobTree):
         config, batchSystem = reloadJobTree(options.jobTree)
     else:
         config, batchSystem = createJobTree(options)
         #Setup first job.
         command = self.makeRunnable(options.jobTree)
         memory = self.getMemory()
         cpu = self.getCpu()
         time = self.getRunTime()
         if memory != sys.maxint:
             if cpu != sys.maxint:
                 createFirstJob(command, config, memory=memory, cpu=cpu, time=time)
             else:
                 createFirstJob(command, config, memory=memory, time=time)
         else:
             if cpu != sys.maxint:
                 createFirstJob(command, config, cpu=cpu, time=time)
             else:
                 createFirstJob(command, config, time=time)
     loadEnvironment(config)
     return mainLoop(config, batchSystem)
Exemplo n.º 2
0
def main():
    parser = OptionParser()
    Stack.addJobTreeOptions(parser)
    parser.add_option("--sleepTime", dest="sleepTime", type="int",
                     help="sleep [default=5] seconds", default=5)
    parser.add_option("--tree", dest="tree",
                      help="tree [balanced|comb|star|fly]", default="comb")
    parser.add_option("--size", dest="size", type="int",
                      help="tree size (for comb or star) [default=10]", 
                      default=10) 
    parser.add_option("--cpusPerJob", dest="cpusPerJob",
                      help="Cpus per job", default="1")
        
    options, args = parser.parse_args()
    setLoggingFromOptions(options)

    startTime = datetime.datetime.now()

    if options.tree == "star":
        tree = starTree(options.size)
    elif options.tree == "balanced":
        tree = balancedTree()
    elif options.tree == "fly":
        tree = flyTree()
    else:
        tree = combTree(options.size)
    
    baseTarget = FirstJob(tree, "Anc00", options.sleepTime, startTime, int(options.cpusPerJob))
    Stack(baseTarget).startJobTree(options)
    
    if options.logFile is not None:
        checkLog(options)
Exemplo n.º 3
0
def main():
    #Parse the inputs args/options
    parser = OptionParser(usage="usage: workingDir [options]", version="%prog 0.1")
    Stack.addJobTreeOptions(parser)
    options, args = parser.parse_args()
    setLoggingFromOptions(options)
    
    if len(args) != 1:
        raise RuntimeError("Expected one argument, got %s arguments: %s" % (len(args), " ".join(args)))
    workingDir = args[0]
    
    #Assign the input files
    readFastqFiles = [ os.path.join(workingDir, "readFastqFiles", i) for i in os.listdir(os.path.join(workingDir, "readFastqFiles")) if ".fq" in i or ".fastq" in i ]
    referenceFastaFiles = [ os.path.join(workingDir, "referenceFastaFiles", i) for i in os.listdir(os.path.join(workingDir, "referenceFastaFiles")) if ".fa" in i or ".fasta" in i ] 
    outputDir = os.path.join(workingDir, "output")
    
    #Log the inputs
    logger.info("Using the following working directory: %s" % workingDir)
    logger.info("Using the following output directory: %s" % outputDir)
    for readFastqFile in readFastqFiles:
        logger.info("Got the following read fastq file: %s" % readFastqFile)
    for referenceFastaFile in referenceFastaFiles:
        logger.info("Got the following reference fasta files: %s" % referenceFastaFile)
    
    #This line invokes jobTree  
    i = Stack(Target.makeTargetFn(setupExperiments, args=(readFastqFiles, referenceFastaFiles, mappers, analyses, outputDir))).startJobTree(options) 
    
    if i != 0:
        raise RuntimeError("Got failed jobs")
Exemplo n.º 4
0
def main():
    parser = build_parser()
    Stack.addJobTreeOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)

    if not os.path.exists(args.outDir):
        os.mkdir(args.outDir)

    if args.overwriteDb is True:
        if os.path.exists(args.mergedDb):
            os.remove(args.mergedDb)
        for g in args.genomes:
            if os.path.exists(os.path.join(args.outDir, g + ".db")):
                os.remove(os.path.join(args.outDir, g + ".db"))

    logger.info("Building paths to the required files")
    alnPslDict = parse_dir(args.genomes, args.dataDir, alignment_ext)
    seqTwoBitDict = parse_dir(args.genomes, args.dataDir, sequence_ext)
    geneCheckBedDict = parse_dir(args.genomes, args.dataDir, gene_check_ext)
    #geneCheckBedDetailsDict = parse_dir(args.genomes, args.geneCheckDir, gene_check_details_ext)

    refSequence = os.path.join(args.dataDir, args.refGenome + ".2bit")
    if not os.path.exists(refSequence):
        raise RuntimeError("Reference genome 2bit not present at {}".format(refSequence))
    args.refSequence = refSequence

    i = Stack(Target.makeTargetFn(build_analysis, args=(alnPslDict, seqTwoBitDict, geneCheckBedDict, 
            args.gencodeAttributeMap, args.genomes, args.annotationBed, args.outDir, args.primaryKey, 
            args.refGenome))).startJobTree(args)

    if i != 0:
        raise RuntimeError("Got failed jobs")

    merge_databases(args.outDir, args.mergedDb, args.genomes)
Exemplo n.º 5
0
def main():
    usage = "usage: %prog [options] <multicactus project>"
    description = "Progressive version of cactus_workflow"
    parser = OptionParser(usage=usage, description=description)
    Stack.addJobTreeOptions(parser)
    addCactusWorkflowOptions(parser)
    
    parser.add_option("--nonRecursive", dest="nonRecursive", action="store_true",
                      help="Only process given event (not children) [default=False]", 
                      default=False)
    
    parser.add_option("--event", dest="event", 
                      help="Target event to process [default=root]", default=None)
    
    parser.add_option("--overwrite", dest="overwrite", action="store_true",
                      help="Recompute and overwrite output files if they exist [default=False]",
                      default=False)
    
    options, args = parser.parse_args()
    setLoggingFromOptions(options)

    if len(args) != 1:
        parser.print_help()
        raise RuntimeError("Unrecognised input arguments: %s" % " ".join(args))

    Stack(RunCactusPreprocessorThenProgressiveDown(options, args)).startJobTree(options)
Exemplo n.º 6
0
def main():
    parser = build_parser()
    Stack.addJobTreeOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)

    ilp_config = ilp_tuple(args.breakpoint_penalty, args.data_penalty,
                           args.expected_value_penalty, args.trash_penalty,
                           args.kmer_size)
    paths = paths_tuple(args.out_dir, args.aln_index, args.whitelist,
                        args.masked_ref, args.unmasked_ref, args.bad_kmers,
                        args.normalizing, args.key_file)
    try:
        cgquery_dict = pickle.load(open(args.cgquery_file))
    except IOError:
        raise IOError("Cgquery dict does not exist.")

    if not os.path.exists(paths.out_dir):
        os.makedirs(paths.out_dir)

    i = Stack(
        Target.makeTargetFn(build_analyses,
                            args=(paths, ilp_config,
                                  cgquery_dict))).startJobTree(args)

    if i != 0:
        raise RuntimeError("Got failed jobs")
def main():
    parser = build_parser()
    Stack.addJobTreeOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    args.defaultCpu = args.num_threads
    args.defaultMemory = 8 * 1024 ** 3
    i = Stack(Target.makeTargetFn(wrapper, args=(args,), memory=args.defaultMemory, cpu=args.defaultCpu)).startJobTree(args)
    if i != 0:
        raise RuntimeError("Got failed jobs")
def main():
    parser = build_parser()
    Stack.addJobTreeOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)

    i = Stack(Target.makeTargetFn(wrapper, args=(args.source_dir, args.reference, args.out_dir))).startJobTree(args)

    if i != 0:
        raise RuntimeError("Got failed jobs")
def main():
    parser = buildParser()
    Stack.addJobTreeOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)

    i = Stack(
        Target.makeTargetFn(buildAnalyses, args=(args.output, args.fastq_list, args.save_intermediate))
    ).startJobTree(args)

    if i != 0:
        raise RuntimeError("Got failed jobs")
Exemplo n.º 10
0
def main():
    parser = buildParser()
    Stack.addJobTreeOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)

    i = Stack(Target.makeTargetFn(buildAnalyses, args=(
        args.output, args.breakpoint_penalty, args.data_penalty, args.tightness_penalty, args.graph, args.kmer_size, args.save_intermediate))).startJobTree(
        args)

    if i != 0:
        raise RuntimeError("Got failed jobs")
def main():
    parser = buildParser()
    Stack.addJobTreeOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)

    i = Stack(
        Target.makeTargetFn(buildAnalyses,
                            args=(args.output, args.fastq_list,
                                  args.save_intermediate))).startJobTree(args)

    if i != 0:
        raise RuntimeError("Got failed jobs")
Exemplo n.º 12
0
def main():
    parser = OptionParser()
    Stack.addJobTreeOptions(parser)
    options, args = parser.parse_args()
    setLoggingFromOptions(options)
    
    outputDir = "muscle_compare_2d/output/"

    if not os.path.exists(outputDir):
        logger.info("Output dir {} does not exist. Creating.")
        os.mkdir(outputDir)
    if len(os.listdir(outputDir)) > 0:
        logger.info("Output dir not empty.")

    if len(args) != 3:
        raise RuntimeError("Error: expected three arguments got %s arguments: %s" % (len(args), " ".join(args)))

    templateRecords = {x.qname for x in pysam.Samfile(args[0]) if not x.is_unmapped}
    complementRecords = {x.qname for x in pysam.Samfile(args[1]) if not x.is_unmapped}
    
    twodSamFile = pysam.Samfile(args[2])
    twodRecords = {x.qname : x for x in twodSamFile if not x.is_unmapped}

    recordsToAnalyze = dict()
    for name, record in twodRecords.iteritems():
        if name not in templateRecords and name not in complementRecords:
            ref_name = twodSamFile.getrname(record.tid)
            ref_start, ref_stop = int(record.aend - record.alen), int(record.aend)
            recordsToAnalyze[name] = [ref_name, ref_start, ref_stop]
    if os.path.exists("../readFastqFiles/template/") and os.path.exists("../readFastqFiles/complement"):
        templateFastqFiles = [os.path.join("../readFastqFiles/template/", x) for x in os.listdir("../readFastqFiles/template/") if x.endswith(".fastq") or x.endswith(".fq")]
        complementFastqFiles = [os.path.join("../readFastqFiles/complement/", x) for x in os.listdir("../readFastqFiles/complement/") if x.endswith(".fastq") or x.endswith(".fq")]
    else:
        raise RuntimeError("Error: readFastqFiles does not contain template and/or complement folders")

    referenceFastaFiles = [os.path.join("../referenceFastaFiles", x) for x in os.listdir("../referenceFastaFiles") if x.endswith(".fa") or x.endswith(".fasta")]
    
    if len(referenceFastaFiles) > 0:
        references = { y[0].split(" ")[0] : y[1] for x in referenceFastaFiles for y in fastaRead(x) }
    else:
        raise RuntimeError("Error: no reference fasta files")

    if len(recordsToAnalyze) == 0:
        raise RuntimeError("Error: none of the mappable twoD reads in this set did not map as template/complement.")

    logger.info("Starting to find analyses to run...")
    args = (recordsToAnalyze, templateFastqFiles, complementFastqFiles, references, outputDir)
    i = Stack(Target.makeTargetFn(find_analyses, args=args)).startJobTree(options) 

    if i != 0:
        raise RuntimeError("Got {} failed jobs".format(i))
Exemplo n.º 13
0
def main():
    #Parse the inputs args/options
    parser = OptionParser(usage="usage: inputSamFile referenceFastaFile outputVcfFile [options]", 
                          version="%prog 0.1")
    
    #Options
    parser.add_option("--noMargin", dest="noMargin", help="Do not marginalise over the read \
    alignments, rather use the input alignment to call the variants (this will be faster)", 
                      default=False, action="store_true")
    parser.add_option("--alignmentModel", default=os.path.join(pathToBaseNanoporeDir(), 
                                                          "src", "margin", "mappers", "last_hmm_20.txt"), 
                     help="The model to use in realigning the reads to the reference.")
    parser.add_option("--errorModel", default=os.path.join(pathToBaseNanoporeDir(), 
                                                          "src", "margin", "mappers", "last_hmm_20.txt"), 
                     help="The model to use in calculating the difference between the predicted true reference and the reads.")
    parser.add_option("--maxAlignmentLengthPerJob", default=7000000, 
                     help="Maximum total alignment length of alignments to include in one posterior prob calculation job.", 
                     type=int)
    parser.add_option("--threshold", default=0.3, 
                     help="The posterior probability threshold for a non-reference base above which to report a variant.", 
                     type=float)
    
    #Add the jobTree options
    Stack.addJobTreeOptions(parser)
    
    #Parse the options/arguments
    options, args = parser.parse_args()
    
    #Setup logging
    setLoggingFromOptions(options)
    
    #Print help message if no input
    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(0)
    
    #Exit if the arguments are not what we expect
    if len(args) != 3:
        raise RuntimeError("Expected three arguments, got: %s" % " ".join(args))
    
    print options.errorModel
    print options.threshold
    
    #This line invokes jobTree  
    i = Stack(Target.makeTargetFn(fn=marginCallerTargetFn, args=(args[0], args[1], args[2], options))).startJobTree(options) 
        
    #The return value of the jobtree script is the number of failed jobs. If we have any then
    #report this.       
    if i != 0:
        raise RuntimeError("Got failed jobs")
Exemplo n.º 14
0
def runJobTreeScript(options):
    """Builds the basic job tree, or takes an existing one
    and runs the job tree master script.
    """
    setLoggingFromOptions(options)
    assert options.jobTree != None #We need a job tree, or a place to create one
    if os.path.isdir(options.jobTree):
        config, batchSystem = reloadJobTree(options.jobTree)
    else:
        assert options.command != None
        config, batchSystem = createJobTree(options)
        #Setup first job.
        createFirstJob(options.command, config)
    loadEnvironment(config)
    return mainLoop(config, batchSystem)
def main():
    parser = build_parser()
    Stack.addJobTreeOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)

    if not os.path.exists(args.out_dir):
        os.mkdir(args.out_dir)

    count_files = [[x, os.path.join(args.data_dir, x, x + ".Counts.fa")] for x in os.listdir(args.data_dir)]

    i = Stack(Target.makeTargetFn(buildDictWrapper, args=(count_files, args.out_dir, args.graph, args.new_graph))).startJobTree(args)

    if i != 0:
        raise RuntimeError("Got failed jobs")
def main():
    parser = buildParser()
    Stack.addJobTreeOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)

    if args.fastq is not None:
        i = Stack(ModelWrapperLocalFiles(args.name, args.output, args.breakpoint_penalty, args.data_penalty,
                                         args.tightness_penalty, args.graph, args.fastq,
                                         args.save_intermediate)).startJobTree(args)
    else:
        i = Stack(Target.makeTargetFn(buildAnalyses, args=(
            args.name, args.output, args.breakpoint_penalty, args.data_penalty, args.tightness_penalty, args.graph,
            args.fastq_list, args.save_intermediate))).startJobTree(args)

    if i != 0:
        raise RuntimeError("Got failed jobs")
Exemplo n.º 17
0
def main():
    parser = build_parser()
    Stack.addJobTreeOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)

    if not os.path.exists(args.out_dir):
        os.mkdir(args.out_dir)

    count_files = [[x, os.path.join(args.data_dir, x, x + ".Counts.fa")]
                   for x in os.listdir(args.data_dir)]

    i = Stack(
        Target.makeTargetFn(buildDictWrapper,
                            args=(count_files, args.out_dir, args.graph,
                                  args.new_graph))).startJobTree(args)

    if i != 0:
        raise RuntimeError("Got failed jobs")
Exemplo n.º 18
0
def main():
    parser = OptionParser()
    Stack.addJobTreeOptions(parser)
    parser.add_option("--sleepTime",
                      dest="sleepTime",
                      type="int",
                      help="sleep [default=5] seconds",
                      default=5)
    parser.add_option("--tree",
                      dest="tree",
                      help="tree [balanced|comb|star|fly]",
                      default="comb")
    parser.add_option("--size",
                      dest="size",
                      type="int",
                      help="tree size (for comb or star) [default=10]",
                      default=10)
    parser.add_option("--cpusPerJob",
                      dest="cpusPerJob",
                      help="Cpus per job",
                      default="1")

    options, args = parser.parse_args()
    setLoggingFromOptions(options)

    startTime = datetime.datetime.now()

    if options.tree == "star":
        tree = starTree(options.size)
    elif options.tree == "balanced":
        tree = balancedTree()
    elif options.tree == "fly":
        tree = flyTree()
    else:
        tree = combTree(options.size)

    baseTarget = FirstJob(tree, "Anc00", options.sleepTime, startTime,
                          int(options.cpusPerJob))
    Stack(baseTarget).startJobTree(options)

    if options.logFile is not None:
        checkLog(options)
Exemplo n.º 19
0
def main():
    parser = OptionParser()
    Stack.addJobTreeOptions(parser)
    parser.add_option("--sleepTime", dest="sleepTime", type="int",
                     help="sleep [default=5] seconds", default="5")
    parser.add_option("--tree", dest="tree",
                      help="tree [balanced|comb|star|fly]", default="comb")
    options, args = parser.parse_args()
    setLoggingFromOptions(options)

    startTime = datetime.datetime.now()

    tree = combTree()
    if options.tree == "star":
        tree = starTree()
    elif options.tree == "balanced":
        tree = balancedTree()
    elif options.tree == "fly":
        tree = flyTree()
    
    baseTarget = FirstJob(tree, "Anc00", options.sleepTime, startTime)
    Stack(baseTarget).startJobTree(options)
Exemplo n.º 20
0
def main():
    parser = build_parser()
    Stack.addJobTreeOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)

    ilp_config = ilp_tuple(args.breakpoint_penalty, args.data_penalty, args.expected_value_penalty, args.trash_penalty,
                           args.kmer_size)
    paths = paths_tuple(args.out_dir, args.aln_index, args.whitelist, args.masked_ref, args.unmasked_ref,
                        args.bad_kmers, args.normalizing, args.key_file)
    try:
        cgquery_dict = pickle.load(open(args.cgquery_file))
    except IOError:
        raise IOError("Cgquery dict does not exist.")

    if not os.path.exists(paths.out_dir):
        os.makedirs(paths.out_dir)

    i = Stack(Target.makeTargetFn(build_analyses, args=(paths, ilp_config, cgquery_dict))).startJobTree(args)

    if i != 0:
        raise RuntimeError("Got failed jobs")
Exemplo n.º 21
0
def main():
    parser = buildParser()
    Stack.addJobTreeOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)

    if args.fastq is not None:
        i = Stack(
            ModelWrapperLocalFiles(args.name, args.output,
                                   args.breakpoint_penalty, args.data_penalty,
                                   args.tightness_penalty, args.graph,
                                   args.fastq,
                                   args.save_intermediate)).startJobTree(args)
    else:
        i = Stack(
            Target.makeTargetFn(
                buildAnalyses,
                args=(args.name, args.output, args.breakpoint_penalty,
                      args.data_penalty, args.tightness_penalty, args.graph,
                      args.fastq_list,
                      args.save_intermediate))).startJobTree(args)

    if i != 0:
        raise RuntimeError("Got failed jobs")
Exemplo n.º 22
0
def main():
    #Parse the inputs args/options
    parser = OptionParser(usage="usage: samFile, readFastqFile, referenceFastaFile [options]", 
                          version="%prog 0.1")
    
    #Options
    parser.add_option("--readIdentity", dest="readIdentity", 
                      help="Print readIdentity of alignments", 
                      default=False, action="store_true")
    
    parser.add_option("--alignmentIdentity", dest="alignmentIdentity", 
                      help="Print alignmentIdentity", 
                      default=False, action="store_true")
    
    parser.add_option("--readCoverage", dest="readCoverage", 
                      help="Print read coverage of alignments", 
                      default=False, action="store_true")
    
    parser.add_option("--mismatchesPerAlignedBase", dest="mismatchesPerAlignedBase", 
                      help="Print mismatches per aligned base", 
                      default=False, action="store_true")
    
    parser.add_option("--deletionsPerReadBase", dest="deletionsPerReadBase", 
                      help="Print deletions per base of alignments", 
                      default=False, action="store_true")
    
    parser.add_option("--insertionsPerReadBase", dest="insertionsPerReadBase", 
                      help="Print insertions per base of alignments", 
                      default=False, action="store_true")
    
    parser.add_option("--readLength", dest="readLength", 
                      help="Print read lengths of aligned reads", 
                      default=False, action="store_true")

    parser.add_option("--localAlignment", dest="localAlignment", 
                      help="Ignore unaligned prefix and suffix of each read in making calculation", 
                      default=False, action="store_true")
    
    parser.add_option("--printValuePerReadAlignment", dest="printValuePerReadAlignment", 
                      help="Prints the value of statistics for each read alignment", 
                      default=False, action="store_true")
    
    parser.add_option("--noStats", dest="noStats", 
                      help="Do not print stats (avg, median, min, max, mode) of desired statistic", 
                      default=False, action="store_true")
    
    addLoggingOptions(parser)
    
    #Parse the options/arguments
    options, args = parser.parse_args()
    
    #Setup logging
    setLoggingFromOptions(options)
    
    #Print help message if no input
    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(0)
    
    #Exit if the arguments are not what we expect
    if len(args) != 3:
        raise RuntimeError("Expected three arguments, got: %s" % " ".join(args))
    
    #Now do the stats calculation
    samFile, readFastqFile, referenceFastaFile = args
    
    readAlignmentStats = ReadAlignmentStats.getReadAlignmentStats(samFile, readFastqFile, 
                                             referenceFastaFile, globalAlignment=not options.localAlignment)
    
    def report(values, statisticName):
        if not options.noStats:
            print "Average" + statisticName, numpy.average(values)
            print "Median" + statisticName, numpy.median(values)
            print "Min" + statisticName, min(values)
            print "Max" + statisticName, max(values)
        if options.printValuePerReadAlignment:
            print "Values" + statisticName, "\t".join(map(str, values))
    
    if options.readIdentity:
        report(map(lambda rAS : rAS.readIdentity(), readAlignmentStats), "ReadIdentity")
    
    if options.alignmentIdentity:
        report(map(lambda rAS : rAS.alignmentIdentity(), readAlignmentStats), "AlignmentIdentity")
    
    if options.readCoverage:
        report(map(lambda rAS : rAS.readCoverage(), readAlignmentStats), "ReadCoverage")
    
    if options.mismatchesPerAlignedBase:
        report(map(lambda rAS : rAS.mismatchesPerAlignedBase(), readAlignmentStats), "MismatchesPerAlignedBase")
    
    if options.deletionsPerReadBase:
        report(map(lambda rAS : rAS.deletionsPerReadBase(), readAlignmentStats), "DeletionsPerReadBase")
    
    if options.insertionsPerReadBase:
        report(map(lambda rAS : rAS.insertionsPerReadBase(), readAlignmentStats), "InsertionsPerReadBase")

    if options.readLength:
        report(map(lambda rAS : rAS.readLength(), readAlignmentStats), "ReadLength")
Exemplo n.º 23
0
def main():
    #Parse the inputs args/options
    parser = OptionParser(
        usage="usage: samFile, readFastqFile, referenceFastaFile [options]",
        version="%prog 0.1")

    #Options
    parser.add_option("--identity",
                      dest="identity",
                      help="Print identity of alignments",
                      default=False,
                      action="store_true")

    parser.add_option("--readCoverage",
                      dest="readCoverage",
                      help="Print read coverage of alignments",
                      default=False,
                      action="store_true")

    parser.add_option("--mismatchesPerAlignedBase",
                      dest="mismatchesPerAlignedBase",
                      help="Print mismatches per aligned base",
                      default=False,
                      action="store_true")

    parser.add_option("--deletionsPerReadBase",
                      dest="deletionsPerReadBase",
                      help="Print deletions per base of alignments",
                      default=False,
                      action="store_true")

    parser.add_option("--insertionsPerReadBase",
                      dest="insertionsPerReadBase",
                      help="Print insertions per base of alignments",
                      default=False,
                      action="store_true")

    parser.add_option(
        "--localAlignment",
        dest="localAlignment",
        help=
        "Ignore unaligned prefix and suffix of each read in making calculation",
        default=False,
        action="store_true")

    parser.add_option(
        "--printValuePerReadAlignment",
        dest="printValuePerReadAlignment",
        help="Prints the value of statistics for each read alignment",
        default=False,
        action="store_true")

    parser.add_option(
        "--noStats",
        dest="noStats",
        help=
        "Do not print stats (avg, median, min, max, mode) of desired statistic",
        default=False,
        action="store_true")

    parser.add_option(
        "--printAlignmentData",
        dest="printAlignmentData",
        help=
        "Print all stats for each read alignment in tabular format; include unaligned with --includeUnaligned",
        default=False,
        action="store_true")

    parser.add_option(
        "--includeUnaligned",
        dest="includeUnaligned",
        help="Includes unaligned reads when printing alignment data",
        default=False,
        action="store_true")

    addLoggingOptions(parser)

    #Parse the options/arguments
    options, args = parser.parse_args()

    #Setup logging
    setLoggingFromOptions(options)

    #Print help message if no input
    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(0)

    #Exit if the arguments are not what we expect
    if len(args) != 3:
        raise RuntimeError("Expected three arguments, got: %s" %
                           " ".join(args))

    #Now do the stats calculation
    samFile, readFastqFile, referenceFastaFile = args

    readAlignmentStats = ReadAlignmentStats.getReadAlignmentStats(
        samFile,
        readFastqFile,
        referenceFastaFile,
        globalAlignment=not options.localAlignment,
        includeUnaligned=options.includeUnaligned)

    def report(values, statisticName):
        if not options.noStats:
            print "Average" + statisticName, numpy.average(values)
            print "Median" + statisticName, numpy.median(values)
            print "Min" + statisticName, min(values)
            print "Max" + statisticName, max(values)
        if options.printValuePerReadAlignment:
            print "Values" + statisticName, "\t".join(map(str, values))

    def report_alignment_data():
        name = map(lambda rAS: rAS.readName(), readAlignmentStats)
        ref_id = map(lambda rAS: rAS.referenceID(), readAlignmentStats)
        read_type = map(lambda rAS: rAS.readType(), readAlignmentStats)
        length = map(lambda rAS: rAS.readLength(), readAlignmentStats)
        identity = map(lambda rAS: rAS.identity(), readAlignmentStats)
        read_coverage = map(lambda rAS: rAS.readCoverage(), readAlignmentStats)
        ref_coverage = map(lambda rAS: rAS.referenceCoverage(),
                           readAlignmentStats)
        mismatch = map(lambda rAS: rAS.mismatchesPerAlignedBase(),
                       readAlignmentStats)
        insertion = map(lambda rAS: rAS.insertionsPerReadBase(),
                        readAlignmentStats)
        deletion = map(lambda rAS: rAS.deletionsPerReadBase(),
                       readAlignmentStats)
        mean_quality = map(lambda rAS: rAS.readMeanQuality(),
                           readAlignmentStats)
        aligned = map(lambda rAS: rAS.isAligned(), readAlignmentStats)
        aligned_length = map(lambda rAS: rAS.alignedReadLength(),
                             readAlignmentStats)
        ref_c_content = map(lambda rAS: rAS.getRefCContent(),
                            readAlignmentStats)
        ref_gc_content = map(lambda rAS: rAS.getRefGcContent(),
                             readAlignmentStats)

        print "\t".join(["Name", "ReferenceID", "ReadType", "Length", "Aligned", \
                        "AlignedLength", "Identity", "ReadCoverage", \
                        "ReferenceCoverage", "MismatchPerBase", \
                        "InsertionPerBase", "DeletionPerBase", "MeanQuality",
                        "RefCContent", "RefGcContent"])

        for read in zip(name, ref_id, read_type, length, aligned, aligned_length, \
                        identity, read_coverage, ref_coverage, mismatch, insertion,\
                        deletion, mean_quality, ref_c_content, ref_gc_content):
            print "\t".join(map(str, read))

    if options.printAlignmentData:
        report_alignment_data()

    else:
        if options.identity:
            report(map(lambda rAS: rAS.identity(), readAlignmentStats),
                   "Identity")

        if options.readCoverage:
            report(map(lambda rAS: rAS.readCoverage(), readAlignmentStats),
                   "ReadCoverage")

        if options.mismatchesPerAlignedBase:
            report(
                map(lambda rAS: rAS.mismatchesPerAlignedBase(),
                    readAlignmentStats), "MismatchesPerAlignedBase")

        if options.deletionsPerReadBase:
            report(
                map(lambda rAS: rAS.deletionsPerReadBase(),
                    readAlignmentStats), "DeletionsPerReadBase")

        if options.insertionsPerReadBase:
            report(
                map(lambda rAS: rAS.insertionsPerReadBase(),
                    readAlignmentStats), "InsertionsPerReadBase")
Exemplo n.º 24
0
def main():
    #Parse the inputs args/options
    parser = OptionParser(usage="usage: inputFastqFile referenceFastaFile outputSamFile [options]", 
                          version="%prog 0.1")
    
    #Options
    parser.add_option("--em", dest="em", 
                      help="Run expectation maximisation (EM)",
                      default=False, action="store_true")
    ##Most people would not want to use the following, but I put them here for debug purposes
    parser.add_option("--bwa", dest="bwa", help="Use BWA instead of LAST", 
                      default=False, action="store_true")
    parser.add_option("--graphmap", dest="graphmap", help="Use GraphMap instead of LAST", 
                      default=False, action="store_true")
    parser.add_option("--graphmapanchor", dest="graphmapanchor", help="Use GraphMap with anchor alignment instead of LAST", 
                      default=False, action="store_true")
    parser.add_option("--noRealign", dest="noRealign", help="Don't run any realignment step", 
                      default=False, action="store_true")
    parser.add_option("--noChain", dest="noChain", help="Don't run any chaining step", 
                      default=False, action="store_true")
    parser.add_option("--gapGamma", dest="gapGamma", help="Set the gap gamma for the AMAP function", 
                      default=0.5, type=float)
    parser.add_option("--matchGamma", dest="matchGamma", help="Set the match gamma for the AMAP function", 
                      default=0.0, type=float)
    
    #Add the cPecan expectation maximisation options
    options = cPecan.cPecanEm.Options()
    options.inputModel = os.path.join(pathToBaseNanoporeDir(), "src", "margin", "mappers", "last_hmm_20.txt")
    options.modelType="fiveStateAsymmetric" #"threeStateAsymmetric"
    options.optionsToRealign="--diagonalExpansion=10 --splitMatrixBiggerThanThis=300" 
    options.randomStart = True
    options.trials = 3
    options.outputTrialHmms = True
    options.iterations = 100
    options.maxAlignmentLengthPerJob=700000
    options.maxAlignmentLengthToSample = 50000000
    #options.outputXMLModelFile = outputModel + ".xml"
    #options.updateTheBand = True
    #options.useDefaultModelAsStart = True
    #options.setJukesCantorStartingEmissions=0.3
    options.trainEmissions=True
    #options.tieEmissions = True
    addExpectationMaximisationOptions(parser, options)
    
    #Add the jobTree options
    Stack.addJobTreeOptions(parser)
    
    #Parse the options/arguments
    options, args = parser.parse_args()
    
    #Setup logging
    setLoggingFromOptions(options)
    
    #Print help message if no input
    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(0)
    
    #Exit if the arguments are not what we expect
    if len(args) != 3:
        raise RuntimeError("Expected three arguments, got: %s" % " ".join(args))
    
    #Set the mapper
    if options.noRealign:
        if options.noChain: # i.e. --noChain --noRealign
            # mapper = Bwa if options.bwa else Last
            mapper = Last;
            if (options.bwa):
                mapper = Bwa;
            if (options.graphmap):
                mapper = GraphMap;
            if (options.graphmapanchor):
                mapper = GraphMapAnchor;
        else: # i.e. --noRealign
            # mapper = BwaChain if options.bwa else LastChain
            mapper = LastChain;
            if (options.bwa):
                mapper = BwaChain;
            if (options.graphmap):
                mapper = GraphMapChain;
            if (options.graphmapanchor):
                mapper = GraphMapAnchorChain;
    else:
        # mapper = BwaRealign if options.bwa else LastRealign
        mapper = LastRealign;
        if (options.bwa):
            mapper = BwaRealign;
        if (options.graphmap):
            mapper = GraphMapRealign;
        if (options.graphmapanchor):
            mapper = GraphMapAnchorRealign;
    
    #This line invokes jobTree  
    i = Stack(mapper(readFastqFile=args[0], referenceFastaFile=args[1], outputSamFile=args[2], 
                     options=options)).startJobTree(options) 
        
    #The return value of the jobtree script is the number of failed jobs. If we have any then
    #report this.       
    if i != 0:
        raise RuntimeError("Got failed jobs")
Exemplo n.º 25
0
def main():
    #Parse the inputs args/options
    parser = OptionParser(usage="usage: samFile, readFastqFile, referenceFastaFile [options]", 
                          version="%prog 0.1")
    
    #Options
    parser.add_option("--identity", dest="identity", 
                      help="Print identity of alignments", 
                      default=False, action="store_true")
    
    parser.add_option("--readCoverage", dest="readCoverage", 
                      help="Print read coverage of alignments", 
                      default=False, action="store_true")
    
    parser.add_option("--mismatchesPerAlignedBase", dest="mismatchesPerAlignedBase", 
                      help="Print mismatches per aligned base", 
                      default=False, action="store_true")
    
    parser.add_option("--deletionsPerReadBase", dest="deletionsPerReadBase", 
                      help="Print deletions per base of alignments", 
                      default=False, action="store_true")
    
    parser.add_option("--insertionsPerReadBase", dest="insertionsPerReadBase", 
                      help="Print insertions per base of alignments", 
                      default=False, action="store_true")
    
    parser.add_option("--localAlignment", dest="localAlignment", 
                      help="Ignore unaligned prefix and suffix of each read in making calculation", 
                      default=False, action="store_true")
    
    parser.add_option("--printValuePerReadAlignment", dest="printValuePerReadAlignment", 
                      help="Prints the value of statistics for each read alignment", 
                      default=False, action="store_true")
    
    parser.add_option("--noStats", dest="noStats", 
                      help="Do not print stats (avg, median, min, max, mode) of desired statistic", 
                      default=False, action="store_true")
    
    addLoggingOptions(parser)
    
    #Parse the options/arguments
    options, args = parser.parse_args()
    
    #Setup logging
    setLoggingFromOptions(options)
    
    #Print help message if no input
    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(0)
    
    #Exit if the arguments are not what we expect
    if len(args) != 3:
        raise RuntimeError("Expected three arguments, got: %s" % " ".join(args))
    
    #Now do the stats calculation
    samFile, readFastqFile, referenceFastaFile = args
    
    readAlignmentStats = ReadAlignmentStats.getReadAlignmentStats(samFile, readFastqFile, 
                                             referenceFastaFile, globalAlignment=not options.localAlignment)
    
    def report(values, statisticName):
        if not options.noStats:
            print "Average" + statisticName, numpy.average(values)
            print "Median" + statisticName, numpy.median(values)
            print "Min" + statisticName, min(values)
            print "Max" + statisticName, max(values)
        if options.printValuePerReadAlignment:
            print "Values" + statisticName, "\t".join(map(str, values))
    
    if options.identity:
        report(map(lambda rAS : rAS.identity(), readAlignmentStats), "Identity")
    
    if options.readCoverage:
        report(map(lambda rAS : rAS.readCoverage(), readAlignmentStats), "ReadCoverage")
    
    if options.mismatchesPerAlignedBase:
        report(map(lambda rAS : rAS.mismatchesPerAlignedBase(), readAlignmentStats), "MismatchesPerAlignedBase")
    
    if options.deletionsPerReadBase:
        report(map(lambda rAS : rAS.deletionsPerReadBase(), readAlignmentStats), "DeletionsPerReadBase")
    
    if options.insertionsPerReadBase:
        report(map(lambda rAS : rAS.insertionsPerReadBase(), readAlignmentStats), "InsertionsPerReadBase")
Exemplo n.º 26
0
def main():
    parser = OptionParser()
    Stack.addJobTreeOptions(parser)
    options, args = parser.parse_args()
    setLoggingFromOptions(options)

    outputDir = "muscle_compare_2d/output/"

    if not os.path.exists(outputDir):
        logger.info("Output dir {} does not exist. Creating.")
        os.mkdir(outputDir)
    if len(os.listdir(outputDir)) > 0:
        logger.info("Output dir not empty.")

    if len(args) != 3:
        raise RuntimeError(
            "Error: expected three arguments got %s arguments: %s" %
            (len(args), " ".join(args)))

    templateRecords = {
        x.qname
        for x in pysam.Samfile(args[0]) if not x.is_unmapped
    }
    complementRecords = {
        x.qname
        for x in pysam.Samfile(args[1]) if not x.is_unmapped
    }

    twodSamFile = pysam.Samfile(args[2])
    twodRecords = {x.qname: x for x in twodSamFile if not x.is_unmapped}

    recordsToAnalyze = dict()
    for name, record in twodRecords.iteritems():
        if name not in templateRecords and name not in complementRecords:
            ref_name = twodSamFile.getrname(record.tid)
            ref_start, ref_stop = int(record.aend - record.alen), int(
                record.aend)
            recordsToAnalyze[name] = [ref_name, ref_start, ref_stop]
    if os.path.exists("../readFastqFiles/template/") and os.path.exists(
            "../readFastqFiles/complement"):
        templateFastqFiles = [
            os.path.join("../readFastqFiles/template/", x)
            for x in os.listdir("../readFastqFiles/template/")
            if x.endswith(".fastq") or x.endswith(".fq")
        ]
        complementFastqFiles = [
            os.path.join("../readFastqFiles/complement/", x)
            for x in os.listdir("../readFastqFiles/complement/")
            if x.endswith(".fastq") or x.endswith(".fq")
        ]
    else:
        raise RuntimeError(
            "Error: readFastqFiles does not contain template and/or complement folders"
        )

    referenceFastaFiles = [
        os.path.join("../referenceFastaFiles", x)
        for x in os.listdir("../referenceFastaFiles")
        if x.endswith(".fa") or x.endswith(".fasta")
    ]

    if len(referenceFastaFiles) > 0:
        references = {
            y[0].split(" ")[0]: y[1]
            for x in referenceFastaFiles for y in fastaRead(x)
        }
    else:
        raise RuntimeError("Error: no reference fasta files")

    if len(recordsToAnalyze) == 0:
        raise RuntimeError(
            "Error: none of the mappable twoD reads in this set did not map as template/complement."
        )

    logger.info("Starting to find analyses to run...")
    args = (recordsToAnalyze, templateFastqFiles, complementFastqFiles,
            references, outputDir)
    i = Stack(Target.makeTargetFn(find_analyses,
                                  args=args)).startJobTree(options)

    if i != 0:
        raise RuntimeError("Got {} failed jobs".format(i))
Exemplo n.º 27
0
def main():
    parser = OptionParser()
    Stack.addJobTreeOptions(parser)
    options, args = parser.parse_args()
    setLoggingFromOptions(options)

    outputDir = "blast_combined/output/"

    if not os.path.exists(outputDir):
        logger.info("Output dir {} does not exist. Creating.")
        os.mkdir(outputDir)
    if len(os.listdir(outputDir)) > 0:
        logger.info("Output dir not empty.")

    #find all read fastq files, load into a dict by read type
    readFastqFiles = dict()
    for readType in readTypes:
        readFastqFiles[readType] = [
            os.path.join("../output/processedReadFastqFiles/", readType, x)
            for x in os.listdir(
                os.path.join("../output/processedReadFastqFiles/", readType))
            if x.endswith(".fq") or x.endswith(".fastq")
        ]

    #find all reference fasta files
    referenceFastaFiles = [
        x for x in os.listdir("../referenceFastaFiles")
        if x.endswith(".fasta") or x.endswith(".fa")
    ]

    #find all sam files that were analyzed using combinedAnalyses
    samFiles = {}
    for readType in readTypes:
        samFiles[readType] = [
            (readFastqFile,
             os.path.join(
                 "../output", "analysis_" + readType,
                 "experiment_" + os.path.basename(readFastqFile) + "_" +
                 referenceFastaFile + "_" + analysis, "mapping.sam"))
            for readFastqFile, referenceFastaFile, analysis in product(
                readFastqFiles[readType], referenceFastaFiles,
                combinedAnalyses)
        ]

    mappedByReadType = defaultdict(set)
    for readType in readTypes:
        for readFastqFileFullPath, samFile in samFiles[readType]:
            readFastqFile = os.path.basename(readFastqFileFullPath)
            mappedNames = {(x.qname, readFastqFile)
                           for x in pysam.Samfile(samFile)
                           if not x.is_unmapped}
            mappedByReadType[readType] = mappedByReadType[readType].union(
                mappedNames)

    unmappedByReadType = defaultdict(dict)
    for readType in readTypes:
        for readFastqFileFullPath, samFile in samFiles[readType]:
            readFastqFile = os.path.basename(readFastqFileFullPath)
            for name, seq, qual in fastqRead(readFastqFileFullPath):
                name = name.split(" ")[0]
                if (name, readFastqFile) not in mappedByReadType[readType]:
                    unmappedByReadType[readType][(name, readFastqFile)] = seq

    i = Stack(
        Target.makeTargetFn(find_analyses,
                            args=(unmappedByReadType,
                                  outputDir))).startJobTree(options)

    if i != 0:
        raise RuntimeError("Got {} failed jobs".format(i))

    for readType in readTypes:
        #build a counter of blast hits and set of read names that did not map
        blast_hits, no_hits = Counter(), set()
        for query, result in parse_blast(
                open(os.path.join(outputDir, readType + "_blast_out.txt"))):
            if result is None:
                no_hits.add(query)
            else:
                blast_hits[tuple(
                    result)] += 1  #count number of times each hit was seen
        #write the unmapped hits to a fasta file
        outf = open(os.path.join(outputDir, readType + "_no_hits.fasta"), "w")
        for (name,
             readFastqFile), seq in unmappedByReadType[readType].iteritems():
            if name in no_hits:
                outf.write(">{}\n{}\n".format(name, seq))
        outf.close()
        #write the blast report
        blast_out = open(
            os.path.join(outputDir, readType + "_blast_report.txt"), "w")
        blast_out.write(
            "gi|##|gb|##|\tSpecies\tseqID\tCount\n")  #header to output
        for result, count in sorted(blast_hits.items(),
                                    key=lambda x: -int(x[-1])):
            blast_out.write("{}\t{}\n".format("\t".join(result), count))
        blast_out.close()
        #calculate percents and make a barplot
        blast_count = sum(blast_hits.values())
        unmapped_count = len(unmappedByReadType[readType]) - sum(
            blast_hits.values())
        mapped_count = len(mappedByReadType[readType])

        #blast_percent = 1.0 * sum(blast_hits.values()) / (len(mappedByReadType[readType]) + len(unmappedByReadType[readType]))
        #unmapped_percent = (1.0 * len(unmappedByReadType[readType]) - sum(blast_hits.values())) / (len(mappedByReadType[readType]) + len(unmappedByReadType[readType]))
        #mapped_percent = 1.0 * len(mappedByReadType[readType]) / (len(mappedByReadType[readType]) + len(unmappedByReadType[readType]))
        outf = open(os.path.join(outputDir, readType + "percents.txt"), "w")
        outf.write("\n".join(
            map(str, [blast_count, unmapped_count, mapped_count])))
        outf.close()
        #system("Rscript blast_combined/barplot_blast.R {} {} {} {} {}".format(blast_percent, unmapped_percent, mapped_percent, readType, os.path.join(outputDir, readType + "_blast_barplot.pdf")))
        system("Rscript blast_combined/barplot_blast.R {} {} {} {} {}".format(
            blast_count, unmapped_count, mapped_count, readType,
            os.path.join(outputDir, readType + "_blast_barplot.pdf")))
Exemplo n.º 28
0
def main():
    #Parse the inputs args/options
    parser = OptionParser(usage="usage: inputFastqFile referenceFastaFile outputSamFile [options]", 
                          version="%prog 0.1")
    
    #Options
    parser.add_option("--em", dest="em", 
                      help="Run expectation maximisation (EM)",
                      default=False, action="store_true")
    ##Most people would not want to use the following, but I put them here for debug purposes
    parser.add_option("--bwa", dest="bwa", help="Use BWA instead of LAST", 
                      default=False, action="store_true")
    parser.add_option("--noRealign", dest="noRealign", help="Don't run any realignment step", 
                      default=False, action="store_true")
    parser.add_option("--noChain", dest="noChain", help="Don't run any chaining step", 
                      default=False, action="store_true")
    parser.add_option("--gapGamma", dest="gapGamma", help="Set the gap gamma for the AMAP function", 
                      default=0.5, type=float)
    parser.add_option("--matchGamma", dest="matchGamma", help="Set the match gamma for the AMAP function", 
                      default=0.0, type=float)
    
    #Add the cPecan expectation maximisation options
    options = cPecan.cPecanEm.Options()
    options.inputModel = os.path.join(pathToBaseNanoporeDir(), "src", "margin", "mappers", "last_hmm_20.txt")
    options.modelType="fiveStateAsymmetric" #"threeStateAsymmetric"
    options.optionsToRealign="--diagonalExpansion=10 --splitMatrixBiggerThanThis=300" 
    options.randomStart = True
    options.trials = 3
    options.outputTrialHmms = True
    options.iterations = 100
    options.maxAlignmentLengthPerJob=700000
    options.maxAlignmentLengthToSample = 50000000
    #options.outputXMLModelFile = outputModel + ".xml"
    #options.updateTheBand = True
    #options.useDefaultModelAsStart = True
    #options.setJukesCantorStartingEmissions=0.3
    options.trainEmissions=True
    #options.tieEmissions = True
    addExpectationMaximisationOptions(parser, options)
    
    #Add the jobTree options
    Stack.addJobTreeOptions(parser)
    
    #Parse the options/arguments
    options, args = parser.parse_args()
    
    #Setup logging
    setLoggingFromOptions(options)
    
    #Print help message if no input
    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(0)
    
    #Exit if the arguments are not what we expect
    if len(args) != 3:
        raise RuntimeError("Expected three arguments, got: %s" % " ".join(args))
    
    #Set the mapper
    if options.noRealign:
        if options.noChain: # i.e. --noChain --noRealign
            mapper = Bwa if options.bwa else Last
        else: # i.e. --noRealign
            mapper = BwaChain if options.bwa else LastChain
    else:
        mapper = BwaRealign if options.bwa else LastRealign
    
    #This line invokes jobTree  
    i = Stack(mapper(readFastqFile=args[0], referenceFastaFile=args[1], outputSamFile=args[2], 
                     options=options)).startJobTree(options) 
        
    #The return value of the jobtree script is the number of failed jobs. If we have any then
    #report this.       
    if i != 0:
        raise RuntimeError("Got failed jobs")
Exemplo n.º 29
0
def main():
    parser = OptionParser()
    Stack.addJobTreeOptions(parser)
    options, args = parser.parse_args()
    setLoggingFromOptions(options)
    
    outputDir = "blast_combined/output/"

    if not os.path.exists(outputDir):
        logger.info("Output dir {} does not exist. Creating.")
        os.mkdir(outputDir)
    if len(os.listdir(outputDir)) > 0:
        logger.info("Output dir not empty.")

    #find all read fastq files, load into a dict by read type
    readFastqFiles = dict()
    for readType in readTypes:
        readFastqFiles[readType] = [os.path.join("../output/processedReadFastqFiles/", readType, x) for x in os.listdir(os.path.join("../output/processedReadFastqFiles/", readType)) if x.endswith(".fq") or x.endswith(".fastq")]
    
    #find all reference fasta files
    referenceFastaFiles = [x for x in os.listdir("../referenceFastaFiles") if x.endswith(".fasta") or x.endswith(".fa")]

    #find all sam files that were analyzed using combinedAnalyses
    samFiles = {}
    for readType in readTypes:
        samFiles[readType] = [(readFastqFile, os.path.join("../output", "analysis_" + readType, "experiment_" + os.path.basename(readFastqFile) + "_" + referenceFastaFile + "_" + analysis, "mapping.sam")) for readFastqFile, referenceFastaFile, analysis in product(readFastqFiles[readType], referenceFastaFiles, combinedAnalyses)]

    mappedByReadType = defaultdict(set)
    for readType in readTypes:
        for readFastqFileFullPath, samFile in samFiles[readType]:
            readFastqFile = os.path.basename(readFastqFileFullPath)
            mappedNames = {(x.qname, readFastqFile) for x in pysam.Samfile(samFile) if not x.is_unmapped}
            mappedByReadType[readType] = mappedByReadType[readType].union(mappedNames)

    unmappedByReadType = defaultdict(dict)
    for readType in readTypes:
        for readFastqFileFullPath, samFile in samFiles[readType]:
            readFastqFile = os.path.basename(readFastqFileFullPath)
            for name, seq, qual in fastqRead(readFastqFileFullPath):
                name = name.split(" ")[0]
                if (name, readFastqFile) not in mappedByReadType[readType]:
                    unmappedByReadType[readType][(name, readFastqFile)] = seq
        

    i = Stack(Target.makeTargetFn(find_analyses, args=(unmappedByReadType, outputDir))).startJobTree(options) 

    if i != 0:
        raise RuntimeError("Got {} failed jobs".format(i))

    for readType in readTypes:
        #build a counter of blast hits and set of read names that did not map
        blast_hits, no_hits = Counter(), set()
        for query, result in parse_blast(open(os.path.join(outputDir, readType + "_blast_out.txt"))):
            if result is None:
                no_hits.add(query)
            else:
                blast_hits[tuple(result)] += 1 #count number of times each hit was seen
        #write the unmapped hits to a fasta file
        outf = open(os.path.join(outputDir, readType + "_no_hits.fasta"), "w")
        for (name, readFastqFile), seq in unmappedByReadType[readType].iteritems():
            if name in no_hits:
                outf.write(">{}\n{}\n".format(name, seq))
        outf.close()
        #write the blast report
        blast_out = open(os.path.join(outputDir, readType + "_blast_report.txt"), "w")
        blast_out.write("gi|##|gb|##|\tSpecies\tseqID\tCount\n") #header to output
        for result, count in sorted(blast_hits.items(), key = lambda x: -int(x[-1])):
            blast_out.write("{}\t{}\n".format("\t".join(result), count))
        blast_out.close()
        #calculate percents and make a barplot
        blast_count =  sum(blast_hits.values())
        unmapped_count = len(unmappedByReadType[readType]) - sum(blast_hits.values())
        mapped_count = len(mappedByReadType[readType])
        
        #blast_percent = 1.0 * sum(blast_hits.values()) / (len(mappedByReadType[readType]) + len(unmappedByReadType[readType]))
        #unmapped_percent = (1.0 * len(unmappedByReadType[readType]) - sum(blast_hits.values())) / (len(mappedByReadType[readType]) + len(unmappedByReadType[readType]))
        #mapped_percent = 1.0 * len(mappedByReadType[readType]) / (len(mappedByReadType[readType]) + len(unmappedByReadType[readType]))
        outf = open(os.path.join(outputDir, readType + "percents.txt"),"w")
        outf.write("\n".join(map(str,[blast_count, unmapped_count, mapped_count])))
        outf.close()
        #system("Rscript blast_combined/barplot_blast.R {} {} {} {} {}".format(blast_percent, unmapped_percent, mapped_percent, readType, os.path.join(outputDir, readType + "_blast_barplot.pdf")))
        system("Rscript blast_combined/barplot_blast.R {} {} {} {} {}".format(blast_count, unmapped_count, mapped_count, readType, os.path.join(outputDir, readType + "_blast_barplot.pdf")))
Exemplo n.º 30
0
def main():
    #Parse the inputs args/options
    parser = OptionParser(usage="usage: workingDir [options]",
                          version="%prog 0.1")
    Stack.addJobTreeOptions(parser)
    options, args = parser.parse_args()
    setLoggingFromOptions(options)

    if len(args) != 1:
        raise RuntimeError("Expected one argument, got %s arguments: %s" %
                           (len(args), " ".join(args)))
    workingDir = args[0]

    # call read sampler script; samples 75, 50, and 25% reads
    #SampleReads(workingDir)

    #Create (if necessary) the output dir
    outputDir = os.path.join(workingDir, "output")
    if not os.path.exists(outputDir):
        logger.info("Creating output dir: %s" % outputDir)
        os.mkdir(outputDir)
    else:
        logger.info("Root output dir already exists: %s" % outputDir)

    #Assign/process (uniquify the names of) the input read fastq files
    processedFastqFiles = os.path.join(outputDir, "processedReadFastqFiles")
    if not os.path.exists(processedFastqFiles):
        os.mkdir(processedFastqFiles)

    fastqParentDir = os.path.join(workingDir, "readFastqFiles")
    readFastqFiles = list()
    for fastqSubDir in filter(
            os.path.isdir,
        [os.path.join(fastqParentDir, x) for x in os.listdir(fastqParentDir)]):
        readType = os.path.basename(fastqSubDir)
        if not os.path.exists(
                os.path.join(processedFastqFiles,
                             os.path.basename(fastqSubDir))):
            os.mkdir(os.path.join(processedFastqFiles, readType))
        readFastqFiles.append([
            readType,
            [
                makeFastqSequenceNamesUnique(
                    os.path.join(workingDir, "readFastqFiles", readType, i),
                    os.path.join(processedFastqFiles, readType, i))
                for i in os.listdir(
                    os.path.join(workingDir, "readFastqFiles", readType))
                if (".fq" in i and i[-3:] == '.fq') or (
                    ".fastq" in i and i[-6:] == '.fastq')
            ]
        ])

    #Assign/process (uniquify the names of) the input reference fasta files
    processedFastaFiles = os.path.join(outputDir,
                                       "processedReferenceFastaFiles")
    if not os.path.exists(processedFastaFiles):
        os.mkdir(processedFastaFiles)
    referenceFastaFiles = [
        makeFastaSequenceNamesUnique(
            os.path.join(workingDir, "referenceFastaFiles", i),
            os.path.join(processedFastaFiles, i))
        for i in os.listdir(os.path.join(workingDir, "referenceFastaFiles"))
        if (".fa" in i and i[-3:] == '.fa') or (
            ".fasta" in i and i[-6:] == '.fasta')
    ]

    # call reference mutator script; introduces 1%, and 5% mutations (No nucleotide bias used for now)
    #referenceFastaFiles = mutateReferenceSequences(referenceFastaFiles)

    #Log the inputs
    logger.info("Using the following working directory: %s" % workingDir)
    logger.info("Using the following output directory: %s" % outputDir)
    for readType, readTypeFastqFiles in readFastqFiles:
        logger.info("Got the follow read type: %s" % readType)
        for readFastqFile in readTypeFastqFiles:
            logger.info("Got the following read fastq file: %s" %
                        readFastqFile)
    for referenceFastaFile in referenceFastaFiles:
        logger.info("Got the following reference fasta files: %s" %
                    referenceFastaFile)

    #This line invokes jobTree
    i = Stack(
        Target.makeTargetFn(setupExperiments,
                            args=(readFastqFiles, referenceFastaFiles, mappers,
                                  analyses, metaAnalyses,
                                  outputDir))).startJobTree(options)

    if i != 0:
        raise RuntimeError("Got failed jobs")