Exemplo n.º 1
0
def main(argv=None):  # IGNORE:C0111
    '''Command line options.'''

    if argv is None:
        argv = sys.argv
    else:
        sys.argv.extend(argv)

    program_name = os.path.basename(sys.argv[0])
    program_version = "v%s" % __version__
    program_build_date = str(__updated__)
    program_version_message = '%%(prog)s %s (%s)' % (program_version,
                                                     program_build_date)
    if __name__ == '__main__':
        program_shortdesc = __import__('__main__').__doc__.split("\n")[1]
    else:
        program_shortdesc = __doc__.split("\n")[1]
    #program_shortdesc = __import__('__main__').__doc__.split("\n")[1]
    program_license = '''%s

  Created by TGen North on %s.
  Copyright 2015 TGen North. All rights reserved.

  Available for academic and research use only under a license
  from The Translational Genomics Research Institute (TGen)
  that is free for non-commercial use.

  Distributed on an "AS IS" basis without warranties
  or conditions of any kind, either express or implied.

USAGE
''' % (program_shortdesc, str(__date__))

    try:
        # Setup argument parser
        parser = argparse.ArgumentParser(
            description=program_license,
            formatter_class=argparse.RawDescriptionHelpFormatter)
        required_group = parser.add_argument_group("required arguments")
        required_group.add_argument("-n",
                                    "--name",
                                    required=True,
                                    help="name for this run. [REQUIRED]")
        required_group.add_argument(
            "-j",
            "--json",
            required=True,
            help="JSON file of assay descriptions. [REQUIRED]")
        optional_group = parser.add_argument_group("optional arguments")
        reads_bams_group = optional_group.add_mutually_exclusive_group()
        reads_bams_group.add_argument(
            "-r",
            "--read-dir",
            dest="rdir",
            metavar="DIR",
            help="directory of read files to analyze.")
        reads_bams_group.add_argument(
            "--bam-dir",
            dest="bdir",
            metavar="DIR",
            help="directory of bam files to analyze.")
        optional_group.add_argument(
            "-o",
            "--out-dir",
            dest="odir",
            metavar="DIR",
            help="directory to write output files to. [default: `pwd`]")
        optional_group.add_argument(
            "-s",
            "--submitter",
            dest="job_manager",
            default="PBS",
            help=
            "cluster job submitter to use (PBS, SLURM, SGE, none). [default: PBS]"
        )
        optional_group.add_argument(
            "--submitter-args",
            dest="sargs",
            metavar="ARGS",
            help=
            "additional arguments to pass to the job submitter, enclosed in \"\"."
        )
        optional_group.add_argument(
            "--smor",
            action="store_true",
            default=False,
            help=
            "perform SMOR analysis with overlapping reads. [default: False]")
        trim_group = parser.add_argument_group("read trimming options")
        on_off_group = trim_group.add_mutually_exclusive_group()
        on_off_group.add_argument(
            "--trim",
            action="store_true",
            default=True,
            help="perform adapter trimming on reads. [default: True]")
        on_off_group.add_argument("--no-trim",
                                  dest="trim",
                                  action="store_false",
                                  help="do not perform adapter trimming.")
        trim_group.add_argument(
            "--adapter-sequences",
            dest="adapters",
            default="/scratch/dlemmer/ASAP/illumina_adapters_all.fasta",
            help=
            "location of the adapter sequence file to use for trimming. [default: /scratch/dlemmer/ASAP/illumina_adapters_all.fasta]"
        )
        trim_group.add_argument(
            "-q",
            "--qual",
            nargs="?",
            const="SLIDINGWINDOW:5:20",
            help=
            "perform quality trimming [default: False], optional parameter can be used to customize quality trimming parameters to trimmomatic. [default: SLIDINGWINDOW:5:20]"
        )
        trim_group.add_argument(
            "-m",
            "--minlen",
            metavar="LEN",
            default=80,
            type=int,
            help="minimum read length to keep after trimming. [default: 80]")
        align_group = parser.add_argument_group("read mapping options")
        align_group.add_argument(
            "-a",
            "--aligner",
            default="bowtie2",
            help=
            "aligner to use for read mapping, supports bowtie2, novoalign, and bwa. [default: bowtie2]"
        )
        align_group.add_argument(
            "--aligner-args",
            dest="aargs",
            metavar="ARGS",
            default='',
            help=
            "additional arguments to pass to the aligner, enclosed in \"\".")
        align_group.add_argument(
            "-d",
            "--depth",
            default=100,
            type=int,
            help=
            "minimum read depth required to consider a position covered. [default: 100]"
        )
        align_group.add_argument(
            "-b",
            "--breadth",
            default=0.8,
            type=float,
            help=
            "minimum breadth of coverage required to consider an amplicon as present. [default: 0.8]"
        )
        align_group.add_argument(
            "-p",
            "--proportion",
            default=0.1,
            type=float,
            help=
            "minimum proportion required to call a SNP at a given position. [default: 0.1]"
        )
        align_group.add_argument(
            "-i",
            "--identity",
            dest="percid",
            default=0,
            type=float,
            help=
            "minimum percent identity required to align a read to a reference amplicon sequence. [default: 0]"
        )
        parser.add_argument("-V",
                            "--version",
                            action="version",
                            version=program_version_message)

        # Process arguments
        args = parser.parse_args()

        run_name = args.name
        json_fp = dispatcher.expandPath(args.json)
        read_dir = args.rdir
        bam_dir = args.bdir
        out_dir = args.odir
        trim = args.trim
        qual = args.qual
        minlen = args.minlen
        aligner = args.aligner
        aligner_args = args.aargs
        depth = args.depth
        breadth = args.breadth
        proportion = args.proportion
        percid = args.percid
        adapters = dispatcher.expandPath(args.adapters)
        dispatcher.job_manager = args.job_manager.upper()
        dispatcher.job_manager_args = args.sargs
        smor = args.smor

        if not out_dir:
            out_dir = os.getcwd()
        if not (read_dir or bam_dir):
            read_dir = os.getcwd()

        out_dir = dispatcher.expandPath(out_dir)
        if read_dir:
            read_dir = dispatcher.expandPath(read_dir)
        if bam_dir:
            bam_dir = dispatcher.expandPath(bam_dir)

        if os.path.exists(out_dir):
            response = input(
                "\nOutput folder %s already exists!\nFiles in it may be overwritten!\nShould we continue anyway [N]? "
                % out_dir)
            if not re.match('^[Yy]', response):
                print("Operation cancelled!")
                quit()
        else:
            os.makedirs(out_dir)

        logfile = os.path.join(out_dir, "asap.log")
        logging.basicConfig(level=logging.DEBUG,
                            format='%(asctime)s %(levelname)-8s %(message)s',
                            datefmt='%m/%d/%Y %H:%M:%S',
                            filename=logfile,
                            filemode='w')

        logging.info(
            "Combining reads in %s and JSON file: %s for run: %s. Trim=%s Qual=%s"
            % (read_dir, json_fp, run_name, trim, qual))

        assay_list = assayInfo.parseJSON(json_fp)

        bam_list = []
        output_files = []
        final_jobs = []
        xml_dir = os.path.join(out_dir, "xml")
        if not os.path.exists(xml_dir):
            os.makedirs(xml_dir)

        if bam_dir:
            bam_list = dispatcher.findBams(bam_dir)

        if read_dir:
            reference = assayInfo.generateReference(assay_list)
            ref_fasta = os.path.join(out_dir, "reference.fasta")
            reference.write(ref_fasta, 'fasta')
            index_job = dispatcher.indexFasta(ref_fasta, aligner)

            read_list = dispatcher.findReads(read_dir)
            for read in read_list:
                if (not read.reads):
                    #TODO: write out appropriate xml for samples with empty read files so they show up in results
                    continue
                if trim:
                    trimmed_reads = dispatcher.trimAdapters(*read,
                                                            outdir=out_dir,
                                                            adapters=adapters,
                                                            quality=qual,
                                                            minlen=minlen)
                    (bam_file, job_id) = dispatcher.alignReadsToReference(
                        trimmed_reads.sample,
                        trimmed_reads.reads,
                        ref_fasta,
                        out_dir,
                        jobid=trimmed_reads.jobid,
                        aligner=aligner,
                        args=aligner_args)
                else:
                    (bam_file, job_id) = dispatcher.alignReadsToReference(
                        read.sample,
                        read.reads,
                        ref_fasta,
                        out_dir,
                        jobid=index_job,
                        aligner=aligner,
                        args=aligner_args)
                bam_list.append((read.sample, bam_file, job_id))

        for sample, bam, job in bam_list:
            (xml_file, job_id) = dispatcher.processBam(sample, json_fp, bam,
                                                       xml_dir, job, depth,
                                                       breadth, proportion,
                                                       percid, smor)
            output_files.append(xml_file)
            final_jobs.append(job_id)

        (final_output,
         job) = dispatcher.combineOutputFiles(run_name, xml_dir, out_dir,
                                              final_jobs)
        print(
            "All jobs are submitted, the final job id is: %s. Output will be in %s when ready."
            % (job, final_output))

        return 0
    except KeyboardInterrupt:
        ### handle keyboard interrupt ###
        return 0
    except Exception as e:
        if DEBUG or TESTRUN:
            raise (e)
        indent = len(program_name) * " "
        sys.stderr.write(program_name + ": " + repr(e) + "\n")
        sys.stderr.write(indent + "  for help use --help")
        return 2
Exemplo n.º 2
0
def main(argv=None): # IGNORE:C0111
    '''Command line options.'''

    if argv is None:
        argv = sys.argv
    else:
        sys.argv.extend(argv)

    program_name = os.path.basename(sys.argv[0])
    program_version = "v%s" % __version__
    program_build_date = str(__updated__)
    program_version_message = '%%(prog)s %s (%s)' % (program_version, program_build_date)
    if __name__ == '__main__':
        program_shortdesc = __import__('__main__').__doc__.split("\n")[1]
    else:
        program_shortdesc = __doc__.split("\n")[1]    
    #program_shortdesc = __import__('__main__').__doc__.split("\n")[1]
    program_license = '''%s

  Created by TGen North on %s.
  Copyright 2015 TGen North. All rights reserved.

  Available for academic and research use only under a license
  from The Translational Genomics Research Institute (TGen)
  that is free for non-commercial use.

  Distributed on an "AS IS" basis without warranties
  or conditions of any kind, either express or implied.

USAGE
''' % (program_shortdesc, str(__date__))

    try:
        # Setup argument parser
        parser = argparse.ArgumentParser(description=program_license, formatter_class=argparse.RawDescriptionHelpFormatter)
        required_group = parser.add_argument_group("required arguments")
        required_group.add_argument("-j", "--json", metavar="FILE", required=True, help="JSON file of assay descriptions. [REQUIRED]")
        required_group.add_argument("-b", "--bam", metavar="FILE", required=True, help="BAM file to analyze. [REQUIRED]")
        #required_group.add_argument("-r", "--ref", metavar="FILE", required=True, help="reference fasta file, should already be indexed. [REQUIRED]")
        #parser.add_argument("-o", "--out-dir", dest="odir", metavar="DIR", help="directory to write output files to. [default: `pwd`]")
        required_group.add_argument("-o", "--out", metavar="FILE", required=True, help="XML file to write output to. [REQUIRED]")
        #parser.add_argument("-n", "--name", help="sample name, if not provided it will be derived from BAM file")
        parser.add_argument("-d", "--depth", default=100, type=int, help="minimum read depth required to consider a position covered. [default: 100]")
        parser.add_argument("--breadth", default=0.8, type=float, help="minimum breadth of coverage required to consider an amplicon as present. [default: 0.8]")
        parser.add_argument("-p", "--proportion", default=0.1, type=float, help="minimum proportion required to call a SNP at a given position. [default: 0.1]")
        parser.add_argument("-V", "--version", action="version", version=program_version_message)
     
        # Process arguments
        args = parser.parse_args()

        json_fp = args.json
        bam_fp = args.bam
        out_fp = args.out
        depth = args.depth
        breadth = args.breadth
        proportion = args.proportion
        #ref_fp = args.ref
        #out_dir = args.odir
        #if not out_dir:
        #    out_dir = os.getcwd()
       
        #out_dir = dispatcher.expandPath(out_dir)
        #if not os.path.exists(out_dir):
        #    os.makedirs(out_dir)

        assay_list = assayInfo.parseJSON(json_fp)
        samdata = pysam.AlignmentFile(bam_fp, "rb")
        #reference = pysam.FastaFile(ref_fp)
        
        sample_dict = {}
        if 'RG' in samdata.header :
            sample_dict['name'] = samdata.header['RG'][0]['ID']
        else:
            sample_dict['name'] = os.path.splitext(os.path.basename(bam_fp))[0]
        sample_dict['mapped_reads'] = str(samdata.mapped)
        sample_dict['unmapped_reads'] = str(samdata.unmapped)
        sample_dict['unassigned_reads'] = str(samdata.nocoordinate)
        sample_node = ElementTree.Element("sample", sample_dict)

        #out_fp = os.path.join(out_dir, sample_dict['name']+".xml")
        
        for assay in assay_list:
            assay_dict = {}
            assay_dict['name'] = assay.name
            assay_dict['type'] = assay.assay_type
            assay_node = ElementTree.SubElement(sample_node, "assay", assay_dict)
            ref_name = assay.name
            reverse_comp = assay.target.reverse_comp
            for amplicon in assay.target.amplicons:
                ref_name = assay.name + "_%s" % amplicon.variant_name if amplicon.variant_name else assay.name
                amplicon_dict = {}
                amplicon_dict['reads'] = str(samdata.count(ref_name))
                if amplicon.variant_name:
                    amplicon_dict['variant'] = amplicon.variant_name
                amplicon_node = ElementTree.SubElement(assay_node, "amplicon", amplicon_dict)
                if samdata.count(ref_name) == 0:
                    significance_node = ElementTree.SubElement(amplicon_node, "significance", {"flag":"no coverage"})
                    #Check for indeterminate resistances
                    resistances = set()
                    if amplicon.significance and amplicon.significance.resistance:
                        resistances.add(amplicon.significance.resistance)
                    for snp in amplicon.SNPs:
                        if snp.significance.resistance:
                            resistances.add(snp.significance.resistance)
                    for roi in amplicon.ROIs:
                        if roi.significance.resistance:
                            resistances.add(roi.significance.resistance)
                    if resistances:        
                        significance_node.set("resistance", ",".join(resistances))
                else:
                    if amplicon.significance or samdata.count(ref_name) < depth:
                        significance_node = ElementTree.SubElement(amplicon_node, "significance")
                        if amplicon.significance:
                            significance_node.text = amplicon.significance.message
                            if amplicon.significance.resistance:
                                significance_node.set("resistance", amplicon.significance.resistance)
                        if samdata.count(ref_name) < depth:
                            significance_node.set("flag", "low coverage")
                            #Check for indeterminate resistances
                            resistances = set()
                            if amplicon.significance and amplicon.significance.resistance:
                                resistances.add(amplicon.significance.resistance)
                            for snp in amplicon.SNPs:
                                if snp.significance.resistance:
                                    resistances.add(snp.significance.resistance)
                            for roi in amplicon.ROIs:
                                if roi.significance.resistance:
                                    resistances.add(roi.significance.resistance)
                            if resistances:        
                                significance_node.set("resistance", ",".join(resistances))

                    pileup = samdata.pileup(ref_name, max_depth=1000000)
                    amplicon_data = _process_pileup(pileup, amplicon, depth, proportion)
                    if float(amplicon_data['breadth']) < breadth*100:
                        significance_node = amplicon_node.find("significance")
                        if significance_node is None:
                            significance_node = ElementTree.SubElement(amplicon_node, "significance")
                        if not significance_node.get("flag"):
                            significance_node.set("flag", "insufficient breadth of coverage")
                    for snp in amplicon_data['SNPs']:
                        _add_snp_node(amplicon_node, snp)
                        # This would be helpful, but count_coverage is broken in python3
                        #print(samdata.count_coverage(ref_name, snp.position-1, snp.position))
                    del amplicon_data['SNPs']
                    _write_parameters(amplicon_node, amplicon_data)

                    for roi in amplicon.ROIs:
                        roi_dict = _process_roi(roi, samdata, ref_name, reverse_comp)
                        _add_roi_node(amplicon_node, roi, roi_dict, depth, proportion)

        samdata.close()
        _write_xml(sample_node, out_fp)

        return 0
    except KeyboardInterrupt:
        ### handle keyboard interrupt ###
        return 0
    except Exception as e:
        if DEBUG or TESTRUN:
            raise(e)
        indent = len(program_name) * " "
        sys.stderr.write(program_name + ": " + repr(e) + "\n")
        sys.stderr.write(indent + "  for help use --help")
        return 2
Exemplo n.º 3
0
def main(argv=None):  # IGNORE:C0111
    '''Command line options.'''

    if argv is None:
        argv = sys.argv
    else:
        sys.argv.extend(argv)

    program_name = os.path.basename(sys.argv[0])
    program_version = "v%s" % __version__
    program_build_date = str(__updated__)
    program_version_message = '%%(prog)s %s (%s)' % (program_version,
                                                     program_build_date)
    if __name__ == '__main__':
        program_shortdesc = __import__('__main__').__doc__.split("\n")[1]
    else:
        program_shortdesc = __doc__.split("\n")[1]
    #program_shortdesc = __import__('__main__').__doc__.split("\n")[1]
    program_license = '''%s

  Created by TGen North on %s.
  Copyright 2015 TGen North. All rights reserved.

  Available for academic and research use only under a license
  from The Translational Genomics Research Institute (TGen)
  that is free for non-commercial use.

  Distributed on an "AS IS" basis without warranties
  or conditions of any kind, either express or implied.

USAGE
''' % (program_shortdesc, str(__date__))

    try:
        # Setup argument parser
        parser = argparse.ArgumentParser(
            description=program_license,
            formatter_class=argparse.RawDescriptionHelpFormatter)
        required_group = parser.add_argument_group("required arguments")
        required_group.add_argument(
            "-j",
            "--json",
            metavar="FILE",
            required=True,
            help="JSON file of assay descriptions. [REQUIRED]")
        required_group.add_argument("-b",
                                    "--bam",
                                    metavar="FILE",
                                    required=True,
                                    help="BAM file to analyze. [REQUIRED]")
        #required_group.add_argument("-r", "--ref", metavar="FILE", required=True, help="reference fasta file, should already be indexed. [REQUIRED]")
        #parser.add_argument("-o", "--out-dir", dest="odir", metavar="DIR", help="directory to write output files to. [default: `pwd`]")
        required_group.add_argument(
            "-o",
            "--out",
            metavar="FILE",
            required=True,
            help="XML file to write output to. [REQUIRED]")
        #parser.add_argument("-n", "--name", help="sample name, if not provided it will be derived from BAM file")
        parser.add_argument(
            "-d",
            "--depth",
            default=100,
            type=int,
            help=
            "minimum read depth required to consider a position covered. [default: 100]"
        )
        parser.add_argument(
            "--breadth",
            default=0.8,
            type=float,
            help=
            "minimum breadth of coverage required to consider an amplicon as present. [default: 0.8]"
        )
        parser.add_argument(
            "-p",
            "--proportion",
            default=0.1,
            type=float,
            help=
            "minimum proportion required to call a SNP at a given position. [default: 0.1]"
        )
        parser.add_argument(
            "-i",
            "--identity",
            dest="percid",
            default=0,
            type=float,
            help=
            "minimum percent identity required to align a read to a reference amplicon sequence. [default: 0]"
        )
        parser.add_argument(
            "-s",
            "--smor",
            action="store_true",
            default=False,
            help=
            "perform SMOR analysis with overlapping reads. [default: False]")
        parser.add_argument("-V",
                            "--version",
                            action="version",
                            version=program_version_message)

        # Process arguments
        args = parser.parse_args()

        json_fp = args.json
        bam_fp = args.bam
        out_fp = args.out
        depth = args.depth
        breadth = args.breadth
        proportion = args.proportion
        percid = args.percid
        smor = args.smor
        #ref_fp = args.ref
        #out_dir = args.odir
        #if not out_dir:
        #    out_dir = os.getcwd()

        #out_dir = dispatcher.expandPath(out_dir)
        #if not os.path.exists(out_dir):
        #    os.makedirs(out_dir)

        assay_list = assayInfo.parseJSON(json_fp)
        samdata = pysam.AlignmentFile(bam_fp, "rb")
        #reference = pysam.FastaFile(ref_fp)

        sample_dict = {}
        if 'RG' in samdata.header:
            sample_dict['name'] = samdata.header['RG'][0]['ID']
        else:
            sample_dict['name'] = os.path.splitext(os.path.basename(bam_fp))[0]
        sample_dict['mapped_reads'] = str(samdata.mapped)
        sample_dict['unmapped_reads'] = str(samdata.unmapped)
        sample_dict['unassigned_reads'] = str(samdata.nocoordinate)
        sample_dict['depth_filter'] = str(depth)
        sample_dict['proportion_filter'] = str(proportion)
        sample_dict['breadth_filter'] = str(breadth)
        if percid:
            sample_dict['identity_filter'] = str(percid)
        sample_dict['json_file'] = json_fp
        sample_dict['bam_file'] = bam_fp
        sample_node = ElementTree.Element("sample", sample_dict)

        #out_fp = os.path.join(out_dir, sample_dict['name']+".xml")

        for assay in assay_list:
            assay_dict = {}
            assay_dict['name'] = assay.name
            assay_dict['type'] = assay.assay_type
            assay_dict['function'] = assay.target.function
            assay_dict['gene'] = assay.target.gene_name or ""
            assay_node = ElementTree.SubElement(sample_node, "assay",
                                                assay_dict)
            ref_name = assay.name
            reverse_comp = assay.target.reverse_comp
            for amplicon in assay.target.amplicons:
                temp_file = None
                ref_name = assay.name + "_%s" % amplicon.variant_name if amplicon.variant_name else assay.name
                amplicon_dict = {}
                if percid:
                    (temp_file, discarded_reads) = _verify_percent_identity(
                        samdata, ref_name, amplicon, percid)
                    samdata = pysam.AlignmentFile(temp_file, "rb")
                    amplicon_dict['discarded_reads'] = str(discarded_reads)
                elif samdata.closed:
                    samdata = pysam.AlignmentFile(bam_fp, "rb")
                amplicon_dict['reads'] = str(samdata.count(ref_name))
                if amplicon.variant_name:
                    amplicon_dict['variant'] = amplicon.variant_name
                amplicon_node = ElementTree.SubElement(assay_node, "amplicon",
                                                       amplicon_dict)
                if samdata.count(ref_name) == 0:
                    significance_node = ElementTree.SubElement(
                        amplicon_node, "significance", {"flag": "no coverage"})
                    #Check for indeterminate resistances
                    resistances = set()
                    if amplicon.significance and amplicon.significance.resistance:
                        resistances.add(amplicon.significance.resistance)
                    for snp in amplicon.SNPs:
                        if snp.significance.resistance:
                            resistances.add(snp.significance.resistance)
                    for roi in amplicon.ROIs:
                        if roi.significance.resistance:
                            resistances.add(roi.significance.resistance)
                    if resistances:
                        significance_node.set("resistance",
                                              ",".join(resistances))
                else:
                    if amplicon.significance or samdata.count(
                            ref_name) < depth:
                        significance_node = ElementTree.SubElement(
                            amplicon_node, "significance")
                        if amplicon.significance:
                            significance_node.text = amplicon.significance.message
                            if amplicon.significance.resistance:
                                significance_node.set(
                                    "resistance",
                                    amplicon.significance.resistance)
                        if samdata.count(ref_name) < depth:
                            significance_node.set("flag", "low coverage")
                            #Check for indeterminate resistances
                            resistances = set()
                            if amplicon.significance and amplicon.significance.resistance:
                                resistances.add(
                                    amplicon.significance.resistance)
                            for snp in amplicon.SNPs:
                                if snp.significance.resistance:
                                    resistances.add(
                                        snp.significance.resistance)
                            for roi in amplicon.ROIs:
                                if roi.significance.resistance:
                                    resistances.add(
                                        roi.significance.resistance)
                            if resistances:
                                significance_node.set("resistance",
                                                      ",".join(resistances))

                    pileup = samdata.pileup(ref_name, max_depth=1000000)
                    if smor:
                        amplicon_data = _process_pileup_SMOR(
                            pileup, amplicon, depth, proportion)
                    else:
                        amplicon_data = _process_pileup(
                            pileup, amplicon, depth, proportion)
                    if float(amplicon_data['breadth']) < breadth * 100:
                        significance_node = amplicon_node.find("significance")
                        if significance_node is None:
                            significance_node = ElementTree.SubElement(
                                amplicon_node, "significance")
                        if not significance_node.get("flag"):
                            significance_node.set(
                                "flag", "insufficient breadth of coverage")
                    for snp in amplicon_data['SNPs']:
                        _add_snp_node(amplicon_node, snp)
                        # This would be helpful, but count_coverage is broken in python3
                        #print(samdata.count_coverage(ref_name, snp.position-1, snp.position))
                    del amplicon_data['SNPs']
                    _write_parameters(amplicon_node, amplicon_data)

                    for roi in amplicon.ROIs:
                        if smor:
                            roi_dict = _process_roi_SMOR(
                                roi, samdata, ref_name, reverse_comp)
                        else:
                            roi_dict = _process_roi(roi, samdata, ref_name,
                                                    reverse_comp)
                        _add_roi_node(amplicon_node, roi, roi_dict, depth,
                                      proportion)

                if temp_file:
                    samdata.close()
                    os.remove(temp_file)
                    os.remove(temp_file + ".bai")

        if samdata.is_open():
            samdata.close()
        _write_xml(sample_node, out_fp)

        return 0
    except KeyboardInterrupt:
        ### handle keyboard interrupt ###
        return 0
    except Exception as e:
        if DEBUG or TESTRUN:
            raise (e)
        indent = len(program_name) * " "
        sys.stderr.write(program_name + ": " + repr(e) + "\n")
        sys.stderr.write(indent + "  for help use --help")
        return 2
Exemplo n.º 4
0
def main(argv=None): # IGNORE:C0111
    '''Command line options.'''

    if argv is None:
        argv = sys.argv
    else:
        sys.argv.extend(argv)

    program_name = os.path.basename(sys.argv[0])
    program_version = "v%s" % __version__
    program_build_date = str(__updated__)
    program_version_message = '%%(prog)s %s (%s)' % (program_version, program_build_date)
    if __name__ == '__main__':
        program_shortdesc = __import__('__main__').__doc__.split("\n")[1]
    else:
        program_shortdesc = __doc__.split("\n")[1]    
    #program_shortdesc = __import__('__main__').__doc__.split("\n")[1]
    program_license = '''%s

  Created by TGen North on %s.
  Copyright 2015 TGen North. All rights reserved.

  Available for academic and research use only under a license
  from The Translational Genomics Research Institute (TGen)
  that is free for non-commercial use.

  Distributed on an "AS IS" basis without warranties
  or conditions of any kind, either express or implied.

USAGE
''' % (program_shortdesc, str(__date__))

    try:
        # Setup argument parser
        parser = argparse.ArgumentParser(description=program_license, formatter_class=argparse.RawDescriptionHelpFormatter)
        required_group = parser.add_argument_group("required arguments")
        required_group.add_argument("-n", "--name", required=True, help="name for this run. [REQUIRED]")
        required_group.add_argument("-j", "--json", required=True, help="JSON file of assay descriptions. [REQUIRED]")
        optional_group = parser.add_argument_group("optional arguments")
        reads_bams_group = optional_group.add_mutually_exclusive_group()
        reads_bams_group.add_argument("-r", "--read-dir", dest="rdir", metavar="DIR", help="directory of read files to analyze.")
        reads_bams_group.add_argument("-b", "--bam-dir", dest="bdir", metavar="DIR", help="directory of bam files to analyze.")
        optional_group.add_argument("-o", "--out-dir", dest="odir", metavar="DIR", help="directory to write output files to. [default: `pwd`]")
        trim_group = parser.add_argument_group("read trimming options")
        on_off_group = trim_group.add_mutually_exclusive_group()
        on_off_group.add_argument("--trim", action="store_true", default=True, help="perform adapter trimming on reads. [default: True]")
        on_off_group.add_argument("--no-trim", dest="trim", action="store_false", help="do not perform adapter trimming.")
        trim_group.add_argument("-s", "--adapter-sequences", dest="adapters", default="/scratch/dlemmer/ASAP/illumina_adapters_all.fasta", help="location of the adapter sequence file to use for trimming. [default: /scratch/dlemmer/ASAP/illumina_adapters_all.fasta]")
        trim_group.add_argument("-q", "--qual", nargs="?", const="SLIDINGWINDOW:5:20", help="perform quality trimming [default: False], optional parameter can be used to customize quality trimming parameters to trimmomatic. [default: SLIDINGWINDOW:5:20]")
        trim_group.add_argument("-m", "--minlen", metavar="LEN", default=80, type=int, help="minimum read length to keep after trimming. [default: 80]")
        align_group = parser.add_argument_group("read mapping options")
        align_group.add_argument("-a", "--aligner", default="bowtie2", help="aligner to use for read mapping, supports bowtie2, novoalign, and bwa. [default: bowtie2]")
        align_group.add_argument("--aligner-args", dest="aargs", metavar="ARGS", default='', help="additional arguments to pass to the aligner, enclosed in \"\".")
        align_group.add_argument("-d", "--depth", default=100, type=int, help="minimum read depth required to consider a position covered. [default: 100]")
        align_group.add_argument("--breadth", default=0.8, type=float, help="minimum breadth of coverage required to consider an amplicon as present. [default: 0.8]")
        align_group.add_argument("-p", "--proportion", default=0.1, type=float, help="minimum proportion required to call a SNP at a given position. [default: 0.1]")
        parser.add_argument("-V", "--version", action="version", version=program_version_message)
     
        # Process arguments
        args = parser.parse_args()

        run_name = args.name
        json_fp = dispatcher.expandPath(args.json)
        read_dir = args.rdir
        bam_dir = args.bdir
        out_dir = args.odir
        trim = args.trim
        qual = args.qual
        minlen = args.minlen
        aligner = args.aligner
        aligner_args = args.aargs
        depth = args.depth
        breadth = args.breadth
        proportion = args.proportion
        adapters = dispatcher.expandPath(args.adapters)
        
        if not out_dir:
            out_dir = os.getcwd()
        if not (read_dir or bam_dir):
            read_dir = os.getcwd()
       
        out_dir = dispatcher.expandPath(out_dir)
        if read_dir:
            read_dir = dispatcher.expandPath(read_dir)
        if bam_dir:
            bam_dir = dispatcher.expandPath(bam_dir)

        if os.path.exists(out_dir):
            response = input(
                "\nOutput folder %s already exists!\nFiles in it may be overwritten!\nShould we continue anyway [N]? " % out_dir)
            if not re.match('^[Yy]', response):
                print("Operation cancelled!")
                quit()
        else:
            os.makedirs(out_dir)

        logfile = os.path.join(out_dir, "asap.log")
        logging.basicConfig(level=logging.DEBUG,
                            format='%(asctime)s %(levelname)-8s %(message)s',
                            datefmt='%m/%d/%Y %H:%M:%S',
                            filename=logfile,
                            filemode='w')
        
        logging.info("Combining reads in %s and JSON file: %s for run: %s. Trim=%s Qual=%s" % (read_dir, json_fp, run_name, trim, qual))
        
        assay_list = assayInfo.parseJSON(json_fp)
        
        bam_list = []
        output_files = []
        final_jobs = []
        xml_dir = os.path.join(out_dir, "xml")
        if not os.path.exists(xml_dir):
            os.makedirs(xml_dir)
        
        if bam_dir:
            bam_list = dispatcher.findBams(bam_dir)
                   
        if read_dir:
            reference = assayInfo.generateReference(assay_list)
            ref_fasta = os.path.join(out_dir, "reference.fasta")
            reference.write(ref_fasta, 'fasta')
            index_job = dispatcher.indexFasta(ref_fasta, aligner)        
        
            read_list = dispatcher.findReads(read_dir)
            for read in read_list:
                if (not read.reads):
                    #TODO: write out appropriate xml for samples with empty read files so they show up in results
                    continue
                if trim:
                    trimmed_reads = dispatcher.trimAdapters(*read, outdir=out_dir, adapters=adapters, quality=qual, minlen=minlen)
                    (bam_file, job_id) = dispatcher.alignReadsToReference(trimmed_reads.sample, trimmed_reads.reads, ref_fasta, out_dir, jobid=trimmed_reads.jobid, aligner=aligner, args=aligner_args)
                else:            
                    (bam_file, job_id) = dispatcher.alignReadsToReference(read.sample, read.reads, ref_fasta, out_dir, jobid=index_job, aligner=aligner, args=aligner_args)
                bam_list.append((read.sample, bam_file, job_id))    
         
        for sample, bam, job in bam_list:
            (xml_file, job_id) = dispatcher.processBam(sample, json_fp, bam, xml_dir, job, depth, breadth, proportion)
            output_files.append(xml_file)
            final_jobs.append(job_id)
            
        (final_output, job) = dispatcher.combineOutputFiles(run_name, xml_dir, out_dir, final_jobs)
        print("All jobs are submitted, the final job id is: %s. Output will be in %s when ready." % (job, final_output))

        return 0
    except KeyboardInterrupt:
        ### handle keyboard interrupt ###
        return 0
    except Exception as e:
        if DEBUG or TESTRUN:
            raise(e)
        indent = len(program_name) * " "
        sys.stderr.write(program_name + ": " + repr(e) + "\n")
        sys.stderr.write(indent + "  for help use --help")
        return 2