def main():
    args = get_args()
    # setup logging
    log, my_name = setup_logging(args)
    log.info("Output files are in {}".format(args.output_dir))
    reads = glob.glob("{}/*.fq.gz".format(args.input_dir))
    reads.sort()
    read_pairs = pairwise(reads)
    for sample in read_pairs:
        read_1_filename = os.path.basename(sample[0])
        read_2_filename = os.path.basename(sample[1])
        read_1_sample_name = re.search("(.*).\d.\d.fq.gz",
                                       read_1_filename).groups()[0]
        read_2_sample_name = re.search("(.*).\d.\d.fq.gz",
                                       read_2_filename).groups()[0]
        #pdb.set_trace()
        assert read_1_sample_name == read_2_sample_name, IOError(
            "Input fastq names do not match (or reads are not paired)")
        sample_name = read_1_sample_name
        log.info("Creating output and symlinking {}".format(sample_name))
        outdir = os.path.join(args.output_dir, sample_name)
        os.mkdir(outdir)
        for fastq in sample:
            os.symlink(fastq, os.path.join(outdir, os.path.basename(fastq)))
    # end
    text = " Completed {} ".format(my_name)
    log.info(text.center(65, "="))
Exemplo n.º 2
0
def main():
    args = get_args()
    # setup logging
    log, my_name = setup_logging(args)
    reference_dir, reference = os.path.split(args.input_reference)
    # create samtools index
    samtools.index(log, args.input_bam)
    samtools.faidx(log, args.input_reference)
    # create picard reference dictionary
    picard.create_reference_dict(log, reference, reference_dir, args.input_reference)
    # run GATK steps
    intervals = gatk.get_merged_intervals(log, args.input_reference, args.input_bam, args.cores, args.output_dir)
    realigned_bam = gatk.realign_bam(log, args.input_reference, args.input_bam, intervals, args.output_dir)
    raw_snps_vcf = gatk.call_snps(log, args.input_reference, realigned_bam, args.cores, args.output_dir)
    raw_indels_vcf = gatk.call_indels(log, args.input_reference, realigned_bam, args.cores, args.output_dir)
    filtered_variants_vcf = gatk.variant_filtration(log, args.input_reference, realigned_bam, raw_snps_vcf, raw_indels_vcf, args.output_dir)
    # output a file with only passing SNPS
    log.info("Creating a file of PASSING SNP calls")
    passing = os.path.splitext(filtered_variants_vcf)[0] + ".PASSING.vcf"
    with open(filtered_variants_vcf, 'r') as infile:
        with open(passing, 'w') as outfile:
            for line in infile:
                if line.startswith("#"):
                    outfile.write(line)
                else:
                    ls = line.strip().split("\t")
                    if ls[6] == "PASS":
                        outfile.write(line)
    # end
    text = " Completed {} ".format(my_name)
    log.info(text.center(65, "="))
Exemplo n.º 3
0
def main():
    args = get_args()
    # setup logging
    log, my_name = setup_logging(args)
    interval_list = get_interval_file(log, args.input_reference)
    # get coverage
    gatk.coverage(log, args.input_reference, args.input_bam, interval_list, args.output_dir)
    # end
    text = " Completed {} ".format(my_name)
    log.info(text.center(65, "="))
def main():
    args = get_args()
    # setup logging
    log, my_name = setup_logging(args)
    all_bams = get_all_bams(args.input_dir, args.follow_links)
    sample = os.path.basename(args.input_dir)
    picard.merge_many_bams(log, sample, all_bams, args.output_dir)
    # end
    text = " Completed {} ".format(my_name)
    log.info(text.center(65, "="))
def main():
    args = get_args()
    # setup logging
    log, my_name = setup_logging(args)
    log.info("{}".format('-' * 65))
    cnt_loci = Counter()
    cnt_status = Counter()
    with open(args.input_vcf, 'r') as infile:
        for line in infile:
            if line.startswith("#"):
                pass
            else:
                ls = line.strip().split("\t")
                cnt_status[ls[6]] += 1
                if ls[6] == "PASS":
                    cnt_loci[ls[0]] += 1
    snps_per_locus = numpy.array([v[1] for v in cnt_loci.iteritems()])
    ordered_status = cnt_status.keys()
    ordered_status.sort()
    for status in ordered_status:
        log.info("{0: <6}\tSNPs with {1}".format(cnt_status[status], status))
    log.info("{}".format('-' * 65))
    log.info("Total count of PASS loci\t = {}".format(len(snps_per_locus)))
    log.info("Total count of PASS SNPs\t = {}".format(sum(snps_per_locus)))
    log.info("Mean PASS SNPs per locus\t = {}".format(round(numpy.mean(snps_per_locus), 2)))
    confidence_interval = 1.96 * (numpy.std(snps_per_locus, ddof=1) / numpy.sqrt(len(snps_per_locus)))
    log.info("95 CI PASS SNPs per locus\t = {}".format(round(confidence_interval, 2)))
    log.info("Max PASS SNPs per locus\t = {}".format(numpy.min(snps_per_locus)))
    log.info("Min PASS SNPs per locus\t = {}".format(numpy.max(snps_per_locus)))
    if args.output_file is not None:
        with open(args.output_file, "w") as outfile:
            outfile.write("locus,count\n")
            for locus, cnt in cnt_loci.iteritems():
                outfile.write("{},{}\n".format(locus, cnt))
    # end
    text = " Completed {} ".format(my_name)
    log.info(text.center(65, "="))
Exemplo n.º 6
0
def main():
    # get args and options
    args = get_args()
    # setup logging
    log, my_name = setup_logging(args)
    text = " Starting {} ".format(my_name)
    log.info(text.center(65, "="))
    # get the config file data
    conf = ConfigParser.ConfigParser(allow_no_value=True)
    conf.optionxform = str
    conf.read(args.config)
    # get the input data
    log.info("Getting input filenames and creating output directories")
    reference, individuals = get_input_data(log, conf, args.output)
    flowcells = dict(conf.items("flowcell"))
    if args.bwa_mem:
        log.info("You are running BWA-MEM")
    for indiv in individuals:
        bam, bam_se = False, False
        sample, dir = indiv
        # pretty print taxon status
        text = " Processing {} ".format(sample)
        log.info(text.center(65, "-"))
        # make a directory for sample-specific assemblies
        sample_dir = os.path.join(args.output, sample)
        os.makedirs(sample_dir)
        # determine how many files we're dealing with
        fastq = get_input_files(dir, args.subfolder, log)
        if fastq.r1 and fastq.r2:
            # bwa align r1 and r2
            if args.bwa_mem:
                bam = bwa.mem_pe_align(log, sample, sample_dir, reference, args.cores, fastq.r1, fastq.r2)
            else:
                bam = bwa.pe_align(log, sample, sample_dir, reference, args.cores, fastq.r1, fastq.r2)
            # clean the bam up (MAPq 0 and trim overlapping reads)
            bam = picard.clean_up_bam(log, sample, sample_dir, bam, "pe")
            # get flowcell id
            fc = flowcells[sample]
            bam = picard.add_rg_header_info(log, sample, sample_dir, fc, bam, "pe")
            ### !!! We are not removing duplicates because we expect them and
            ### !!! and they have been filtered prior to alignment (after demuxing)
        if fastq.singleton:
            # bwa align singleton reads
            if args.bwa_mem:
                bam_se = bwa.mem_se_align(log, sample, sample_dir, reference, args.cores, fastq.singleton)
            else:
                bam_se = bwa.se_align(log, sample, sample_dir, reference, args.cores, fastq.singleton)
            # clean the bam up (MAPq 0 and trim overlapping reads)
            bam_se = picard.clean_up_bam(log, sample, sample_dir, bam_se, "se")
            # get flowcell id
            fc = flowcells[sample]
            bam_se = picard.add_rg_header_info(log, sample, sample_dir, fc, bam_se, "se")
            ### !!! We are not removing duplicates because we expect them and
            ### !!! and they have been filtered prior to alignment (after demuxing)
        if bam and bam_se:
            bam = picard.merge_two_bams(log, sample, sample_dir, bam, bam_se)
        elif bam_se and not bam:
            bam = bam_se
        if not bam:
            raise IOError("There is no BAM file.  Check bwa log files for problems.")
    # end
    text = " Completed {} ".format(my_name)
    log.info(text.center(65, "="))