def main(): args = get_args() # setup logging log, my_name = setup_logging(args) log.info("Output files are in {}".format(args.output_dir)) reads = glob.glob("{}/*.fq.gz".format(args.input_dir)) reads.sort() read_pairs = pairwise(reads) for sample in read_pairs: read_1_filename = os.path.basename(sample[0]) read_2_filename = os.path.basename(sample[1]) read_1_sample_name = re.search("(.*).\d.\d.fq.gz", read_1_filename).groups()[0] read_2_sample_name = re.search("(.*).\d.\d.fq.gz", read_2_filename).groups()[0] #pdb.set_trace() assert read_1_sample_name == read_2_sample_name, IOError( "Input fastq names do not match (or reads are not paired)") sample_name = read_1_sample_name log.info("Creating output and symlinking {}".format(sample_name)) outdir = os.path.join(args.output_dir, sample_name) os.mkdir(outdir) for fastq in sample: os.symlink(fastq, os.path.join(outdir, os.path.basename(fastq))) # end text = " Completed {} ".format(my_name) log.info(text.center(65, "="))
def main(): args = get_args() # setup logging log, my_name = setup_logging(args) reference_dir, reference = os.path.split(args.input_reference) # create samtools index samtools.index(log, args.input_bam) samtools.faidx(log, args.input_reference) # create picard reference dictionary picard.create_reference_dict(log, reference, reference_dir, args.input_reference) # run GATK steps intervals = gatk.get_merged_intervals(log, args.input_reference, args.input_bam, args.cores, args.output_dir) realigned_bam = gatk.realign_bam(log, args.input_reference, args.input_bam, intervals, args.output_dir) raw_snps_vcf = gatk.call_snps(log, args.input_reference, realigned_bam, args.cores, args.output_dir) raw_indels_vcf = gatk.call_indels(log, args.input_reference, realigned_bam, args.cores, args.output_dir) filtered_variants_vcf = gatk.variant_filtration(log, args.input_reference, realigned_bam, raw_snps_vcf, raw_indels_vcf, args.output_dir) # output a file with only passing SNPS log.info("Creating a file of PASSING SNP calls") passing = os.path.splitext(filtered_variants_vcf)[0] + ".PASSING.vcf" with open(filtered_variants_vcf, 'r') as infile: with open(passing, 'w') as outfile: for line in infile: if line.startswith("#"): outfile.write(line) else: ls = line.strip().split("\t") if ls[6] == "PASS": outfile.write(line) # end text = " Completed {} ".format(my_name) log.info(text.center(65, "="))
def main(): args = get_args() # setup logging log, my_name = setup_logging(args) interval_list = get_interval_file(log, args.input_reference) # get coverage gatk.coverage(log, args.input_reference, args.input_bam, interval_list, args.output_dir) # end text = " Completed {} ".format(my_name) log.info(text.center(65, "="))
def main(): args = get_args() # setup logging log, my_name = setup_logging(args) all_bams = get_all_bams(args.input_dir, args.follow_links) sample = os.path.basename(args.input_dir) picard.merge_many_bams(log, sample, all_bams, args.output_dir) # end text = " Completed {} ".format(my_name) log.info(text.center(65, "="))
def main(): args = get_args() # setup logging log, my_name = setup_logging(args) log.info("{}".format('-' * 65)) cnt_loci = Counter() cnt_status = Counter() with open(args.input_vcf, 'r') as infile: for line in infile: if line.startswith("#"): pass else: ls = line.strip().split("\t") cnt_status[ls[6]] += 1 if ls[6] == "PASS": cnt_loci[ls[0]] += 1 snps_per_locus = numpy.array([v[1] for v in cnt_loci.iteritems()]) ordered_status = cnt_status.keys() ordered_status.sort() for status in ordered_status: log.info("{0: <6}\tSNPs with {1}".format(cnt_status[status], status)) log.info("{}".format('-' * 65)) log.info("Total count of PASS loci\t = {}".format(len(snps_per_locus))) log.info("Total count of PASS SNPs\t = {}".format(sum(snps_per_locus))) log.info("Mean PASS SNPs per locus\t = {}".format(round(numpy.mean(snps_per_locus), 2))) confidence_interval = 1.96 * (numpy.std(snps_per_locus, ddof=1) / numpy.sqrt(len(snps_per_locus))) log.info("95 CI PASS SNPs per locus\t = {}".format(round(confidence_interval, 2))) log.info("Max PASS SNPs per locus\t = {}".format(numpy.min(snps_per_locus))) log.info("Min PASS SNPs per locus\t = {}".format(numpy.max(snps_per_locus))) if args.output_file is not None: with open(args.output_file, "w") as outfile: outfile.write("locus,count\n") for locus, cnt in cnt_loci.iteritems(): outfile.write("{},{}\n".format(locus, cnt)) # end text = " Completed {} ".format(my_name) log.info(text.center(65, "="))
def main(): # get args and options args = get_args() # setup logging log, my_name = setup_logging(args) text = " Starting {} ".format(my_name) log.info(text.center(65, "=")) # get the config file data conf = ConfigParser.ConfigParser(allow_no_value=True) conf.optionxform = str conf.read(args.config) # get the input data log.info("Getting input filenames and creating output directories") reference, individuals = get_input_data(log, conf, args.output) flowcells = dict(conf.items("flowcell")) if args.bwa_mem: log.info("You are running BWA-MEM") for indiv in individuals: bam, bam_se = False, False sample, dir = indiv # pretty print taxon status text = " Processing {} ".format(sample) log.info(text.center(65, "-")) # make a directory for sample-specific assemblies sample_dir = os.path.join(args.output, sample) os.makedirs(sample_dir) # determine how many files we're dealing with fastq = get_input_files(dir, args.subfolder, log) if fastq.r1 and fastq.r2: # bwa align r1 and r2 if args.bwa_mem: bam = bwa.mem_pe_align(log, sample, sample_dir, reference, args.cores, fastq.r1, fastq.r2) else: bam = bwa.pe_align(log, sample, sample_dir, reference, args.cores, fastq.r1, fastq.r2) # clean the bam up (MAPq 0 and trim overlapping reads) bam = picard.clean_up_bam(log, sample, sample_dir, bam, "pe") # get flowcell id fc = flowcells[sample] bam = picard.add_rg_header_info(log, sample, sample_dir, fc, bam, "pe") ### !!! We are not removing duplicates because we expect them and ### !!! and they have been filtered prior to alignment (after demuxing) if fastq.singleton: # bwa align singleton reads if args.bwa_mem: bam_se = bwa.mem_se_align(log, sample, sample_dir, reference, args.cores, fastq.singleton) else: bam_se = bwa.se_align(log, sample, sample_dir, reference, args.cores, fastq.singleton) # clean the bam up (MAPq 0 and trim overlapping reads) bam_se = picard.clean_up_bam(log, sample, sample_dir, bam_se, "se") # get flowcell id fc = flowcells[sample] bam_se = picard.add_rg_header_info(log, sample, sample_dir, fc, bam_se, "se") ### !!! We are not removing duplicates because we expect them and ### !!! and they have been filtered prior to alignment (after demuxing) if bam and bam_se: bam = picard.merge_two_bams(log, sample, sample_dir, bam, bam_se) elif bam_se and not bam: bam = bam_se if not bam: raise IOError("There is no BAM file. Check bwa log files for problems.") # end text = " Completed {} ".format(my_name) log.info(text.center(65, "="))