def fastq_file(s): s = s if ',' in s else f'{s},' if not len(s) == 2: logger.error( f'Invalid FASTQ file(s) specified, {len(s)} files were given while only accepts 1 or 2 files.' ) sys.exit(1) return [file_path(p) if p else p for p in s.split(',')]
def rescue_ratio(bed, txt): pseudo_count = count_lines(bed) actual_count = count_lines(os.path.join(tmp, f'{key}.ip.pseudo.01.vs.{key}.ip.pseudo.02.reproducible.peaks.bed')) try: ratio = max(actual_count, pseudo_count) / min(actual_count, pseudo_count) except ZeroDivisionError: ratio = 0 logger.error(f'No peaks found in reproducible peaks or pseudo reproducible peaks, return ratio 0.') with open(txt, 'w') as o: o.write(f'{ratio}\n')
def rescue_ratio(inputs, outputs): def prepare_pseudo_bam(bam1, bam2, basename): pseudo_bam = f'{basename}.bam' tmp_pseudo_bam = pseudo_bam.replace('.bam', '.tmp.bam') cmd = f'samtools merge {tmp_pseudo_bam} {bam1} {bam2}' cmder.run(cmd, msg=f'Merging {bam1} and {bam2} ...') cmder.run( f'samtools sort -@ {options.cores} -m 2G -o {pseudo_bam} {tmp_pseudo_bam}' ) cmder.run(f'rm {tmp_pseudo_bam}') bam1, bam2 = split_bam(pseudo_bam, f'{basename}.pseudo.01.bam', f'{basename}.pseudo.02.bam') return bam1, bam2 pseudo_ip_bams, pseudo_input_bams, pseudo_peak_beds = [], [], [] for i, (sample1, sample2) in enumerate(itertools.combinations(SAMPLES, 2), start=1): pseudo_ip_bam = prepare_pseudo_bam( sample1.ip_read.bam, sample2.ip_read.bam, f'rescue/{sample1.ip_read.name}.{sample2.ip_read.name}') pseudo_ip_bams.extend(pseudo_ip_bam) pseudo_input_bam = prepare_pseudo_bam( sample1.input_read.bam, sample2.input_read.bam, f'rescue/{sample1.input_read.name}.{sample2.input_read.name}') pseudo_input_bams.extend(pseudo_input_bam) pseudo_peak_beds.extend([clipper_peaks(bam) for bam in pseudo_ip_bam]) key = ".".join([sample.ip_read.name for sample in SAMPLES]) pseudo_reproducible_bed = f'rescue/{key}.pseudo.01.vs.{key}.pseudo.02.reproducible.peaks.bed' peak(pseudo_ip_bams, pseudo_input_bams, pseudo_peak_beds, pseudo_reproducible_bed, 'rescue', cwd=options.outdir) pseudo_count = count_lines(pseudo_reproducible_bed) key = ".vs.".join([sample.ip_read.name for sample in SAMPLES]) count = count_lines(f'{ECLIP}/{key}.reproducible.peaks.bed') try: ratio = max(count, pseudo_count) / min(count, pseudo_count) except ZeroDivisionError: ratio = 0 logger.error( f'No peaks found in reproducible peaks or pseudo reproducible peaks, return ratio 0.' ) with open(outputs, 'w') as o1, open(outputs.replace(f'{ECLIP}/', 'rescue/'), 'w') as o2: o1.write(f'{ratio}\n') o2.write(f'{ratio}\n')
def rescue_ratio(bed, txt): bed1, bed2 = glob.glob(os.path.join(tmp, '*.reproducible.peaks.bed')) count1, count2 = count_lines(bed1), count_lines(bed2) try: ratio = count1 / count2 except ZeroDivisionError: ratio = 0 logger.error( f'No peaks found in one of the split reproducible peaks, return ratio 0.' ) with open(txt, 'w') as o: o.write(f'{ratio}\n')
def rescue_ratio(inputs, txt): if len(SAMPLES) == 1: logger.warning( 'No enough samples (n = 1 < 2) to calculate rescue ratio!') shutil.rmtree('rescue') return '' ip_bams, input_bams = [s.ip_bam for s in SAMPLES], [s.input_bam for s in SAMPLES] ip_pseudo_bam = merge_bam(ip_bams, os.path.join('rescue', 'ip.pseudo.bam')) ip_pseudo_bams = split_bam(ip_pseudo_bam, os.path.join('rescue', 'ip.pseudo.'), len(ip_bams)) os.unlink(ip_pseudo_bam) input_pseudo_bam = merge_bam(input_bams, os.path.join('rescue', 'input.pseudo.bam')) input_pseudo_bams = split_bam(input_pseudo_bam, os.path.join('rescue', 'input.pseudo.'), len(input_bams)) os.unlink(input_pseudo_bam) pseudo_peak_beds = [ clipper_peaks(bam, bam.replace('.bam', '.peak.clusters.bed')) for bam in ip_pseudo_bams ] basename = ".vs.".join( [os.path.basename(bam).replace('.bam', '') for bam in ip_pseudo_bams]) pseudo_peak_bed = os.path.join('rescue', f'{basename}.reproducible.peaks.bed') peak(ip_pseudo_bams, input_pseudo_bams, pseudo_peak_beds, pseudo_peak_bed, 'rescue') pseudo_count = count_lines(pseudo_peak_bed) basename = ".vs.".join([f'{name}.ip' for name in options.names]) actual_count = count_lines(f'{basename}.reproducible.peaks.bed') try: ratio = max(actual_count, pseudo_count) / min(actual_count, pseudo_count) except ZeroDivisionError: ratio = 0 logger.error( f'No peaks found in reproducible peaks or pseudo reproducible peaks, return ratio 0.' ) with open(txt, 'w') as o: o.write(f'{ratio}\n')
def consistency_ratio(inputs, txt): if len(SAMPLES) == 1: logger.warning( 'No enough samples (n = 1 < 2) to calculate self-consistency ratio!' ) shutil.rmtree('consistency') return '' ip_bam1, ip_bam2, input_bam1, input_bam2, peak_bed1, peak_bed2 = [], [], [], [], [], [] for s in SAMPLES: ip_b1, ip_b2 = split_bam( s.ip_bam, os.path.join('consistency', f'{s.name}.ip.split.'), 2) ip_bam1.append(ip_b1), ip_bam2.append(ip_b2) input_b1, input_b2 = split_bam( s.input_bam, os.path.join('consistency', f'{s.name}.input.split.'), 2) input_bam1.append(input_b1), input_bam2.append(input_b2) bed1 = clipper_peaks(ip_b1, ip_b1.replace('.bam', '.peak.clusters.bed')) bed2 = clipper_peaks(ip_b2, ip_b2.replace('.bam', '.peak.clusters.bed')) peak_bed1.append(bed1), peak_bed2.append(bed2) basename = ".vs.".join( [os.path.basename(bam).replace('.bam', '') for bam in ip_bam1]) split_peak_bed1 = os.path.join('consistency', f'{basename}.reproducible.peaks.bed') peak(ip_bam1, input_bam1, peak_bed1, split_peak_bed1, 'consistency') basename = ".vs.".join( [os.path.basename(bam).replace('.bam', '') for bam in ip_bam2]) split_peak_bed2 = os.path.join('consistency', f'{basename}.reproducible.peaks.bed') peak(ip_bam2, input_bam2, peak_bed2, split_peak_bed2, 'consistency') count1, count2 = count_lines(split_peak_bed1), count_lines(split_peak_bed2) try: ratio = count1 / count2 except ZeroDivisionError: ratio = 0 logger.error( f'No peaks found in one of the split reproducible peaks, return ratio 0.' ) with open(txt, 'w') as o: o.write(f'{ratio}\n')
def make_bigwig_files(bam, bigwig): def bam_to_bigwig(bam, scale, strand, bw): bg, bg_sort = bw.replace('.bw', '.bg'), bw.replace('.bw', '.sort.bg') cmd = f'genomeCoverageBed -ibam {bam} -bg -scale {scale} -strand {strand} -du -split > {bg}' cmder.run(cmd) cmd = f'bedSort {bg} {bg_sort}' cmder.run(cmd) cmd = f'bedGraphToBigWig {bg_sort} {options.genome}/chrNameLength.txt {bw}' cmder.run(cmd) cmder.run(f'rm {bg} {bg_sort}') message, start_time = f'Make BigWig files for {bam} ...', time.perf_counter( ) logger.info(message) pos_bw, neg_bw = bigwig, bigwig.replace('.plus.bw', '.minus.bw') with pysam.AlignmentFile(bam, 'rb') as sam: total_reads = sam.mapped total_reads = total_reads / 2 if TYPE == 'paired' else total_reads try: scale = 1000000.0 / total_reads except ZeroDivisionError: logger.error( f'No reads was found in BAM {bam}, empty BigWig file was created.') with open(bigwig, 'w') as o: o.write('') return bigwig if TYPE == 'single': bam_to_bigwig(bam, scale, '+', pos_bw) bam_to_bigwig(bam, -1 * scale, '-', neg_bw) else: bam_to_bigwig(bam, -1 * scale, '-', pos_bw) bam_to_bigwig(bam, scale, '+', neg_bw) run_time = int(time.perf_counter() - start_time) message = message.replace( ' ...', f' completed in [{str(datetime.timedelta(seconds=run_time))}].') logger.info(message) return bigwig
def pureclip(bam, bed): ip_bam, input_bam = [[sample.ip_bam, sample.input_bam] for sample in SAMPLES if sample.cross_bed == bed][0] header = cmder.run(f'samtools view -H {ip_bam}').stdout.read() refs = [ line.split()[1].replace('SN:', '') for line in header.splitlines() if line.startswith('@SQ') ][:3] refs = ';'.join(refs) cmd = [ 'pureclip', '-i', ip_bam, '-bai', f'{ip_bam}.bai', '-g', f'{options.genome}/genome.fa', '-nt', options.cpus, '-ibam', input_bam, '-ibai', f'{input_bam}.bai', '-iv', f'"{refs};"', '-o', bed, '-or', bed.replace('.crosslink.sites.bed', '.binding.regions.bed'), '>', bed.replace('.crosslink.sites.bed', '.pureclip.log') ] try: cmder.run( cmd, msg= f'Calling peaks from {ip_bam} and {input_bam} using pureCLIP ...') except Exception as e: logger.error(f'Running pureclip failed: {e}.')
def consistency_ratio(inputs, outputs): counts = [] for i, sample in enumerate(SAMPLES, start=1): split_ip_bams = split_bam( sample.ip_read.bam, f'consistency/{sample.ip_read.name}.split.01.bam', f'consistency/{sample.ip_read.name}.split.02.bam') split_input_bams = split_bam( sample.input_read.bam, f'consistency/{sample.input_read.name}.split.01.bam', f'consistency/{sample.input_read.name}.split.02.bam') split_peak_beds = [ clipper_peaks(split_ip_bams[0]), clipper_peaks(split_ip_bams[1]) ] bed = f'consistency/{sample.ip_read.name}.split.01.vs.{sample.ip_read.name}.split.02.reproducible.peaks.bed' peak(split_ip_bams, split_input_bams, split_peak_beds, bed, 'consistency', cwd=options.outdir) counts.append(count_lines(bed)) try: ratio = counts[0] / counts[1] except ZeroDivisionError: ratio = 0 logger.error( f'No peaks found in one of the split reproducible peaks, return ratio 0.' ) with open(outputs, 'w') as o1, open(outputs.replace(f'{ECLIP}/', 'consistency/'), 'w') as o2: o1.write(f'{ratio}\n') o2.write(f'{ratio}\n')
parser = argparse.ArgumentParser(description=__doc__, prog='eclip') parser.add_argument('--names', nargs='+', required=True, help='Shortnames for each sample, e.g., rep1, rep2.') parser.add_argument('--wd', required=True, help='Path to the work directory that contains eCLIP analysis results.') parser.add_argument('--species', help="Species name (short name code) the dataset associated with, e.g., hg19, mm10.", default='hg19') parser.add_argument('--l2fc', type=int, help="Only consider peaks at or above this log2 fold change cutoff.", default=3) parser.add_argument('--l10p', type=int, help="Only consider peaks at or above this log10 p value cutoff.", default=3) parser.add_argument('--cpus', type=int, help='Maximum number of CPU cores can be used for your job.', default=16) parser.add_argument('--dry_run', action='store_true', help='Print out steps and files involved in each step without actually running the pipeline.') options = parser.parse_args() try: os.chdir(options.wd) except OSError as e: logger.error(e) tmp = 'rescue' if not os.path.isdir(tmp): os.mkdir(tmp) ip_bams = [f'{name}.ip.bam' for name in options.names] input_bams = [f'{name}.input.bam' for name in options.names] files = {} for name1, name2 in itertools.combinations(options.names, 2): files[f'{name1}.{name2}'] = (f'{name1}.ip.bam', f'{name2}.ip.bam', f'{name1}.input.bam', f'{name2}.input.bam') key = '.'.join(options.names) def merge_bam(bam1, bam2, bam): if os.path.isfile(bam): logger.info(f'BAM file {bam} already exist.') else:
def dir_path(p): if not os.path.isdir(p): logger.error(f'Path "{p}" may not be a directory or does not exist.') sys.exit(1) return p
def file_path(p): if not os.path.isfile(p): logger.error(f'File "{p}" may not be a file or does not exist.') sys.exit(1) return p
parser.add_argument( '--hold_submit', action='store_true', help= 'Generate the submit script but hold it without submitting to the job scheduler.' ) args = parser.parse_args() outdir = args.outdir or os.getcwd() dir_path(outdir) os.chdir(outdir) fastq, adapters_fasta, name = args.fastq, args.adapters_fasta, args.name rtag, gtag = args.repeat_label, args.genome_label if len(fastq) != len(adapters_fasta): logger.error('Number of items for fastq and adapters_fasta are not equal.') sys.exit(1) if name: if not len(args.fastq) != len(args.name): logger.error('Number of items for fastq and name are not equal.') sys.exit(1) else: name = [basename(n) for n in fastq] name = [os.path.join(outdir, n) for n in name] fastq_to_name = {fq: n for fq, n in zip(fastq, name)} fastq_to_adapters = {n: adapter for n, adapter in zip(name, adapters_fasta)} @task(inputs=fastq, outputs=lambda i: f'{fastq_to_name[i]}.umi.fastq.gz', cmd=['eclip_umi_extract', 'input', '-o', 'output'],
parser.add_argument( '--dry_run', action='store_true', help= 'Print out steps and files involved in each step without actually running the pipeline.' ) START_TIME = time.perf_counter() options = parser.parse_args() setattr(options, 'outdir', options.outdir or os.getcwd()) if not os.path.isdir(options.outdir): try: os.mkdir(options.outdir) except OSError as e: logger.error(f'Create outdir failed: {e}.') sys.exit(1) os.chdir(options.outdir) adapters = '/storage/vannostrand/software/eclip/data/se.adapters.fasta' setattr(options, 'adapters_fasta', options.adapters_fasta or adapters) file_path(options.adapters_fasta) setattr( options, 'repeat', options.repeat or '/storage/vannostrand/reference_data/hg19/repbase_v2_star_index') dir_path(options.repeat) setattr( options, 'genome', options.genome or '/storage/vannostrand/reference_data/hg19/genome_star_index')
START_TIME = time.perf_counter() options = parser.parse_args() setattr(options, 'outdir', options.outdir or os.getcwd()) dir_path(options.outdir) os.chdir(options.outdir) ips, inputs, names = options.ip_fastqs, options.input_fastqs, options.labels if len(ips) == len(names): if len(ips) == len(inputs): input_type = 'single-input' else: input_type = 'multiple-inputs' if len(inputs) != 1: logger.error('Wrong number of input_fastqs were provided.') sys.exit(1) else: logger.error('Number of items in ip_fastqs and names are not equal.') sys.exit(1) class Read: def __init__(self, fastq1, fastq2, read_name, read_type): self.fastq1 = fastq1 self.fastq2 = fastq2 self.read_name = read_name self.read_type = read_type self.key = read_name if read_name else fastq1.replace( '.fastq.gz', '').replace('.fq.gz', '') self.paired = True if self.fastq2 else False
parser = argparse.ArgumentParser(description=__doc__, prog='se_fastq_to_bam') parser.add_argument('--fastq', required=True, help='Path to a UMI extracted FASTQ file.', type=file_path) parser.add_argument('--bam', required=True, help='Path to the output BAM file (must ends with .bam).') parser.add_argument('--adapters_fasta', help="Path to the fasta file contains adapters and their sequences (for " "single-end dataset only.", required=True, type=file_path) parser.add_argument('--genome', help="Path to STAR reference genome index directory.", type=dir_path) parser.add_argument('--repeat', help="Path to STAR repeat elements index directory.", type=dir_path) parser.add_argument('--cpus', type=int, help='Maximum number of CPU cores can be used for your job.', default=16) parser.add_argument('--dry_run', action='store_true', help='Print out steps and files involved in each step without actually running the pipeline.') args = parser.parse_args() fastq, bam = args.fastq, args.bam if not bam.endswith('.bam'): logger.error(f'Output BAM file "{bam}" does not end with .bam extension.') sys.exit(1) name = args.name if args.name else bam.replace('.bam', '') outdir = os.path.dirname(bam) or os.getcwd() if not os.path.isdir(outdir): logger.error(f'Cane not set "{outdir}" as output directory.') sys.exit(1) os.chdir(options.outdir) ips, inputs, names = options.ip_fastqs, options.input_fastqs, options.labels if len(ips) == len(inputs) == len(names): pass else: logger.error('Number of items in ip_fastqs, input_fastqs, and names are not equal.') sys.exit(1)