def imputation(b: hb.batch.Batch, vcf: str = None, vcf_filename_no_ext: str = None, ref: hb.ResourceGroup = None, ref_size: Union[int, float] = None, region: str = None, chromosome: str = None, cpu: int = 8, memory: str = 'highmem', img: str = 'docker.io/lindonkambule/gwaspy:v1', threads: int = 7, out_dir: str = None): # in_vcf = b.read_input(vcf) in_vcf = b.read_input_group(**{'bcf': vcf, 'bcf.csi': f'{vcf}.csi'}) vcf_size = bytes_to_gb(vcf) output_file_name = vcf_filename_no_ext + '.imputed.bcf' file_dir = vcf_filename_no_ext.split('.')[0] disk_size = ref_size + (vcf_size * 4) map_file = f'/shapeit4/maps/b38/{chromosome}.b38.gmap.gz' impute = b.new_job(name=output_file_name) impute.cpu(cpu) impute.memory(memory) impute.storage(f'{disk_size}Gi') impute.image(img) cmd = f''' impute5_1.1.5_static \ --h {ref.bcf} \ --m {map_file} \ --g {in_vcf.bcf} \ --r {region} \ --out-gp-field \ --o {output_file_name} \ --threads {threads} ''' impute.command(cmd) # index file to use when merging impute.command(f'bcftools index {output_file_name}') impute.command(f'mv {output_file_name} {impute.ofile}') impute.command(f'mv {output_file_name}.csi {impute.ind}') b.write_output( impute.ofile, f'{out_dir}/GWASpy/Imputation/{file_dir}/imputed_chunks/{output_file_name}' ) b.write_output( impute.ind, f'{out_dir}/GWASpy/Imputation/{file_dir}/imputed_chunks/{output_file_name}.csi' )
def concat_vcfs(b: hb.batch.Batch, vcf_basename: str = None, vcfs_to_merge: List = None, output_type: str = 'vcf', chrom: str = None, cpu: int = 16, memory: str = 'standard', docker_img: str = 'docker.io/lindonkambule/gwaspy:v1', out_dir: str = None): global index_cmd out_type = 'b' if output_type == 'bcf' else 'z' vcfs_sizes_sum = 0 merge_vcf_i = '' out_filename = f'{vcf_basename}.{chrom}.merged.bcf' if output_type == 'bcf' else \ f'{vcf_basename}.{chrom}.merged.vcf.gz' out_index_name = f'{vcf_basename}.{chrom}.merged.bcf.csi' if output_type == 'bcf' else \ f'{vcf_basename}.{chrom}.merged.vcf.gz.csi' for line in vcfs_to_merge: vcfs_sizes_sum += 2 + bytes_to_gb(line) disk_size = int(round(10 + (2 * vcfs_sizes_sum))) threads = cpu - 1 concat = b.new_job(name=f'concat-{vcf_basename}') concat.memory(memory) concat.storage(f'{disk_size}Gi') concat.image(docker_img) concat.cpu(cpu) for line in vcfs_to_merge: input_vcf = b.read_input_group(vcf=line, ind=f'{line}.csi') merge_vcf_i += f'{input_vcf.vcf} ' cmd = f''' bcftools concat \ --no-version \ --output-type {out_type} \ --output {out_filename} \ --threads {threads} \ {merge_vcf_i} ''' concat.command(cmd) # index the merged output concat.command(f'bcftools index --force {out_filename}') concat.command(f'mv {out_filename} {concat.ofile}') concat.command(f'mv {out_index_name} {concat.idx}') b.write_output(concat.ofile, f'{out_dir}/GWASpy/{vcf_basename}/Imputation/imputed_merged/{out_filename}') b.write_output(concat.idx, f'{out_dir}/GWASpy/{vcf_basename}/Imputation/imputed_merged/{out_index_name}')
def cram_to_bam(b: hb.batch.Batch, input_cram_file: str = None, ref_fasta: str = None, ref_dict: str = None, ref_ind: str = None, bam_out_name: str = None, memory: int = 15, samtools_image: str = None, out_dir: str = None): docker_image = samtools_image if samtools_image else 'gcr.io/genomics-tools/samtools' out_bam_name = bam_out_name + '.bam' output_bam_size: float = bytes_to_gb(input_cram_file) / 0.40 ref_size: float = bytes_to_gb(ref_fasta) + bytes_to_gb(ref_ind) disk_size: int = round( bytes_to_gb(input_cram_file) + output_bam_size + ref_size) + 25 job_memory = str(memory) + 'Gi' job_storage = str(disk_size) + 'Gi' crams_to_bams = b.new_job(name=out_bam_name) in_cram = b.read_input(input_cram_file) fasta = b.read_input_group(**{ 'fasta': ref_fasta, 'fasta.fai': ref_ind, 'dict': ref_dict }) crams_to_bams.memory(job_memory) crams_to_bams.image(docker_image) crams_to_bams.storage(job_storage) crams_to_bams.command( f'samtools view -b -T {fasta.fasta} -o {out_bam_name} {in_cram}') crams_to_bams.command(f'samtools index {out_bam_name}') crams_to_bams.command(f'mv {out_bam_name} {crams_to_bams.bamout}') crams_to_bams.command(f'mv {out_bam_name}.bai {crams_to_bams.bamind}') b.write_output(crams_to_bams.bamout, f'{out_dir}/BAMS/{out_bam_name}') b.write_output(crams_to_bams.bamind, f'{out_dir}/BAMS/{out_bam_name}.bai') return crams_to_bams
def concat_vcfs(b: hb.batch.Batch, vcf_basename: str = None, vcfs_to_merge: List = None, output_type: str = 'bcf', software: str = None, chrom: str = None, docker_img: str = 'docker.io/lindonkambule/gwaspy:v1', cpu: int = 8, out_dir: str = None): global index_cmd out_type = 'b' if output_type == 'bcf' else 'z' threads = cpu - 1 vcfs_sizes_sum = 0 merge_vcf_i = '' out_filename = f'{vcf_basename}.{chrom}.phased.{software}.bcf' if output_type == 'bcf' else \ f'{vcf_basename}.{chrom}.phased.{software}.vcf.gz' out_index_name = f'{vcf_basename}.{chrom}.phased.{software}.bcf.csi' if output_type == 'bcf' else \ f'{vcf_basename}.{chrom}.phased.{software}.vcf.gz.csi' for line in vcfs_to_merge: vcfs_sizes_sum += 1 + bytes_to_gb(line) mem = 'highmem' if vcfs_sizes_sum > 2 else 'standard' disk_size = 10 + vcfs_sizes_sum concat = b.new_job(name=f'concat-{vcf_basename}') concat.memory(mem) concat.storage(f'{disk_size}Gi') concat.image(docker_img) concat.cpu(cpu) for line in vcfs_to_merge: input_vcf = b.read_input_group(vcf=line, ind=f'{line}.csi') merge_vcf_i += f'{input_vcf.vcf} ' cmd = f''' bcftools concat \ --no-version \ --output-type {out_type} \ --output {out_filename} \ --threads {threads} \ --ligate \ {merge_vcf_i} ''' concat.command(cmd) # index the merged output concat.command(f'bcftools index {out_filename}') concat.command(f'mv {out_filename} {concat.ofile}') concat.command(f'mv {out_index_name} {concat.idx}') b.write_output( concat.ofile, f'{out_dir}/GWASpy/{vcf_basename}/Phasing/phased_merged/{out_filename}' ) b.write_output( concat.idx, f'{out_dir}/GWASpy/{vcf_basename}/Phasing/phased_merged/{out_index_name}' )