예제 #1
0
파일: impute_vcf.py 프로젝트: atgu/GWASpy
def imputation(b: hb.batch.Batch,
               vcf: str = None,
               vcf_filename_no_ext: str = None,
               ref: hb.ResourceGroup = None,
               ref_size: Union[int, float] = None,
               region: str = None,
               chromosome: str = None,
               cpu: int = 8,
               memory: str = 'highmem',
               img: str = 'docker.io/lindonkambule/gwaspy:v1',
               threads: int = 7,
               out_dir: str = None):

    # in_vcf = b.read_input(vcf)
    in_vcf = b.read_input_group(**{'bcf': vcf, 'bcf.csi': f'{vcf}.csi'})
    vcf_size = bytes_to_gb(vcf)

    output_file_name = vcf_filename_no_ext + '.imputed.bcf'
    file_dir = vcf_filename_no_ext.split('.')[0]

    disk_size = ref_size + (vcf_size * 4)

    map_file = f'/shapeit4/maps/b38/{chromosome}.b38.gmap.gz'

    impute = b.new_job(name=output_file_name)
    impute.cpu(cpu)
    impute.memory(memory)
    impute.storage(f'{disk_size}Gi')
    impute.image(img)

    cmd = f'''
        impute5_1.1.5_static \
            --h {ref.bcf} \
            --m {map_file} \
            --g {in_vcf.bcf} \
            --r {region} \
            --out-gp-field \
            --o {output_file_name} \
            --threads {threads}
    '''

    impute.command(cmd)
    # index file to use when merging
    impute.command(f'bcftools index {output_file_name}')

    impute.command(f'mv {output_file_name} {impute.ofile}')
    impute.command(f'mv {output_file_name}.csi {impute.ind}')
    b.write_output(
        impute.ofile,
        f'{out_dir}/GWASpy/Imputation/{file_dir}/imputed_chunks/{output_file_name}'
    )
    b.write_output(
        impute.ind,
        f'{out_dir}/GWASpy/Imputation/{file_dir}/imputed_chunks/{output_file_name}.csi'
    )
예제 #2
0
def concat_vcfs(b: hb.batch.Batch,
                vcf_basename: str = None,
                vcfs_to_merge: List = None,
                output_type: str = 'vcf',
                chrom: str = None,
                cpu: int = 16,
                memory: str = 'standard',
                docker_img: str = 'docker.io/lindonkambule/gwaspy:v1',
                out_dir: str = None):

    global index_cmd

    out_type = 'b' if output_type == 'bcf' else 'z'
    vcfs_sizes_sum = 0
    merge_vcf_i = ''

    out_filename = f'{vcf_basename}.{chrom}.merged.bcf' if output_type == 'bcf' else \
        f'{vcf_basename}.{chrom}.merged.vcf.gz'
    out_index_name = f'{vcf_basename}.{chrom}.merged.bcf.csi' if output_type == 'bcf' else \
        f'{vcf_basename}.{chrom}.merged.vcf.gz.csi'

    for line in vcfs_to_merge:
        vcfs_sizes_sum += 2 + bytes_to_gb(line)

    disk_size = int(round(10 + (2 * vcfs_sizes_sum)))
    threads = cpu - 1

    concat = b.new_job(name=f'concat-{vcf_basename}')
    concat.memory(memory)
    concat.storage(f'{disk_size}Gi')
    concat.image(docker_img)
    concat.cpu(cpu)

    for line in vcfs_to_merge:
        input_vcf = b.read_input_group(vcf=line,
                                       ind=f'{line}.csi')
        merge_vcf_i += f'{input_vcf.vcf} '

    cmd = f'''
        bcftools concat \
            --no-version \
            --output-type {out_type} \
            --output {out_filename} \
            --threads {threads} \
            {merge_vcf_i}
    '''

    concat.command(cmd)
    # index the merged output
    concat.command(f'bcftools index --force {out_filename}')

    concat.command(f'mv {out_filename} {concat.ofile}')
    concat.command(f'mv {out_index_name} {concat.idx}')
    b.write_output(concat.ofile, f'{out_dir}/GWASpy/{vcf_basename}/Imputation/imputed_merged/{out_filename}')
    b.write_output(concat.idx, f'{out_dir}/GWASpy/{vcf_basename}/Imputation/imputed_merged/{out_index_name}')
예제 #3
0
def cram_to_bam(b: hb.batch.Batch,
                input_cram_file: str = None,
                ref_fasta: str = None,
                ref_dict: str = None,
                ref_ind: str = None,
                bam_out_name: str = None,
                memory: int = 15,
                samtools_image: str = None,
                out_dir: str = None):
    docker_image = samtools_image if samtools_image else 'gcr.io/genomics-tools/samtools'

    out_bam_name = bam_out_name + '.bam'

    output_bam_size: float = bytes_to_gb(input_cram_file) / 0.40
    ref_size: float = bytes_to_gb(ref_fasta) + bytes_to_gb(ref_ind)
    disk_size: int = round(
        bytes_to_gb(input_cram_file) + output_bam_size + ref_size) + 25

    job_memory = str(memory) + 'Gi'
    job_storage = str(disk_size) + 'Gi'

    crams_to_bams = b.new_job(name=out_bam_name)
    in_cram = b.read_input(input_cram_file)
    fasta = b.read_input_group(**{
        'fasta': ref_fasta,
        'fasta.fai': ref_ind,
        'dict': ref_dict
    })

    crams_to_bams.memory(job_memory)
    crams_to_bams.image(docker_image)
    crams_to_bams.storage(job_storage)
    crams_to_bams.command(
        f'samtools view -b -T {fasta.fasta} -o {out_bam_name} {in_cram}')
    crams_to_bams.command(f'samtools index {out_bam_name}')
    crams_to_bams.command(f'mv {out_bam_name} {crams_to_bams.bamout}')
    crams_to_bams.command(f'mv {out_bam_name}.bai {crams_to_bams.bamind}')
    b.write_output(crams_to_bams.bamout, f'{out_dir}/BAMS/{out_bam_name}')
    b.write_output(crams_to_bams.bamind, f'{out_dir}/BAMS/{out_bam_name}.bai')

    return crams_to_bams
예제 #4
0
def concat_vcfs(b: hb.batch.Batch,
                vcf_basename: str = None,
                vcfs_to_merge: List = None,
                output_type: str = 'bcf',
                software: str = None,
                chrom: str = None,
                docker_img: str = 'docker.io/lindonkambule/gwaspy:v1',
                cpu: int = 8,
                out_dir: str = None):

    global index_cmd

    out_type = 'b' if output_type == 'bcf' else 'z'
    threads = cpu - 1
    vcfs_sizes_sum = 0
    merge_vcf_i = ''

    out_filename = f'{vcf_basename}.{chrom}.phased.{software}.bcf' if output_type == 'bcf' else \
        f'{vcf_basename}.{chrom}.phased.{software}.vcf.gz'
    out_index_name = f'{vcf_basename}.{chrom}.phased.{software}.bcf.csi' if output_type == 'bcf' else \
        f'{vcf_basename}.{chrom}.phased.{software}.vcf.gz.csi'

    for line in vcfs_to_merge:
        vcfs_sizes_sum += 1 + bytes_to_gb(line)

    mem = 'highmem' if vcfs_sizes_sum > 2 else 'standard'
    disk_size = 10 + vcfs_sizes_sum

    concat = b.new_job(name=f'concat-{vcf_basename}')
    concat.memory(mem)
    concat.storage(f'{disk_size}Gi')
    concat.image(docker_img)
    concat.cpu(cpu)

    for line in vcfs_to_merge:
        input_vcf = b.read_input_group(vcf=line, ind=f'{line}.csi')
        merge_vcf_i += f'{input_vcf.vcf} '

    cmd = f'''
        bcftools concat \
            --no-version \
            --output-type {out_type} \
            --output {out_filename} \
            --threads {threads} \
            --ligate \
            {merge_vcf_i}
    '''

    concat.command(cmd)
    # index the merged output
    concat.command(f'bcftools index {out_filename}')

    concat.command(f'mv {out_filename} {concat.ofile}')
    concat.command(f'mv {out_index_name} {concat.idx}')
    b.write_output(
        concat.ofile,
        f'{out_dir}/GWASpy/{vcf_basename}/Phasing/phased_merged/{out_filename}'
    )
    b.write_output(
        concat.idx,
        f'{out_dir}/GWASpy/{vcf_basename}/Phasing/phased_merged/{out_index_name}'
    )