Пример #1
0
def move_file_to(f_in: str, f_out: str, cd: str):
    # cd: path to out directory
    cmd = ' '.join(['bcftools view', '-Oz -o', f_out, f_in])
    print(cmd)
    subprocess.run(cmd, shell=True, cwd=cd)
    pybcf.index(f_out, cd)

    return check_file_creation(os.getcwd(), f_out)
Пример #2
0
 def split_file(self, base_name_out: str):
     """Write reference an target populations to files"""
     self._split_samples()
     pybcf.sampling(self.filein,
                    self.ref_prefix + '.' + base_name_out,
                    self.ref_pan,
                    wd=self.wd)
     pybcf.index(self.ref_prefix + '.' + base_name_out, wd=self.wd)
     pybcf.sampling(self.filein,
                    self.stu_prefix + '.' + base_name_out,
                    self.stu_pop,
                    wd=self.wd)
     pybcf.index(self.stu_prefix + '.' + base_name_out, wd=self.wd)
Пример #3
0
def partition_imp(tpl: Tuple[str, Dict[str, str]],
                  total_ref: bool = False) -> None:
    print('\n\nREF/IMP SAMPLING'.ljust(80, '.'))
    folder, dic = tpl
    delete_file(os.path.join(folder, dic['imp']))
    delete_file(os.path.join(folder, dic['imp'] + '.csi'))
    pybcf.sampling(dic['gz'], dic['imp'],
                   '{}/ALL.chr20.snps.impID.txt'.format(prm.WD + '/gt'),
                   folder)

    if total_ref and dic.name != 'raw':
        pybcf.rename_samples(dic['imp'], dic['imp'][:-3], folder, '_IMP')
        pybcf.bgzip(dic['imp'][:-3], dic['imp'], folder)
        delete_file(dic['imp'][:-3])

    pybcf.index(dic['imp'], folder)
Пример #4
0
def all_snps_all_samples(dic: Dict[str, str], cd: str) -> str:
    mkdir('./all_snps_all_samples')
    path_out = os.path.join(cd, 'all_snps_all_samples')
    cmd = ' '.join([
        'bcftools view',
        '-Oz -o',
        # os.path.join(path_out, dic['b1'] + '.vcf.gz'),
        # dic['b1'] + '.vcf.gz'
        os.path.join(path_out, dic['imp']),
        dic['imp']
    ])

    subprocess.run(cmd, shell=True, cwd=cd)

    # pybcf.index(dic['b1'] + '.vcf.gz', path_out)
    pybcf.index(dic['imp'], path_out)

    return path_out
Пример #5
0
def keep_single_sample(dic: Dict[str, str], cd: str, sample_name: str) -> str:
    mkdir('./keeponly_{}'.format(sample_name))
    path_out = os.path.join(cd, 'keeponly_{}'.format(sample_name))
    cmd = ' '.join([
        'bcftools view -s {}'.format(sample_name),
        '-Oz -o',
        # os.path.join(path_out, dic['b1'] + '.vcf.gz'),
        # dic['b1'] + '.vcf.gz'
        os.path.join(path_out, dic['imp']),
        dic['imp']
    ])

    subprocess.run(cmd, shell=True, cwd=cd)

    # pybcf.index(dic['b1'] + '.vcf.gz', path_out)
    pybcf.index(dic['imp'], path_out)

    return path_out
Пример #6
0
def switch_off_markers(dic: Dict[str, str], cd: str, rm_snp: str) -> str:
    # remove snp for simulating it as ungenotyped
    mkdir('./rm_20:{}'.format(rm_snp))
    path_out = os.path.join(cd, 'rm_20:{}'.format(rm_snp))
    cmd = ' '.join([
        'bcftools view -e POS={}'.format(rm_snp),
        '-Oz -o',
        # os.path.join(path_out, dic['b1'] + '.vcf.gz'),
        # dic['b1'] + '.vcf.gz'
        os.path.join(path_out, dic['imp']),
        dic['imp']
    ])

    subprocess.run(cmd, shell=True, cwd=cd)

    # pybcf.index(dic['b1'] + '.vcf.gz', path_out)
    pybcf.index(dic['imp'], path_out)

    return path_out
Пример #7
0
 def write(self) -> None:
     """Writes pooling simulation result into an output file"""
     # load header and data to write
     self._new_header()
     self._new_data()
     # write as text-like file
     print('\r\nWriting metadata in {}'.format(self.path_out).ljust(
         80, '.'))
     vcf_out = pysam.VariantFile(self.path_out,
                                 'w',
                                 header=self.vcf_in.header)
     vcf_out.close()
     print('\r\nWriting data in {}'.format(self.path_out).ljust(80, '.'))
     with open(self.path_out, 'a') as f_out:
         f_out.writelines(self.data)
     print('Writing data in {}: Done'.format(self.path_out).rjust(80, '.'))
     # compress and index the VCF file
     pybcf.bgzip(self.path_out, self.path_out + '.gz', wd=self.wd)
     pybcf.index(self.path_out + '.gz', wd=self.wd)
     os.remove(self.path_out)
Пример #8
0
def merge_files(pattern: str, f_out: str, cd: str):
    mkdir(os.path.join(cd, 'single_samples_merged'))
    os.chdir(os.path.join(cd, 'single_samples_merged'))
    # with open('files2merge.txt', mode='w+', encoding='utf-8') as f:
    #     for line in flist:
    #         f.write(line)
    #         f.write('\n')
    # files = ' '.join(flist)
    cmd = ' '.join([
        'bcftools merge',
        pattern,
        '-Oz -o',
        f_out,
        #files
    ])
    print(cmd)
    subprocess.run(cmd, shell=True, cwd=os.getcwd())
    pybcf.index(f_out, os.getcwd())

    return check_file_creation(os.getcwd(), f_out)
Пример #9
0
def beagle_imputing(dic_study: dict, dicref: dict, cd: str) -> bool:
    print('\n\nBEAGLE (ROUND#2)'.ljust(80, '.'))
    os.chdir(cd)
    delete_file(dic_study['b2'] + '.vcf.gz')

    bgl2 = ' '.join([
        'java -Xss5m -jar {}'.format(
            prm.BEAGLE_JAR),  # -Xss5m option: fix StackOverFlow error of Java
        'gt=' + dic_study['cfgt'] + '.vcf.gz',
        'ref={}'.format(os.path.join(cd, dicref['b1r'] + '.vcf.gz')),
        'impute=true',
        'gprobs=true',
        'out=' + dic_study['b2'],
        # 'map=' + os.path.join(os.path.expanduser('~'),
        #                       '1000Genomes/data/plink.GRCh37.map/plink.chr20.GRCh37.map')
    ])

    subprocess.run(bgl2, shell=True, cwd=cd)
    pybcf.index(dic_study['b2'] + '.vcf.gz', cd)

    return check_file_creation(cd, dic_study['b2'] + '.vcf.gz')
Пример #10
0
def conform_gt(dic: dict, dicraw: dict, cd: str) -> bool:
    print('\n\nCONFORM-GT'.ljust(80, '.'))
    # GT for reference files, even when working with GLs
    os.chdir(cd)
    print(os.getcwd())
    delete_file(dic['cfgt'] + '.vcf.gz')

    cfgt = ' '.join([
        'java -jar {}'.format(prm.CFGT_JAR),
        '{}='.format('gt') + dic['b1'] + '.vcf.gz', 'chrom=20:60343-62965354',
        'ref={}'.format(os.path.join(cd, dicraw['b1r'] + '.vcf.gz')),
        'out=' + dic['cfgt']
    ])
    try:
        subprocess.run(cfgt, shell=True, cwd=cd)
        assert os.path.exists(dic['cfgt'] + '.vcf.gz')
    except AssertionError:  # if duplicated markers, just copy phased file
        shutil.copy(dic['b1'] + '.vcf.gz', dic['cfgt'] + '.vcf.gz')
    pybcf.index(dic['cfgt'] + '.vcf.gz', cd)

    return check_file_creation(cd, dic['cfgt'] + '.vcf.gz')
Пример #11
0
def bgzip_working_files(dic: Dict[str, str], path_gt_files: str,
                        path_gl_files: str, cd: str) -> None:
    """
    Bgzip, sort and index files created from pooloing-decoding (text files vcf formatted).
    :param dic:
    :param path_gt_files:
    :param path_gl_files:
    :param cd:
    :return:
    """
    print('\n\nBGZIP in {}'.format(path_gt_files).ljust(80, '.'))
    # process prm.GTGL == 'GT' format anyway: enables to start imputing with GT or GL indifferently
    print('{} compressed to {}'.format(os.path.join(path_gt_files, dic['vcf']),
                                       dic['gz']))
    pybcf.bgzip(
        os.path.join(path_gt_files, dic['vcf'].replace('.gl', '.gt')),
        os.path.join(path_gt_files, dic['gz'].replace('.gl',
                                                      '.gt')), path_gt_files
    )  # bgzip the file in the corresponding GT folder for missing values
    # Delete the index file for avaoiding Warning
    # "[W::hts_idx_load2] The index file is older than the data file" when sorting
    delete_file(
        os.path.join(path_gt_files, dic['gz'].replace('.gl', '.gt') + '.csi'))
    pybcf.sort(dic['gz'].replace('.gl', '.gt'), path_gt_files)
    pybcf.index(dic['gz'].replace('.gl', '.gt'), path_gt_files)
    #delete_file(path_gt_files + dic['vcf'])

    if prm.GTGL == 'GL':
        print('\n\nBGZIP in {}'.format(os.getcwd()).ljust(80, '.'))
        print('{} compressed to {}'.format(
            os.path.join(path_gl_files, dic['vcf']), dic['gz']))
        pybcf.bgzip(
            os.path.join(path_gl_files, dic['vcf']), dic['gz'], cd
        )  # bgzip the file in the corresponding GL folder for unknown genotypes
        delete_file(dic['gz'] + '.csi')
        pybcf.sort(dic['gz'], cd)
        pybcf.index(dic['gz'], cd)
Пример #12
0
def reformat_fields(dic_study: dict, cd: str) -> bool:
    print('\n\nREFORMATTING GP AND DS FIELDS'.ljust(80, '.'))
    os.chdir(cd)
    delete_file(dic_study['corr'] + '.vcf.gz')

    refmt = ' '.join([
        "bcftools view {}.vcf.gz".format(dic_study['b2']),
        "| sed 's/##FORMAT=<ID=DS,Number=A,Type=Float/##FORMAT=<ID=DS,Number=1,Type=String/'",
        "| sed 's/##FORMAT=<ID=GP,Number=G,Type=Float/##FORMAT=<ID=GP,Number=3,Type=String/'",
        "| bcftools view -Oz -o {}.vcf.gz".format(dic_study['corr'])
    ])

    subprocess.run(refmt, shell=True, cwd=cd)
    pybcf.index(dic_study['corr'] + '.vcf.gz', cd)

    gtonly = ' '.join([
        "bcftools annotate -x 'FORMAT'", dic_study['corr'] + '.vcf.gz',
        "| bgzip >", dic_study['gtonly'] + '.vcf.gz'
    ])

    subprocess.run(gtonly, shell=True, cwd=cd)
    pybcf.index(dic_study['gtonly'] + '.vcf.gz', cd)

    return check_file_creation(cd, dic_study['gtonly'] + '.vcf.gz')
Пример #13
0
def concat_files(flist: list, f_out: str, dic: dict, cd: str):
    rechr20pos = re.compile(r'(20\:)(\d+)')
    poslist: list = []
    for f in flist:
        dirpath = os.path.dirname(f)
        os.chdir(os.path.join(cd, dirpath))
        pos = re.search(rechr20pos, f)[0]
        poslist.append(pos)

        delete_file(f + '.csi')
        variant = ' '.join([
            'bcftools view',
            '-i POS={}'.format(pos[3:]),  # trim 20: from the regex
            '-Oz -o',
            'marker_{}.vcf.gz'.format(pos),
            dic['gtonly'] + '.vcf.gz'
        ])
        print(variant)
        subprocess.run(variant, shell=True, cwd=os.getcwd())
        pybcf.index(f, os.getcwd())

    mkdir(os.path.join(cd, 'ko_markers_merged'))
    os.chdir(os.path.join(cd, 'ko_markers_merged'))

    f_init = ' '.join([
        'bcftools view', '-t ^' + ','.join(poslist), '-Oz -o',
        'ko_markers_core.vcf.gz',
        os.path.join(os.path.dirname(flist[0]), dic['gtonly'] + '.vcf.gz')
    ])
    print(f_init)
    subprocess.run(f_init, shell=True, cwd=os.getcwd())
    pybcf.index('ko_markers_core.vcf.gz', os.getcwd())

    files = ' '.join(flist)
    cmd = ' '.join([
        'bcftools concat -a', '-Oz -o', f_out, 'ko_markers_core.vcf.gz', files
    ])
    print(cmd)
    subprocess.run(cmd, shell=True, cwd=os.getcwd())
    pybcf.index(f_out, os.getcwd())

    return check_file_creation(os.getcwd(), f_out)
Пример #14
0
def beagle_phasing(dic: dict, path_gt_files: str, cd: str) -> None:
    print('\n\nBEAGLE ROUND#1'.ljust(80, '.'))
    os.chdir(cd)
    print('Directory: ', cd)

    if dic.name == 'raw':
        delete_file(dic['b1r'] + '.vcf.gz')
        delete_file(dic['b1i'] + '.vcf.gz')

        bgl1gt = ' '.join([
            'java -Xss5m -jar {}'.format(prm.BEAGLE_JAR),
            # -Xmx4000m replaced by -Xss5m because of Exception in thread "main"
            # java.lang.StackOverflowError at dag.MergeableDag.similar(MergeableDag.java:374)
            '{}='.format('gt') +
            os.path.join(path_gt_files, dic['imp'].replace('.gl', '.gt')),
            'impute=false',
            'gprobs=true',
            'out=' + dic['b1i'],
            '&',
            'java -Xss5m -jar {}'.format(prm.BEAGLE_JAR),
            '{}='.format('gt') +
            os.path.join(path_gt_files, dic['ref'].replace('.gl', '.gt')),
            'impute=false',
            'gprobs=true',
            'out=' + dic['b1r'],
            # 'map=' + os.path.join(os.path.expanduser('~'),
            #                       '1000Genomes/data/plink.GRCh37.map/plink.chr20.GRCh37.map')
        ])

        subprocess.run(bgl1gt, shell=True, cwd=cd)
        pybcf.index(dic['b1r'] + '.vcf.gz', cd)
        pybcf.index(dic['b1i'] + '.vcf.gz', cd)
        delete_file('temp.' + dic['imp'])
        delete_file('temp.' + dic['ref'])

    else:  # dic.name == 'pooled' or 'missing'
        delete_file(dic['b1'] + '.vcf.gz')

        bgl1gtgl = ' '.join([
            'java -Xss5m -jar {}'.format(prm.BEAGLE_JAR),
            '{}='.format('gtgl') + dic['imp'],
            'impute=false',
            'gprobs=true',
            'out=' + 'temp.b1',
            # 'map=' + os.path.join(os.path.expanduser('~'),
            #                       '1000Genomes/data/plink.GRCh37.map/plink.chr20.GRCh37.map')
        ])

        bgl1gt = ' '.join([
            'java -Xss5m -jar {}'.format(prm.BEAGLE_JAR),
            '{}='.format('gt') +
            '{}'.format('temp.b1.vcf.gz' if prm.GTGL == 'GL' else dic['imp']),
            'impute=false',
            'gprobs=true',
            'out=' + dic['b1'],
            # 'map=' + os.path.join(os.path.expanduser('~'),
            #                       '1000Genomes/data/plink.GRCh37.map/plink.chr20.GRCh37.map')
        ])

        if prm.GTGL == 'GL':
            subprocess.run(bgl1gtgl, shell=True, cwd=cd)
        subprocess.run(bgl1gt, shell=True, cwd=cd)
        pybcf.index(dic['b1'] + '.vcf.gz', cd)
        delete_file('temp.b1' + '.vcf.gz')
Пример #15
0
def partition_ref(dic: dict, path: str) -> None:
    pybcf.sampling(dic['gz'].replace('.gl',
                                     '.gt'), dic['ref'].replace('.gl', '.gt'),
                   '{}/ALL.chr20.snps.refID.txt'.format(prm.WD + '/gt'), path)
    pybcf.index(dic['ref'].replace('.gl', '.gt'), path)