Python is_mnp 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: hail

메소드/함수: is_mnp

hotexamples.com에서의 예제들: 2

Python is_mnp - 2개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 hail.is_mnp에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: RF_ancestry_AKT_overlap_updated_labels_August2020.py 프로젝트: wtsi-hgi/exomeQC-hail-gnomad

    mt = mt.annotate_cols(superpopulation=cohorts_pop[mt.s].superpopulation)
    mt = mt.annotate_cols(gVCF=cohorts_pop[mt.s].gVCF_ID)
    # done the above on pca_RF jupyter notebook
    # mt = hl.read_matrix_table(
    #    f"{temp_dir}/ddd-elgh-ukbb/Sanger_cohorts_chr1-20-XY_new_cohorts.mt")
    #mt = hl.split_multi_hts(    mt, keep_star=False, left_aligned=False)
    mt.write(
        f"{tmp_dir}/ddd-elgh-ukbb/Sanger_chr1-20-XY_new_cohorts_split_multi_pops.mt",
        overwrite=True)
    # filter matrixtable
    logger.info("wrote mt ")
    # filter mt
    # mt = hl.read_matrix_table(
    #    f"{temp_dir}/ddd-elgh-ukbb/Sanger_cohorts_chr1-20-XY_new_cohorts_split_multi.mt")
    mt = mt.filter_rows(hl.is_snp(mt.alleles[0], mt.alleles[1]))
    mt = mt.filter_rows(~hl.is_mnp(mt.alleles[0], mt.alleles[1]))
    mt = mt.filter_rows(~hl.is_indel(mt.alleles[0], mt.alleles[1]))
    mt = mt.filter_rows(~hl.is_complex(mt.alleles[0], mt.alleles[1]))
    mt_vqc = hl.variant_qc(mt, name='variant_QC_Hail')
    # (mt_vqc.variant_QC_Hail.p_value_hwe >= 10 ** -6) & not to use this according to hcm.
    mt_vqc_filtered = mt_vqc.filter_rows(
        (mt_vqc.variant_QC_Hail.call_rate >= 0.99)
        & (mt_vqc.variant_QC_Hail.AF[1] >= 0.05)
        & (mt_vqc.variant_QC_Hail.AF[1] <= 0.95))
    mt_vqc_filtered = mt_vqc_filtered.filter_rows(hl.is_defined(
        bed_to_exclude_pca[mt_vqc_filtered.locus]),
                                                  keep=False)
    # overlap AKT dataset:

    mt_1kg_chr1_chr20 = hl.read_matrix_table(
        f"{temp_dir}/ddd-elgh-ukbb/relatedness_ancestry/ancestry_work/1000g_chr1_20_AKT_overlap.mt"

예제 #2

파일 보기

파일: 3.population_PCA_prediction_io.py 프로젝트: wtsi-hgi/exomeQC-hail-gnomad

def main(args):

    bed_to_exclude_pca = hl.import_bed(locations_exclude_from_pca,
                                       reference_genome='GRCh38')
    cohorts_pop = hl.import_table(cohorts_populations,
                                  delimiter="\t").key_by('s')

    # # overlap AKT dataset
    overlap_1kg_AKT = hl.import_matrix_table(AKT_overlap)
    # drop cohorts
    # annotate with cohorts and populations from s3 table.
    # save matrixtable
    mt = hl.read_matrix_table(args.matrixtable)
    mt = mt.annotate_cols(cohort=cohorts_pop[mt.s].cohort)
    mt = mt.annotate_cols(original_pop=cohorts_pop[mt.s].known_population)
    mt = mt.annotate_cols(known_pop=cohorts_pop[mt.s].known_population_updated)
    # mt = mt.annotate_cols(superpopulation=cohorts_pop[mt.s].superpopulation)
    mt = mt.annotate_cols(gVCF=cohorts_pop[mt.s].gVCF_ID)
    mt.write(
        f"{args.output_dir}/ddd-elgh-ukbb/Sanger_chr1-20-XY_new_cohorts_split_multi_pops.mt",
        overwrite=True)
    # filter matrixtable
    logger.info("wrote mt ")
    # filter mt
    mt = mt.filter_rows(hl.is_snp(mt.alleles[0], mt.alleles[1]))
    mt = mt.filter_rows(~hl.is_mnp(mt.alleles[0], mt.alleles[1]))
    mt = mt.filter_rows(~hl.is_indel(mt.alleles[0], mt.alleles[1]))
    mt = mt.filter_rows(~hl.is_complex(mt.alleles[0], mt.alleles[1]))
    mt_vqc = hl.variant_qc(mt, name='variant_QC_Hail')
    # (mt_vqc.variant_QC_Hail.p_value_hwe >= 10 ** -6) & not to use this according to hcm.
    mt_vqc_filtered = mt_vqc.filter_rows(
        (mt_vqc.variant_QC_Hail.call_rate >= 0.99)
        & (mt_vqc.variant_QC_Hail.AF[1] >= 0.05)
        & (mt_vqc.variant_QC_Hail.AF[1] <= 0.95))
    mt_vqc_filtered = mt_vqc_filtered.filter_rows(hl.is_defined(
        bed_to_exclude_pca[mt_vqc_filtered.locus]),
                                                  keep=False)
    # overlap AKT dataset:
    # overlap_1kg_AKT
    # mt_1kg_chr1_chr20 = hl.read_matrix_table(
    #    f"{temp_dir}/ddd-elgh-ukbb/relatedness_ancestry/ancestry_work/1000g_chr1_20_AKT_overlap.mt")
    overlap_1kg_AKT = overlap_1kg_AKT.key_rows_by("locus")
    mt_vqc_filtered = mt_vqc_filtered.filter_rows(
        hl.is_defined(overlap_1kg_AKT.rows()[mt_vqc_filtered.locus]))
    logger.info("done filtering writing mt")
    # ld pruning
    pruned_ht = hl.ld_prune(mt_vqc_filtered.GT, r2=0.2, bp_window_size=500000)
    #pruned_ht = hl.ld_prune(mt.GT, r2=0.1)
    pruned_mt = mt_vqc_filtered.filter_rows(
        hl.is_defined(pruned_ht[mt_vqc_filtered.row_key]))
    # remove pruned areas that need to be removed

    # autosomes only:
    pruned_mt = pruned_mt.filter_rows(pruned_mt.locus.in_autosome())

    pruned_mt.write(
        f"{args.output_dir}/ddd-elgh-ukbb/chr1_chr20_ldpruned_updated.mt",
        overwrite=True)
    # pruned_mt = hl.read_matrix_table(
    #    f"{temp_dir}/ddd-elgh-ukbb/relatedness_ancestry/ddd-elgh-ukbb/chr1_chr20_XY_ldpruned.mt")

    # related_samples_to_drop = hl.read_table(
    #    f"{temp_dir}/ddd-elgh-ukbb/relatedness_ancestry/ddd-elgh-ukbb/chr1_chr20_XY_related_samples_to_remove.ht")

    logger.info("run_pca_with_relateds")
    # pca_evals, pca_scores, pca_loadings = run_pca_with_relateds(
    #    pruned_mt, related_samples_to_drop, autosomes_only=True)
    pca_evals, pca_scores, pca_loadings = hl.hwe_normalized_pca(
        pruned_mt.GT, k=10, compute_loadings=True)
    pca_scores = pca_scores.annotate(
        known_pop=pruned_mt.cols()[pca_scores.s].known_pop)
    # mt = mt.annotate_cols(
    #    loadings=pca_loadings[mt_vqc_filtered.col_key].loadings)
    # mt = mt.annotate_cols(known_pop="unk")
    # pca_scores = pca_scores.annotate(known_pop="unk")
    pca_scores.write(
        f"{args.output_dir}/ddd-elgh-ukbb/pca_scores_after_pruning.ht",
        overwrite=True)
    pca_loadings.write(
        f"{args.output_dir}/ddd-elgh-ukbb/pca_loadings_after_pruning.ht",
        overwrite=True)
    with open(f"{args.output_dir}/ddd-elgh-ukbb/pca_evals_after_pruning.txt",
              'w') as f:
        for val in pca_evals:
            f.write(str(val))

    logger.info("assign population pcs")

    pop_ht, pop_clf = assign_population_pcs(pca_scores,
                                            pca_scores.scores,
                                            known_col="known_pop",
                                            n_estimators=100,
                                            prop_train=0.8,
                                            min_prob=0.5)
    pop_ht.write(f"{args.output_dir}/ddd-elgh-ukbb/pop_assignments_updated.ht",
                 overwrite=True)
    pop_ht.export(
        f"{args.output_dir}/ddd-elgh-ukbb/pop_assignments_updated.tsv.gz")