def step0_init_and_run_vep(hc, vds, args): if args.start_with_step > 0: return hc, vds logger.info( "\n\n=============================== pipeline - step 0 - run vep ===============================" ) vds = read_in_dataset( hc, input_path=args.input_dataset.rstrip("/"), dataset_type=args.dataset_type, filter_interval=args.filter_interval, skip_summary=False, num_partitions=args.cpu_limit, not_gatk_genotypes=args.not_gatk_genotypes, ) validate_dataset(hc, vds, args) vds = remap_samples(hc, vds, args) vds = subset_samples(hc, vds, args) vds = add_global_metadata(vds, args) if not args.skip_vep: vds = run_vep(vds, genome_version=args.genome_version, block_size=args.vep_block_size) vds = vds.annotate_global_expr('global.gencodeVersion = "{}"'.format( "19" if args.genome_version == "37" else "25")) if args.step0_output_vds != args.input_dataset.rstrip( "/") and not args.skip_writing_intermediate_vds: write_vds(vds, args.step0_output_vds) if args.export_vcf: logger.info("Writing out to VCF...") vds.export_vcf(args.step0_output_vcf, overwrite=True) args.start_with_step = 1 # step 0 finished, so, if an error occurs and it goes to retry, start with the next step return hc, vds
"Elasticsearch index name. If specified, only this index will be updated.") p.add_argument( "--dataset-path", help="(optional) Path of variant callset. If not specified, the original " "vcf/vds path from which the data was loaded will be used.") p.add_argument("--genome-version", help="Genome build: 37 or 38", choices=["37", "38"]) p.add_argument( "--all", help="Update all elasticsearch indices. This option is mutually-exclusive " "with --index-name, --dataset-path, and --genome-version.", action="store_true") args = p.parse_args() hc = create_hail_context() if args.download_latest_clinvar_vcf: for genome_version in ["37", "38"]: vds = download_and_import_latest_clinvar_vcf(hc, genome_version) write_vds(vds, CLINVAR_VDS_PATH.format(genome_version=genome_version)) if args.index_name and not args.all: update_dataset(hc, args.index_name, args) elif args.all: update_all_datasets(hc, args) else: p.exit("ERROR: must specify either --index-name or --all")
if args.dataset_path.endswith(".vds"): vds = hc.read(args.dataset_path) else: vds = hc.import_vcf(args.dataset_path, force_bgz=True, min_partitions=10000) if args.chrom: interval = hail.Interval.parse('%s:1-500000000' % str(args.chrom)) else: if args.genome_version == "37": interval = hail.Interval.parse('X:31224000-31228000') elif args.genome_version == "38": interval = hail.Interval.parse('X:31205883-31209883') else: p.error("Unexpected genome version: " + str(args.genome_version)) vds = vds.filter_intervals(interval) print("\n==> split_multi") vds = vds.annotate_variants_expr("va.originalAltAlleles=%s" % get_expr_for_orig_alt_alleles_set()) vds = vds.split_multi() print("") pprint(vds.variant_schema) print("\n==> summary: %s" % str(vds.summarize())) write_vds(vds, output_path)
def step3_add_reference_datasets(hc, vds, args): if args.start_with_step > 3 or args.stop_after_step < 3: return hc, vds logger.info( "\n\n=============================== pipeline - step 3 - add reference datasets ===============================" ) if vds is None or not args.skip_writing_intermediate_vds: stop_hail_context(hc) hc = create_hail_context() vds = read_in_dataset(hc, args.step1_output_vds, dataset_type=args.dataset_type, skip_summary=True) if not args.only_export_to_elasticsearch_at_the_end: vds = compute_minimal_schema(vds, args.dataset_type) if args.dataset_type == "VARIANTS": # annotate with the combined reference data file which was generated using # ../download_and_create_reference_datasets/v01/hail_scripts/combine_all_variant_level_reference_data.py # and contains all these annotations in one .vds if not (args.exclude_dbnsfp or args.exclude_cadd or args.exclude_1kg or args.exclude_exac or args.exclude_topmed or args.exclude_mpc or args.exclude_gnomad or args.exclude_eigen or args.exclude_primate_ai or args.exclude_splice_ai): logger.info("\n==> add combined variant-level reference data") vds = add_combined_reference_data_to_vds( hc, vds, args.genome_version, subset=args.filter_interval) else: # annotate with each reference data file - one-by-one if not args.skip_annotations and not args.exclude_dbnsfp: logger.info("\n==> add dbnsfp") vds = add_dbnsfp_to_vds(hc, vds, args.genome_version, root="va.dbnsfp", subset=args.filter_interval) if not args.skip_annotations and not args.exclude_cadd: logger.info("\n==> add cadd") vds = add_cadd_to_vds(hc, vds, args.genome_version, root="va.cadd", subset=args.filter_interval) if not args.skip_annotations and not args.exclude_1kg: logger.info("\n==> add 1kg") vds = add_1kg_phase3_to_vds(hc, vds, args.genome_version, root="va.g1k", subset=args.filter_interval) if not args.skip_annotations and not args.exclude_exac: logger.info("\n==> add exac") vds = add_exac_to_vds(hc, vds, args.genome_version, root="va.exac", subset=args.filter_interval) if not args.skip_annotations and not args.exclude_topmed: logger.info("\n==> add topmed") vds = add_topmed_to_vds(hc, vds, args.genome_version, root="va.topmed", subset=args.filter_interval) if not args.skip_annotations and not args.exclude_mpc: logger.info("\n==> add mpc") vds = add_mpc_to_vds(hc, vds, args.genome_version, root="va.mpc", subset=args.filter_interval) if not args.skip_annotations and not args.exclude_gnomad: logger.info("\n==> add gnomad exomes") vds = add_gnomad_to_vds(hc, vds, args.genome_version, exomes_or_genomes="exomes", root="va.gnomad_exomes", subset=args.filter_interval) if not args.skip_annotations and not args.exclude_gnomad: logger.info("\n==> add gnomad genomes") vds = add_gnomad_to_vds(hc, vds, args.genome_version, exomes_or_genomes="genomes", root="va.gnomad_genomes", subset=args.filter_interval) if not args.skip_annotations and not args.exclude_eigen: logger.info("\n==> add eigen") vds = add_eigen_to_vds(hc, vds, args.genome_version, root="va.eigen", subset=args.filter_interval) if not args.exclude_primate_ai: logger.info("\n==> add primate_ai") vds = add_primate_ai_to_vds(hc, vds, args.genome_version, root="va.primate_ai", subset=args.filter_interval) if not args.exclude_splice_ai: logger.info("\n==> add splice_ai") vds = add_splice_ai_to_vds(hc, vds, args.genome_version, root="va.splice_ai", subset=args.filter_interval) if not args.skip_annotations and not args.exclude_clinvar: logger.info("\n==> add clinvar") vds = add_clinvar_to_vds(hc, vds, args.genome_version, root="va.clinvar", subset=args.filter_interval) if not args.skip_annotations and not args.exclude_hgmd: logger.info("\n==> add hgmd") vds = add_hgmd_to_vds(hc, vds, args.genome_version, root="va.hgmd", subset=args.filter_interval) if not args.is_running_locally and not args.skip_writing_intermediate_vds: write_vds(vds, args.step3_output_vds) args.start_with_step = 4 # step 3 finished, so, if an error occurs and it goes to retry, start with the next step return hc, vds
def step1_compute_derived_fields(hc, vds, args): if args.start_with_step > 1 or args.stop_after_step < 1: return hc, vds logger.info( "\n\n=============================== pipeline - step 1 - compute derived fields ===============================" ) if vds is None or not args.skip_writing_intermediate_vds: stop_hail_context(hc) hc = create_hail_context() vds = read_in_dataset(hc, args.step0_output_vds, dataset_type=args.dataset_type, skip_summary=True, num_partitions=args.cpu_limit) parallel_computed_annotation_exprs = [ "va.docId = %s" % get_expr_for_variant_id(512), "va.variantId = %s" % get_expr_for_variant_id(), "va.variantType= %s" % get_expr_for_variant_type(), "va.contig = %s" % get_expr_for_contig(), "va.pos = %s" % get_expr_for_start_pos(), "va.start = %s" % get_expr_for_start_pos(), "va.end = %s" % get_expr_for_end_pos(), "va.ref = %s" % get_expr_for_ref_allele(), "va.alt = %s" % get_expr_for_alt_allele(), "va.xpos = %s" % get_expr_for_xpos(pos_field="start"), "va.xstart = %s" % get_expr_for_xpos(pos_field="start"), "va.sortedTranscriptConsequences = %s" % get_expr_for_vep_sorted_transcript_consequences_array( vep_root="va.vep", include_coding_annotations=True, add_transcript_rank=bool(args.use_nested_objects_for_vep)), ] if args.dataset_type == "VARIANTS": FAF_CONFIDENCE_INTERVAL = 0.95 # based on https://macarthurlab.slack.com/archives/C027LHMPP/p1528132141000430 parallel_computed_annotation_exprs += [ "va.FAF = %s" % get_expr_for_filtering_allele_frequency( "va.info.AC[va.aIndex - 1]", "va.info.AN", FAF_CONFIDENCE_INTERVAL), ] serial_computed_annotation_exprs = [ "va.xstop = %s" % get_expr_for_xpos(field_prefix="va.", pos_field="end"), "va.transcriptIds = %s" % get_expr_for_vep_transcript_ids_set( vep_transcript_consequences_root="va.sortedTranscriptConsequences" ), "va.domains = %s" % get_expr_for_vep_protein_domains_set( vep_transcript_consequences_root="va.sortedTranscriptConsequences" ), "va.transcriptConsequenceTerms = %s" % get_expr_for_vep_consequence_terms_set( vep_transcript_consequences_root="va.sortedTranscriptConsequences" ), "va.mainTranscript = %s" % get_expr_for_worst_transcript_consequence_annotations_struct( "va.sortedTranscriptConsequences"), "va.geneIds = %s" % get_expr_for_vep_gene_ids_set( vep_transcript_consequences_root="va.sortedTranscriptConsequences" ), "va.codingGeneIds = %s" % get_expr_for_vep_gene_ids_set( vep_transcript_consequences_root="va.sortedTranscriptConsequences", only_coding_genes=True), ] # serial_computed_annotation_exprs += [ # "va.sortedTranscriptConsequences = va.sortedTranscriptConsequences.map(c => drop(c, amino_acids, biotype))" #] if not bool(args.use_nested_objects_for_vep): serial_computed_annotation_exprs += [ "va.sortedTranscriptConsequences = json(va.sortedTranscriptConsequences)" ] vds = vds.annotate_variants_expr(parallel_computed_annotation_exprs) for expr in serial_computed_annotation_exprs: vds = vds.annotate_variants_expr(expr) pprint(vds.variant_schema) INPUT_SCHEMA = {} if args.dataset_type == "VARIANTS": INPUT_SCHEMA["top_level_fields"] = """ docId: String, variantId: String, originalAltAlleles: Set[String], contig: String, start: Int, pos: Int, end: Int, ref: String, alt: String, xpos: Long, xstart: Long, xstop: Long, rsid: String, --- qual: Double, filters: Set[String], aIndex: Int, geneIds: Set[String], transcriptIds: Set[String], codingGeneIds: Set[String], domains: Set[String], transcriptConsequenceTerms: Set[String], sortedTranscriptConsequences: String, mainTranscript: Struct, """ if args.not_gatk_genotypes: INPUT_SCHEMA["info_fields"] = """ AC: Array[Int], AF: Array[Double], AN: Int, --- BaseQRankSum: Double, --- ClippingRankSum: Double, --- DP: Int, --- FS: Double, --- InbreedingCoeff: Double, --- MQ: Double, --- MQRankSum: Double, --- QD: Double, --- ReadPosRankSum: Double, --- VQSLOD: Double, --- culprit: String, """ else: INPUT_SCHEMA["info_fields"] = """ AC: Array[Int], AF: Array[Double], AN: Int, --- BaseQRankSum: Double, --- ClippingRankSum: Double, --- DP: Int, --- FS: Double, --- InbreedingCoeff: Double, --- MQ: Double, --- MQRankSum: Double, --- QD: Double, --- ReadPosRankSum: Double, --- VQSLOD: Double, --- culprit: String, """ elif args.dataset_type == "SV": INPUT_SCHEMA["top_level_fields"] = """ docId: String, variantId: String, contig: String, start: Int, pos: Int, end: Int, ref: String, alt: String, xpos: Long, xstart: Long, xstop: Long, rsid: String, --- qual: Double, filters: Set[String], aIndex: Int, geneIds: Set[String], transcriptIds: Set[String], codingGeneIds: Set[String], domains: Set[String], transcriptConsequenceTerms: Set[String], sortedTranscriptConsequences: String, mainTranscript: Struct, """ # END=100371979;SVTYPE=DEL;SVLEN=-70;CIGAR=1M70D GT:FT:GQ:PL:PR:SR INPUT_SCHEMA["info_fields"] = """ IMPRECISE: Boolean, SVTYPE: String, SVLEN: Int, END: Int, --- OCC: Int, --- FRQ: Double, """ else: raise ValueError("Unexpected dataset_type: %s" % args.dataset_type) if args.exclude_vcf_info_field: INPUT_SCHEMA["info_fields"] = "" expr = convert_vds_schema_string_to_annotate_variants_expr(root="va.clean", **INPUT_SCHEMA) vds = vds.annotate_variants_expr(expr=expr) vds = vds.annotate_variants_expr("va = va.clean") if not args.skip_writing_intermediate_vds: write_vds(vds, args.step1_output_vds) args.start_with_step = 2 # step 1 finished, so, if an error occurs and it goes to retry, start with the next step return hc, vds
.format(**locals()) for subpop in subpoulations ]) vds = vds.annotate_variants_expr( "va.info.AF_POPMAX_OR_GLOBAL = [ va.info.AF[va.aIndex-1], {subpopulation_exprs} ].max()" .format(**locals())) top_fields_expr = convert_vds_schema_string_to_annotate_variants_expr( root="va.clean", other_source_fields=USEFUL_TOP_LEVEL_FIELDS, other_source_root="va", ) info_fields_expr = convert_vds_schema_string_to_annotate_variants_expr( root="va.clean.info", other_source_fields=USEFUL_INFO_FIELDS, other_source_root="va.info", ) expr = [] if top_fields_expr: expr.append(top_fields_expr) if info_fields_expr: expr.append(info_fields_expr) vds = vds.annotate_variants_expr(expr=expr) vds = vds.annotate_variants_expr("va = va.clean") pprint(vds.variant_schema) write_vds(vds, GNOMAD_SEQR_VDS_PATHS[label])
else: raise ValueError("Invalid genome_version: " + str(genome_version)) kt = ( hail_context .import_table( DBNSFP_FIELDS[dbnsfp_version]["source_path"], types=DBNSFP_FIELDS[dbnsfp_version]["field_types"], missing='.', min_partitions=10000) .drop(DBNSFP_FIELDS[dbnsfp_version]["fields_to_drop"]) .rename(DBNSFP_FIELDS[dbnsfp_version]["rename_fields"]) .filter("ref==alt", keep=False) .annotate("variant=Variant(chr, pos, ref, alt)") .key_by('variant') .drop(["chr", "pos", "ref", "alt"]) ) # create sites-only VDS dbnsfp_vds = VariantDataset.from_table(kt) output_path = DBNSFP_FIELDS[dbnsfp_version]["output_path"] dbnsfp_vds = dbnsfp_vds.annotate_global_expr('global.sourceFilePath = "{}"'.format(DBNSFP_FIELDS[dbnsfp_version]["source_path"])) dbnsfp_vds = dbnsfp_vds.annotate_global_expr('global.version = "{}"'.format(dbnsfp_version)) write_vds(dbnsfp_vds, output_path) pprint(dbnsfp_vds.variant_schema)
if vds.num_partitions() < 50: print("Repartitioning") vds = vds.repartition(10000) vds = vds.annotate_variants_expr( "va.originalAltAlleles=%s" % get_expr_for_orig_alt_alleles_set() ) # save alt alleles before calling split_multi vds = vds.split_multi() #vds = vds.filter_alleles('v.altAlleles[aIndex-1].isStar()', keep=False) filter_interval = "1-MT" if args.subset: filter_interval = args.subset logger.info("\n==> set filter interval to: %s" % (filter_interval, )) vds = vds.filter_intervals(hail.Interval.parse(filter_interval)) summary = vds.summarize() pprint.pprint(summary) if summary.variants == 0: p.error( "0 variants in VDS. Make sure chromosome names don't contain 'chr'") vds = run_vep(vds, genome_version=args.genome_version, block_size=args.block_size) write_vds(vds, args.output_vds) pprint.pprint(vds.variant_schema)