input_vds_path_prefix = input_vds_path.replace(".vds", "") logger.info("\n==> create HailContext") hc = hail.HailContext(log="/hail.log") logger.info("\n==> import vds: " + input_vds_path) vds = hc.read(input_vds_path) parallel_computed_annotation_exprs = [ "va.variantId = %s" % get_expr_for_variant_id(), "va.contig = %s" % get_expr_for_contig(), "va.start = %s" % get_expr_for_start_pos(), "va.pos = %s" % get_expr_for_start_pos(), "va.end = %s" % get_expr_for_end_pos(), "va.ref = %s" % get_expr_for_ref_allele(), "va.alt = %s" % get_expr_for_alt_allele(), "va.xpos = %s" % get_expr_for_xpos(pos_field="start"), "va.xstart = %s" % get_expr_for_xpos(pos_field="start"), ] serial_computed_annotation_exprs = [ "va.xstop = %s" % get_expr_for_xpos(field_prefix="va.", pos_field="end"), ] vds = vds.annotate_variants_expr(parallel_computed_annotation_exprs) vds = vds.annotate_variants_expr(serial_computed_annotation_exprs) # apply schema to dataset INPUT_SCHEMA = {
obs_exp: Double, """ vds = vds.annotate_variants_expr("va.originalAltAlleles=%s" % get_expr_for_orig_alt_alleles_set()) vds = add_mpc_to_vds(hc, vds, args.genome_version, root="va.info", info_fields=MPC_INFO_FIELDS) pprint(vds.variant_schema) for expr in vds_computed_annotations_exprs: vds = vds.annotate_variants_expr(expr) kt_variant_expr = convert_vds_schema_string_to_vds_make_table_arg(**GNOMAD_SCHEMA) # print kt_variant_expr kt = vds.make_table(kt_variant_expr, []) # pprint(kt.schema) kt = kt.annotate("pos = start") kt = kt.annotate("stop = %s" % get_expr_for_end_pos(field_prefix="", pos_field="start", ref_field="ref")) kt = kt.annotate("xpos = %s" % get_expr_for_xpos(field_prefix="", pos_field="start")) kt = kt.annotate("xstart = %s" % get_expr_for_xpos(field_prefix="", pos_field="start")) kt = kt.annotate("xstop = %s" % get_expr_for_xpos(field_prefix="", pos_field="stop")) # flatten and prune mainTranscript transcript_annotations_to_keep = [ "amino_acids", "biotype", "canonical", "cdna_start", "cdna_end", "codons", #"distance", "domains", "exon",