def compute_minimal_schema(vds, analysis_type): # add computed annotations parallel_computed_annotation_exprs = [ "va.docId = %s" % get_expr_for_variant_id(512), ] vds = vds.annotate_variants_expr(parallel_computed_annotation_exprs) #pprint(vds.variant_schema) # apply schema to dataset INPUT_SCHEMA = {} if analysis_type == "GATK_VARIANTS": INPUT_SCHEMA["top_level_fields"] = """ docId: String, wasSplit: Boolean, aIndex: Int, """ INPUT_SCHEMA["info_fields"] = "" elif analysis_type in ["MANTA_SVS", "JULIA_SVS"]: INPUT_SCHEMA["top_level_fields"] = """ docId: String, """ INPUT_SCHEMA["info_fields"] = "" else: raise ValueError("Unexpected analysis_type: %s" % analysis_type) expr = convert_vds_schema_string_to_annotate_variants_expr(root="va.clean", **INPUT_SCHEMA) vds = vds.annotate_variants_expr(expr=expr) vds = vds.annotate_variants_expr("va = va.clean") return vds
args = p.parse_args() input_vds_path = str(args.input_vds).rstrip("/") if not input_vds_path.endswith(".vds"): p.error("Input must be a .vds") input_vds_path_prefix = input_vds_path.replace(".vds", "") logger.info("\n==> create HailContext") hc = hail.HailContext(log="/hail.log") logger.info("\n==> import vds: " + input_vds_path) vds = hc.read(input_vds_path) parallel_computed_annotation_exprs = [ "va.variantId = %s" % get_expr_for_variant_id(), "va.contig = %s" % get_expr_for_contig(), "va.start = %s" % get_expr_for_start_pos(), "va.pos = %s" % get_expr_for_start_pos(), "va.end = %s" % get_expr_for_end_pos(), "va.ref = %s" % get_expr_for_ref_allele(), "va.alt = %s" % get_expr_for_alt_allele(), "va.xpos = %s" % get_expr_for_xpos(pos_field="start"), "va.xstart = %s" % get_expr_for_xpos(pos_field="start"), ] serial_computed_annotation_exprs = [ "va.xstop = %s" % get_expr_for_xpos(field_prefix="va.", pos_field="end"), ]
pprint(kt_pop.schema) pprint(kt_annotations.schema) pprint(kt_rare_variants.schema) ES_HOST_IP = '10.4.0.13' ES_HOST_PORT = 9200 print("======== Export to elasticsearch ======") es = ElasticsearchClient( host=ES_HOST_IP, port=ES_HOST_PORT, ) annotation_expressions = [ 'variant_id = %s' % get_expr_for_variant_id(), 'chrom = %s' % get_expr_for_contig(), 'pos = %s' % get_expr_for_start_pos(), "xpos = %s" % get_expr_for_xpos(field_prefix="", pos_field="pos"), ] for expression in annotation_expressions: kt_rare_variants = kt_rare_variants.annotate(expression) kt_rare_variants = kt_rare_variants.drop(['v']) kt_annotations = kt_annotations.annotate('variantId = %s' % get_expr_for_variant_id()).drop(['v']) kt_rare_variants = kt_rare_variants.key_by('variantId').join(kt_annotations.key_by('variantId')) pprint(kt_rare_variants.schema)
'N analysis groups': 'n_analysis_groups', 'Analysis group': 'analysis_group', 'AC case': 'ac_case', 'AC ctrl': 'ac_ctrl', 'AN case': 'an_case', 'AN ctrl': 'an_ctrl', 'AF case': 'af_case', 'AF ctrl': 'af_ctrl', 'Estimate': 'est', 'SE': 'se', 'P-value': 'p', 'Comment': 'comment', } annotation_expressions = [ 'variant_id = %s' % get_expr_for_variant_id(), 'contig = %s' % get_expr_for_contig(), 'pos = %s' % get_expr_for_start_pos(), "xpos = %s" % get_expr_for_xpos(field_prefix="", pos_field="pos"), ] kt_variant_annotation = kt_variant_annotation.rename(column_map) kt_variant_annotation = kt_variant_annotation.annotate('v = Variant(v)') kt_variant_results = kt_variant_results.rename(column_map) for expression in annotation_expressions: kt_variant_results = kt_variant_results.annotate(expression) kt_variants = kt_variant_results.key_by('v').join(kt_variant_annotation.key_by('v')) kt_variants = kt_variants.drop(['v'])
logger.info( "=============================== pipeline - step 1 ===============================" ) logger.info( "Read in data, compute various derived fields, export to elasticsearch" ) logger.info("\n==> Re-create HailContext") hc = hail.HailContext(log="/hail.log") vds = read_in_dataset(vep_output_vds, args.analysis_type, filter_interval) # add computed annotations logger.info("\n==> Adding computed annotations") parallel_computed_annotation_exprs = [ "va.docId = %s" % get_expr_for_variant_id(512), "va.variantId = %s" % get_expr_for_variant_id(), "va.contig = %s" % get_expr_for_contig(), "va.start = %s" % get_expr_for_start_pos(), "va.pos = %s" % get_expr_for_start_pos(), "va.end = %s" % get_expr_for_end_pos(), "va.ref = %s" % get_expr_for_ref_allele(), "va.alt = %s" % get_expr_for_alt_allele(), # compute AC, Het, Hom, Hemi, AN "va.xpos = %s" % get_expr_for_xpos(pos_field="start"), "va.xstart = %s" % get_expr_for_xpos(pos_field="start"), "va.geneIds = %s" % get_expr_for_vep_gene_ids_set(vep_root="va.vep"), "va.codingGeneIds = %s" % get_expr_for_vep_gene_ids_set( vep_root="va.vep", only_coding_genes=True), "va.transcriptIds = %s" %
Hemi_AFR: Array[Int], Hemi_AMR: Array[Int], Hemi: Array[Int], Hemi_ASJ: Array[Int], Hemi_OTH: Array[Int], Hemi_FIN: Array[Int], Hemi_EAS: Array[Int], """ } vds_computed_annotations_exprs = [ "va.contig = %s" % get_expr_for_contig(), "va.start = %s" % get_expr_for_start_pos(), "va.ref = %s" % get_expr_for_ref_allele(), "va.alt = %s" % get_expr_for_alt_allele(), "va.joinKey = %s" % get_expr_for_variant_id(), "va.variantId = %s" % get_expr_for_variant_id(), "va.originalAltAlleles = %s" % get_expr_for_orig_alt_alleles_set(), "va.geneIds = %s" % get_expr_for_vep_gene_ids_set(), "va.transcriptIds = %s" % get_expr_for_vep_transcript_ids_set(), "va.transcriptConsequenceTerms = %s" % get_expr_for_vep_consequence_terms_set(), "va.sortedTranscriptConsequences = %s" % get_expr_for_vep_sorted_transcript_consequences_array(), "va.mainTranscript = %s" % get_expr_for_worst_transcript_consequence_annotations_struct("va.sortedTranscriptConsequences"), "va.sortedTranscriptConsequences = json(va.sortedTranscriptConsequences)", ] print("======== Exomes: KT Schema ========") exomes_vds = exomes_vds.annotate_variants_expr("va.exomes.originalAltAlleles=%s" % get_expr_for_orig_alt_alleles_set()) exomes_vds = exomes_vds.split_multi() for expr in vds_computed_annotations_exprs: