Exemplo n.º 1
0
def compute_minimal_schema(vds, analysis_type):

    # add computed annotations
    parallel_computed_annotation_exprs = [
        "va.docId = %s" % get_expr_for_variant_id(512),
    ]

    vds = vds.annotate_variants_expr(parallel_computed_annotation_exprs)

    #pprint(vds.variant_schema)

    # apply schema to dataset
    INPUT_SCHEMA = {}
    if analysis_type == "GATK_VARIANTS":
        INPUT_SCHEMA["top_level_fields"] = """
            docId: String,
            wasSplit: Boolean,
            aIndex: Int,
        """

        INPUT_SCHEMA["info_fields"] = ""

    elif analysis_type in ["MANTA_SVS", "JULIA_SVS"]:
        INPUT_SCHEMA["top_level_fields"] = """
            docId: String,
        """

        INPUT_SCHEMA["info_fields"] = ""

    else:
        raise ValueError("Unexpected analysis_type: %s" % analysis_type)

    expr = convert_vds_schema_string_to_annotate_variants_expr(root="va.clean",
                                                               **INPUT_SCHEMA)
    vds = vds.annotate_variants_expr(expr=expr)
    vds = vds.annotate_variants_expr("va = va.clean")

    return vds
Exemplo n.º 2
0
args = p.parse_args()

input_vds_path = str(args.input_vds).rstrip("/")
if not input_vds_path.endswith(".vds"):
    p.error("Input must be a .vds")

input_vds_path_prefix = input_vds_path.replace(".vds", "")

logger.info("\n==> create HailContext")
hc = hail.HailContext(log="/hail.log")

logger.info("\n==> import vds: " + input_vds_path)
vds = hc.read(input_vds_path)

parallel_computed_annotation_exprs = [
    "va.variantId = %s" % get_expr_for_variant_id(),
    
    "va.contig = %s" % get_expr_for_contig(),
    "va.start = %s" % get_expr_for_start_pos(),
    "va.pos = %s" % get_expr_for_start_pos(),
    "va.end = %s" % get_expr_for_end_pos(),
    "va.ref = %s" % get_expr_for_ref_allele(),
    "va.alt = %s" % get_expr_for_alt_allele(),
    
    "va.xpos = %s" % get_expr_for_xpos(pos_field="start"),
    "va.xstart = %s" % get_expr_for_xpos(pos_field="start"),
]

serial_computed_annotation_exprs = [
    "va.xstop = %s" % get_expr_for_xpos(field_prefix="va.", pos_field="end"),
]
pprint(kt_pop.schema)
pprint(kt_annotations.schema)
pprint(kt_rare_variants.schema)

ES_HOST_IP = '10.4.0.13'
ES_HOST_PORT = 9200

print("======== Export to elasticsearch ======")
es = ElasticsearchClient(
    host=ES_HOST_IP,
    port=ES_HOST_PORT,
)

annotation_expressions = [
    'variant_id = %s' % get_expr_for_variant_id(),
    'chrom = %s' % get_expr_for_contig(),
    'pos = %s' % get_expr_for_start_pos(),
    "xpos = %s" % get_expr_for_xpos(field_prefix="", pos_field="pos"),
]

for expression in annotation_expressions:
    kt_rare_variants = kt_rare_variants.annotate(expression)

kt_rare_variants = kt_rare_variants.drop(['v'])

kt_annotations = kt_annotations.annotate('variantId = %s' % get_expr_for_variant_id()).drop(['v'])

kt_rare_variants = kt_rare_variants.key_by('variantId').join(kt_annotations.key_by('variantId'))

pprint(kt_rare_variants.schema)
Exemplo n.º 4
0
  'N analysis groups': 'n_analysis_groups',
  'Analysis group': 'analysis_group',
  'AC case': 'ac_case',
  'AC ctrl': 'ac_ctrl',
  'AN case': 'an_case',
  'AN ctrl': 'an_ctrl',
  'AF case': 'af_case',
  'AF ctrl': 'af_ctrl',
  'Estimate': 'est',
  'SE': 'se',
  'P-value': 'p',
  'Comment': 'comment',
}

annotation_expressions = [
    'variant_id = %s' % get_expr_for_variant_id(),
    'contig = %s' % get_expr_for_contig(),
    'pos = %s' % get_expr_for_start_pos(),
    "xpos = %s" % get_expr_for_xpos(field_prefix="", pos_field="pos"),
]

kt_variant_annotation = kt_variant_annotation.rename(column_map)
kt_variant_annotation = kt_variant_annotation.annotate('v = Variant(v)')

kt_variant_results = kt_variant_results.rename(column_map)
for expression in annotation_expressions:
    kt_variant_results = kt_variant_results.annotate(expression)

kt_variants = kt_variant_results.key_by('v').join(kt_variant_annotation.key_by('v'))
kt_variants = kt_variants.drop(['v'])
Exemplo n.º 5
0
    logger.info(
        "=============================== pipeline - step 1 ==============================="
    )
    logger.info(
        "Read in data, compute various derived fields, export to elasticsearch"
    )

    logger.info("\n==> Re-create HailContext")
    hc = hail.HailContext(log="/hail.log")

    vds = read_in_dataset(vep_output_vds, args.analysis_type, filter_interval)

    # add computed annotations
    logger.info("\n==> Adding computed annotations")
    parallel_computed_annotation_exprs = [
        "va.docId = %s" % get_expr_for_variant_id(512),
        "va.variantId = %s" % get_expr_for_variant_id(),
        "va.contig = %s" % get_expr_for_contig(),
        "va.start = %s" % get_expr_for_start_pos(),
        "va.pos = %s" % get_expr_for_start_pos(),
        "va.end = %s" % get_expr_for_end_pos(),
        "va.ref = %s" % get_expr_for_ref_allele(),
        "va.alt = %s" % get_expr_for_alt_allele(),

        # compute AC, Het, Hom, Hemi, AN
        "va.xpos = %s" % get_expr_for_xpos(pos_field="start"),
        "va.xstart = %s" % get_expr_for_xpos(pos_field="start"),
        "va.geneIds = %s" % get_expr_for_vep_gene_ids_set(vep_root="va.vep"),
        "va.codingGeneIds = %s" % get_expr_for_vep_gene_ids_set(
            vep_root="va.vep", only_coding_genes=True),
        "va.transcriptIds = %s" %
Exemplo n.º 6
0
        Hemi_AFR: Array[Int],
        Hemi_AMR: Array[Int],
        Hemi: Array[Int],
        Hemi_ASJ: Array[Int],
        Hemi_OTH: Array[Int],
        Hemi_FIN: Array[Int],
        Hemi_EAS: Array[Int],
    """
}

vds_computed_annotations_exprs = [
    "va.contig = %s" % get_expr_for_contig(),
    "va.start = %s" % get_expr_for_start_pos(),
    "va.ref = %s" % get_expr_for_ref_allele(),
    "va.alt = %s" % get_expr_for_alt_allele(),
    "va.joinKey = %s" % get_expr_for_variant_id(),
    "va.variantId = %s" % get_expr_for_variant_id(),
    "va.originalAltAlleles = %s" % get_expr_for_orig_alt_alleles_set(),
    "va.geneIds = %s" % get_expr_for_vep_gene_ids_set(),
    "va.transcriptIds = %s" % get_expr_for_vep_transcript_ids_set(),
    "va.transcriptConsequenceTerms = %s" % get_expr_for_vep_consequence_terms_set(),
    "va.sortedTranscriptConsequences = %s" % get_expr_for_vep_sorted_transcript_consequences_array(),
    "va.mainTranscript = %s" % get_expr_for_worst_transcript_consequence_annotations_struct("va.sortedTranscriptConsequences"),
    "va.sortedTranscriptConsequences = json(va.sortedTranscriptConsequences)",
]


print("======== Exomes: KT Schema ========")
exomes_vds = exomes_vds.annotate_variants_expr("va.exomes.originalAltAlleles=%s" % get_expr_for_orig_alt_alleles_set())
exomes_vds = exomes_vds.split_multi()
for expr in vds_computed_annotations_exprs: