def test_liftover(): ''' The test data was prepared as follows: * We loaded all intervals from hg17-to-hg18. * We then picked positions from the genome as follows: For each interval we picked the first, the last, the first-1, last+1, and first+4 positions. From the resulting ~40k points we chose 10000 random sites. We converted those via UCSC-hosted Web "liftOver" tool and wrote down the results. The test results are in data/hg17ToHg18.testpoints.txt.gz. Just in case we also saved the corresponding over.chain file. ''' lo = LiftOver(os.path.join(DATA_DIR, 'hg17ToHg18.over.chain.gz')) testdata_file = os.path.join(DATA_DIR, 'hg17ToHg18.testpoints.txt.gz') test_counter = 0 with gzip.open(testdata_file) as f: for ln in f: s_chr, s_pos, t_chr, t_pos = ln.split('\t') result = lo.convert_coordinate(s_chr, int(s_pos)) if t_chr == '-': assert len(result) == 0 else: assert len(result) == 1 res_chr = result[0][0] res_pos = result[0][1] assert res_chr == t_chr assert res_pos == int(t_pos) test_counter += 1 assert test_counter == 10000
def test_liftover_2(): ''' Check that liftover can open files given both as strings and file objects. ''' lo = LiftOver(os.path.join(DATA_DIR, 'hg17ToHg18.over.chain.gz')) assert len(lo.chain_file.chain_index) > 22 lo = LiftOver(gzip.open(os.path.join(DATA_DIR, 'hg17ToHg18.over.chain.gz'))) assert len(lo.chain_file.chain_index) > 22
def _liftover_grch38_to_grch37(): global LIFTOVER_GRCH38_TO_GRCH37 if not LIFTOVER_GRCH38_TO_GRCH37: try: LIFTOVER_GRCH38_TO_GRCH37 = LiftOver('hg38', 'hg19') except Exception as e: logger.warn('WARNING: Unable to set up liftover. {}'.format(e)) return LIFTOVER_GRCH38_TO_GRCH37
def test_liftover(): ''' The test data was prepared as follows: * We loaded all intervals from hg17-to-hg18. * We then picked positions from the genome as follows: For each interval we picked the first, the last, the first-1, last+1, and first+4 positions. From the resulting ~40k points we chose 10000 random sites. We converted those via UCSC-hosted Web "liftOver" tool and wrote down the results. The test results are in data/hg17ToHg18.testpoints.txt.gz. Just in case we also saved the corresponding over.chain file. ''' lo = LiftOver(os.path.join(DATA_DIR, 'hg17ToHg18.over.chain.gz')) testdata_file = os.path.join(DATA_DIR, 'hg17ToHg18.testpoints.txt.gz') test_counter = 0 f = gzip.open( testdata_file) # no "with" here because we want to support Python 2.6 for ln in f: ln = ln.decode('ascii') s_chr, s_pos, t_chr, t_pos = ln.split('\t') result = lo.convert_coordinate(s_chr, int(s_pos)) if t_chr == '-': assert len(result) == 0 else: assert len(result) == 1 res_chr = result[0][0] res_pos = result[0][1] assert res_chr == t_chr assert res_pos == int(t_pos) # Check that we can provide chromosome as a bytes object and # everything will work still if sys.version_info >= (3, 0): result = lo.convert_coordinate(s_chr.encode('ascii'), int(s_pos)) if t_chr == '-': assert len(result) == 0 else: assert len(result) == 1 res_chr = result[0][0] res_pos = result[0][1] assert res_chr == t_chr assert res_pos == int(t_pos) test_counter += 1 assert test_counter == 10000
def test_liftover(): ''' The test data was prepared as follows: * We loaded all intervals from hg17-to-hg18. * We then picked positions from the genome as follows: For each interval we picked the first, the last, the first-1, last+1, and first+4 positions. From the resulting ~40k points we chose 10000 random sites. We converted those via UCSC-hosted Web "liftOver" tool and wrote down the results. The test results are in data/hg17ToHg18.testpoints.txt.gz. Just in case we also saved the corresponding over.chain file. ''' lo = LiftOver(os.path.join(DATA_DIR, 'hg17ToHg18.over.chain.gz')) testdata_file = os.path.join(DATA_DIR, 'hg17ToHg18.testpoints.txt.gz') test_counter = 0 f = gzip.open(testdata_file) # no "with" here because we want to support Python 2.6 for ln in f: ln = ln.decode('ascii') s_chr, s_pos, t_chr, t_pos = ln.split('\t') result = lo.convert_coordinate(s_chr, int(s_pos)) if t_chr == '-': assert len(result) == 0 else: assert len(result) == 1 res_chr = result[0][0] res_pos = result[0][1] assert res_chr == t_chr assert res_pos == int(t_pos) # Check that we can provide chromosome as a bytes object and # everything will work still if sys.version_info >= (3, 0): result = lo.convert_coordinate(s_chr.encode('ascii'), int(s_pos)) if t_chr == '-': assert len(result) == 0 else: assert len(result) == 1 res_chr = result[0][0] res_pos = result[0][1] assert res_chr == t_chr assert res_pos == int(t_pos) test_counter += 1 assert test_counter == 10000
def test_issue_2_3_4(): ''' Check the correctness of coordinate conversion for issue 2/3/4. NB: We are using the "live" hg38ToHg19.over.chain.gz file, hence if it happens to change later on, the test may start failing. Just in case we have the original cached in the data directory as well. ''' lo = LiftOver('hg38', 'hg19') test_input = os.path.join(DATA_DIR, 'hg38ToHg19.testinput.txt') test_output = os.path.join(DATA_DIR, 'hg38ToHg19.testoutput.txt') test_input = dict([(ln[3], (ln[0], int(ln[1]), ln[5].strip())) for ln in [line.split() for line in open(test_input)]]) test_output = dict([(ln[3], (ln[0], int(ln[1]), ln[5].strip())) for ln in [line.split() for line in open(test_output)]]) for k in test_input: res = lo.convert_coordinate(*test_input[k]) if k not in test_output: assert len(res) == 0 else: assert len(res) == 1 and res[0][0:3] == test_output[k]
class ElasticsearchDatastore(datastore.Datastore): def __init__(self, annotator): self.liftover_grch38_to_grch37 = None self.liftover_grch37_to_grch38 = None self._annotator = annotator self._es_client = elasticsearch.Elasticsearch( host=settings.ELASTICSEARCH_SERVICE_HOSTNAME) self._redis_client = None if settings.REDIS_SERVICE_HOSTNAME: try: self._redis_client = redis.StrictRedis( host=settings.REDIS_SERVICE_HOSTNAME, socket_connect_timeout=3) self._redis_client.ping() except redis.exceptions.TimeoutError as e: logger.warn("Unable to connect to redis host: {}".format( settings.REDIS_SERVICE_HOSTNAME) + str(e)) self._redis_client = None def get_elasticsearch_variants( self, project_id, family_id=None, variant_filter=None, genotype_filter=None, variant_id_filter=None, quality_filter=None, indivs_to_consider=None, include_all_consequences=False, user=None, max_results_limit=settings.VARIANT_QUERY_RESULTS_LIMIT, ): from xbrowse_server.base.models import Individual from xbrowse_server.mall import get_reference cache_key = "Variants___%s___%s___%s" % (project_id, family_id, json.dumps([ variant_filter.toJSON() if variant_filter else None, genotype_filter, quality_filter, variant_id_filter, indivs_to_consider, include_all_consequences, ])) cached_results = self._redis_client and self._redis_client.get( cache_key) if cached_results is not None: variant_results = json.loads(cached_results) return [ Variant.fromJSON(variant_json) for variant_json in variant_results ] if indivs_to_consider is None: if genotype_filter: indivs_to_consider = genotype_filter.keys() else: indivs_to_consider = [] if family_id is not None: family_individual_ids = [ i.indiv_id for i in Individual.objects.filter( family__family_id=family_id).only("indiv_id") ] else: family_individual_ids = [ i.indiv_id for i in Individual.objects.filter( family__project__project_id=project_id).only("indiv_id") ] from xbrowse_server.base.models import Project, Family from pyliftover.liftover import LiftOver query_json = self._make_db_query(genotype_filter, variant_filter) try: if self.liftover_grch38_to_grch37 is None: self.liftover_grch38_to_grch37 = LiftOver('hg38', 'hg19') if self.liftover_grch37_to_grch38 is None: self.liftover_grch37_to_grch38 = None # LiftOver('hg19', 'hg38') except Exception as e: logger.info( "WARNING: Unable to set up liftover. Is there a working internet connection? " + str(e)) if family_id is None: project = Project.objects.get(project_id=project_id) elasticsearch_index = project.get_elasticsearch_index() logger.info("Searching in project elasticsearch index: " + str(elasticsearch_index)) else: family = Family.objects.get(project__project_id=project_id, family_id=family_id) elasticsearch_index = family.get_elasticsearch_index() project = family.project logger.info("Searching in family elasticsearch index: " + str(elasticsearch_index)) if family_id is not None and len(family_individual_ids) > 0: # figure out which index to use # TODO add caching matching_indices = [] mapping = self._es_client.indices.get_mapping( str(elasticsearch_index) + "*") if family_individual_ids: indiv_id = _encode_name(family_individual_ids[0]) for index_name, index_mapping in mapping.items(): if indiv_id + "_num_alt" in index_mapping["mappings"][ "variant"]["properties"]: matching_indices.append(index_name) if not matching_indices: if not family_individual_ids: logger.error("no individuals found for family %s" % (family_id)) elif not mapping: logger.error( "no es mapping found for found with prefix %s" % (elasticsearch_index)) else: logger.error("%s not found in %s:\n%s" % (indiv_id, elasticsearch_index, pformat(index_mapping["mappings"]["variant"] ["properties"]))) else: logger.info("matching indices: " + str(elasticsearch_index)) elasticsearch_index = ",".join(matching_indices) s = elasticsearch_dsl.Search(using=self._es_client, index=str(elasticsearch_index) + "*") #",".join(indices)) if variant_id_filter is not None: variant_id_filter_term = None for variant_id in variant_id_filter: q_obj = Q('term', **{"variantId": variant_id}) if variant_id_filter_term is None: variant_id_filter_term = q_obj else: variant_id_filter_term |= q_obj s = s.filter(variant_id_filter_term) if indivs_to_consider: atleast_one_nonref_genotype_filter = None for sample_id in indivs_to_consider: encoded_sample_id = _encode_name(sample_id) q = Q('range', **{encoded_sample_id + "_num_alt": {'gte': 1}}) if atleast_one_nonref_genotype_filter is None: atleast_one_nonref_genotype_filter = q else: atleast_one_nonref_genotype_filter |= q s = s.filter(atleast_one_nonref_genotype_filter) if quality_filter is not None and indivs_to_consider: #'vcf_filter': u'pass', u'min_ab': 17, u'min_gq': 46 min_ab = quality_filter.get('min_ab') if min_ab is not None: min_ab /= 100.0 # convert to fraction min_gq = quality_filter.get('min_gq') vcf_filter = quality_filter.get('vcf_filter') for sample_id in indivs_to_consider: encoded_sample_id = _encode_name(sample_id) #'vcf_filter': u'pass', u'min_ab': 17, u'min_gq': 46 if min_ab: s = s.filter( ~Q('term', **{encoded_sample_id + "_num_alt": 1}) | Q('range', ** {encoded_sample_id + "_ab": { 'gte': min_ab }})) #logger.info("### ADDED FILTER: " + str({encoded_sample_id+"_ab": {'gte': min_ab}})) if min_gq: s = s.filter( 'range', **{encoded_sample_id + "_gq": { 'gte': min_gq }}) #logger.info("### ADDED FILTER: " + str({encoded_sample_id+"_gq": {'gte': min_gq}})) if vcf_filter is not None: s = s.filter(~Q('exists', field='filters')) #logger.info("### ADDED FILTER: " + str(~Q('exists', field='filters'))) # parse variant query annotation_groups_map = ANNOTATION_GROUPS_MAP_INTERNAL if user and user.is_staff else ANNOTATION_GROUPS_MAP for key, value in query_json.items(): if key == 'db_tags': so_annotations = query_json.get('db_tags', {}).get('$in', []) # handle clinvar filters selected_so_annotations_set = set(so_annotations) all_clinvar_filters_set = set( annotation_groups_map.get("clinvar", {}).get("children", [])) selected_clinvar_filters_set = all_clinvar_filters_set & selected_so_annotations_set all_hgmd_filters_set = set( annotation_groups_map.get("hgmd", {}).get("children", [])) selected_hgmd_filters_set = all_hgmd_filters_set & selected_so_annotations_set vep_consequences = list(selected_so_annotations_set - selected_clinvar_filters_set - selected_hgmd_filters_set) consequences_filter = Q( "terms", transcriptConsequenceTerms=vep_consequences) if selected_clinvar_filters_set: clinvar_clinical_significance_terms = set() for clinvar_filter in selected_clinvar_filters_set: # translate selected filters to the corresponding clinvar clinical consequence terms if clinvar_filter == "pathogenic": clinvar_clinical_significance_terms.update( ["Pathogenic", "Pathogenic/Likely_pathogenic"]) elif clinvar_filter == "likely_pathogenic": clinvar_clinical_significance_terms.update([ "Likely_pathogenic", "Pathogenic/Likely_pathogenic" ]) elif clinvar_filter == "benign": clinvar_clinical_significance_terms.update( ["Benign", "Benign/Likely_benign"]) elif clinvar_filter == "likely_benign": clinvar_clinical_significance_terms.update( ["Likely_benign", "Benign/Likely_benign"]) elif clinvar_filter == "vus_or_conflicting": clinvar_clinical_significance_terms.update([ "Conflicting_interpretations_of_pathogenicity", "Uncertain_significance", "not_provided", "other" ]) else: raise ValueError("Unexpected clinvar filter: " + str(clinvar_filter)) consequences_filter = consequences_filter | Q( "terms", clinvar_clinical_significance=list( clinvar_clinical_significance_terms)) if selected_hgmd_filters_set: hgmd_class = set() for hgmd_filter in selected_hgmd_filters_set: # translate selected filters to the corresponding hgmd clinical consequence terms if hgmd_filter == "disease_causing": hgmd_class.update(["DM"]) elif hgmd_filter == "likely_disease_causing": hgmd_class.update(["DM?"]) elif hgmd_filter == "hgmd_other": hgmd_class.update(["DP", "DFP", "FP", "FTV"]) else: raise ValueError("Unexpected hgmd filter: " + str(hgmd_filter)) consequences_filter = consequences_filter | Q( "terms", hgmd_class=list(hgmd_class)) if 'intergenic_variant' in vep_consequences: # for many intergenic variants VEP doesn't add any annotations, so if user selected 'intergenic_variant', also match variants where transcriptConsequenceTerms is emtpy consequences_filter = consequences_filter | ~Q( 'exists', field='transcriptConsequenceTerms') s = s.filter(consequences_filter) #logger.info("==> transcriptConsequenceTerms: %s" % str(vep_consequences)) if key.startswith("genotypes"): sample_id = ".".join(key.split(".")[1:-1]) encoded_sample_id = _encode_name(sample_id) genotype_filter = value #logger.info("==> genotype filter: " + str(genotype_filter)) if type(genotype_filter) == int or type( genotype_filter) == basestring: #logger.info("==> genotypes: %s" % str({encoded_sample_id+"_num_alt": genotype_filter})) s = s.filter( 'term', **{encoded_sample_id + "_num_alt": genotype_filter}) elif '$gte' in genotype_filter: genotype_filter = { k.replace("$", ""): v for k, v in genotype_filter.items() } s = s.filter( 'range', **{encoded_sample_id + "_num_alt": genotype_filter}) #logger.info("==> genotypes: %s" % str({encoded_sample_id+"_num_alt": genotype_filter})) elif "$in" in genotype_filter: num_alt_values = genotype_filter['$in'] q = Q( 'term', **{encoded_sample_id + "_num_alt": num_alt_values[0]}) #logger.info("==> genotypes: %s" % str({encoded_sample_id+"_num_alt": num_alt_values[0]})) for num_alt_value in num_alt_values[1:]: q = q | Q( 'term', ** {encoded_sample_id + "_num_alt": num_alt_value}) #logger.info("==> genotypes: %s" % str({encoded_sample_id+"_num_alt": num_alt_value})) s = s.filter(q) if key == "db_gene_ids": db_gene_ids = query_json.get('db_gene_ids', {}) exclude_genes = db_gene_ids.get('$nin', []) gene_ids = exclude_genes or db_gene_ids.get('$in', []) if exclude_genes: s = s.exclude("terms", geneIds=gene_ids) else: s = s.filter("terms", geneIds=gene_ids) #logger.info("==> %s %s" % ("exclude" if exclude_genes else "include", "geneIds: " + str(gene_ids))) if key == "$or" and type(value) == list: q_terms = None for region_filter in value: xpos_filters = region_filter.get("$and", {}) # for example: $or : [{'$and': [{'xpos': {'$gte': 12345}}, {'xpos': {'$lte': 54321}}]}] xpos_filters_dict = {} for xpos_filter in xpos_filters: xpos_filter_setting = xpos_filter[ "xpos"] # for example {'$gte': 12345} or {'$lte': 54321} xpos_filters_dict.update(xpos_filter_setting) xpos_filter_setting = { k.replace("$", ""): v for k, v in xpos_filters_dict.items() } q = Q('range', **{"xpos": xpos_filter_setting}) if q_terms is None: q_terms = q else: q_terms |= q if q_terms is not None: s = s.filter(q_terms) #logger.info("==> xpos range: " + str({"xpos": xpos_filter_setting})) af_key_map = { "db_freqs.AF": "AF", "db_freqs.1kg_wgs_phase3": "g1k_POPMAX_AF", "db_freqs.exac_v3": "exac_AF_POPMAX", "db_freqs.topmed": "topmed_AF", "db_freqs.gnomad_exomes": "gnomad_exomes_AF_POPMAX", "db_freqs.gnomad_genomes": "gnomad_genomes_AF_POPMAX", "db_freqs.gnomad-exomes2": "gnomad_exomes_AF_POPMAX", "db_freqs.gnomad-genomes2": "gnomad_genomes_AF_POPMAX", } if key in af_key_map: filter_key = af_key_map[key] af_filter_setting = { k.replace("$", ""): v for k, v in value.items() } s = s.filter( Q('range', **{filter_key: af_filter_setting}) | ~Q('exists', field=filter_key)) #logger.info("==> %s: %s" % (filter_key, af_filter_setting)) ac_key_map = { "db_acs.AF": "AC", "db_acs.1kg_wgs_phase3": "g1k_AC", "db_acs.exac_v3": "exac_AC", "db_acs.topmed": "topmed_AC", "db_acs.gnomad_exomes": "gnomad_exomes_AC", "db_acs.gnomad_genomes": "gnomad_genomes_AC", "db_acs.gnomad-exomes2": "gnomad_exomes_AC", "db_acs.gnomad-genomes2": "gnomad_genomes_AC", } if key in ac_key_map: filter_key = ac_key_map[key] ac_filter_setting = { k.replace("$", ""): v for k, v in value.items() } s = s.filter( Q('range', **{filter_key: ac_filter_setting}) | ~Q('exists', field=filter_key)) hemi_key_map = { "db_hemi.exac_v3": "exac_AC_Hemi", "db_hemi.gnomad_exomes": "gnomad_exomes_Hemi", "db_hemi.gnomad_genomes": "gnomad_genomes_Hemi", "db_hemi.gnomad-exomes2": "gnomad_exomes_Hemi", "db_hemi.gnomad-genomes2": "gnomad_genomes_Hemi", } if key in hemi_key_map: filter_key = hemi_key_map[key] hemi_filter_setting = { k.replace("$", ""): v for k, v in value.items() } s = s.filter( Q('range', **{filter_key: hemi_filter_setting}) | ~Q('exists', field=filter_key)) hom_key_map = { "db_hom.exac_v3": "exac_AC_Hom", "db_hom.gnomad_exomes": "gnomad_exomes_Hom", "db_hom.gnomad_genomes": "gnomad_genomes_Hom", "db_hom.gnomad-exomes2": "gnomad_exomes_Hom", "db_hom.gnomad-genomes2": "gnomad_genomes_Hom", } if key in hom_key_map: filter_key = hom_key_map[key] hom_filter_setting = { k.replace("$", ""): v for k, v in value.items() } s = s.filter( Q('range', **{filter_key: hom_filter_setting}) | ~Q('exists', field=filter_key)) #s = s.sort("xpos") #logger.info("=====") #logger.info("FULL QUERY OBJ: " + pformat(s.__dict__)) #logger.info("FILTERS: " + pformat(s.to_dict())) # https://elasticsearch-py.readthedocs.io/en/master/helpers.html#elasticsearch.helpers.scan start = time.time() s = s.params(size=max_results_limit + 1) #if not include_all_consequences: # s = s.source(exclude=["sortedTranscriptConsequences"]) response = s.execute() logger.info("=====") logger.info("TOTAL: %s. Query took %s seconds" % (response.hits.total, time.time() - start)) if response.hits.total > max_results_limit + 1: raise Exception( "This search matched too many variants. Please set additional filters and try again." ) #print(pformat(response.to_dict())) project = Project.objects.get(project_id=project_id) #gene_list_map = project.get_gene_list_map() reference = get_reference() #for i, hit in enumerate(response.hits): variant_results = [] for i, hit in enumerate(s.scan()): # preserve_order=True #logger.info("HIT %s: %s %s %s" % (i, hit["variantId"], hit["geneIds"], pformat(hit.__dict__))) #print("HIT %s: %s" % (i, pformat(hit.to_dict()))) filters = ",".join(hit["filters"] or []) if "filters" in hit else "" genotypes = {} all_num_alt = [] for individual_id in family_individual_ids: encoded_individual_id = _encode_name(individual_id) num_alt = int(hit["%s_num_alt" % encoded_individual_id]) if ( "%s_num_alt" % encoded_individual_id) in hit else -1 if num_alt is not None: all_num_alt.append(num_alt) alleles = [] if num_alt == 0: alleles = [hit["ref"], hit["ref"]] elif num_alt == 1: alleles = [hit["ref"], hit["alt"]] elif num_alt == 2: alleles = [hit["alt"], hit["alt"]] elif num_alt == -1 or num_alt == None: alleles = [] else: raise ValueError("Invalid num_alt: " + str(num_alt)) genotypes[individual_id] = { 'ab': hit["%s_ab" % encoded_individual_id] if ("%s_ab" % encoded_individual_id) in hit else None, 'alleles': map(str, alleles), 'extras': { 'ad': hit["%s_ab" % encoded_individual_id] if ("%s_ad" % encoded_individual_id) in hit else None, 'dp': hit["%s_dp" % encoded_individual_id] if ("%s_dp" % encoded_individual_id) in hit else None, #'pl': '', }, 'filter': filters or "pass", 'gq': hit["%s_gq" % encoded_individual_id] if ("%s_gq" % encoded_individual_id in hit and hit["%s_gq" % encoded_individual_id] is not None) else '', 'num_alt': num_alt, } if all([num_alt <= 0 for num_alt in all_num_alt]): #logger.info("Filtered out due to genotype: " + str(genotypes)) #print("Filtered all_num_alt <= 0 - Result %s: GRCh38: %s:%s, cadd: %s %s - %s" % (i, hit["contig"], hit["start"], hit["cadd_PHRED"] if "cadd_PHRED" in hit else "", hit["transcriptConsequenceTerms"], all_num_alt)) continue vep_annotation = json.loads( str(hit['sortedTranscriptConsequences']) ) if 'sortedTranscriptConsequences' in hit else None if project.genome_version == GENOME_VERSION_GRCh37: grch38_coord = None if self.liftover_grch37_to_grch38: grch38_coord = self.liftover_grch37_to_grch38.convert_coordinate( "chr%s" % hit["contig"].replace("chr", ""), int(hit["start"])) if grch38_coord and grch38_coord[0]: grch38_coord = "%s-%s-%s-%s " % ( grch38_coord[0][0], grch38_coord[0][1], hit["ref"], hit["alt"]) else: grch38_coord = None else: grch38_coord = hit["variantId"] if project.genome_version == GENOME_VERSION_GRCh38: grch37_coord = None if self.liftover_grch38_to_grch37: grch37_coord = self.liftover_grch38_to_grch37.convert_coordinate( "chr%s" % hit["contig"].replace("chr", ""), int(hit["start"])) if grch37_coord and grch37_coord[0]: grch37_coord = "%s-%s-%s-%s " % ( grch37_coord[0][0], grch37_coord[0][1], hit["ref"], hit["alt"]) else: grch37_coord = None else: grch37_coord = hit["variantId"] result = { #u'_id': ObjectId('596d2207ff66f729285ca588'), 'alt': str(hit["alt"]) if "alt" in hit else None, 'annotation': { 'fathmm': fathmm_map.get(hit["dbnsfp_FATHMM_pred"].split(';')[0]) if "dbnsfp_FATHMM_pred" in hit and hit["dbnsfp_FATHMM_pred"] else None, 'muttaster': muttaster_map.get( hit["dbnsfp_MutationTaster_pred"].split(';')[0]) if "dbnsfp_MutationTaster_pred" in hit and hit["dbnsfp_MutationTaster_pred"] else None, 'polyphen': polyphen_map.get( hit["dbnsfp_Polyphen2_HVAR_pred"].split(';')[0]) if "dbnsfp_Polyphen2_HVAR_pred" in hit and hit["dbnsfp_Polyphen2_HVAR_pred"] else None, 'sift': sift_map.get(hit["dbnsfp_SIFT_pred"].split(';')[0]) if "dbnsfp_SIFT_pred" in hit and hit["dbnsfp_SIFT_pred"] else None, 'GERP_RS': hit["dbnsfp_GERP_RS"] if "dbnsfp_GERP_RS" in hit else None, 'phastCons100way_vertebrate': hit["dbnsfp_phastCons100way_vertebrate"] if "dbnsfp_phastCons100way_vertebrate" in hit else None, 'cadd_phred': hit["cadd_PHRED"] if "cadd_PHRED" in hit else None, 'dann_score': hit["dbnsfp_DANN_score"] if "dbnsfp_DANN_score" in hit else None, 'revel_score': hit["dbnsfp_REVEL_score"] if "dbnsfp_REVEL_score" in hit else None, 'eigen_phred': hit["eigen_Eigen_phred"] if "eigen_Eigen_phred" in hit else (hit["dbnsfp_Eigen_phred"] if "dbnsfp_Eigen_phred" in hit else None), 'mpc_score': hit["mpc_MPC"] if "mpc_MPC" in hit else None, 'annotation_tags': list(hit["transcriptConsequenceTerms"] or []) if "transcriptConsequenceTerms" in hit else None, 'coding_gene_ids': list(hit['codingGeneIds'] or []), 'gene_ids': list(hit['geneIds'] or []), 'vep_annotation': vep_annotation, 'vep_group': str(hit['mainTranscript_major_consequence'] or ""), 'vep_consequence': str(hit['mainTranscript_major_consequence'] or ""), 'main_transcript': { k.replace('mainTranscript_', ''): hit[k] for k in dir(hit) if k.startswith('mainTranscript_') }, 'worst_vep_annotation_index': 0, 'worst_vep_index_per_gene': { str(hit['mainTranscript_gene_id']): 0 }, }, 'chr': hit["contig"], 'coding_gene_ids': list(hit['codingGeneIds'] or []), 'gene_ids': list(hit['geneIds'] or []), 'coverage': { 'gnomad_exome_coverage': float(hit["gnomad_exome_coverage"] or -1) if "gnomad_exome_coverage" in hit else -1, 'gnomad_genome_coverage': float(hit["gnomad_genome_coverage"] or -1) if "gnomad_genome_coverage" in hit else -1, }, 'pop_counts': { 'AC': int(hit['AC'] or 0) if 'AC' in hit else None, 'AN': int(hit['AN'] or 0) if 'AN' in hit else None, '1kg_AC': int(hit['g1k_AC'] or 0) if 'g1k_AC' in hit else None, '1kg_AN': int(hit['g1k_AN'] or 0) if 'g1k_AN' in hit else None, 'exac_v3_AC': int(hit["exac_AC_Adj"] or 0) if "exac_Adj_AC" in hit else None, 'exac_v3_Het': int(hit["exac_AC_Het"] or 0) if "exac_AC_Het" in hit else None, 'exac_v3_Hom': int(hit["exac_AC_Hom"] or 0) if "exac_AC_Hom" in hit else None, 'exac_v3_Hemi': int(hit["exac_AC_Hemi"] or 0) if "exac_AC_Hemi" in hit else None, 'gnomad_exomes_AC': int(hit["gnomad_exomes_AC"] or 0) if "gnomad_exomes_AC" in hit else None, 'gnomad_exomes_Hom': int(hit["gnomad_exomes_Hom"] or 0) if "gnomad_exomes_Hom" in hit else None, 'gnomad_exomes_Hemi': int(hit["gnomad_exomes_Hemi"] or 0) if "gnomad_exomes_Hemi" in hit else None, 'gnomad_exomes_AN': int(hit["gnomad_exomes_AN"] or 0) if "gnomad_exomes_AN" in hit else None, 'gnomad_genomes_AC': int(hit["gnomad_genomes_AC"] or 0) if "gnomad_genomes_AC" in hit else None, 'gnomad_genomes_Hom': int(hit["gnomad_genomes_Hom"] or 0) if "gnomad_genomes_Hom" in hit else None, 'gnomad_genomes_Hemi': int(hit["gnomad_genomes_Hemi"] or 0) if "gnomad_genomes_Hemi" in hit else None, 'gnomad_genomes_AN': int(hit["gnomad_genomes_AN"] or 0) if "gnomad_genomes_AN" in hit else None, 'topmed_AC': float(hit["topmed_AC"] or 0) if "topmed_AC" in hit else None, 'topmed_Het': float(hit["topmed_Het"] or 0) if "topmed_Het" in hit else None, 'topmed_Hom': float(hit["topmed_Hom"] or 0) if "topmed_Hom" in hit else None, 'topmed_AN': float(hit["topmed_AN"] or 0) if "topmed_AN" in hit else None, }, 'db_freqs': { 'AF': float(hit["AF"] or 0.0) if "AF" in hit else None, '1kg_wgs_AF': float(hit["g1k_AF"] or 0.0) if "g1k_AF" in hit else None, '1kg_wgs_popmax_AF': float(hit["g1k_POPMAX_AF"] or 0.0) if "g1k_POPMAX_AF" in hit else None, 'exac_v3_AF': float(hit["exac_AF"] or 0.0) if "exac_AF" in hit else (hit["exac_AC_Adj"] / float(hit["exac_AN_Adj"]) if "exac_AC_Adj" in hit and "exac_AN_Adj" in hit and int(hit["exac_AN_Adj"] or 0) > 0 else None), 'exac_v3_popmax_AF': float(hit["exac_AF_POPMAX"] or 0.0) if "exac_AF_POPMAX" in hit else None, 'gnomad_exomes_AF': float(hit["gnomad_exomes_AF"] or 0.0) if "gnomad_exomes_AF" in hit else None, 'gnomad_exomes_popmax_AF': float(hit["gnomad_exomes_AF_POPMAX"] or 0.0) if "gnomad_exomes_AF_POPMAX" in hit else None, 'gnomad_genomes_AF': float(hit["gnomad_genomes_AF"] or 0.0) if "gnomad_genomes_AF" in hit else None, 'gnomad_genomes_popmax_AF': float(hit["gnomad_genomes_AF_POPMAX"] or 0.0) if "gnomad_genomes_AF_POPMAX" in hit else None, 'topmed_AF': float(hit["topmed_AF"] or 0.0) if "topmed_AF" in hit else None, }, #'popmax_populations': { # 'exac_popmax': hit["exac_POPMAX"] or None, # 'gnomad_exomes_popmax': hit["gnomad_exomes_POPMAX"] or None, # 'gnomad_genomes_popmax': hit["gnomad_genomes_POPMAX"] or None, #}, 'db_gene_ids': list((hit["geneIds"] or []) if "geneIds" in hit else []), 'db_tags': str(hit["transcriptConsequenceTerms"] or "") if "transcriptConsequenceTerms" in hit else None, 'extras': { 'clinvar_variant_id': hit['clinvar_variation_id'] if 'clinvar_variation_id' in hit and hit['clinvar_variation_id'] else None, 'clinvar_allele_id': hit['clinvar_allele_id'] if 'clinvar_allele_id' in hit and hit['clinvar_allele_id'] else None, 'clinvar_clinsig': hit['clinvar_clinical_significance'].lower() if ('clinvar_clinical_significance' in hit) and hit['clinvar_clinical_significance'] else None, 'hgmd_class': hit['hgmd_class'] if 'hgmd_class' in hit and user and user.is_staff else None, 'hgmd_accession': hit['hgmd_accession'] if 'hgmd_accession' in hit else None, 'genome_version': project.genome_version, 'grch37_coords': grch37_coord, 'grch38_coords': grch38_coord, 'alt_allele_pos': 0, 'orig_alt_alleles': map(str, [a.split("-")[-1] for a in hit["originalAltAlleles"]]) if "originalAltAlleles" in hit else None }, 'genotypes': genotypes, 'pos': long(hit['start']), 'pos_end': str(hit['end']), 'ref': str(hit['ref']), 'vartype': 'snp' if len(hit['ref']) == len(hit['alt']) else "indel", 'vcf_id': None, 'xpos': long(hit["xpos"]), 'xposx': long(hit["xpos"]), } result["annotation"]["freqs"] = result["db_freqs"] result["annotation"]["pop_counts"] = result["pop_counts"] result["annotation"]["db"] = "elasticsearch" result["extras"][ "svlen"] = hit["SVLEN"] if "SVLEN" in hit else None result["extras"][ "svtype"] = hit["SVTYPE"] if "SVTYPE" in hit else None logger.info( "Result %s: GRCh37: %s GRCh38: %s:, cadd: %s %s - gene ids: %s, coding gene_ids: %s" % (i, grch37_coord, grch38_coord, hit["cadd_PHRED"] if "cadd_PHRED" in hit else "", hit["transcriptConsequenceTerms"], result["gene_ids"], result["coding_gene_ids"])) result["extras"]["project_id"] = project_id result["extras"]["family_id"] = family_id # add gene info gene_names = {} if vep_annotation is not None: gene_names = { vep_anno["gene_id"]: vep_anno.get("gene_symbol") for vep_anno in vep_annotation if vep_anno.get("gene_symbol") } result["extras"]["gene_names"] = gene_names try: genes = {} for gene_id in result["coding_gene_ids"]: if gene_id: genes[gene_id] = reference.get_gene_summary( gene_id) or {} if not genes: for gene_id in result["gene_ids"]: if gene_id: genes[gene_id] = reference.get_gene_summary( gene_id) or {} #if not genes: # genes = {vep_anno["gene_id"]: {"symbol": vep_anno["gene_symbol"]} for vep_anno in vep_annotation} result["extras"]["genes"] = genes except Exception as e: exc_type, exc_obj, exc_tb = sys.exc_info() logger.warn( "WARNING: got unexpected error in add_gene_names_to_variants: %s : line %s" % (e, exc_tb.tb_lineno)) variant_results.append(result) logger.info("Finished returning the %s variants: %s seconds" % (response.hits.total, time.time() - start)) if self._redis_client: self._redis_client.set(cache_key, json.dumps(variant_results)) return [ Variant.fromJSON(variant_json) for variant_json in variant_results ] def get_variants(self, project_id, family_id, genotype_filter=None, variant_filter=None, quality_filter=None, indivs_to_consider=None, user=None): for variant in self.get_elasticsearch_variants( project_id, family_id, variant_filter=variant_filter, genotype_filter=genotype_filter, quality_filter=quality_filter, indivs_to_consider=indivs_to_consider, user=user, ): yield variant def get_variants_in_gene(self, project_id, family_id, gene_id, genotype_filter=None, variant_filter=None): if variant_filter is None: modified_variant_filter = VariantFilter() else: modified_variant_filter = copy.deepcopy(variant_filter) modified_variant_filter.add_gene(gene_id) #db_query = self._make_db_query(genotype_filter, modified_variant_filter, user=None) raise ValueError("Not Implemented") def get_single_variant(self, project_id, family_id, xpos, ref, alt, user=None): chrom, pos = get_chr_pos(xpos) variant_id = "%s-%s-%s-%s" % (chrom, pos, ref, alt) results = list( self.get_elasticsearch_variants(project_id, family_id=family_id, variant_id_filter=[variant_id], user=user, include_all_consequences=True)) if not results: return None if len(results) > 1: raise ValueError( "Multiple variant records found for project: %s family: %s %s-%s-%s-%s: \n %s" % (project_id, family_id, chrom, pos, ref, alt, "\n".join( [pformat(v.toJSON()) for v in results]))) variant = results[0] return variant def get_multiple_variants(self, project_id, family_id, xpos_ref_alt_tuples, user=None): """ Get one or more specific variants in a family Variant should be identifiable by xpos, ref, and alt Note that ref and alt are just strings from the VCF (for now) """ variant_ids = [] for xpos, ref, alt in xpos_ref_alt_tuples: chrom, pos = get_chr_pos(xpos) variant_ids.append("%s-%s-%s-%s" % (chrom, pos, ref, alt)) results = self.get_elasticsearch_variants( project_id, family_id=family_id, variant_id_filter=variant_ids, user=user) # make sure all variants in xpos_ref_alt_tuples were retrieved and are in the same order. # Return None for tuples that weren't found in ES. results_by_xpos_ref_alt = {} for r in results: results_by_xpos_ref_alt[(r.xpos, r.ref, r.alt)] = r # create a list that's the same length as the input list of xpos_ref_alt_tuples, putting None for # xpos-ref-alt's that weren't found in the elasticsearch index results = [results_by_xpos_ref_alt.get(t) for t in xpos_ref_alt_tuples] return results def get_variants_cohort(self, project_id, cohort_id, variant_filter=None): raise ValueError("Not implemented") def get_single_variant_cohort(self, project_id, cohort_id, xpos, ref, alt): raise ValueError("Not implemented") def get_project_variants_in_gene(self, project_id, gene_id, variant_filter=None, user=None): if variant_filter is None: modified_variant_filter = VariantFilter() else: modified_variant_filter = copy.deepcopy(variant_filter) modified_variant_filter.add_gene(gene_id) variants = [ variant for variant in self.get_elasticsearch_variants( project_id, variant_filter=modified_variant_filter, user=user, max_results_limit=9999) ] return variants def _make_db_query(self, genotype_filter=None, variant_filter=None): """ Caller specifies filters to get_variants, but they are evaluated later. Here, we just inspect those filters and see what heuristics we can apply to avoid a full table scan, Query here must return a superset of the true get_variants results Note that the full annotation isn't stored, so use the fields added by _add_index_fields_to_variant """ db_query = {} # genotype filter if genotype_filter is not None: _add_genotype_filter_to_variant_query(db_query, genotype_filter) if variant_filter: logger.info(pformat(variant_filter.toJSON())) if variant_filter.locations: location_ranges = [] for i, location in enumerate(variant_filter.locations): if isinstance(location, basestring): chrom, pos_range = location.split(":") start, end = pos_range.split("-") xstart = genomeloc.get_xpos(chrom, int(start)) xend = genomeloc.get_xpos(chrom, int(end)) variant_filter.locations[i] = (xstart, xend) else: xstart, xend = location location_ranges.append({ '$and': [{ 'xpos': { '$gte': xstart } }, { 'xpos': { '$lte': xend } }] }) db_query['$or'] = location_ranges if variant_filter.so_annotations: db_query['db_tags'] = {'$in': variant_filter.so_annotations} if variant_filter.genes: if getattr(variant_filter, 'exclude_genes'): db_query['db_gene_ids'] = {'$nin': variant_filter.genes} else: db_query['db_gene_ids'] = {'$in': variant_filter.genes} if variant_filter.ref_freqs: for population, freq in variant_filter.ref_freqs: #if population in self._annotator.reference_population_slugs: db_query['db_freqs.' + population] = {'$lte': freq} if variant_filter.ref_acs: for population, ac in variant_filter.ref_acs: db_query['db_acs.' + population] = {'$lte': ac} if variant_filter.ref_hom_hemi: for population, count in variant_filter.ref_hom_hemi: db_query['db_hemi.' + population] = {'$lte': count} db_query['db_hom.' + population] = {'$lte': count} return db_query def family_exists(self, project_id, family_id): from xbrowse_server.base.models import Family family = Family.objects.get(project__project_id=project_id, family_id=family_id) return family.has_variant_data() def get_family_status(self, project_id, family_id): if self.family_exists(project_id, family_id): return 'loaded' else: return 'not_loaded' def project_collection_is_loaded(self, project): """Returns true if the project collection is fully loaded (this is the collection that stores the project-wide set of variants used for gene search).""" return project.get_elasticsearch_index() is not None
def handle(self, *args, **options): """transfer project""" project_arg = options['project'] elasticsearch_index = options['es_index'] project = Project.objects.get( Q(name=project_arg) | Q(guid=project_arg)) logger.info('Updating project genome version for {}'.format( project.name)) # Validate the provided index logger.info('Validating es index {}'.format(elasticsearch_index)) sample_ids, index_metadata = get_elasticsearch_index_samples( elasticsearch_index) validate_index_metadata(index_metadata, project, elasticsearch_index, genome_version=GENOME_VERSION_GRCh38) sample_type = index_metadata['sampleType'] dataset_path = index_metadata['sourceFilePath'] matched_sample_id_to_sample_record = match_sample_ids_to_sample_records( project=project, sample_ids=sample_ids, sample_type=sample_type, dataset_type=Sample.DATASET_TYPE_VARIANT_CALLS, elasticsearch_index=elasticsearch_index, sample_id_to_individual_id_mapping={}, ) unmatched_samples = set(sample_ids) - set( matched_sample_id_to_sample_record.keys()) if len(unmatched_samples) > 0: raise CommandError( 'Matches not found for ES sample ids: {}.'.format( ', '.join(unmatched_samples))) prefetch_related_objects(matched_sample_id_to_sample_record.values(), 'individual__family') included_families = { sample.individual.family for sample in matched_sample_id_to_sample_record.values() } missing_individuals = Individual.objects.filter( family__in=included_families, sample__is_active=True, sample__dataset_type=Sample.DATASET_TYPE_VARIANT_CALLS, ).exclude(sample__in=matched_sample_id_to_sample_record.values() ).select_related('family') missing_family_individuals = defaultdict(list) for individual in missing_individuals: missing_family_individuals[individual.family].append(individual) if missing_family_individuals: raise CommandError( 'The following families are included in the callset but are missing some family members: {}.' .format(', '.join([ '{} ({})'.format( family.family_id, ', '.join([i.individual_id for i in missing_indivs])) for family, missing_indivs in missing_family_individuals.items() ]))) # Get and clean up expected saved variants saved_variant_models_by_guid = { v.guid: v for v in SavedVariant.objects.filter(family__project=project) } deleted_no_tags = set() for guid, variant in saved_variant_models_by_guid.items(): if not (variant.varianttag_set.count() or variant.variantnote_set.count()): deleted_no_tags.add(guid) if deleted_no_tags: if raw_input( 'Do you want to delete the following {} saved variants with no tags (y/n)?: {} ' .format(len(deleted_no_tags), ', '.join(deleted_no_tags))) == 'y': for guid in deleted_no_tags: saved_variant_models_by_guid.pop(guid).delete() logger.info('Deleted {} variants'.format(len(deleted_no_tags))) expected_families = { sv.family for sv in saved_variant_models_by_guid.values() } missing_families = expected_families - included_families if missing_families: raise CommandError( 'The following families have saved variants but are missing from the callset: {}.' .format(', '.join([f.family_id for f in missing_families]))) # Lift-over saved variants _update_variant_samples(matched_sample_id_to_sample_record, elasticsearch_index, dataset_path) saved_variants = get_json_for_saved_variants( saved_variant_models_by_guid.values(), add_details=True) saved_variants_to_lift = [ v for v in saved_variants if v['genomeVersion'] != GENOME_VERSION_GRCh38 ] num_already_lifted = len(saved_variants) - len(saved_variants_to_lift) if num_already_lifted: if raw_input( 'Found {} saved variants already on Hg38. Continue with liftover (y/n)? ' .format(num_already_lifted)) != 'y': raise CommandError( 'Error: found {} saved variants already on Hg38'.format( num_already_lifted)) logger.info( 'Lifting over {} variants (skipping {} that are already lifted)'. format(len(saved_variants_to_lift), num_already_lifted)) liftover_to_38 = LiftOver('hg19', 'hg38') hg37_to_hg38_xpos = {} lift_failed = {} for v in saved_variants_to_lift: if not (hg37_to_hg38_xpos.get(v['xpos']) or v['xpos'] in lift_failed): hg38_coord = liftover_to_38.convert_coordinate( 'chr{}'.format(v['chrom'].lstrip('chr')), int(v['pos'])) if hg38_coord and hg38_coord[0]: hg37_to_hg38_xpos[v['xpos']] = get_xpos( hg38_coord[0][0], hg38_coord[0][1]) else: lift_failed[v['xpos']] = v if lift_failed: if raw_input( 'Unable to lift over the following {} coordinates. Continue with update (y/n)?: {} ' .format( len(lift_failed), ', '.join([ '{}:{}-{}-{} ({})'.format( v['chrom'], v['pos'], v['ref'], v['alt'], ', '.join(v['familyGuids'])) for v in lift_failed.values() ]))) != 'y': raise CommandError( 'Error: unable to lift over {} variants'.format( len(lift_failed))) saved_variants_map = defaultdict(list) for v in saved_variants_to_lift: if hg37_to_hg38_xpos.get(v['xpos']): variant_model = saved_variant_models_by_guid[v['variantGuid']] saved_variants_map[(hg37_to_hg38_xpos[v['xpos']], v['ref'], v['alt'])].append(variant_model) es_variants = get_es_variants_for_variant_tuples( expected_families, saved_variants_map.keys()) missing_variants = set( saved_variants_map.keys()) - {(v['xpos'], v['ref'], v['alt']) for v in es_variants} if missing_variants: missing_variant_strings = [] for xpos, ref, alt in missing_variants: var_id = '{}-{}-{}'.format(xpos, ref, alt) for v in saved_variants_map[(xpos, ref, alt)]: tags = v.varianttag_set.all() notes = v.variantnote_set.all() missing_variant_strings.append( '{var_id} {family_id}: {tags} ({guid})'.format( var_id=var_id, family_id=v.family.family_id, guid=v.guid, tags=', '.join([ tag.variant_tag_type.name for tag in tags ]) if tags else 'No Tags; {}'.format('; '.join( [note.note for note in notes])))) if raw_input( 'Unable to find the following {} variants in the index. Continue with update (y/n)?:\n{}\n' .format(len(missing_variants), '\n'.join(missing_variant_strings))) != 'y': raise CommandError( 'Error: unable to find {} lifted-over variants'.format( len(missing_variants))) logger.info('Successfully lifted over {} variants'.format( len(es_variants))) # Update saved variants missing_family_count = 0 for var in es_variants: saved_variant_models = saved_variants_map[(var['xpos'], var['ref'], var['alt'])] missing_saved_variants = [ v for v in saved_variant_models if v.family.guid not in var['familyGuids'] ] if missing_saved_variants: variant_id = '{}-{}-{}-{}'.format(var['chrom'], var['pos'], var['ref'], var['alt']) if raw_input( ('Variant {} (hg37: {}) not find for expected families {}. Continue with update (y/n)? ' .format( variant_id, missing_saved_variants[0].xpos, ', '.join([ '{} ({})'.format(v.family.guid, v.guid) for v in missing_saved_variants ])))) == 'y': var = get_single_es_variant( [v.family for v in saved_variant_models], variant_id, return_all_queried_families=True) missing_family_count += len(missing_saved_variants) else: raise CommandError( 'Error: unable to find family data for lifted over variant' ) for saved_variant in saved_variant_models: saved_variant.xpos_start = var['xpos'] saved_variant.saved_variant_json = var saved_variant.save() logger.info('Successfully updated {} variants'.format( len(es_variants))) # Update project and sample data update_model_from_json(project, {'genome_version': GENOME_VERSION_GRCh38}) reset_cached_search_results(project) logger.info('---Done---') logger.info( 'Succesfully lifted over {} variants. Skipped {} failed variants. Family data not updated for {} variants' .format(len(es_variants), len(missing_variants) + len(lift_failed), missing_family_count))
def get_elasticsearch_variants( self, project_id, family_id=None, variant_filter=None, genotype_filter=None, variant_id_filter=None, quality_filter=None, indivs_to_consider=None, include_all_consequences=False, user=None, max_results_limit=settings.VARIANT_QUERY_RESULTS_LIMIT, ): from xbrowse_server.base.models import Project, Family, Individual from seqr.models import Sample from xbrowse_server.mall import get_reference from pyliftover.liftover import LiftOver cache_key = "Variants___%s___%s___%s" % ( project_id, family_id, json.dumps([ variant_filter.toJSON() if variant_filter else None, genotype_filter, quality_filter, variant_id_filter, indivs_to_consider, include_all_consequences, ]) ) cached_results = self._redis_client and self._redis_client.get(cache_key) if cached_results is not None: variant_results = json.loads(cached_results) return [Variant.fromJSON(variant_json) for variant_json in variant_results] if family_id is None: project = Project.objects.get(project_id=project_id) elasticsearch_index = project.get_elasticsearch_index() logger.info("Searching in project elasticsearch index: " + str(elasticsearch_index)) else: family = Family.objects.get(project__project_id=project_id, family_id=family_id) elasticsearch_index = family.get_elasticsearch_index() project = family.project logger.info("Searching in family elasticsearch index: " + str(elasticsearch_index)) if indivs_to_consider is None and genotype_filter and not family_id: indivs_to_consider = genotype_filter.keys() individuals = Individual.objects.filter(family__project__project_id=project_id).only("indiv_id", "seqr_individual") if indivs_to_consider: individuals = individuals.filter(indiv_id__in=indivs_to_consider) if family_id is not None: individuals = individuals.filter(family__family_id=family_id) if not indivs_to_consider: indivs_to_consider = [i.indiv_id for i in individuals] prefetch_related_objects(individuals, "seqr_individual") es_indices = [index.rstrip('*') for index in elasticsearch_index.split(',')] samples = Sample.objects.filter( individual__in=[i.seqr_individual for i in individuals if i.seqr_individual], dataset_type=Sample.DATASET_TYPE_VARIANT_CALLS, sample_status=Sample.SAMPLE_STATUS_LOADED, elasticsearch_index__startswith=es_indices[0], loaded_date__isnull=False, ).order_by('-loaded_date') prefetch_related_objects(samples, "individual") family_individual_ids_to_sample_ids = {} for i in individuals: indiv_id = i.indiv_id sample_id = None if i.seqr_individual: sample_id = next(( sample.sample_id for sample in samples if sample.individual == i.seqr_individual and sample.elasticsearch_index.startswith(tuple(es_indices)) ), None) family_individual_ids_to_sample_ids[indiv_id] = sample_id or indiv_id query_json = self._make_db_query(genotype_filter, variant_filter) try: if self.liftover_grch38_to_grch37 is None: self.liftover_grch38_to_grch37 = LiftOver('hg38', 'hg19') if self.liftover_grch37_to_grch38 is None: self.liftover_grch37_to_grch38 = None # LiftOver('hg19', 'hg38') except Exception as e: logger.info("WARNING: Unable to set up liftover. Is there a working internet connection? " + str(e)) mapping = self._es_client.indices.get_mapping(str(elasticsearch_index) + "*") index_fields = {} is_nested = False if elasticsearch_index in mapping and 'join_field' in mapping[elasticsearch_index]["mappings"]["variant"]["properties"]: # Nested indices are not sharded so all samples are in the single index logger.info("matching indices: " + str(elasticsearch_index)) is_nested = True elif family_id is not None and len(family_individual_ids_to_sample_ids) > 0: # figure out which index to use # TODO add caching matching_indices = [] for raw_sample_id in family_individual_ids_to_sample_ids.values(): sample_id = _encode_name(raw_sample_id) for index_name, index_mapping in mapping.items(): if sample_id+"_num_alt" in index_mapping["mappings"]["variant"]["properties"]: matching_indices.append(index_name) index_fields.update(index_mapping["mappings"]["variant"]["properties"]) if len(matching_indices) > 0: break if not matching_indices: if family_id is not None and not family_individual_ids_to_sample_ids: logger.error("no individuals found for family %s" % (family_id)) elif not mapping: logger.error("no es mapping found for found with prefix %s" % (elasticsearch_index)) else: logger.error("%s not found in %s:\n%s" % (indiv_id, elasticsearch_index, pformat(index_mapping["mappings"]["variant"]["properties"]))) else: elasticsearch_index = ",".join(matching_indices) logger.info("matching indices: " + str(elasticsearch_index)) else: elasticsearch_index = str(elasticsearch_index)+"*" if not index_fields: for index_mapping in mapping.values(): index_fields.update(index_mapping["mappings"]["variant"]["properties"]) s = elasticsearch_dsl.Search(using=self._es_client, index=elasticsearch_index) #",".join(indices)) if variant_id_filter is not None: variant_id_filter_term = None for variant_id in variant_id_filter: q_obj = Q('term', **{"variantId": variant_id}) if variant_id_filter_term is None: variant_id_filter_term = q_obj else: variant_id_filter_term |= q_obj s = s.filter(variant_id_filter_term) genotype_filters = {} for key, value in query_json.items(): if key.startswith("genotypes"): indiv_id = ".".join(key.split(".")[1:-1]) sample_id = family_individual_ids_to_sample_ids.get(indiv_id) or indiv_id genotype_filter = value if type(genotype_filter) == int or type(genotype_filter) == basestring: genotype_filters[sample_id] = [('term', genotype_filter)] elif '$gte' in genotype_filter: genotype_filter = {k.replace("$", ""): v for k, v in genotype_filter.items()} genotype_filters[sample_id] = [('range', genotype_filter)] elif "$in" in genotype_filter: num_alt_values = genotype_filter['$in'] genotype_filters[sample_id] = [('term', num_alt_value) for num_alt_value in num_alt_values] sample_ids = [family_individual_ids_to_sample_ids.get(indiv_id) or indiv_id for indiv_id in (indivs_to_consider or [])] min_ab = None min_gq = None if quality_filter is not None and indivs_to_consider: min_ab = quality_filter.get('min_ab') if min_ab is not None: min_ab /= 100.0 # convert to fraction min_gq = quality_filter.get('min_gq') vcf_filter = quality_filter.get('vcf_filter') if vcf_filter is not None: s = s.filter(~Q('exists', field='filters')) if is_nested: quality_q = Q() if min_ab or min_gq: if min_ab is not None: # AB only relevant for hets quality_q &= Q(~Q('term', num_alt=1) | Q('range', ab={'gte': min_ab})) if min_gq is not None: quality_q &= Q('range', gq={'gte': min_gq}) if genotype_filters: # Return inner hits for all requested samples, even those without a specified genotype genotype_sample_ids = sample_ids or genotype_filters.keys() genotype_q = None for sample_id in genotype_sample_ids: sample_q = Q(Q('term', sample_id=sample_id) & quality_q) if genotype_filters.get(sample_id): q = None for (op, val) in genotype_filters[sample_id]: if q: q |= Q(op, num_alt=val) else: q = Q(op, num_alt=val) sample_q &= q if not genotype_q: genotype_q = sample_q else: genotype_q |= sample_q genotype_kwargs = {'query': genotype_q, 'min_children': len(genotype_sample_ids)} elif sample_ids: # Subquery for child docs with the requested sample IDs and quality metrics sample_id_q = Q('terms', sample_id=sample_ids) & quality_q # Only return variants where at least one of the requested samples has an alt allele s = s.filter(Q('has_child', type='genotype', query=(Q(Q('range', num_alt={'gte': 1}) & sample_id_q)))) # Return inner hits for all the requested samples regardless of genotype genotype_kwargs = {'query': sample_id_q, 'min_children': len(sample_ids)} else: # Return all inner hits for the variant # This case is only used by gene search, which also does not use quality filters genotype_kwargs = {'query': Q()} s = s.filter(Q('has_child', type='genotype', inner_hits={'size': genotype_kwargs.get('min_children', MAX_INNER_HITS)}, **genotype_kwargs)) else: for sample_id, queries in genotype_filters.items(): encoded_sample_id = _encode_name(sample_id) q = Q(queries[0][0], **{encoded_sample_id + "_num_alt": queries[0][1]}) for (op, val) in queries[1:]: q = q | Q(op, **{encoded_sample_id + "_num_alt": val}) s = s.filter(q) if sample_ids: atleast_one_nonref_genotype_filter = None for sample_id in sample_ids: encoded_sample_id = _encode_name(sample_id) q = Q('range', **{encoded_sample_id+"_num_alt": {'gte': 1}}) if atleast_one_nonref_genotype_filter is None: atleast_one_nonref_genotype_filter = q else: atleast_one_nonref_genotype_filter |= q s = s.filter(atleast_one_nonref_genotype_filter) if min_ab or min_gq: for sample_id in sample_ids: encoded_sample_id = _encode_name(sample_id) if min_ab: s = s.filter( ~Q('term', **{encoded_sample_id+"_num_alt": 1}) | Q('range', **{encoded_sample_id+"_ab": {'gte': min_ab}})) #logger.info("### ADDED FILTER: " + str({encoded_sample_id+"_ab": {'gte': min_ab}})) if min_gq: s = s.filter('range', **{encoded_sample_id+"_gq": {'gte': min_gq}}) #logger.info("### ADDED FILTER: " + str({encoded_sample_id+"_gq": {'gte': min_gq}})) # parse variant query annotation_groups_map = ANNOTATION_GROUPS_MAP_INTERNAL if user and user.is_staff else ANNOTATION_GROUPS_MAP for key, value in query_json.items(): if key == 'db_tags': so_annotations = query_json.get('db_tags', {}).get('$in', []) # handle clinvar filters selected_so_annotations_set = set(so_annotations) all_clinvar_filters_set = set(annotation_groups_map.get("clinvar", {}).get("children", [])) selected_clinvar_filters_set = all_clinvar_filters_set & selected_so_annotations_set all_hgmd_filters_set = set(annotation_groups_map.get("hgmd", {}).get("children", [])) selected_hgmd_filters_set = all_hgmd_filters_set & selected_so_annotations_set vep_consequences = list(selected_so_annotations_set - selected_clinvar_filters_set - selected_hgmd_filters_set) consequences_filter = Q("terms", transcriptConsequenceTerms=vep_consequences) if selected_clinvar_filters_set: clinvar_clinical_significance_terms = set() for clinvar_filter in selected_clinvar_filters_set: # translate selected filters to the corresponding clinvar clinical consequence terms if clinvar_filter == "pathogenic": clinvar_clinical_significance_terms.update(["Pathogenic", "Pathogenic/Likely_pathogenic"]) elif clinvar_filter == "likely_pathogenic": clinvar_clinical_significance_terms.update(["Likely_pathogenic", "Pathogenic/Likely_pathogenic"]) elif clinvar_filter == "benign": clinvar_clinical_significance_terms.update(["Benign", "Benign/Likely_benign"]) elif clinvar_filter == "likely_benign": clinvar_clinical_significance_terms.update(["Likely_benign", "Benign/Likely_benign"]) elif clinvar_filter == "vus_or_conflicting": clinvar_clinical_significance_terms.update([ "Conflicting_interpretations_of_pathogenicity", "Uncertain_significance", "not_provided", "other"]) else: raise ValueError("Unexpected clinvar filter: " + str(clinvar_filter)) consequences_filter = consequences_filter | Q("terms", clinvar_clinical_significance=list(clinvar_clinical_significance_terms)) if selected_hgmd_filters_set: hgmd_class = set() for hgmd_filter in selected_hgmd_filters_set: # translate selected filters to the corresponding hgmd clinical consequence terms if hgmd_filter == "disease_causing": hgmd_class.update(["DM"]) elif hgmd_filter == "likely_disease_causing": hgmd_class.update(["DM?"]) elif hgmd_filter == "hgmd_other": hgmd_class.update(["DP", "DFP", "FP", "FTV"]) else: raise ValueError("Unexpected hgmd filter: " + str(hgmd_filter)) consequences_filter = consequences_filter | Q("terms", hgmd_class=list(hgmd_class)) if 'intergenic_variant' in vep_consequences: # for many intergenic variants VEP doesn't add any annotations, so if user selected 'intergenic_variant', also match variants where transcriptConsequenceTerms is emtpy consequences_filter = consequences_filter | ~Q('exists', field='transcriptConsequenceTerms') s = s.filter(consequences_filter) #logger.info("==> transcriptConsequenceTerms: %s" % str(vep_consequences)) if key.startswith("genotypes"): continue if key == "db_gene_ids": db_gene_ids = query_json.get('db_gene_ids', {}) exclude_genes = db_gene_ids.get('$nin', []) gene_ids = exclude_genes or db_gene_ids.get('$in', []) if exclude_genes: s = s.exclude("terms", geneIds=gene_ids) else: s = s.filter("terms", geneIds=gene_ids) #logger.info("==> %s %s" % ("exclude" if exclude_genes else "include", "geneIds: " + str(gene_ids))) if key == "$or" and type(value) == list: q_terms = None for region_filter in value: xpos_filters = region_filter.get("$and", {}) # for example: $or : [{'$and': [{'xpos': {'$gte': 12345}}, {'xpos': {'$lte': 54321}}]}] xpos_filters_dict = {} for xpos_filter in xpos_filters: xpos_filter_setting = xpos_filter["xpos"] # for example {'$gte': 12345} or {'$lte': 54321} xpos_filters_dict.update(xpos_filter_setting) xpos_filter_setting = {k.replace("$", ""): v for k, v in xpos_filters_dict.items()} q = Q('range', **{"xpos": xpos_filter_setting}) if q_terms is None: q_terms = q else: q_terms |= q if q_terms is not None: s = s.filter(q_terms) #logger.info("==> xpos range: " + str({"xpos": xpos_filter_setting})) af_key_map = { "db_freqs.AF": ["AF"], "db_freqs.1kg_wgs_phase3": ["g1k_POPMAX_AF"], "db_freqs.exac_v3": ["exac_AF_POPMAX"], "db_freqs.topmed": ["topmed_AF"], "db_freqs.gnomad_exomes": ["gnomad_exomes_AF_POPMAX", "gnomad_exomes_AF_POPMAX_OR_GLOBAL"], "db_freqs.gnomad_genomes": ["gnomad_genomes_AF_POPMAX", "gnomad_genomes_AF_POPMAX_OR_GLOBAL"], "db_freqs.gnomad-exomes2": ["gnomad_exomes_AF_POPMAX", "gnomad_exomes_AF_POPMAX_OR_GLOBAL"], "db_freqs.gnomad-genomes2": ["gnomad_genomes_AF_POPMAX", "gnomad_genomes_AF_POPMAX_OR_GLOBAL"], } if key in af_key_map: for filter_key in af_key_map[key]: af_filter_setting = {k.replace("$", ""): v for k, v in value.items()} s = s.filter(Q('range', **{filter_key: af_filter_setting}) | ~Q('exists', field=filter_key)) #logger.info("==> %s: %s" % (filter_key, af_filter_setting)) ac_key_map = { "db_acs.AF": "AC", "db_acs.1kg_wgs_phase3": "g1k_AC", "db_acs.exac_v3": "exac_AC", "db_acs.topmed": "topmed_AC", "db_acs.gnomad_exomes": "gnomad_exomes_AC", "db_acs.gnomad_genomes": "gnomad_genomes_AC", "db_acs.gnomad-exomes2": "gnomad_exomes_AC", "db_acs.gnomad-genomes2": "gnomad_genomes_AC", } if key in ac_key_map: filter_key = ac_key_map[key] ac_filter_setting = {k.replace("$", ""): v for k, v in value.items()} s = s.filter(Q('range', **{filter_key: ac_filter_setting}) | ~Q('exists', field=filter_key)) hemi_key_map = { "db_hemi.exac_v3": "exac_AC_Hemi", "db_hemi.gnomad_exomes": "gnomad_exomes_Hemi", "db_hemi.gnomad_genomes": "gnomad_genomes_Hemi", "db_hemi.gnomad-exomes2": "gnomad_exomes_Hemi", "db_hemi.gnomad-genomes2": "gnomad_genomes_Hemi", } if key in hemi_key_map: filter_key = hemi_key_map[key] hemi_filter_setting = {k.replace("$", ""): v for k, v in value.items()} s = s.filter(Q('range', **{filter_key: hemi_filter_setting}) | ~Q('exists', field=filter_key)) hom_key_map = { "db_hom.exac_v3": "exac_AC_Hom", "db_hom.gnomad_exomes": "gnomad_exomes_Hom", "db_hom.gnomad_genomes": "gnomad_genomes_Hom", "db_hom.gnomad-exomes2": "gnomad_exomes_Hom", "db_hom.gnomad-genomes2": "gnomad_genomes_Hom", } if key in hom_key_map: filter_key = hom_key_map[key] hom_filter_setting = {k.replace("$", ""): v for k, v in value.items()} s = s.filter(Q('range', **{filter_key: hom_filter_setting}) | ~Q('exists', field=filter_key)) #s = s.sort("xpos") #logger.info("=====") #logger.info("FULL QUERY OBJ: " + pformat(s.__dict__)) #logger.info("FILTERS: " + pformat(s.to_dict())) # https://elasticsearch-py.readthedocs.io/en/master/helpers.html#elasticsearch.helpers.scan start = time.time() s = s.params(size=max_results_limit + 1) #if not include_all_consequences: # s = s.source(exclude=["sortedTranscriptConsequences"]) response = s.execute() logger.info("=====") logger.info("TOTAL: %s. Query took %s seconds" % (response.hits.total, time.time() - start)) if response.hits.total > max_results_limit + 1: raise Exception("This search matched too many variants. Please set additional filters and try again.") #print(pformat(response.to_dict())) project = Project.objects.get(project_id=project_id) #gene_list_map = project.get_gene_list_map() reference = get_reference() #for i, hit in enumerate(response.hits): variant_results = [] for i, hit in enumerate(response): # preserve_order=True #logger.info("HIT %s: %s %s %s" % (i, hit["variantId"], hit["geneIds"], pformat(hit.__dict__))) #print("HIT %s: %s" % (i, pformat(hit.to_dict()))) filters = ",".join(hit["filters"] or []) if "filters" in hit else "" genotypes = {} all_num_alt = [] if is_nested: genotypes_by_sample_id = {gen_hit['sample_id']: gen_hit for gen_hit in hit.meta.inner_hits.genotype} for individual_id, sample_id in family_individual_ids_to_sample_ids.items(): def _get_hit_field(field): if is_nested: gen_hit = genotypes_by_sample_id.get(sample_id, {}) key = field else: gen_hit = hit key = '{}_{}'.format(_encode_name(sample_id), field) return gen_hit[key] if key in gen_hit else None num_alt = _get_hit_field('num_alt') if num_alt is None: num_alt = -1 all_num_alt.append(num_alt) alleles = [] if num_alt == 0: alleles = [hit["ref"], hit["ref"]] elif num_alt == 1: alleles = [hit["ref"], hit["alt"]] elif num_alt == 2: alleles = [hit["alt"], hit["alt"]] elif num_alt == -1 or num_alt == None: alleles = [] else: raise ValueError("Invalid num_alt: " + str(num_alt)) genotypes[individual_id] = { 'ab': _get_hit_field('ab'), 'alleles': map(str, alleles), 'extras': { 'ad': _get_hit_field('ad'), 'dp': _get_hit_field('dp'), #'pl': '', }, 'filter': filters or "pass", 'gq': _get_hit_field('gq') or '', 'num_alt': num_alt, } vep_annotation = hit['sortedTranscriptConsequences'] if 'sortedTranscriptConsequences' in hit else None if vep_annotation is not None: if is_nested: vep_annotation = [annot.to_dict() for annot in vep_annotation] else: vep_annotation = json.loads(str(vep_annotation)) gene_ids = list(hit['geneIds'] or []) worst_vep_index_per_gene = { gene_id: next((i for i, annot in enumerate(vep_annotation) if annot['gene_id'] == gene_id), None) for gene_id in gene_ids } if project.genome_version == GENOME_VERSION_GRCh37: grch38_coord = None if self.liftover_grch37_to_grch38: grch38_coord = self.liftover_grch37_to_grch38.convert_coordinate("chr%s" % hit["contig"].replace("chr", ""), int(hit["start"])) if grch38_coord and grch38_coord[0]: grch38_coord = "%s-%s-%s-%s "% (grch38_coord[0][0], grch38_coord[0][1], hit["ref"], hit["alt"]) else: grch38_coord = None else: grch38_coord = hit["variantId"] if project.genome_version == GENOME_VERSION_GRCh38: grch37_coord = None if self.liftover_grch38_to_grch37: grch37_coord = self.liftover_grch38_to_grch37.convert_coordinate("chr%s" % hit["contig"].replace("chr", ""), int(hit["start"])) if grch37_coord and grch37_coord[0]: grch37_coord = "%s-%s-%s-%s "% (grch37_coord[0][0], grch37_coord[0][1], hit["ref"], hit["alt"]) else: grch37_coord = None else: grch37_coord = hit["variantId"] freq_fields = { 'AF': "AF" if "AF" in index_fields else None, '1kg_wgs_AF': "g1k_AF" if "g1k_AF" in index_fields else None, '1kg_wgs_popmax_AF': "g1k_POPMAX_AF" if "g1k_POPMAX_AF" in index_fields else None, 'exac_v3_AF': "exac_AF" if "exac_AF" in index_fields else None, 'exac_v3_popmax_AF': "exac_AF_POPMAX" if "exac_AF_POPMAX" in index_fields else None, 'gnomad_exomes_AF': "gnomad_exomes_AF" if "gnomad_exomes_AF" in index_fields else None, 'gnomad_exomes_popmax_AF': "gnomad_exomes_AF_POPMAX_OR_GLOBAL" if "gnomad_exomes_AF_POPMAX_OR_GLOBAL" in index_fields else ( "gnomad_exomes_AF_POPMAX" if "gnomad_exomes_AF_POPMAX" in index_fields else None), 'gnomad_genomes_AF': "gnomad_genomes_AF" if "gnomad_genomes_AF" in index_fields else None, 'gnomad_genomes_popmax_AF': "gnomad_genomes_AF_POPMAX_OR_GLOBAL" if "gnomad_genomes_AF_POPMAX_OR_GLOBAL" in index_fields else ( "gnomad_genomes_AF_POPMAX" if "gnomad_genomes_AF_POPMAX" in index_fields else None), 'topmed_AF': "topmed_AF" if "topmed_AF" in index_fields else None, } result = { #u'_id': ObjectId('596d2207ff66f729285ca588'), 'alt': str(hit["alt"]) if "alt" in hit else None, 'annotation': { 'fathmm': fathmm_map.get(hit["dbnsfp_FATHMM_pred"].split(';')[0]) if "dbnsfp_FATHMM_pred" in hit and hit["dbnsfp_FATHMM_pred"] else None, 'muttaster': muttaster_map.get(hit["dbnsfp_MutationTaster_pred"].split(';')[0]) if "dbnsfp_MutationTaster_pred" in hit and hit["dbnsfp_MutationTaster_pred"] else None, 'polyphen': polyphen_map.get(hit["dbnsfp_Polyphen2_HVAR_pred"].split(';')[0]) if "dbnsfp_Polyphen2_HVAR_pred" in hit and hit["dbnsfp_Polyphen2_HVAR_pred"] else None, 'sift': sift_map.get(hit["dbnsfp_SIFT_pred"].split(';')[0]) if "dbnsfp_SIFT_pred" in hit and hit["dbnsfp_SIFT_pred"] else None, 'metasvm': metasvm_map.get(hit["dbnsfp_MetaSVM_pred"].split(';')[0]) if "dbnsfp_MetaSVM_pred" in hit and hit["dbnsfp_MetaSVM_pred"] else None, 'GERP_RS': float(hit["dbnsfp_GERP_RS"]) if "dbnsfp_GERP_RS" in hit and hit["dbnsfp_GERP_RS"] else None, 'phastCons100way_vertebrate': float(hit["dbnsfp_phastCons100way_vertebrate"]) if "dbnsfp_phastCons100way_vertebrate" in hit and hit["dbnsfp_phastCons100way_vertebrate"] else None, 'cadd_phred': hit["cadd_PHRED"] if "cadd_PHRED" in hit else None, 'dann_score': hit["dbnsfp_DANN_score"] if "dbnsfp_DANN_score" in hit else None, 'revel_score': hit["dbnsfp_REVEL_score"] if "dbnsfp_REVEL_score" in hit else None, 'eigen_phred': hit["eigen_Eigen_phred"] if "eigen_Eigen_phred" in hit else (hit["dbnsfp_Eigen_phred"] if "dbnsfp_Eigen_phred" in hit else None), 'mpc_score': hit["mpc_MPC"] if "mpc_MPC" in hit else None, 'primate_ai_score': hit["primate_ai_score"] if "primate_ai_score" in hit else None, 'splice_ai_delta_score': hit["splice_ai_delta_score"] if "splice_ai_delta_score" in hit else None, 'rsid': hit["rsid"] if "rsid" in hit else None, 'annotation_tags': list(hit["transcriptConsequenceTerms"] or []) if "transcriptConsequenceTerms" in hit else None, 'coding_gene_ids': list(hit['codingGeneIds'] or []), 'gene_ids': list(hit['geneIds'] or []), 'vep_annotation': vep_annotation, 'vep_group': str(hit['mainTranscript_major_consequence'] or ""), 'vep_consequence': str(hit['mainTranscript_major_consequence'] or ""), 'main_transcript': {k.replace('mainTranscript_', ''): hit[k] for k in dir(hit) if k.startswith('mainTranscript_')}, 'worst_vep_annotation_index': 0, 'worst_vep_index_per_gene': worst_vep_index_per_gene, }, 'chr': hit["contig"], 'coding_gene_ids': list(hit['codingGeneIds'] or []), 'gene_ids': gene_ids, 'coverage': { 'gnomad_exome_coverage': float(hit["gnomad_exome_coverage"] or -1) if "gnomad_exome_coverage" in hit else -1, 'gnomad_genome_coverage': float(hit["gnomad_genome_coverage"] or -1) if "gnomad_genome_coverage" in hit else -1, }, 'pop_counts': { 'AC': int(hit['AC'] or 0) if 'AC' in hit else None, 'AN': int(hit['AN'] or 0) if 'AN' in hit else None, 'g1kAC': int(hit['g1k_AC'] or 0) if 'g1k_AC' in hit else None, 'g1kAN': int(hit['g1k_AN'] or 0) if 'g1k_AN' in hit else None, 'exac_v3_AC': int(hit["exac_AC_Adj"] or 0) if "exac_AC_Adj" in hit else None, 'exac_v3_Het': int(hit["exac_AC_Het"] or 0) if "exac_AC_Het" in hit else None, 'exac_v3_Hom': int(hit["exac_AC_Hom"] or 0) if "exac_AC_Hom" in hit else None, 'exac_v3_Hemi': int(hit["exac_AC_Hemi"] or 0) if "exac_AC_Hemi" in hit else None, 'exac_v3_AN': int(hit["exac_AN_Adj"] or 0) if "exac_AN_Adj" in hit else None, 'gnomad_exomes_AC': int(hit["gnomad_exomes_AC"] or 0) if "gnomad_exomes_AC" in hit else None, 'gnomad_exomes_Hom': int(hit["gnomad_exomes_Hom"] or 0) if "gnomad_exomes_Hom" in hit else None, 'gnomad_exomes_Hemi': int(hit["gnomad_exomes_Hemi"] or 0) if "gnomad_exomes_Hemi" in hit else None, 'gnomad_exomes_AN': int(hit["gnomad_exomes_AN"] or 0) if "gnomad_exomes_AN" in hit else None, 'gnomad_genomes_AC': int(hit["gnomad_genomes_AC"] or 0) if "gnomad_genomes_AC" in hit else None, 'gnomad_genomes_Hom': int(hit["gnomad_genomes_Hom"] or 0) if "gnomad_genomes_Hom" in hit else None, 'gnomad_genomes_Hemi': int(hit["gnomad_genomes_Hemi"] or 0) if "gnomad_genomes_Hemi" in hit else None, 'gnomad_genomes_AN': int(hit["gnomad_genomes_AN"] or 0) if "gnomad_genomes_AN" in hit else None, 'topmed_AC': float(hit["topmed_AC"] or 0) if "topmed_AC" in hit else None, 'topmed_Het': float(hit["topmed_Het"] or 0) if "topmed_Het" in hit else None, 'topmed_Hom': float(hit["topmed_Hom"] or 0) if "topmed_Hom" in hit else None, 'topmed_AN': float(hit["topmed_AN"] or 0) if "topmed_AN" in hit else None, }, 'db_freqs': {k: float(hit[v] or 0.0) if v in hit else (0.0 if v else None) for k, v in freq_fields.items()}, #'popmax_populations': { # 'exac_popmax': hit["exac_POPMAX"] or None, # 'gnomad_exomes_popmax': hit["gnomad_exomes_POPMAX"] or None, # 'gnomad_genomes_popmax': hit["gnomad_genomes_POPMAX"] or None, #}, 'db_gene_ids': list((hit["geneIds"] or []) if "geneIds" in hit else []), 'db_tags': str(hit["transcriptConsequenceTerms"] or "") if "transcriptConsequenceTerms" in hit else None, 'extras': { 'clinvar_variant_id': hit['clinvar_variation_id'] if 'clinvar_variation_id' in hit and hit['clinvar_variation_id'] else None, 'clinvar_allele_id': hit['clinvar_allele_id'] if 'clinvar_allele_id' in hit and hit['clinvar_allele_id'] else None, 'clinvar_clinsig': hit['clinvar_clinical_significance'].lower() if ('clinvar_clinical_significance' in hit) and hit['clinvar_clinical_significance'] else None, 'clinvar_gold_stars': hit['clinvar_gold_stars'] if 'clinvar_gold_stars' in hit and hit['clinvar_gold_stars'] else None, 'hgmd_class': hit['hgmd_class'] if 'hgmd_class' in hit and user and user.is_staff else None, 'hgmd_accession': hit['hgmd_accession'] if 'hgmd_accession' in hit else None, 'genome_version': project.genome_version, 'grch37_coords': grch37_coord, 'grch38_coords': grch38_coord, 'alt_allele_pos': 0, 'orig_alt_alleles': map(str, [a.split("-")[-1] for a in hit["originalAltAlleles"]]) if "originalAltAlleles" in hit else None }, 'genotypes': genotypes, 'pos': long(hit['start']), 'pos_end': str(hit['end']), 'ref': str(hit['ref']), 'vartype': 'snp' if len(hit['ref']) == len(hit['alt']) else "indel", 'vcf_id': None, 'xpos': long(hit["xpos"]), 'xposx': long(hit["xpos"]), } result["annotation"]["freqs"] = result["db_freqs"] result["annotation"]["pop_counts"] = result["pop_counts"] result["annotation"]["db"] = "elasticsearch" result["extras"]["svlen"] = hit["SVLEN"] if "SVLEN" in hit else None result["extras"]["svtype"] = hit["SVTYPE"] if "SVTYPE" in hit else None logger.info("Result %s: GRCh37: %s GRCh38: %s - gene ids: %s, coding gene_ids: %s" % ( i, grch37_coord, grch38_coord, result["gene_ids"], result["coding_gene_ids"])) result["extras"]["project_id"] = project_id result["extras"]["family_id"] = family_id # add gene info gene_names = {} if vep_annotation is not None: gene_names = {vep_anno["gene_id"]: vep_anno.get("gene_symbol") for vep_anno in vep_annotation if vep_anno.get("gene_symbol")} result["extras"]["gene_names"] = gene_names try: genes = {} for gene_id in result["gene_ids"]: if gene_id: genes[gene_id] = reference.get_gene_summary(gene_id) or {} #if not genes: # genes = {vep_anno["gene_id"]: {"symbol": vep_anno["gene_symbol"]} for vep_anno in vep_annotation} result["extras"]["genes"] = genes except Exception as e: exc_type, exc_obj, exc_tb = sys.exc_info() logger.warn("WARNING: got unexpected error in add_gene_names_to_variants: %s : line %s" % (e, exc_tb.tb_lineno)) variant_results.append(result) logger.info("Finished returning the %s variants: %s seconds" % (response.hits.total, time.time() - start)) if self._redis_client: self._redis_client.set(cache_key, json.dumps(variant_results)) return [Variant.fromJSON(variant_json) for variant_json in variant_results]
PROJECT_NAME = '1kg project n\u00e5me with uni\u00e7\u00f8de' PROJECT_GUID = 'R0001_1kg' ELASTICSEARCH_INDEX = 'test_index' INDEX_METADATA = { "gencodeVersion": "25", "hail_version": "0.2.24", "genomeVersion": "38", "sampleType": "WES", "sourceFilePath": "test_index_alias_1_path.vcf.gz", } SAMPLE_IDS = [ "NA19679", "NA19675_1", "NA19678", "HG00731", "HG00732", "HG00733" ] liftover_to_38 = LiftOver('hg19', 'hg38') LIFT_MAP = { 21003343353: [('chr21', 3343400)], 1248367227: [('chr1', 248203925)], 1001562437: [('chr1', 1627057)], 1001560662: [('chr1', 46394160)], } def mock_convert_coordinate(chrom, pos): pos = int(chrom.replace('chr', '')) * int(1e9) + pos return (LIFT_MAP[pos]) @mock.patch('seqr.management.commands.lift_project_to_hg38.logger')
def handle(self, *args, **options): """transfer project""" project_arg = options['project'] elasticsearch_index = options['es_index'] project = Project.objects.get(Q(name=project_arg) | Q(guid=project_arg)) logger.info('Updating project genome version for {}'.format(project.name)) # Validate the provided index logger.info('Validating es index {}'.format(elasticsearch_index)) sample_ids, index_metadata = get_elasticsearch_index_samples(elasticsearch_index) validate_index_metadata(index_metadata, project, elasticsearch_index, genome_version=GENOME_VERSION_GRCh38) sample_type = index_metadata['sampleType'] dataset_path = index_metadata['sourceFilePath'] matched_sample_id_to_sample_record = match_sample_ids_to_sample_records( project=project, sample_ids=sample_ids, sample_type=sample_type, dataset_type=Sample.DATASET_TYPE_VARIANT_CALLS, elasticsearch_index=elasticsearch_index, sample_id_to_individual_id_mapping={}, ) unmatched_samples = set(sample_ids) - set(matched_sample_id_to_sample_record.keys()) if len(unmatched_samples) > 0: raise Exception('Matches not found for ES sample ids: {}.'.format(', '.join(unmatched_samples))) included_family_individuals = defaultdict(set) individual_guids_by_id = {} for sample in matched_sample_id_to_sample_record.values(): included_family_individuals[sample.individual.family].add(sample.individual.individual_id) individual_guids_by_id[sample.individual.individual_id] = sample.individual.guid missing_family_individuals = [] for family, individual_ids in included_family_individuals.items(): missing_indivs = family.individual_set.filter( sample__sample_status=Sample.SAMPLE_STATUS_LOADED, sample__dataset_type=Sample.DATASET_TYPE_VARIANT_CALLS ).exclude(individual_id__in=individual_ids) if missing_indivs: missing_family_individuals.append( '{} ({})'.format(family.family_id, ', '.join([i.individual_id for i in missing_indivs])) ) if missing_family_individuals: raise Exception( 'The following families are included in the callset but are missing some family members: {}.'.format( ', '.join(missing_family_individuals) )) # Get and clean up expected saved variants saved_variant_models_by_guid = {v.guid: v for v in SavedVariant.objects.filter(project=project)} deleted_no_family = set() deleted_no_tags = set() for guid, variant in saved_variant_models_by_guid.items(): if not variant.family: deleted_no_family.add(guid) elif not (variant.varianttag_set.count() or variant.variantnote_set.count()): deleted_no_tags.add(guid) if deleted_no_family: if raw_input('Do you want to delete the following {} saved variants with no family (y/n)?: {} '.format( len(deleted_no_family), ', '.join(deleted_no_family))) == 'y': for guid in deleted_no_family: saved_variant_models_by_guid.pop(guid).delete() logger.info('Deleted {} variants'.format(len(deleted_no_family))) if deleted_no_tags: if raw_input('Do you want to delete the following {} saved variants with no tags (y/n)?: {} '.format( len(deleted_no_tags), ', '.join(deleted_no_tags))) == 'y': for guid in deleted_no_tags: saved_variant_models_by_guid.pop(guid).delete() logger.info('Deleted {} variants'.format(len(deleted_no_tags))) expected_families = {sv.family for sv in saved_variant_models_by_guid.values()} missing_families = expected_families - set(included_family_individuals.keys()) if missing_families: raise Exception( 'The following families have saved variants but are missing from the callset: {}.'.format( ', '.join([f.family_id for f in missing_families]) )) # Lift-over saved variants saved_variants = get_json_for_saved_variants( saved_variant_models_by_guid.values(), add_details=True, project=project, individual_guids_by_id=individual_guids_by_id) saved_variants_to_lift = [v for v in saved_variants if v['genomeVersion'] != GENOME_VERSION_GRCh38] num_already_lifted = len(saved_variants) - len(saved_variants_to_lift) if num_already_lifted: if raw_input('Found {} saved variants already on Hg38. Continue with liftover (y/n)?'.format(num_already_lifted)) != 'y': raise Exception('Error: found {} saved variants already on Hg38'.format(num_already_lifted)) logger.info('Lifting over {} variants (skipping {} that are already lifted)'.format( len(saved_variants_to_lift), num_already_lifted)) liftover_to_38 = LiftOver('hg19', 'hg38') hg37_to_hg38_xpos = {} lift_failed = set() for v in saved_variants_to_lift: if not (hg37_to_hg38_xpos.get(v['xpos']) or v['xpos'] in lift_failed): hg38_coord = liftover_to_38.convert_coordinate('chr{}'.format(v['chrom'].lstrip('chr')), int(v['pos'])) if hg38_coord and hg38_coord[0]: hg37_to_hg38_xpos[v['xpos']] = get_xpos(hg38_coord[0][0], hg38_coord[0][1]) else: lift_failed.add(v['xpos']) if lift_failed: raise Exception( 'Unable to lift over the following {} coordinates: {}'.format(len(lift_failed), ', '.join(lift_failed))) saved_variants_map = defaultdict(list) for v in saved_variants_to_lift: variant_model = saved_variant_models_by_guid[v['variantGuid']] saved_variants_map[(hg37_to_hg38_xpos[v['xpos']], v['ref'], v['alt'])].append(variant_model) es_variants = get_es_variants_for_variant_tuples(expected_families, saved_variants_map.keys()) missing_variants = set(saved_variants_map.keys()) - {(v['xpos'], v['ref'], v['alt']) for v in es_variants} if missing_variants: missing_variant_strings = ['{}-{}-{} ({})'.format( xpos, ref, alt, ', '.join(['{}: {}'.format(v.family.family_id, v.guid) for v in saved_variants_map[(xpos, ref, alt)]])) for xpos, ref, alt in missing_variants] if raw_input('Unable to find the following {} variants in the index. Continue with update (y/n)?: {} '.format( len(missing_variants), ', '.join(missing_variant_strings))) != 'y': raise Exception('Error: unable to find {} lifted-over variants'.format(len(missing_variants))) logger.info('Successfully lifted over {} variants'.format(len(es_variants))) # Update saved variants for var in es_variants: saved_variant_models = saved_variants_map[(var['xpos'], var['ref'], var['alt'])] missing_families = [v.family.guid for v in saved_variant_models if v.family.guid not in var['familyGuids']] if missing_families: raise Exception('Error with variant {}:{}-{}-{} not find for expected families {}; found in families {}'.format( var['chrom'], var['pos'], var['ref'], var['alt'], ', '.join(missing_families), ', '.join(var['familyGuids']) )) for saved_variant in saved_variant_models: saved_variant.xpos_start = var['xpos'] saved_variant.saved_variant_json = json.dumps(var) saved_variant.save() logger.info('Successfully updated {} variants'.format(len(es_variants))) # Update project and sample data update_model_from_json(project, {'genome_version': GENOME_VERSION_GRCh38, 'has_new_search': True}) _update_samples( matched_sample_id_to_sample_record, elasticsearch_index=elasticsearch_index, dataset_path=dataset_path ) update_xbrowse_vcfffiles( project, sample_type, elasticsearch_index, dataset_path, matched_sample_id_to_sample_record ) reset_cached_search_results(project) logger.info('---Done---') logger.info('Succesfully lifted over {} variants. Skipped {} failed variants.'.format( len(es_variants), len(missing_variants)))