예제 #1
0
def test_liftover_2():
    '''
    Check that liftover can open files given both as strings and file objects.
    '''
    lo = LiftOver(os.path.join(DATA_DIR, 'hg17ToHg18.over.chain.gz'))
    assert len(lo.chain_file.chain_index) > 22
    lo = LiftOver(gzip.open(os.path.join(DATA_DIR,
                                         'hg17ToHg18.over.chain.gz')))
    assert len(lo.chain_file.chain_index) > 22
예제 #2
0
def _liftover_grch38_to_grch37():
    global LIFTOVER_GRCH38_TO_GRCH37
    if not LIFTOVER_GRCH38_TO_GRCH37:
        try:
            LIFTOVER_GRCH38_TO_GRCH37 = LiftOver('hg38', 'hg19')
        except Exception as e:
            logger.warn('WARNING: Unable to set up liftover. {}'.format(e))
    return LIFTOVER_GRCH38_TO_GRCH37
예제 #3
0
def test_liftover():
    '''
    The test data was prepared as follows:
        * We loaded all intervals from hg17-to-hg18.
        * We then picked positions from the genome as follows:
            For each interval we picked the first, the last, the first-1, last+1, and first+4 positions.
            From the resulting ~40k points we chose 10000 random sites.
            We converted those via UCSC-hosted Web "liftOver" tool and wrote down the results.
     The test results are in data/hg17ToHg18.testpoints.txt.gz.
     Just in case we also saved the corresponding over.chain file.
    '''
    lo = LiftOver(os.path.join(DATA_DIR, 'hg17ToHg18.over.chain.gz'))
    testdata_file = os.path.join(DATA_DIR, 'hg17ToHg18.testpoints.txt.gz')
    test_counter = 0
    f = gzip.open(
        testdata_file)  # no "with" here because we want to support Python 2.6
    for ln in f:
        ln = ln.decode('ascii')
        s_chr, s_pos, t_chr, t_pos = ln.split('\t')
        result = lo.convert_coordinate(s_chr, int(s_pos))
        if t_chr == '-':
            assert len(result) == 0
        else:
            assert len(result) == 1
            res_chr = result[0][0]
            res_pos = result[0][1]
            assert res_chr == t_chr
            assert res_pos == int(t_pos)

        # Check that we can provide chromosome as a bytes object and
        # everything will work still
        if sys.version_info >= (3, 0):
            result = lo.convert_coordinate(s_chr.encode('ascii'), int(s_pos))
            if t_chr == '-':
                assert len(result) == 0
            else:
                assert len(result) == 1
                res_chr = result[0][0]
                res_pos = result[0][1]
                assert res_chr == t_chr
                assert res_pos == int(t_pos)

        test_counter += 1
    assert test_counter == 10000
예제 #4
0
def test_issue_2_3_4():
    '''
    Check the correctness of coordinate conversion for issue 2/3/4.
    
    NB: We are using the "live" hg38ToHg19.over.chain.gz file, hence if it happens to change later on,
    the test may start failing. Just in case we have the original cached in the data directory as well.
    '''
    lo = LiftOver('hg38', 'hg19')
    test_input = os.path.join(DATA_DIR, 'hg38ToHg19.testinput.txt')
    test_output = os.path.join(DATA_DIR, 'hg38ToHg19.testoutput.txt')

    test_input = dict([(ln[3], (ln[0], int(ln[1]), ln[5].strip()))
                       for ln in [line.split() for line in open(test_input)]])
    test_output = dict([(ln[3], (ln[0], int(ln[1]), ln[5].strip()))
                        for ln in [line.split()
                                   for line in open(test_output)]])

    for k in test_input:
        res = lo.convert_coordinate(*test_input[k])
        if k not in test_output:
            assert len(res) == 0
        else:
            assert len(res) == 1 and res[0][0:3] == test_output[k]
예제 #5
0
    def get_elasticsearch_variants(
        self,
        project_id,
        family_id=None,
        variant_filter=None,
        genotype_filter=None,
        variant_id_filter=None,
        quality_filter=None,
        indivs_to_consider=None,
        include_all_consequences=False,
        user=None,
        max_results_limit=settings.VARIANT_QUERY_RESULTS_LIMIT,
    ):
        from xbrowse_server.base.models import Individual
        from xbrowse_server.mall import get_reference

        cache_key = "Variants___%s___%s___%s" % (project_id, family_id,
                                                 json.dumps([
                                                     variant_filter.toJSON() if
                                                     variant_filter else None,
                                                     genotype_filter,
                                                     quality_filter,
                                                     variant_id_filter,
                                                     indivs_to_consider,
                                                     include_all_consequences,
                                                 ]))

        cached_results = self._redis_client and self._redis_client.get(
            cache_key)
        if cached_results is not None:
            variant_results = json.loads(cached_results)
            return [
                Variant.fromJSON(variant_json)
                for variant_json in variant_results
            ]

        if indivs_to_consider is None:
            if genotype_filter:
                indivs_to_consider = genotype_filter.keys()
            else:
                indivs_to_consider = []

        if family_id is not None:
            family_individual_ids = [
                i.indiv_id for i in Individual.objects.filter(
                    family__family_id=family_id).only("indiv_id")
            ]
        else:
            family_individual_ids = [
                i.indiv_id for i in Individual.objects.filter(
                    family__project__project_id=project_id).only("indiv_id")
            ]

        from xbrowse_server.base.models import Project, Family
        from pyliftover.liftover import LiftOver

        query_json = self._make_db_query(genotype_filter, variant_filter)

        try:
            if self.liftover_grch38_to_grch37 is None:
                self.liftover_grch38_to_grch37 = LiftOver('hg38', 'hg19')

            if self.liftover_grch37_to_grch38 is None:
                self.liftover_grch37_to_grch38 = None  # LiftOver('hg19', 'hg38')
        except Exception as e:
            logger.info(
                "WARNING: Unable to set up liftover. Is there a working internet connection? "
                + str(e))

        if family_id is None:
            project = Project.objects.get(project_id=project_id)
            elasticsearch_index = project.get_elasticsearch_index()
            logger.info("Searching in project elasticsearch index: " +
                        str(elasticsearch_index))
        else:
            family = Family.objects.get(project__project_id=project_id,
                                        family_id=family_id)
            elasticsearch_index = family.get_elasticsearch_index()
            project = family.project
            logger.info("Searching in family elasticsearch index: " +
                        str(elasticsearch_index))

        if family_id is not None and len(family_individual_ids) > 0:
            # figure out which index to use
            # TODO add caching
            matching_indices = []
            mapping = self._es_client.indices.get_mapping(
                str(elasticsearch_index) + "*")

            if family_individual_ids:
                indiv_id = _encode_name(family_individual_ids[0])
                for index_name, index_mapping in mapping.items():
                    if indiv_id + "_num_alt" in index_mapping["mappings"][
                            "variant"]["properties"]:
                        matching_indices.append(index_name)

            if not matching_indices:
                if not family_individual_ids:
                    logger.error("no individuals found for family %s" %
                                 (family_id))
                elif not mapping:
                    logger.error(
                        "no es mapping found for found with prefix %s" %
                        (elasticsearch_index))
                else:
                    logger.error("%s not found in %s:\n%s" %
                                 (indiv_id, elasticsearch_index,
                                  pformat(index_mapping["mappings"]["variant"]
                                          ["properties"])))
            else:
                logger.info("matching indices: " + str(elasticsearch_index))
                elasticsearch_index = ",".join(matching_indices)

        s = elasticsearch_dsl.Search(using=self._es_client,
                                     index=str(elasticsearch_index) +
                                     "*")  #",".join(indices))

        if variant_id_filter is not None:
            variant_id_filter_term = None
            for variant_id in variant_id_filter:
                q_obj = Q('term', **{"variantId": variant_id})
                if variant_id_filter_term is None:
                    variant_id_filter_term = q_obj
                else:
                    variant_id_filter_term |= q_obj
            s = s.filter(variant_id_filter_term)

        if indivs_to_consider:
            atleast_one_nonref_genotype_filter = None
            for sample_id in indivs_to_consider:
                encoded_sample_id = _encode_name(sample_id)
                q = Q('range', **{encoded_sample_id + "_num_alt": {'gte': 1}})
                if atleast_one_nonref_genotype_filter is None:
                    atleast_one_nonref_genotype_filter = q
                else:
                    atleast_one_nonref_genotype_filter |= q

            s = s.filter(atleast_one_nonref_genotype_filter)

        if quality_filter is not None and indivs_to_consider:
            #'vcf_filter': u'pass', u'min_ab': 17, u'min_gq': 46
            min_ab = quality_filter.get('min_ab')
            if min_ab is not None:
                min_ab /= 100.0  # convert to fraction
            min_gq = quality_filter.get('min_gq')
            vcf_filter = quality_filter.get('vcf_filter')
            for sample_id in indivs_to_consider:
                encoded_sample_id = _encode_name(sample_id)

                #'vcf_filter': u'pass', u'min_ab': 17, u'min_gq': 46
                if min_ab:
                    s = s.filter(
                        ~Q('term', **{encoded_sample_id + "_num_alt": 1})
                        | Q('range', **
                            {encoded_sample_id + "_ab": {
                                'gte': min_ab
                            }}))
                    #logger.info("### ADDED FILTER: " + str({encoded_sample_id+"_ab": {'gte': min_ab}}))
                if min_gq:
                    s = s.filter(
                        'range',
                        **{encoded_sample_id + "_gq": {
                            'gte': min_gq
                        }})
                    #logger.info("### ADDED FILTER: " + str({encoded_sample_id+"_gq": {'gte': min_gq}}))
                if vcf_filter is not None:
                    s = s.filter(~Q('exists', field='filters'))
                    #logger.info("### ADDED FILTER: " + str(~Q('exists', field='filters')))

        # parse variant query
        annotation_groups_map = ANNOTATION_GROUPS_MAP_INTERNAL if user and user.is_staff else ANNOTATION_GROUPS_MAP

        for key, value in query_json.items():
            if key == 'db_tags':
                so_annotations = query_json.get('db_tags', {}).get('$in', [])

                # handle clinvar filters
                selected_so_annotations_set = set(so_annotations)

                all_clinvar_filters_set = set(
                    annotation_groups_map.get("clinvar",
                                              {}).get("children", []))
                selected_clinvar_filters_set = all_clinvar_filters_set & selected_so_annotations_set

                all_hgmd_filters_set = set(
                    annotation_groups_map.get("hgmd", {}).get("children", []))
                selected_hgmd_filters_set = all_hgmd_filters_set & selected_so_annotations_set

                vep_consequences = list(selected_so_annotations_set -
                                        selected_clinvar_filters_set -
                                        selected_hgmd_filters_set)
                consequences_filter = Q(
                    "terms", transcriptConsequenceTerms=vep_consequences)

                if selected_clinvar_filters_set:
                    clinvar_clinical_significance_terms = set()
                    for clinvar_filter in selected_clinvar_filters_set:
                        # translate selected filters to the corresponding clinvar clinical consequence terms
                        if clinvar_filter == "pathogenic":
                            clinvar_clinical_significance_terms.update(
                                ["Pathogenic", "Pathogenic/Likely_pathogenic"])
                        elif clinvar_filter == "likely_pathogenic":
                            clinvar_clinical_significance_terms.update([
                                "Likely_pathogenic",
                                "Pathogenic/Likely_pathogenic"
                            ])
                        elif clinvar_filter == "benign":
                            clinvar_clinical_significance_terms.update(
                                ["Benign", "Benign/Likely_benign"])
                        elif clinvar_filter == "likely_benign":
                            clinvar_clinical_significance_terms.update(
                                ["Likely_benign", "Benign/Likely_benign"])
                        elif clinvar_filter == "vus_or_conflicting":
                            clinvar_clinical_significance_terms.update([
                                "Conflicting_interpretations_of_pathogenicity",
                                "Uncertain_significance", "not_provided",
                                "other"
                            ])
                        else:
                            raise ValueError("Unexpected clinvar filter: " +
                                             str(clinvar_filter))

                    consequences_filter = consequences_filter | Q(
                        "terms",
                        clinvar_clinical_significance=list(
                            clinvar_clinical_significance_terms))

                if selected_hgmd_filters_set:
                    hgmd_class = set()
                    for hgmd_filter in selected_hgmd_filters_set:
                        # translate selected filters to the corresponding hgmd clinical consequence terms
                        if hgmd_filter == "disease_causing":
                            hgmd_class.update(["DM"])
                        elif hgmd_filter == "likely_disease_causing":
                            hgmd_class.update(["DM?"])
                        elif hgmd_filter == "hgmd_other":
                            hgmd_class.update(["DP", "DFP", "FP", "FTV"])
                        else:
                            raise ValueError("Unexpected hgmd filter: " +
                                             str(hgmd_filter))

                    consequences_filter = consequences_filter | Q(
                        "terms", hgmd_class=list(hgmd_class))

                if 'intergenic_variant' in vep_consequences:
                    # for many intergenic variants VEP doesn't add any annotations, so if user selected 'intergenic_variant', also match variants where transcriptConsequenceTerms is emtpy
                    consequences_filter = consequences_filter | ~Q(
                        'exists', field='transcriptConsequenceTerms')

                s = s.filter(consequences_filter)
                #logger.info("==> transcriptConsequenceTerms: %s" % str(vep_consequences))

            if key.startswith("genotypes"):
                sample_id = ".".join(key.split(".")[1:-1])
                encoded_sample_id = _encode_name(sample_id)
                genotype_filter = value
                #logger.info("==> genotype filter: " + str(genotype_filter))
                if type(genotype_filter) == int or type(
                        genotype_filter) == basestring:
                    #logger.info("==> genotypes: %s" % str({encoded_sample_id+"_num_alt": genotype_filter}))
                    s = s.filter(
                        'term',
                        **{encoded_sample_id + "_num_alt": genotype_filter})

                elif '$gte' in genotype_filter:
                    genotype_filter = {
                        k.replace("$", ""): v
                        for k, v in genotype_filter.items()
                    }
                    s = s.filter(
                        'range',
                        **{encoded_sample_id + "_num_alt": genotype_filter})
                    #logger.info("==> genotypes: %s" % str({encoded_sample_id+"_num_alt": genotype_filter}))
                elif "$in" in genotype_filter:
                    num_alt_values = genotype_filter['$in']
                    q = Q(
                        'term',
                        **{encoded_sample_id + "_num_alt": num_alt_values[0]})
                    #logger.info("==> genotypes: %s" % str({encoded_sample_id+"_num_alt": num_alt_values[0]}))
                    for num_alt_value in num_alt_values[1:]:
                        q = q | Q(
                            'term', **
                            {encoded_sample_id + "_num_alt": num_alt_value})
                        #logger.info("==> genotypes: %s" % str({encoded_sample_id+"_num_alt": num_alt_value}))
                    s = s.filter(q)

            if key == "db_gene_ids":
                db_gene_ids = query_json.get('db_gene_ids', {})

                exclude_genes = db_gene_ids.get('$nin', [])
                gene_ids = exclude_genes or db_gene_ids.get('$in', [])

                if exclude_genes:
                    s = s.exclude("terms", geneIds=gene_ids)
                else:
                    s = s.filter("terms", geneIds=gene_ids)
                #logger.info("==> %s %s" % ("exclude" if exclude_genes else "include", "geneIds: " + str(gene_ids)))

            if key == "$or" and type(value) == list:
                q_terms = None
                for region_filter in value:
                    xpos_filters = region_filter.get("$and", {})

                    # for example: $or : [{'$and': [{'xpos': {'$gte': 12345}}, {'xpos': {'$lte': 54321}}]}]
                    xpos_filters_dict = {}
                    for xpos_filter in xpos_filters:
                        xpos_filter_setting = xpos_filter[
                            "xpos"]  # for example {'$gte': 12345} or {'$lte': 54321}
                        xpos_filters_dict.update(xpos_filter_setting)

                    xpos_filter_setting = {
                        k.replace("$", ""): v
                        for k, v in xpos_filters_dict.items()
                    }
                    q = Q('range', **{"xpos": xpos_filter_setting})
                    if q_terms is None:
                        q_terms = q
                    else:
                        q_terms |= q
                if q_terms is not None:
                    s = s.filter(q_terms)

                #logger.info("==> xpos range: " + str({"xpos": xpos_filter_setting}))

            af_key_map = {
                "db_freqs.AF": "AF",
                "db_freqs.1kg_wgs_phase3": "g1k_POPMAX_AF",
                "db_freqs.exac_v3": "exac_AF_POPMAX",
                "db_freqs.topmed": "topmed_AF",
                "db_freqs.gnomad_exomes": "gnomad_exomes_AF_POPMAX",
                "db_freqs.gnomad_genomes": "gnomad_genomes_AF_POPMAX",
                "db_freqs.gnomad-exomes2": "gnomad_exomes_AF_POPMAX",
                "db_freqs.gnomad-genomes2": "gnomad_genomes_AF_POPMAX",
            }

            if key in af_key_map:
                filter_key = af_key_map[key]
                af_filter_setting = {
                    k.replace("$", ""): v
                    for k, v in value.items()
                }
                s = s.filter(
                    Q('range', **{filter_key: af_filter_setting})
                    | ~Q('exists', field=filter_key))
                #logger.info("==> %s: %s" % (filter_key, af_filter_setting))

            ac_key_map = {
                "db_acs.AF": "AC",
                "db_acs.1kg_wgs_phase3": "g1k_AC",
                "db_acs.exac_v3": "exac_AC",
                "db_acs.topmed": "topmed_AC",
                "db_acs.gnomad_exomes": "gnomad_exomes_AC",
                "db_acs.gnomad_genomes": "gnomad_genomes_AC",
                "db_acs.gnomad-exomes2": "gnomad_exomes_AC",
                "db_acs.gnomad-genomes2": "gnomad_genomes_AC",
            }

            if key in ac_key_map:
                filter_key = ac_key_map[key]
                ac_filter_setting = {
                    k.replace("$", ""): v
                    for k, v in value.items()
                }
                s = s.filter(
                    Q('range', **{filter_key: ac_filter_setting})
                    | ~Q('exists', field=filter_key))

            hemi_key_map = {
                "db_hemi.exac_v3": "exac_AC_Hemi",
                "db_hemi.gnomad_exomes": "gnomad_exomes_Hemi",
                "db_hemi.gnomad_genomes": "gnomad_genomes_Hemi",
                "db_hemi.gnomad-exomes2": "gnomad_exomes_Hemi",
                "db_hemi.gnomad-genomes2": "gnomad_genomes_Hemi",
            }

            if key in hemi_key_map:
                filter_key = hemi_key_map[key]
                hemi_filter_setting = {
                    k.replace("$", ""): v
                    for k, v in value.items()
                }
                s = s.filter(
                    Q('range', **{filter_key: hemi_filter_setting})
                    | ~Q('exists', field=filter_key))

            hom_key_map = {
                "db_hom.exac_v3": "exac_AC_Hom",
                "db_hom.gnomad_exomes": "gnomad_exomes_Hom",
                "db_hom.gnomad_genomes": "gnomad_genomes_Hom",
                "db_hom.gnomad-exomes2": "gnomad_exomes_Hom",
                "db_hom.gnomad-genomes2": "gnomad_genomes_Hom",
            }

            if key in hom_key_map:
                filter_key = hom_key_map[key]
                hom_filter_setting = {
                    k.replace("$", ""): v
                    for k, v in value.items()
                }
                s = s.filter(
                    Q('range', **{filter_key: hom_filter_setting})
                    | ~Q('exists', field=filter_key))

            #s = s.sort("xpos")

        #logger.info("=====")
        #logger.info("FULL QUERY OBJ: " + pformat(s.__dict__))
        #logger.info("FILTERS: " + pformat(s.to_dict()))

        # https://elasticsearch-py.readthedocs.io/en/master/helpers.html#elasticsearch.helpers.scan
        start = time.time()

        s = s.params(size=max_results_limit + 1)
        #if not include_all_consequences:
        #    s = s.source(exclude=["sortedTranscriptConsequences"])
        response = s.execute()
        logger.info("=====")

        logger.info("TOTAL: %s. Query took %s seconds" %
                    (response.hits.total, time.time() - start))

        if response.hits.total > max_results_limit + 1:
            raise Exception(
                "This search matched too many variants. Please set additional filters and try again."
            )

        #print(pformat(response.to_dict()))

        project = Project.objects.get(project_id=project_id)

        #gene_list_map = project.get_gene_list_map()

        reference = get_reference()

        #for i, hit in enumerate(response.hits):
        variant_results = []
        for i, hit in enumerate(s.scan()):  # preserve_order=True
            #logger.info("HIT %s: %s %s %s" % (i, hit["variantId"], hit["geneIds"], pformat(hit.__dict__)))
            #print("HIT %s: %s" % (i, pformat(hit.to_dict())))
            filters = ",".join(hit["filters"]
                               or []) if "filters" in hit else ""
            genotypes = {}
            all_num_alt = []
            for individual_id in family_individual_ids:
                encoded_individual_id = _encode_name(individual_id)
                num_alt = int(hit["%s_num_alt" % encoded_individual_id]) if (
                    "%s_num_alt" % encoded_individual_id) in hit else -1
                if num_alt is not None:
                    all_num_alt.append(num_alt)

                alleles = []
                if num_alt == 0:
                    alleles = [hit["ref"], hit["ref"]]
                elif num_alt == 1:
                    alleles = [hit["ref"], hit["alt"]]
                elif num_alt == 2:
                    alleles = [hit["alt"], hit["alt"]]
                elif num_alt == -1 or num_alt == None:
                    alleles = []
                else:
                    raise ValueError("Invalid num_alt: " + str(num_alt))

                genotypes[individual_id] = {
                    'ab':
                    hit["%s_ab" % encoded_individual_id] if
                    ("%s_ab" % encoded_individual_id) in hit else None,
                    'alleles':
                    map(str, alleles),
                    'extras': {
                        'ad':
                        hit["%s_ab" % encoded_individual_id] if
                        ("%s_ad" % encoded_individual_id) in hit else None,
                        'dp':
                        hit["%s_dp" % encoded_individual_id] if
                        ("%s_dp" % encoded_individual_id) in hit else None,
                        #'pl': '',
                    },
                    'filter':
                    filters or "pass",
                    'gq':
                    hit["%s_gq" % encoded_individual_id] if
                    ("%s_gq" % encoded_individual_id in hit
                     and hit["%s_gq" % encoded_individual_id] is not None) else
                    '',
                    'num_alt':
                    num_alt,
                }

            if all([num_alt <= 0 for num_alt in all_num_alt]):
                #logger.info("Filtered out due to genotype: " + str(genotypes))
                #print("Filtered all_num_alt <= 0 - Result %s: GRCh38: %s:%s,  cadd: %s  %s - %s" % (i, hit["contig"], hit["start"], hit["cadd_PHRED"] if "cadd_PHRED" in hit else "", hit["transcriptConsequenceTerms"], all_num_alt))
                continue

            vep_annotation = json.loads(
                str(hit['sortedTranscriptConsequences'])
            ) if 'sortedTranscriptConsequences' in hit else None

            if project.genome_version == GENOME_VERSION_GRCh37:
                grch38_coord = None
                if self.liftover_grch37_to_grch38:
                    grch38_coord = self.liftover_grch37_to_grch38.convert_coordinate(
                        "chr%s" % hit["contig"].replace("chr", ""),
                        int(hit["start"]))
                    if grch38_coord and grch38_coord[0]:
                        grch38_coord = "%s-%s-%s-%s " % (
                            grch38_coord[0][0], grch38_coord[0][1], hit["ref"],
                            hit["alt"])
                    else:
                        grch38_coord = None
            else:
                grch38_coord = hit["variantId"]

            if project.genome_version == GENOME_VERSION_GRCh38:
                grch37_coord = None
                if self.liftover_grch38_to_grch37:
                    grch37_coord = self.liftover_grch38_to_grch37.convert_coordinate(
                        "chr%s" % hit["contig"].replace("chr", ""),
                        int(hit["start"]))
                    if grch37_coord and grch37_coord[0]:
                        grch37_coord = "%s-%s-%s-%s " % (
                            grch37_coord[0][0], grch37_coord[0][1], hit["ref"],
                            hit["alt"])
                    else:
                        grch37_coord = None
            else:
                grch37_coord = hit["variantId"]

            result = {
                #u'_id': ObjectId('596d2207ff66f729285ca588'),
                'alt':
                str(hit["alt"]) if "alt" in hit else None,
                'annotation': {
                    'fathmm':
                    fathmm_map.get(hit["dbnsfp_FATHMM_pred"].split(';')[0])
                    if "dbnsfp_FATHMM_pred" in hit
                    and hit["dbnsfp_FATHMM_pred"] else None,
                    'muttaster':
                    muttaster_map.get(
                        hit["dbnsfp_MutationTaster_pred"].split(';')[0])
                    if "dbnsfp_MutationTaster_pred" in hit
                    and hit["dbnsfp_MutationTaster_pred"] else None,
                    'polyphen':
                    polyphen_map.get(
                        hit["dbnsfp_Polyphen2_HVAR_pred"].split(';')[0])
                    if "dbnsfp_Polyphen2_HVAR_pred" in hit
                    and hit["dbnsfp_Polyphen2_HVAR_pred"] else None,
                    'sift':
                    sift_map.get(hit["dbnsfp_SIFT_pred"].split(';')[0])
                    if "dbnsfp_SIFT_pred" in hit and hit["dbnsfp_SIFT_pred"]
                    else None,
                    'GERP_RS':
                    hit["dbnsfp_GERP_RS"] if "dbnsfp_GERP_RS" in hit else None,
                    'phastCons100way_vertebrate':
                    hit["dbnsfp_phastCons100way_vertebrate"]
                    if "dbnsfp_phastCons100way_vertebrate" in hit else None,
                    'cadd_phred':
                    hit["cadd_PHRED"] if "cadd_PHRED" in hit else None,
                    'dann_score':
                    hit["dbnsfp_DANN_score"]
                    if "dbnsfp_DANN_score" in hit else None,
                    'revel_score':
                    hit["dbnsfp_REVEL_score"]
                    if "dbnsfp_REVEL_score" in hit else None,
                    'eigen_phred':
                    hit["eigen_Eigen_phred"] if "eigen_Eigen_phred" in hit else
                    (hit["dbnsfp_Eigen_phred"]
                     if "dbnsfp_Eigen_phred" in hit else None),
                    'mpc_score':
                    hit["mpc_MPC"] if "mpc_MPC" in hit else None,
                    'annotation_tags':
                    list(hit["transcriptConsequenceTerms"] or [])
                    if "transcriptConsequenceTerms" in hit else None,
                    'coding_gene_ids':
                    list(hit['codingGeneIds'] or []),
                    'gene_ids':
                    list(hit['geneIds'] or []),
                    'vep_annotation':
                    vep_annotation,
                    'vep_group':
                    str(hit['mainTranscript_major_consequence'] or ""),
                    'vep_consequence':
                    str(hit['mainTranscript_major_consequence'] or ""),
                    'main_transcript': {
                        k.replace('mainTranscript_', ''): hit[k]
                        for k in dir(hit) if k.startswith('mainTranscript_')
                    },
                    'worst_vep_annotation_index':
                    0,
                    'worst_vep_index_per_gene': {
                        str(hit['mainTranscript_gene_id']): 0
                    },
                },
                'chr':
                hit["contig"],
                'coding_gene_ids':
                list(hit['codingGeneIds'] or []),
                'gene_ids':
                list(hit['geneIds'] or []),
                'coverage': {
                    'gnomad_exome_coverage':
                    float(hit["gnomad_exome_coverage"] or -1)
                    if "gnomad_exome_coverage" in hit else -1,
                    'gnomad_genome_coverage':
                    float(hit["gnomad_genome_coverage"] or -1)
                    if "gnomad_genome_coverage" in hit else -1,
                },
                'pop_counts': {
                    'AC':
                    int(hit['AC'] or 0) if 'AC' in hit else None,
                    'AN':
                    int(hit['AN'] or 0) if 'AN' in hit else None,
                    '1kg_AC':
                    int(hit['g1k_AC'] or 0) if 'g1k_AC' in hit else None,
                    '1kg_AN':
                    int(hit['g1k_AN'] or 0) if 'g1k_AN' in hit else None,
                    'exac_v3_AC':
                    int(hit["exac_AC_Adj"] or 0)
                    if "exac_Adj_AC" in hit else None,
                    'exac_v3_Het':
                    int(hit["exac_AC_Het"] or 0)
                    if "exac_AC_Het" in hit else None,
                    'exac_v3_Hom':
                    int(hit["exac_AC_Hom"] or 0)
                    if "exac_AC_Hom" in hit else None,
                    'exac_v3_Hemi':
                    int(hit["exac_AC_Hemi"] or 0)
                    if "exac_AC_Hemi" in hit else None,
                    'gnomad_exomes_AC':
                    int(hit["gnomad_exomes_AC"] or 0)
                    if "gnomad_exomes_AC" in hit else None,
                    'gnomad_exomes_Hom':
                    int(hit["gnomad_exomes_Hom"] or 0)
                    if "gnomad_exomes_Hom" in hit else None,
                    'gnomad_exomes_Hemi':
                    int(hit["gnomad_exomes_Hemi"] or 0)
                    if "gnomad_exomes_Hemi" in hit else None,
                    'gnomad_exomes_AN':
                    int(hit["gnomad_exomes_AN"] or 0)
                    if "gnomad_exomes_AN" in hit else None,
                    'gnomad_genomes_AC':
                    int(hit["gnomad_genomes_AC"] or 0)
                    if "gnomad_genomes_AC" in hit else None,
                    'gnomad_genomes_Hom':
                    int(hit["gnomad_genomes_Hom"] or 0)
                    if "gnomad_genomes_Hom" in hit else None,
                    'gnomad_genomes_Hemi':
                    int(hit["gnomad_genomes_Hemi"] or 0)
                    if "gnomad_genomes_Hemi" in hit else None,
                    'gnomad_genomes_AN':
                    int(hit["gnomad_genomes_AN"] or 0)
                    if "gnomad_genomes_AN" in hit else None,
                    'topmed_AC':
                    float(hit["topmed_AC"] or 0)
                    if "topmed_AC" in hit else None,
                    'topmed_Het':
                    float(hit["topmed_Het"] or 0)
                    if "topmed_Het" in hit else None,
                    'topmed_Hom':
                    float(hit["topmed_Hom"] or 0)
                    if "topmed_Hom" in hit else None,
                    'topmed_AN':
                    float(hit["topmed_AN"] or 0)
                    if "topmed_AN" in hit else None,
                },
                'db_freqs': {
                    'AF':
                    float(hit["AF"] or 0.0) if "AF" in hit else None,
                    '1kg_wgs_AF':
                    float(hit["g1k_AF"] or 0.0) if "g1k_AF" in hit else None,
                    '1kg_wgs_popmax_AF':
                    float(hit["g1k_POPMAX_AF"] or 0.0)
                    if "g1k_POPMAX_AF" in hit else None,
                    'exac_v3_AF':
                    float(hit["exac_AF"] or 0.0) if "exac_AF" in hit else
                    (hit["exac_AC_Adj"] / float(hit["exac_AN_Adj"])
                     if "exac_AC_Adj" in hit and "exac_AN_Adj" in hit
                     and int(hit["exac_AN_Adj"] or 0) > 0 else None),
                    'exac_v3_popmax_AF':
                    float(hit["exac_AF_POPMAX"] or 0.0)
                    if "exac_AF_POPMAX" in hit else None,
                    'gnomad_exomes_AF':
                    float(hit["gnomad_exomes_AF"] or 0.0)
                    if "gnomad_exomes_AF" in hit else None,
                    'gnomad_exomes_popmax_AF':
                    float(hit["gnomad_exomes_AF_POPMAX"] or 0.0)
                    if "gnomad_exomes_AF_POPMAX" in hit else None,
                    'gnomad_genomes_AF':
                    float(hit["gnomad_genomes_AF"] or 0.0)
                    if "gnomad_genomes_AF" in hit else None,
                    'gnomad_genomes_popmax_AF':
                    float(hit["gnomad_genomes_AF_POPMAX"] or 0.0)
                    if "gnomad_genomes_AF_POPMAX" in hit else None,
                    'topmed_AF':
                    float(hit["topmed_AF"] or 0.0)
                    if "topmed_AF" in hit else None,
                },
                #'popmax_populations': {
                #    'exac_popmax': hit["exac_POPMAX"] or None,
                #    'gnomad_exomes_popmax': hit["gnomad_exomes_POPMAX"] or None,
                #    'gnomad_genomes_popmax': hit["gnomad_genomes_POPMAX"] or None,
                #},
                'db_gene_ids':
                list((hit["geneIds"] or []) if "geneIds" in hit else []),
                'db_tags':
                str(hit["transcriptConsequenceTerms"] or "")
                if "transcriptConsequenceTerms" in hit else None,
                'extras': {
                    'clinvar_variant_id':
                    hit['clinvar_variation_id']
                    if 'clinvar_variation_id' in hit
                    and hit['clinvar_variation_id'] else None,
                    'clinvar_allele_id':
                    hit['clinvar_allele_id'] if 'clinvar_allele_id' in hit
                    and hit['clinvar_allele_id'] else None,
                    'clinvar_clinsig':
                    hit['clinvar_clinical_significance'].lower() if
                    ('clinvar_clinical_significance' in hit)
                    and hit['clinvar_clinical_significance'] else None,
                    'hgmd_class':
                    hit['hgmd_class'] if 'hgmd_class' in hit and user
                    and user.is_staff else None,
                    'hgmd_accession':
                    hit['hgmd_accession'] if 'hgmd_accession' in hit else None,
                    'genome_version':
                    project.genome_version,
                    'grch37_coords':
                    grch37_coord,
                    'grch38_coords':
                    grch38_coord,
                    'alt_allele_pos':
                    0,
                    'orig_alt_alleles':
                    map(str,
                        [a.split("-")[-1] for a in hit["originalAltAlleles"]])
                    if "originalAltAlleles" in hit else None
                },
                'genotypes':
                genotypes,
                'pos':
                long(hit['start']),
                'pos_end':
                str(hit['end']),
                'ref':
                str(hit['ref']),
                'vartype':
                'snp' if len(hit['ref']) == len(hit['alt']) else "indel",
                'vcf_id':
                None,
                'xpos':
                long(hit["xpos"]),
                'xposx':
                long(hit["xpos"]),
            }

            result["annotation"]["freqs"] = result["db_freqs"]
            result["annotation"]["pop_counts"] = result["pop_counts"]
            result["annotation"]["db"] = "elasticsearch"

            result["extras"][
                "svlen"] = hit["SVLEN"] if "SVLEN" in hit else None
            result["extras"][
                "svtype"] = hit["SVTYPE"] if "SVTYPE" in hit else None

            logger.info(
                "Result %s: GRCh37: %s GRCh38: %s:,  cadd: %s  %s - gene ids: %s, coding gene_ids: %s"
                % (i, grch37_coord, grch38_coord,
                   hit["cadd_PHRED"] if "cadd_PHRED" in hit else "",
                   hit["transcriptConsequenceTerms"], result["gene_ids"],
                   result["coding_gene_ids"]))

            result["extras"]["project_id"] = project_id
            result["extras"]["family_id"] = family_id

            # add gene info
            gene_names = {}
            if vep_annotation is not None:
                gene_names = {
                    vep_anno["gene_id"]: vep_anno.get("gene_symbol")
                    for vep_anno in vep_annotation
                    if vep_anno.get("gene_symbol")
                }
            result["extras"]["gene_names"] = gene_names

            try:
                genes = {}
                for gene_id in result["coding_gene_ids"]:
                    if gene_id:
                        genes[gene_id] = reference.get_gene_summary(
                            gene_id) or {}

                if not genes:
                    for gene_id in result["gene_ids"]:
                        if gene_id:
                            genes[gene_id] = reference.get_gene_summary(
                                gene_id) or {}

                #if not genes:
                #    genes =  {vep_anno["gene_id"]: {"symbol": vep_anno["gene_symbol"]} for vep_anno in vep_annotation}

                result["extras"]["genes"] = genes
            except Exception as e:
                exc_type, exc_obj, exc_tb = sys.exc_info()
                logger.warn(
                    "WARNING: got unexpected error in add_gene_names_to_variants: %s : line %s"
                    % (e, exc_tb.tb_lineno))

            variant_results.append(result)

        logger.info("Finished returning the %s variants: %s seconds" %
                    (response.hits.total, time.time() - start))

        if self._redis_client:
            self._redis_client.set(cache_key, json.dumps(variant_results))

        return [
            Variant.fromJSON(variant_json) for variant_json in variant_results
        ]
예제 #6
0
    def handle(self, *args, **options):
        """transfer project"""
        project_arg = options['project']
        elasticsearch_index = options['es_index']

        project = Project.objects.get(
            Q(name=project_arg) | Q(guid=project_arg))
        logger.info('Updating project genome version for {}'.format(
            project.name))

        # Validate the provided index
        logger.info('Validating es index {}'.format(elasticsearch_index))
        sample_ids, index_metadata = get_elasticsearch_index_samples(
            elasticsearch_index)
        validate_index_metadata(index_metadata,
                                project,
                                elasticsearch_index,
                                genome_version=GENOME_VERSION_GRCh38)
        sample_type = index_metadata['sampleType']
        dataset_path = index_metadata['sourceFilePath']

        matched_sample_id_to_sample_record = match_sample_ids_to_sample_records(
            project=project,
            sample_ids=sample_ids,
            sample_type=sample_type,
            dataset_type=Sample.DATASET_TYPE_VARIANT_CALLS,
            elasticsearch_index=elasticsearch_index,
            sample_id_to_individual_id_mapping={},
        )

        unmatched_samples = set(sample_ids) - set(
            matched_sample_id_to_sample_record.keys())
        if len(unmatched_samples) > 0:
            raise CommandError(
                'Matches not found for ES sample ids: {}.'.format(
                    ', '.join(unmatched_samples)))

        prefetch_related_objects(matched_sample_id_to_sample_record.values(),
                                 'individual__family')
        included_families = {
            sample.individual.family
            for sample in matched_sample_id_to_sample_record.values()
        }
        missing_individuals = Individual.objects.filter(
            family__in=included_families,
            sample__is_active=True,
            sample__dataset_type=Sample.DATASET_TYPE_VARIANT_CALLS,
        ).exclude(sample__in=matched_sample_id_to_sample_record.values()
                  ).select_related('family')
        missing_family_individuals = defaultdict(list)
        for individual in missing_individuals:
            missing_family_individuals[individual.family].append(individual)

        if missing_family_individuals:
            raise CommandError(
                'The following families are included in the callset but are missing some family members: {}.'
                .format(', '.join([
                    '{} ({})'.format(
                        family.family_id,
                        ', '.join([i.individual_id for i in missing_indivs]))
                    for family, missing_indivs in
                    missing_family_individuals.items()
                ])))

        # Get and clean up expected saved variants
        saved_variant_models_by_guid = {
            v.guid: v
            for v in SavedVariant.objects.filter(family__project=project)
        }
        deleted_no_tags = set()
        for guid, variant in saved_variant_models_by_guid.items():
            if not (variant.varianttag_set.count()
                    or variant.variantnote_set.count()):
                deleted_no_tags.add(guid)

        if deleted_no_tags:
            if raw_input(
                    'Do you want to delete the following {} saved variants with no tags (y/n)?: {} '
                    .format(len(deleted_no_tags),
                            ', '.join(deleted_no_tags))) == 'y':
                for guid in deleted_no_tags:
                    saved_variant_models_by_guid.pop(guid).delete()
                logger.info('Deleted {} variants'.format(len(deleted_no_tags)))

        expected_families = {
            sv.family
            for sv in saved_variant_models_by_guid.values()
        }
        missing_families = expected_families - included_families
        if missing_families:
            raise CommandError(
                'The following families have saved variants but are missing from the callset: {}.'
                .format(', '.join([f.family_id for f in missing_families])))

        # Lift-over saved variants
        _update_variant_samples(matched_sample_id_to_sample_record,
                                elasticsearch_index, dataset_path)
        saved_variants = get_json_for_saved_variants(
            saved_variant_models_by_guid.values(), add_details=True)
        saved_variants_to_lift = [
            v for v in saved_variants
            if v['genomeVersion'] != GENOME_VERSION_GRCh38
        ]

        num_already_lifted = len(saved_variants) - len(saved_variants_to_lift)
        if num_already_lifted:
            if raw_input(
                    'Found {} saved variants already on Hg38. Continue with liftover (y/n)? '
                    .format(num_already_lifted)) != 'y':
                raise CommandError(
                    'Error: found {} saved variants already on Hg38'.format(
                        num_already_lifted))
        logger.info(
            'Lifting over {} variants (skipping {} that are already lifted)'.
            format(len(saved_variants_to_lift), num_already_lifted))

        liftover_to_38 = LiftOver('hg19', 'hg38')
        hg37_to_hg38_xpos = {}
        lift_failed = {}
        for v in saved_variants_to_lift:
            if not (hg37_to_hg38_xpos.get(v['xpos'])
                    or v['xpos'] in lift_failed):
                hg38_coord = liftover_to_38.convert_coordinate(
                    'chr{}'.format(v['chrom'].lstrip('chr')), int(v['pos']))
                if hg38_coord and hg38_coord[0]:
                    hg37_to_hg38_xpos[v['xpos']] = get_xpos(
                        hg38_coord[0][0], hg38_coord[0][1])
                else:
                    lift_failed[v['xpos']] = v

        if lift_failed:
            if raw_input(
                    'Unable to lift over the following {} coordinates. Continue with update (y/n)?: {} '
                    .format(
                        len(lift_failed), ', '.join([
                            '{}:{}-{}-{} ({})'.format(
                                v['chrom'], v['pos'], v['ref'], v['alt'],
                                ', '.join(v['familyGuids']))
                            for v in lift_failed.values()
                        ]))) != 'y':
                raise CommandError(
                    'Error: unable to lift over {} variants'.format(
                        len(lift_failed)))

        saved_variants_map = defaultdict(list)
        for v in saved_variants_to_lift:
            if hg37_to_hg38_xpos.get(v['xpos']):
                variant_model = saved_variant_models_by_guid[v['variantGuid']]
                saved_variants_map[(hg37_to_hg38_xpos[v['xpos']], v['ref'],
                                    v['alt'])].append(variant_model)

        es_variants = get_es_variants_for_variant_tuples(
            expected_families, saved_variants_map.keys())

        missing_variants = set(
            saved_variants_map.keys()) - {(v['xpos'], v['ref'], v['alt'])
                                          for v in es_variants}
        if missing_variants:
            missing_variant_strings = []
            for xpos, ref, alt in missing_variants:
                var_id = '{}-{}-{}'.format(xpos, ref, alt)
                for v in saved_variants_map[(xpos, ref, alt)]:
                    tags = v.varianttag_set.all()
                    notes = v.variantnote_set.all()
                    missing_variant_strings.append(
                        '{var_id} {family_id}: {tags} ({guid})'.format(
                            var_id=var_id,
                            family_id=v.family.family_id,
                            guid=v.guid,
                            tags=', '.join([
                                tag.variant_tag_type.name for tag in tags
                            ]) if tags else 'No Tags; {}'.format('; '.join(
                                [note.note for note in notes]))))
            if raw_input(
                    'Unable to find the following {} variants in the index. Continue with update (y/n)?:\n{}\n'
                    .format(len(missing_variants),
                            '\n'.join(missing_variant_strings))) != 'y':
                raise CommandError(
                    'Error: unable to find {} lifted-over variants'.format(
                        len(missing_variants)))

        logger.info('Successfully lifted over {} variants'.format(
            len(es_variants)))

        #  Update saved variants
        missing_family_count = 0
        for var in es_variants:
            saved_variant_models = saved_variants_map[(var['xpos'], var['ref'],
                                                       var['alt'])]
            missing_saved_variants = [
                v for v in saved_variant_models
                if v.family.guid not in var['familyGuids']
            ]
            if missing_saved_variants:
                variant_id = '{}-{}-{}-{}'.format(var['chrom'], var['pos'],
                                                  var['ref'], var['alt'])
                if raw_input(
                    ('Variant {} (hg37: {}) not find for expected families {}. Continue with update (y/n)? '
                     .format(
                         variant_id, missing_saved_variants[0].xpos,
                         ', '.join([
                             '{} ({})'.format(v.family.guid, v.guid)
                             for v in missing_saved_variants
                         ])))) == 'y':
                    var = get_single_es_variant(
                        [v.family for v in saved_variant_models],
                        variant_id,
                        return_all_queried_families=True)
                    missing_family_count += len(missing_saved_variants)
                else:
                    raise CommandError(
                        'Error: unable to find family data for lifted over variant'
                    )
            for saved_variant in saved_variant_models:
                saved_variant.xpos_start = var['xpos']
                saved_variant.saved_variant_json = var
                saved_variant.save()

        logger.info('Successfully updated {} variants'.format(
            len(es_variants)))

        # Update project and sample data
        update_model_from_json(project,
                               {'genome_version': GENOME_VERSION_GRCh38})

        reset_cached_search_results(project)

        logger.info('---Done---')
        logger.info(
            'Succesfully lifted over {} variants. Skipped {} failed variants. Family data not updated for {} variants'
            .format(len(es_variants),
                    len(missing_variants) + len(lift_failed),
                    missing_family_count))
PROJECT_NAME = '1kg project n\u00e5me with uni\u00e7\u00f8de'
PROJECT_GUID = 'R0001_1kg'
ELASTICSEARCH_INDEX = 'test_index'
INDEX_METADATA = {
    "gencodeVersion": "25",
    "hail_version": "0.2.24",
    "genomeVersion": "38",
    "sampleType": "WES",
    "sourceFilePath": "test_index_alias_1_path.vcf.gz",
}
SAMPLE_IDS = [
    "NA19679", "NA19675_1", "NA19678", "HG00731", "HG00732", "HG00733"
]

liftover_to_38 = LiftOver('hg19', 'hg38')

LIFT_MAP = {
    21003343353: [('chr21', 3343400)],
    1248367227: [('chr1', 248203925)],
    1001562437: [('chr1', 1627057)],
    1001560662: [('chr1', 46394160)],
}


def mock_convert_coordinate(chrom, pos):
    pos = int(chrom.replace('chr', '')) * int(1e9) + pos
    return (LIFT_MAP[pos])


@mock.patch('seqr.management.commands.lift_project_to_hg38.logger')
예제 #8
0
    def handle(self, *args, **options):
        """transfer project"""
        project_arg = options['project']
        elasticsearch_index = options['es_index']

        project = Project.objects.get(Q(name=project_arg) | Q(guid=project_arg))
        logger.info('Updating project genome version for {}'.format(project.name))

        # Validate the provided index
        logger.info('Validating es index {}'.format(elasticsearch_index))
        sample_ids, index_metadata = get_elasticsearch_index_samples(elasticsearch_index)
        validate_index_metadata(index_metadata, project, elasticsearch_index, genome_version=GENOME_VERSION_GRCh38)
        sample_type = index_metadata['sampleType']
        dataset_path = index_metadata['sourceFilePath']

        matched_sample_id_to_sample_record = match_sample_ids_to_sample_records(
            project=project,
            sample_ids=sample_ids,
            sample_type=sample_type,
            dataset_type=Sample.DATASET_TYPE_VARIANT_CALLS,
            elasticsearch_index=elasticsearch_index,
            sample_id_to_individual_id_mapping={},
        )

        unmatched_samples = set(sample_ids) - set(matched_sample_id_to_sample_record.keys())
        if len(unmatched_samples) > 0:
            raise Exception('Matches not found for ES sample ids: {}.'.format(', '.join(unmatched_samples)))

        included_family_individuals = defaultdict(set)
        individual_guids_by_id = {}
        for sample in matched_sample_id_to_sample_record.values():
            included_family_individuals[sample.individual.family].add(sample.individual.individual_id)
            individual_guids_by_id[sample.individual.individual_id] = sample.individual.guid
        missing_family_individuals = []
        for family, individual_ids in included_family_individuals.items():
            missing_indivs = family.individual_set.filter(
                sample__sample_status=Sample.SAMPLE_STATUS_LOADED,
                sample__dataset_type=Sample.DATASET_TYPE_VARIANT_CALLS
            ).exclude(individual_id__in=individual_ids)
            if missing_indivs:
                missing_family_individuals.append(
                    '{} ({})'.format(family.family_id, ', '.join([i.individual_id for i in missing_indivs]))
                )
        if missing_family_individuals:
            raise Exception(
                'The following families are included in the callset but are missing some family members: {}.'.format(
                    ', '.join(missing_family_individuals)
                ))

        # Get and clean up expected saved variants
        saved_variant_models_by_guid = {v.guid: v for v in SavedVariant.objects.filter(project=project)}
        deleted_no_family = set()
        deleted_no_tags = set()
        for guid, variant in saved_variant_models_by_guid.items():
            if not variant.family:
                deleted_no_family.add(guid)
            elif not (variant.varianttag_set.count() or variant.variantnote_set.count()):
                deleted_no_tags.add(guid)

        if deleted_no_family:
            if raw_input('Do you want to delete the following {} saved variants with no family (y/n)?: {} '.format(
                    len(deleted_no_family), ', '.join(deleted_no_family))) == 'y':
                for guid in deleted_no_family:
                    saved_variant_models_by_guid.pop(guid).delete()
                logger.info('Deleted {} variants'.format(len(deleted_no_family)))

        if deleted_no_tags:
            if raw_input('Do you want to delete the following {} saved variants with no tags (y/n)?: {} '.format(
                    len(deleted_no_tags), ', '.join(deleted_no_tags))) == 'y':
                for guid in deleted_no_tags:
                    saved_variant_models_by_guid.pop(guid).delete()
                logger.info('Deleted {} variants'.format(len(deleted_no_tags)))

        expected_families = {sv.family for sv in saved_variant_models_by_guid.values()}
        missing_families = expected_families - set(included_family_individuals.keys())
        if missing_families:
            raise Exception(
                'The following families have saved variants but are missing from the callset: {}.'.format(
                    ', '.join([f.family_id for f in missing_families])
                ))

        # Lift-over saved variants
        saved_variants = get_json_for_saved_variants(
            saved_variant_models_by_guid.values(), add_details=True, project=project,
            individual_guids_by_id=individual_guids_by_id)
        saved_variants_to_lift = [v for v in saved_variants if v['genomeVersion'] != GENOME_VERSION_GRCh38]

        num_already_lifted = len(saved_variants) - len(saved_variants_to_lift)
        if num_already_lifted:
            if raw_input('Found {} saved variants already on Hg38. Continue with liftover (y/n)?'.format(num_already_lifted)) != 'y':
                raise Exception('Error: found {} saved variants already on Hg38'.format(num_already_lifted))
        logger.info('Lifting over {} variants (skipping {} that are already lifted)'.format(
            len(saved_variants_to_lift), num_already_lifted))

        liftover_to_38 = LiftOver('hg19', 'hg38')
        hg37_to_hg38_xpos = {}
        lift_failed = set()
        for v in saved_variants_to_lift:
            if not (hg37_to_hg38_xpos.get(v['xpos']) or v['xpos'] in lift_failed):
                hg38_coord = liftover_to_38.convert_coordinate('chr{}'.format(v['chrom'].lstrip('chr')), int(v['pos']))
                if hg38_coord and hg38_coord[0]:
                    hg37_to_hg38_xpos[v['xpos']] = get_xpos(hg38_coord[0][0], hg38_coord[0][1])
                else:
                    lift_failed.add(v['xpos'])

        if lift_failed:
            raise Exception(
                'Unable to lift over the following {} coordinates: {}'.format(len(lift_failed), ', '.join(lift_failed)))

        saved_variants_map = defaultdict(list)
        for v in saved_variants_to_lift:
            variant_model = saved_variant_models_by_guid[v['variantGuid']]
            saved_variants_map[(hg37_to_hg38_xpos[v['xpos']], v['ref'], v['alt'])].append(variant_model)

        es_variants = get_es_variants_for_variant_tuples(expected_families, saved_variants_map.keys())

        missing_variants = set(saved_variants_map.keys()) - {(v['xpos'], v['ref'], v['alt']) for v in es_variants}
        if missing_variants:
            missing_variant_strings = ['{}-{}-{} ({})'.format(
                xpos, ref, alt,
                ', '.join(['{}: {}'.format(v.family.family_id, v.guid) for v in saved_variants_map[(xpos, ref, alt)]]))
                for xpos, ref, alt in missing_variants]
            if raw_input('Unable to find the following {} variants in the index. Continue with update (y/n)?: {} '.format(
                    len(missing_variants), ', '.join(missing_variant_strings))) != 'y':
                raise Exception('Error: unable to find {} lifted-over variants'.format(len(missing_variants)))

        logger.info('Successfully lifted over {} variants'.format(len(es_variants)))

        #  Update saved variants
        for var in es_variants:
            saved_variant_models = saved_variants_map[(var['xpos'], var['ref'], var['alt'])]
            missing_families = [v.family.guid for v in saved_variant_models if v.family.guid not in var['familyGuids']]
            if missing_families:
                raise Exception('Error with variant {}:{}-{}-{} not find for expected families {}; found in families {}'.format(
                    var['chrom'], var['pos'], var['ref'], var['alt'], ', '.join(missing_families), ', '.join(var['familyGuids'])
                ))
            for saved_variant in saved_variant_models:
                saved_variant.xpos_start = var['xpos']
                saved_variant.saved_variant_json = json.dumps(var)
                saved_variant.save()

        logger.info('Successfully updated {} variants'.format(len(es_variants)))

        # Update project and sample data
        update_model_from_json(project, {'genome_version': GENOME_VERSION_GRCh38, 'has_new_search': True})
        _update_samples(
            matched_sample_id_to_sample_record, elasticsearch_index=elasticsearch_index, dataset_path=dataset_path
        )
        update_xbrowse_vcfffiles(
            project, sample_type, elasticsearch_index, dataset_path, matched_sample_id_to_sample_record
        )

        reset_cached_search_results(project)

        logger.info('---Done---')
        logger.info('Succesfully lifted over {} variants. Skipped {} failed variants.'.format(
            len(es_variants), len(missing_variants)))