def parse_locus_list_items(request_json): raw_items = request_json.get('rawItems') if not raw_items: return None, None, None invalid_items = [] intervals = [] gene_ids = set() gene_symbols = set() for item in raw_items.replace(',', ' ').replace('\t', '<TAB>').split(): interval_match = re.match( '(?P<chrom>\w+):(?P<start>\d+)-(?P<end>\d+)(%(?P<offset>(\d+)))?', item) if not interval_match: interval_match = re.match( '(?P<chrom>\w+)<TAB>(?P<start>\d+)<TAB>(?P<end>\d+)', item) if interval_match: interval = interval_match.groupdict() try: interval['chrom'] = interval['chrom'].lstrip('chr') interval['start'] = int(interval['start']) interval['end'] = int(interval['end']) if interval.get('offset'): interval['offset'] = int(interval['offset']) / 100 if interval['start'] > interval['end']: raise ValueError get_xpos(interval['chrom'], interval['start']) intervals.append(interval) except (KeyError, ValueError): invalid_items.append('chr{chrom}:{start}-{end}'.format( chrom=interval.get('chrom'), start=interval.get('start'), end=interval.get('end'))) elif item.upper().startswith('ENSG'): gene_ids.add(item.replace('<TAB>', '')) else: gene_symbols.add(item.replace('<TAB>', '')) gene_symbols_to_ids = get_gene_ids_for_gene_symbols(gene_symbols) invalid_items += [ symbol for symbol in gene_symbols if not gene_symbols_to_ids.get(symbol) ] gene_ids.update({ gene_ids[0] for gene_ids in gene_symbols_to_ids.values() if len(gene_ids) }) genes_by_id = get_genes(list(gene_ids)) if gene_ids else {} invalid_items += [ gene_id for gene_id in gene_ids if not genes_by_id.get(gene_id) ] return genes_by_id, intervals, invalid_items
def _get_saved_variants(variants, families): if not variants: return {}, {} prefetch_related_objects(families, 'project') hg37_family_guids = { family.guid for family in families if family.project.genome_version == GENOME_VERSION_GRCh37 } variant_q = Q() variants_by_id = {} for variant in variants: variants_by_id[_get_variant_key(**variant)] = variant variant_q |= Q(xpos_start=variant['xpos'], ref=variant['ref'], alt=variant['alt'], family__guid__in=variant['familyGuids']) if variant[ 'liftedOverGenomeVersion'] == GENOME_VERSION_GRCh37 and hg37_family_guids: variant_hg37_families = [ family_guid for family_guid in variant['familyGuids'] if family_guid in hg37_family_guids ] if variant_hg37_families: lifted_xpos = get_xpos(variant['liftedOverChrom'], variant['liftedOverPos']) variant_q |= Q(xpos_start=lifted_xpos, ref=variant['ref'], alt=variant['alt'], family__guid__in=variant_hg37_families) variants_by_id[_get_variant_key( xpos=lifted_xpos, ref=variant['ref'], alt=variant['alt'], genomeVersion=variant['liftedOverGenomeVersion'] )] = variant saved_variants = SavedVariant.objects.filter(variant_q) saved_variants_json = get_json_for_saved_variants(saved_variants, add_tags=True, add_details=True) saved_variants_by_guid = {} variants_to_saved_variants = {} for saved_variant in saved_variants_json: family_guids = saved_variant['familyGuids'] searched_variant = variants_by_id[_get_variant_key(**saved_variant)] saved_variant.update(searched_variant) # For saved variants only use family it was saved for, not all families in search saved_variant['familyGuids'] = family_guids saved_variants_by_guid[saved_variant['variantGuid']] = saved_variant if searched_variant['variantId'] not in variants_to_saved_variants: variants_to_saved_variants[searched_variant['variantId']] = {} for family_guid in family_guids: variants_to_saved_variants[searched_variant['variantId']][ family_guid] = saved_variant['variantGuid'] return saved_variants_by_guid, variants_to_saved_variants
def _parse_variant_items(search_json): raw_items = search_json.get('rawVariantItems') if not raw_items: return None, None, None invalid_items = [] variant_ids = [] rs_ids = [] for item in raw_items.replace(',', ' ').split(): if item.startswith('rs'): rs_ids.append(item) else: try: chrom, pos, _, _ = EsSearch.parse_variant_id(item) get_xpos(chrom, pos) variant_ids.append(item.lstrip('chr')) except (KeyError, ValueError): invalid_items.append(item) return rs_ids, variant_ids, invalid_items
def _get_saved_variants(variants, families, include_discovery_tags=False): variants = _flatten_variants(variants) prefetch_related_objects(families, 'project') hg37_family_guids = {family.guid for family in families if family.project.genome_version == GENOME_VERSION_GRCh37} variant_q = Q() variants_by_id = {} for variant in variants: variants_by_id[get_variant_key(**variant)] = variant variant_q |= Q(variant_id=variant['variantId'], family__guid__in=variant['familyGuids']) if variant['liftedOverGenomeVersion'] == GENOME_VERSION_GRCh37 and hg37_family_guids: variant_hg37_families = [family_guid for family_guid in variant['familyGuids'] if family_guid in hg37_family_guids] if variant_hg37_families: lifted_xpos = get_xpos(variant['liftedOverChrom'], variant['liftedOverPos']) variant_q |= Q(xpos=lifted_xpos, ref=variant['ref'], alt=variant['alt'], family__guid__in=variant_hg37_families) variants_by_id[get_variant_key( xpos=lifted_xpos, ref=variant['ref'], alt=variant['alt'], genomeVersion=variant['liftedOverGenomeVersion'] )] = variant saved_variants = SavedVariant.objects.filter(variant_q) json = get_json_for_saved_variants_with_tags(saved_variants, add_details=True) discovery_tags = {} if include_discovery_tags: discovery_tags, discovery_response = get_json_for_discovery_tags(variants) json.update(discovery_response) variants_to_saved_variants = {} for saved_variant in json['savedVariantsByGuid'].values(): family_guids = saved_variant['familyGuids'] searched_variant = variants_by_id.get(get_variant_key(**saved_variant)) if not searched_variant: # This can occur when an hg38 family has a saved variant that did not successfully lift from hg37 continue saved_variant.update(searched_variant) # For saved variants only use family it was saved for, not all families in search saved_variant['familyGuids'] = family_guids json['savedVariantsByGuid'][saved_variant['variantGuid']] = saved_variant if searched_variant['variantId'] not in variants_to_saved_variants: variants_to_saved_variants[searched_variant['variantId']] = {} for family_guid in family_guids: variants_to_saved_variants[searched_variant['variantId']][family_guid] = saved_variant['variantGuid'] for variant_id, tags in discovery_tags.items(): searched_variant = variants_by_id.get(variant_id) if searched_variant: if not searched_variant.get('discoveryTags'): searched_variant['discoveryTags'] = [] searched_variant['discoveryTags'] += [ tag for tag in tags if tag['savedVariant']['familyGuid'] not in searched_variant['familyGuids']] return json, variants_to_saved_variants
def _location_filter(genes, intervals, location_filter): q = None if intervals: q = _build_or_filter('range', [{ 'xpos': { 'gte': get_xpos(interval['chrom'], interval['start']), 'lte': get_xpos(interval['chrom'], interval['end']) } } for interval in intervals]) if genes: gene_q = Q('terms', geneIds=genes.keys()) if q: q |= gene_q else: q = gene_q if location_filter.get('excludeLocations'): return ~q else: return q
def _parse_list_items(request_json): requested_items = (request_json.get('parsedItems') or {}).get('items') or [] existing_gene_ids = set() new_gene_symbols = set() new_gene_ids = set() existing_interval_guids = set() new_intervals = [] invalid_items = [] for item in requested_items: if item.get('locusListIntervalGuid'): existing_interval_guids.add(item.get('locusListIntervalGuid')) elif item.get('geneId'): if item.get('symbol'): existing_gene_ids.add(item.get('geneId')) else: new_gene_ids.add(item.get('geneId')) elif item.get('symbol'): new_gene_symbols.add(item.get('symbol')) else: try: item['start'] = int(item['start']) item['end'] = int(item['end']) if item['start'] > item['end']: raise ValueError get_xpos(item['chrom'], int(item['start'])) new_intervals.append(item) except (KeyError, ValueError): invalid_items.append('chr{chrom}:{start}-{end}'.format( chrom=item.get('chrom', '?'), start=item.get('start', '?'), end=item.get('end', '?') )) gene_symbols_to_ids = get_gene_ids_for_gene_symbols(new_gene_symbols) invalid_items += [symbol for symbol in new_gene_symbols if not gene_symbols_to_ids.get(symbol)] invalid_items += [symbol for symbol in new_gene_symbols if len(gene_symbols_to_ids.get(symbol, [])) > 1] new_genes = get_genes([gene_ids[0] for gene_ids in gene_symbols_to_ids.values() if len(gene_ids) == 1] + list(new_gene_ids), add_dbnsfp=True, add_omim=True, add_constraints=True) invalid_items += [gene_id for gene_id, gene in new_genes.items() if not gene] new_genes = {gene_id: gene for gene_id, gene in new_genes.items() if gene} return new_genes, existing_gene_ids, new_intervals, existing_interval_guids, invalid_items
def _get_parsed_variant_args(variant_json, family): if 'xpos' not in variant_json: variant_json['xpos'] = get_xpos(variant_json['chrom'], variant_json['pos']) xpos = variant_json['xpos'] ref = variant_json['ref'] alt = variant_json['alt'] var_length = variant_json['pos_end'] - variant_json['pos'] if 'pos_end' in variant_json else len(ref) - 1 return { 'xpos': xpos, 'xpos_start': xpos, 'xpos_end': xpos + var_length, 'ref': ref, 'alt': alt, 'family': family, }
def _get_parsed_variant_args(variant_json, family): if 'xpos' not in variant_json: variant_json['xpos'] = get_xpos(variant_json['chrom'], variant_json['pos']) xpos = variant_json['xpos'] ref = variant_json.get('ref') alt = variant_json.get('alt') var_length = variant_json['end'] - variant_json['pos'] if 'end' in variant_json else len(ref) - 1 return { 'xpos': xpos, 'xpos_end': xpos + var_length, 'ref': ref, 'alt': alt, 'family': family, 'variant_id': variant_json['variantId'] }
def create_saved_variant_handler(request): variant_json = json.loads(request.body) family_guid = variant_json.pop('familyGuid') non_variant_json = { k: variant_json.pop(k, None) for k in [ 'searchHash', 'tags', 'functionalData', 'notes', 'note', 'submitToClinvar' ] } family = Family.objects.get(guid=family_guid) check_permissions(family.project, request.user, CAN_VIEW) if 'xpos' not in variant_json: variant_json['xpos'] = get_xpos(variant_json['chrom'], variant_json['pos']) xpos = variant_json['xpos'] ref = variant_json['ref'] alt = variant_json['alt'] var_length = variant_json['pos_end'] - variant_json[ 'pos'] if 'pos_end' in variant_json else len(ref) - 1 saved_variant = SavedVariant.objects.create( xpos=xpos, xpos_start=xpos, xpos_end=xpos + var_length, ref=ref, alt=alt, family=family, saved_variant_json=variant_json) if non_variant_json.get('note'): _create_variant_note(saved_variant, non_variant_json, request.user) elif non_variant_json.get('tags'): _create_new_tags(saved_variant, non_variant_json, request.user) variant_json.update( get_json_for_saved_variant(saved_variant, add_tags=True)) return create_json_response({ 'savedVariantsByGuid': { saved_variant.guid: variant_json }, })
def test_get_xpos(self): self.assertRaises(ValueError, lambda: get_xpos('chrUnknown', 1)) self.assertRaises(ValueError, lambda: get_xpos('chr22', 0)) self.assertRaises(ValueError, lambda: get_xpos('chr22', 1e9)) self.assertEqual(get_xpos('1', 10), 1e9 + 10) self.assertEqual(get_xpos('chr1', 10), 1e9 + 10) self.assertEqual(get_xpos('22', 10), 22 * 1e9 + 10) self.assertEqual(get_xpos('X', 10), 23 * 1e9 + 10) self.assertEqual(get_xpos('chrX', 10), 23 * 1e9 + 10) self.assertEqual(get_xpos('Y', 10), 24 * 1e9 + 10) self.assertEqual(get_xpos('chrY', 10), 24 * 1e9 + 10) self.assertEqual(get_xpos('M', 10), 25 * 1e9 + 10) self.assertEqual(get_xpos('chrM', 10), 25 * 1e9 + 10)
def handle(self, *args, **options): """transfer project""" project_arg = options['project'] elasticsearch_index = options['es_index'] project = Project.objects.get( Q(name=project_arg) | Q(guid=project_arg)) logger.info('Updating project genome version for {}'.format( project.name)) # Validate the provided index logger.info('Validating es index {}'.format(elasticsearch_index)) sample_ids, index_metadata = get_elasticsearch_index_samples( elasticsearch_index) validate_index_metadata(index_metadata, project, elasticsearch_index, genome_version=GENOME_VERSION_GRCh38) sample_type = index_metadata['sampleType'] dataset_path = index_metadata['sourceFilePath'] matched_sample_id_to_sample_record = match_sample_ids_to_sample_records( project=project, sample_ids=sample_ids, sample_type=sample_type, dataset_type=Sample.DATASET_TYPE_VARIANT_CALLS, elasticsearch_index=elasticsearch_index, sample_id_to_individual_id_mapping={}, ) unmatched_samples = set(sample_ids) - set( matched_sample_id_to_sample_record.keys()) if len(unmatched_samples) > 0: raise CommandError( 'Matches not found for ES sample ids: {}.'.format( ', '.join(unmatched_samples))) prefetch_related_objects(matched_sample_id_to_sample_record.values(), 'individual__family') included_families = { sample.individual.family for sample in matched_sample_id_to_sample_record.values() } missing_individuals = Individual.objects.filter( family__in=included_families, sample__is_active=True, sample__dataset_type=Sample.DATASET_TYPE_VARIANT_CALLS, ).exclude(sample__in=matched_sample_id_to_sample_record.values() ).select_related('family') missing_family_individuals = defaultdict(list) for individual in missing_individuals: missing_family_individuals[individual.family].append(individual) if missing_family_individuals: raise CommandError( 'The following families are included in the callset but are missing some family members: {}.' .format(', '.join([ '{} ({})'.format( family.family_id, ', '.join([i.individual_id for i in missing_indivs])) for family, missing_indivs in missing_family_individuals.items() ]))) # Get and clean up expected saved variants saved_variant_models_by_guid = { v.guid: v for v in SavedVariant.objects.filter(family__project=project) } deleted_no_tags = set() for guid, variant in saved_variant_models_by_guid.items(): if not (variant.varianttag_set.count() or variant.variantnote_set.count()): deleted_no_tags.add(guid) if deleted_no_tags: if raw_input( 'Do you want to delete the following {} saved variants with no tags (y/n)?: {} ' .format(len(deleted_no_tags), ', '.join(deleted_no_tags))) == 'y': for guid in deleted_no_tags: saved_variant_models_by_guid.pop(guid).delete() logger.info('Deleted {} variants'.format(len(deleted_no_tags))) expected_families = { sv.family for sv in saved_variant_models_by_guid.values() } missing_families = expected_families - included_families if missing_families: raise CommandError( 'The following families have saved variants but are missing from the callset: {}.' .format(', '.join([f.family_id for f in missing_families]))) # Lift-over saved variants _update_variant_samples(matched_sample_id_to_sample_record, elasticsearch_index, dataset_path) saved_variants = get_json_for_saved_variants( saved_variant_models_by_guid.values(), add_details=True) saved_variants_to_lift = [ v for v in saved_variants if v['genomeVersion'] != GENOME_VERSION_GRCh38 ] num_already_lifted = len(saved_variants) - len(saved_variants_to_lift) if num_already_lifted: if raw_input( 'Found {} saved variants already on Hg38. Continue with liftover (y/n)? ' .format(num_already_lifted)) != 'y': raise CommandError( 'Error: found {} saved variants already on Hg38'.format( num_already_lifted)) logger.info( 'Lifting over {} variants (skipping {} that are already lifted)'. format(len(saved_variants_to_lift), num_already_lifted)) liftover_to_38 = LiftOver('hg19', 'hg38') hg37_to_hg38_xpos = {} lift_failed = {} for v in saved_variants_to_lift: if not (hg37_to_hg38_xpos.get(v['xpos']) or v['xpos'] in lift_failed): hg38_coord = liftover_to_38.convert_coordinate( 'chr{}'.format(v['chrom'].lstrip('chr')), int(v['pos'])) if hg38_coord and hg38_coord[0]: hg37_to_hg38_xpos[v['xpos']] = get_xpos( hg38_coord[0][0], hg38_coord[0][1]) else: lift_failed[v['xpos']] = v if lift_failed: if raw_input( 'Unable to lift over the following {} coordinates. Continue with update (y/n)?: {} ' .format( len(lift_failed), ', '.join([ '{}:{}-{}-{} ({})'.format( v['chrom'], v['pos'], v['ref'], v['alt'], ', '.join(v['familyGuids'])) for v in lift_failed.values() ]))) != 'y': raise CommandError( 'Error: unable to lift over {} variants'.format( len(lift_failed))) saved_variants_map = defaultdict(list) for v in saved_variants_to_lift: if hg37_to_hg38_xpos.get(v['xpos']): variant_model = saved_variant_models_by_guid[v['variantGuid']] saved_variants_map[(hg37_to_hg38_xpos[v['xpos']], v['ref'], v['alt'])].append(variant_model) es_variants = get_es_variants_for_variant_tuples( expected_families, saved_variants_map.keys()) missing_variants = set( saved_variants_map.keys()) - {(v['xpos'], v['ref'], v['alt']) for v in es_variants} if missing_variants: missing_variant_strings = [] for xpos, ref, alt in missing_variants: var_id = '{}-{}-{}'.format(xpos, ref, alt) for v in saved_variants_map[(xpos, ref, alt)]: tags = v.varianttag_set.all() notes = v.variantnote_set.all() missing_variant_strings.append( '{var_id} {family_id}: {tags} ({guid})'.format( var_id=var_id, family_id=v.family.family_id, guid=v.guid, tags=', '.join([ tag.variant_tag_type.name for tag in tags ]) if tags else 'No Tags; {}'.format('; '.join( [note.note for note in notes])))) if raw_input( 'Unable to find the following {} variants in the index. Continue with update (y/n)?:\n{}\n' .format(len(missing_variants), '\n'.join(missing_variant_strings))) != 'y': raise CommandError( 'Error: unable to find {} lifted-over variants'.format( len(missing_variants))) logger.info('Successfully lifted over {} variants'.format( len(es_variants))) # Update saved variants missing_family_count = 0 for var in es_variants: saved_variant_models = saved_variants_map[(var['xpos'], var['ref'], var['alt'])] missing_saved_variants = [ v for v in saved_variant_models if v.family.guid not in var['familyGuids'] ] if missing_saved_variants: variant_id = '{}-{}-{}-{}'.format(var['chrom'], var['pos'], var['ref'], var['alt']) if raw_input( ('Variant {} (hg37: {}) not find for expected families {}. Continue with update (y/n)? ' .format( variant_id, missing_saved_variants[0].xpos, ', '.join([ '{} ({})'.format(v.family.guid, v.guid) for v in missing_saved_variants ])))) == 'y': var = get_single_es_variant( [v.family for v in saved_variant_models], variant_id, return_all_queried_families=True) missing_family_count += len(missing_saved_variants) else: raise CommandError( 'Error: unable to find family data for lifted over variant' ) for saved_variant in saved_variant_models: saved_variant.xpos_start = var['xpos'] saved_variant.saved_variant_json = var saved_variant.save() logger.info('Successfully updated {} variants'.format( len(es_variants))) # Update project and sample data update_model_from_json(project, {'genome_version': GENOME_VERSION_GRCh38}) reset_cached_search_results(project) logger.info('---Done---') logger.info( 'Succesfully lifted over {} variants. Skipped {} failed variants. Family data not updated for {} variants' .format(len(es_variants), len(missing_variants) + len(lift_failed), missing_family_count))
def _get_saved_variants(variants, families, include_discovery_tags=False): if not variants: return {}, {} variants = _flatten_variants(variants) prefetch_related_objects(families, 'project') hg37_family_guids = { family.guid for family in families if family.project.genome_version == GENOME_VERSION_GRCh37 } variant_q = Q() discovery_variant_q = Q() variants_by_id = {} for variant in variants: variants_by_id[_get_variant_key(**variant)] = variant variant_q |= Q(xpos_start=variant['xpos'], ref=variant['ref'], alt=variant['alt'], family__guid__in=variant['familyGuids']) discovery_variant_q |= Q( Q(xpos_start=variant['xpos'], ref=variant['ref'], alt=variant['alt']) & ~Q(family__guid__in=variant['familyGuids'])) if variant[ 'liftedOverGenomeVersion'] == GENOME_VERSION_GRCh37 and hg37_family_guids: variant_hg37_families = [ family_guid for family_guid in variant['familyGuids'] if family_guid in hg37_family_guids ] if variant_hg37_families: lifted_xpos = get_xpos(variant['liftedOverChrom'], variant['liftedOverPos']) variant_q |= Q(xpos_start=lifted_xpos, ref=variant['ref'], alt=variant['alt'], family__guid__in=variant_hg37_families) variants_by_id[_get_variant_key( xpos=lifted_xpos, ref=variant['ref'], alt=variant['alt'], genomeVersion=variant['liftedOverGenomeVersion'] )] = variant saved_variants = SavedVariant.objects.filter(variant_q) json = get_json_for_saved_variants_with_tags(saved_variants, add_details=True) variants_to_saved_variants = {} for saved_variant in json['savedVariantsByGuid'].values(): family_guids = saved_variant['familyGuids'] searched_variant = variants_by_id.get( _get_variant_key(**saved_variant)) if not searched_variant: # This can occur when an hg38 family has a saved variant that did not successfully lift from hg37 continue saved_variant.update(searched_variant) # For saved variants only use family it was saved for, not all families in search saved_variant['familyGuids'] = family_guids json['savedVariantsByGuid'][ saved_variant['variantGuid']] = saved_variant if searched_variant['variantId'] not in variants_to_saved_variants: variants_to_saved_variants[searched_variant['variantId']] = {} for family_guid in family_guids: variants_to_saved_variants[searched_variant['variantId']][ family_guid] = saved_variant['variantGuid'] if include_discovery_tags: discovery_tags = get_json_for_variant_tags( VariantTag.objects.filter( variant_tag_type__category='CMG Discovery Tags', saved_variants__in=SavedVariant.objects.filter( discovery_variant_q)), include_variant_details=True) if discovery_tags: family_guids = set() for tag in discovery_tags: for variant in tag['variants']: family_guids.update(variant['familyGuids']) families_by_guid = { f.guid: f for f in Family.objects.filter( guid__in=family_guids).prefetch_related('project') } for tag in discovery_tags: for variant in tag.pop('variants'): variant_family = families_by_guid[variant['familyGuids'] [0]] searched_variant = variants_by_id.get( _get_variant_key(genomeVersion=variant_family.project. genome_version, **variant)) if searched_variant: if not searched_variant.get('discoveryTags'): searched_variant['discoveryTags'] = [] tag_json = { 'savedVariant': { 'variantGuid': variant['variantGuid'], 'familyGuid': variant_family.guid, 'projectGuid': variant_family.project.guid, } } tag_json.update(tag) searched_variant['discoveryTags'].append(tag_json) json['familiesByGuid'] = { f['familyGuid']: f for f in _get_json_for_families(families_by_guid.values()) } return json, variants_to_saved_variants
def test_get_xpos(self): self.assertRaises(ValueError, lambda: get_xpos('chrUnknown', 1)) self.assertRaises(ValueError, lambda: get_xpos('chr22', 0)) self.assertRaises(ValueError, lambda: get_xpos('chr22', 1e9)) self.assertEqual(get_xpos('1', 10), 1e9 + 10) self.assertEqual(get_xpos('chr1', 10), 1e9 + 10) self.assertEqual(get_xpos('22', 10), 22*1e9 + 10) self.assertEqual(get_xpos('X', 10), 23*1e9 + 10) self.assertEqual(get_xpos('chrX', 10), 23*1e9 + 10) self.assertEqual(get_xpos('Y', 10), 24*1e9 + 10) self.assertEqual(get_xpos('chrY', 10), 24*1e9 + 10) self.assertEqual(get_xpos('M', 10), 25*1e9 + 10) self.assertEqual(get_xpos('chrM', 10), 25*1e9 + 10)
def handle(self, *args, **options): """transfer project""" project_arg = options['project'] elasticsearch_index = options['es_index'] project = Project.objects.get(Q(name=project_arg) | Q(guid=project_arg)) logger.info('Updating project genome version for {}'.format(project.name)) # Validate the provided index logger.info('Validating es index {}'.format(elasticsearch_index)) sample_ids, index_metadata = get_elasticsearch_index_samples(elasticsearch_index) validate_index_metadata(index_metadata, project, elasticsearch_index, genome_version=GENOME_VERSION_GRCh38) sample_type = index_metadata['sampleType'] dataset_path = index_metadata['sourceFilePath'] matched_sample_id_to_sample_record = match_sample_ids_to_sample_records( project=project, sample_ids=sample_ids, sample_type=sample_type, dataset_type=Sample.DATASET_TYPE_VARIANT_CALLS, elasticsearch_index=elasticsearch_index, sample_id_to_individual_id_mapping={}, ) unmatched_samples = set(sample_ids) - set(matched_sample_id_to_sample_record.keys()) if len(unmatched_samples) > 0: raise Exception('Matches not found for ES sample ids: {}.'.format(', '.join(unmatched_samples))) included_family_individuals = defaultdict(set) individual_guids_by_id = {} for sample in matched_sample_id_to_sample_record.values(): included_family_individuals[sample.individual.family].add(sample.individual.individual_id) individual_guids_by_id[sample.individual.individual_id] = sample.individual.guid missing_family_individuals = [] for family, individual_ids in included_family_individuals.items(): missing_indivs = family.individual_set.filter( sample__sample_status=Sample.SAMPLE_STATUS_LOADED, sample__dataset_type=Sample.DATASET_TYPE_VARIANT_CALLS ).exclude(individual_id__in=individual_ids) if missing_indivs: missing_family_individuals.append( '{} ({})'.format(family.family_id, ', '.join([i.individual_id for i in missing_indivs])) ) if missing_family_individuals: raise Exception( 'The following families are included in the callset but are missing some family members: {}.'.format( ', '.join(missing_family_individuals) )) # Get and clean up expected saved variants saved_variant_models_by_guid = {v.guid: v for v in SavedVariant.objects.filter(project=project)} deleted_no_family = set() deleted_no_tags = set() for guid, variant in saved_variant_models_by_guid.items(): if not variant.family: deleted_no_family.add(guid) elif not (variant.varianttag_set.count() or variant.variantnote_set.count()): deleted_no_tags.add(guid) if deleted_no_family: if raw_input('Do you want to delete the following {} saved variants with no family (y/n)?: {} '.format( len(deleted_no_family), ', '.join(deleted_no_family))) == 'y': for guid in deleted_no_family: saved_variant_models_by_guid.pop(guid).delete() logger.info('Deleted {} variants'.format(len(deleted_no_family))) if deleted_no_tags: if raw_input('Do you want to delete the following {} saved variants with no tags (y/n)?: {} '.format( len(deleted_no_tags), ', '.join(deleted_no_tags))) == 'y': for guid in deleted_no_tags: saved_variant_models_by_guid.pop(guid).delete() logger.info('Deleted {} variants'.format(len(deleted_no_tags))) expected_families = {sv.family for sv in saved_variant_models_by_guid.values()} missing_families = expected_families - set(included_family_individuals.keys()) if missing_families: raise Exception( 'The following families have saved variants but are missing from the callset: {}.'.format( ', '.join([f.family_id for f in missing_families]) )) # Lift-over saved variants saved_variants = get_json_for_saved_variants( saved_variant_models_by_guid.values(), add_details=True, project=project, individual_guids_by_id=individual_guids_by_id) saved_variants_to_lift = [v for v in saved_variants if v['genomeVersion'] != GENOME_VERSION_GRCh38] num_already_lifted = len(saved_variants) - len(saved_variants_to_lift) if num_already_lifted: if raw_input('Found {} saved variants already on Hg38. Continue with liftover (y/n)?'.format(num_already_lifted)) != 'y': raise Exception('Error: found {} saved variants already on Hg38'.format(num_already_lifted)) logger.info('Lifting over {} variants (skipping {} that are already lifted)'.format( len(saved_variants_to_lift), num_already_lifted)) liftover_to_38 = LiftOver('hg19', 'hg38') hg37_to_hg38_xpos = {} lift_failed = set() for v in saved_variants_to_lift: if not (hg37_to_hg38_xpos.get(v['xpos']) or v['xpos'] in lift_failed): hg38_coord = liftover_to_38.convert_coordinate('chr{}'.format(v['chrom'].lstrip('chr')), int(v['pos'])) if hg38_coord and hg38_coord[0]: hg37_to_hg38_xpos[v['xpos']] = get_xpos(hg38_coord[0][0], hg38_coord[0][1]) else: lift_failed.add(v['xpos']) if lift_failed: raise Exception( 'Unable to lift over the following {} coordinates: {}'.format(len(lift_failed), ', '.join(lift_failed))) saved_variants_map = defaultdict(list) for v in saved_variants_to_lift: variant_model = saved_variant_models_by_guid[v['variantGuid']] saved_variants_map[(hg37_to_hg38_xpos[v['xpos']], v['ref'], v['alt'])].append(variant_model) es_variants = get_es_variants_for_variant_tuples(expected_families, saved_variants_map.keys()) missing_variants = set(saved_variants_map.keys()) - {(v['xpos'], v['ref'], v['alt']) for v in es_variants} if missing_variants: missing_variant_strings = ['{}-{}-{} ({})'.format( xpos, ref, alt, ', '.join(['{}: {}'.format(v.family.family_id, v.guid) for v in saved_variants_map[(xpos, ref, alt)]])) for xpos, ref, alt in missing_variants] if raw_input('Unable to find the following {} variants in the index. Continue with update (y/n)?: {} '.format( len(missing_variants), ', '.join(missing_variant_strings))) != 'y': raise Exception('Error: unable to find {} lifted-over variants'.format(len(missing_variants))) logger.info('Successfully lifted over {} variants'.format(len(es_variants))) # Update saved variants for var in es_variants: saved_variant_models = saved_variants_map[(var['xpos'], var['ref'], var['alt'])] missing_families = [v.family.guid for v in saved_variant_models if v.family.guid not in var['familyGuids']] if missing_families: raise Exception('Error with variant {}:{}-{}-{} not find for expected families {}; found in families {}'.format( var['chrom'], var['pos'], var['ref'], var['alt'], ', '.join(missing_families), ', '.join(var['familyGuids']) )) for saved_variant in saved_variant_models: saved_variant.xpos_start = var['xpos'] saved_variant.saved_variant_json = json.dumps(var) saved_variant.save() logger.info('Successfully updated {} variants'.format(len(es_variants))) # Update project and sample data update_model_from_json(project, {'genome_version': GENOME_VERSION_GRCh38, 'has_new_search': True}) _update_samples( matched_sample_id_to_sample_record, elasticsearch_index=elasticsearch_index, dataset_path=dataset_path ) update_xbrowse_vcfffiles( project, sample_type, elasticsearch_index, dataset_path, matched_sample_id_to_sample_record ) reset_cached_search_results(project) logger.info('---Done---') logger.info('Succesfully lifted over {} variants. Skipped {} failed variants.'.format( len(es_variants), len(missing_variants)))