Python reset_cached_search_results示例，seqr.views.utils.variant_utils.reset_cached_search_results Python示例

示例#1

0

显示文件

 def handle(self, *args, **options):
     """transfer project"""
     project_name = options['project']
     project = Project.objects.get(
         name=project_name) if project_name else None
     reset_cached_search_results(project=project)
     logger.info(u'Reset cached search results for {}'.format(
         project_name or 'all projects'))

示例#2

0

显示文件

def update_saved_variant_json(request, project_guid):
    project = get_project_and_check_permissions(project_guid,
                                                request.user,
                                                can_edit=True)
    reset_cached_search_results(project)
    updated_saved_variant_guids = update_project_saved_variant_json(project)

    return create_json_response(
        {variant_guid: None
         for variant_guid in updated_saved_variant_guids})

示例#3

0

显示文件

文件： saved_variant_api.py 项目： athegaul/seqr

def update_saved_variant_json(request, project_guid):
    project = get_project_and_check_permissions(project_guid, request.user, can_edit=True)
    reset_cached_search_results(project)
    try:
        updated_saved_variant_guids = update_project_saved_variant_json(project, user=request.user)
    except Exception as e:
        logger.error('Unable to reset saved variant json for {}: {}'.format(project_guid, e))
        updated_saved_variant_guids = []

    return create_json_response({variant_guid: None for variant_guid in updated_saved_variant_guids})

示例#4

0

显示文件

文件： lift_project_to_hg38.py 项目： SarahBeecroft/seqr

    def handle(self, *args, **options):
        """transfer project"""
        project_arg = options['project']
        elasticsearch_index = options['es_index']

        project = Project.objects.get(
            Q(name=project_arg) | Q(guid=project_arg))
        logger.info('Updating project genome version for {}'.format(
            project.name))

        # Validate the provided index
        logger.info('Validating es index {}'.format(elasticsearch_index))
        sample_ids, index_metadata = get_elasticsearch_index_samples(
            elasticsearch_index)
        validate_index_metadata(index_metadata,
                                project,
                                elasticsearch_index,
                                genome_version=GENOME_VERSION_GRCh38)
        sample_type = index_metadata['sampleType']
        dataset_path = index_metadata['sourceFilePath']

        matched_sample_id_to_sample_record = match_sample_ids_to_sample_records(
            project=project,
            sample_ids=sample_ids,
            sample_type=sample_type,
            dataset_type=Sample.DATASET_TYPE_VARIANT_CALLS,
            elasticsearch_index=elasticsearch_index,
            sample_id_to_individual_id_mapping={},
        )

        unmatched_samples = set(sample_ids) - set(
            matched_sample_id_to_sample_record.keys())
        if len(unmatched_samples) > 0:
            raise CommandError(
                'Matches not found for ES sample ids: {}.'.format(
                    ', '.join(unmatched_samples)))

        prefetch_related_objects(matched_sample_id_to_sample_record.values(),
                                 'individual__family')
        included_families = {
            sample.individual.family
            for sample in matched_sample_id_to_sample_record.values()
        }
        missing_individuals = Individual.objects.filter(
            family__in=included_families,
            sample__is_active=True,
            sample__dataset_type=Sample.DATASET_TYPE_VARIANT_CALLS,
        ).exclude(sample__in=matched_sample_id_to_sample_record.values()
                  ).select_related('family')
        missing_family_individuals = defaultdict(list)
        for individual in missing_individuals:
            missing_family_individuals[individual.family].append(individual)

        if missing_family_individuals:
            raise CommandError(
                'The following families are included in the callset but are missing some family members: {}.'
                .format(', '.join([
                    '{} ({})'.format(
                        family.family_id,
                        ', '.join([i.individual_id for i in missing_indivs]))
                    for family, missing_indivs in
                    missing_family_individuals.items()
                ])))

        # Get and clean up expected saved variants
        saved_variant_models_by_guid = {
            v.guid: v
            for v in SavedVariant.objects.filter(family__project=project)
        }
        deleted_no_tags = set()
        for guid, variant in saved_variant_models_by_guid.items():
            if not (variant.varianttag_set.count()
                    or variant.variantnote_set.count()):
                deleted_no_tags.add(guid)

        if deleted_no_tags:
            if raw_input(
                    'Do you want to delete the following {} saved variants with no tags (y/n)?: {} '
                    .format(len(deleted_no_tags),
                            ', '.join(deleted_no_tags))) == 'y':
                for guid in deleted_no_tags:
                    saved_variant_models_by_guid.pop(guid).delete()
                logger.info('Deleted {} variants'.format(len(deleted_no_tags)))

        expected_families = {
            sv.family
            for sv in saved_variant_models_by_guid.values()
        }
        missing_families = expected_families - included_families
        if missing_families:
            raise CommandError(
                'The following families have saved variants but are missing from the callset: {}.'
                .format(', '.join([f.family_id for f in missing_families])))

        # Lift-over saved variants
        _update_variant_samples(matched_sample_id_to_sample_record,
                                elasticsearch_index, dataset_path)
        saved_variants = get_json_for_saved_variants(
            saved_variant_models_by_guid.values(), add_details=True)
        saved_variants_to_lift = [
            v for v in saved_variants
            if v['genomeVersion'] != GENOME_VERSION_GRCh38
        ]

        num_already_lifted = len(saved_variants) - len(saved_variants_to_lift)
        if num_already_lifted:
            if raw_input(
                    'Found {} saved variants already on Hg38. Continue with liftover (y/n)? '
                    .format(num_already_lifted)) != 'y':
                raise CommandError(
                    'Error: found {} saved variants already on Hg38'.format(
                        num_already_lifted))
        logger.info(
            'Lifting over {} variants (skipping {} that are already lifted)'.
            format(len(saved_variants_to_lift), num_already_lifted))

        liftover_to_38 = LiftOver('hg19', 'hg38')
        hg37_to_hg38_xpos = {}
        lift_failed = {}
        for v in saved_variants_to_lift:
            if not (hg37_to_hg38_xpos.get(v['xpos'])
                    or v['xpos'] in lift_failed):
                hg38_coord = liftover_to_38.convert_coordinate(
                    'chr{}'.format(v['chrom'].lstrip('chr')), int(v['pos']))
                if hg38_coord and hg38_coord[0]:
                    hg37_to_hg38_xpos[v['xpos']] = get_xpos(
                        hg38_coord[0][0], hg38_coord[0][1])
                else:
                    lift_failed[v['xpos']] = v

        if lift_failed:
            if raw_input(
                    'Unable to lift over the following {} coordinates. Continue with update (y/n)?: {} '
                    .format(
                        len(lift_failed), ', '.join([
                            '{}:{}-{}-{} ({})'.format(
                                v['chrom'], v['pos'], v['ref'], v['alt'],
                                ', '.join(v['familyGuids']))
                            for v in lift_failed.values()
                        ]))) != 'y':
                raise CommandError(
                    'Error: unable to lift over {} variants'.format(
                        len(lift_failed)))

        saved_variants_map = defaultdict(list)
        for v in saved_variants_to_lift:
            if hg37_to_hg38_xpos.get(v['xpos']):
                variant_model = saved_variant_models_by_guid[v['variantGuid']]
                saved_variants_map[(hg37_to_hg38_xpos[v['xpos']], v['ref'],
                                    v['alt'])].append(variant_model)

        es_variants = get_es_variants_for_variant_tuples(
            expected_families, saved_variants_map.keys())

        missing_variants = set(
            saved_variants_map.keys()) - {(v['xpos'], v['ref'], v['alt'])
                                          for v in es_variants}
        if missing_variants:
            missing_variant_strings = []
            for xpos, ref, alt in missing_variants:
                var_id = '{}-{}-{}'.format(xpos, ref, alt)
                for v in saved_variants_map[(xpos, ref, alt)]:
                    tags = v.varianttag_set.all()
                    notes = v.variantnote_set.all()
                    missing_variant_strings.append(
                        '{var_id} {family_id}: {tags} ({guid})'.format(
                            var_id=var_id,
                            family_id=v.family.family_id,
                            guid=v.guid,
                            tags=', '.join([
                                tag.variant_tag_type.name for tag in tags
                            ]) if tags else 'No Tags; {}'.format('; '.join(
                                [note.note for note in notes]))))
            if raw_input(
                    'Unable to find the following {} variants in the index. Continue with update (y/n)?:\n{}\n'
                    .format(len(missing_variants),
                            '\n'.join(missing_variant_strings))) != 'y':
                raise CommandError(
                    'Error: unable to find {} lifted-over variants'.format(
                        len(missing_variants)))

        logger.info('Successfully lifted over {} variants'.format(
            len(es_variants)))

        #  Update saved variants
        missing_family_count = 0
        for var in es_variants:
            saved_variant_models = saved_variants_map[(var['xpos'], var['ref'],
                                                       var['alt'])]
            missing_saved_variants = [
                v for v in saved_variant_models
                if v.family.guid not in var['familyGuids']
            ]
            if missing_saved_variants:
                variant_id = '{}-{}-{}-{}'.format(var['chrom'], var['pos'],
                                                  var['ref'], var['alt'])
                if raw_input(
                    ('Variant {} (hg37: {}) not find for expected families {}. Continue with update (y/n)? '
                     .format(
                         variant_id, missing_saved_variants[0].xpos,
                         ', '.join([
                             '{} ({})'.format(v.family.guid, v.guid)
                             for v in missing_saved_variants
                         ])))) == 'y':
                    var = get_single_es_variant(
                        [v.family for v in saved_variant_models],
                        variant_id,
                        return_all_queried_families=True)
                    missing_family_count += len(missing_saved_variants)
                else:
                    raise CommandError(
                        'Error: unable to find family data for lifted over variant'
                    )
            for saved_variant in saved_variant_models:
                saved_variant.xpos_start = var['xpos']
                saved_variant.saved_variant_json = var
                saved_variant.save()

        logger.info('Successfully updated {} variants'.format(
            len(es_variants)))

        # Update project and sample data
        update_model_from_json(project,
                               {'genome_version': GENOME_VERSION_GRCh38})

        reset_cached_search_results(project)

        logger.info('---Done---')
        logger.info(
            'Succesfully lifted over {} variants. Skipped {} failed variants. Family data not updated for {} variants'
            .format(len(es_variants),
                    len(missing_variants) + len(lift_failed),
                    missing_family_count))

示例#5

0

显示文件

def add_dataset_handler(request, project_guid):
    """Create or update samples for the given dataset

    Args:
        request: Django request object
        project_guid (string): GUID of the project that should be updated

    HTTP POST
        Request body - should contain the following json structure:
        {
            'sampleType':  <"WGS", "WES", or "RNA"> (required)
            'datasetType': <"VARIANTS", or "ALIGN"> (required)
            'elasticsearchIndex': <String>
            'datasetPath': <String>
            'datasetName': <String>
            'ignoreExtraSamplesInCallset': <Boolean>
            'mappingFile': { 'uploadedFileId': <Id for temporary uploaded file> }
        }

        Response body - will contain the following structure:

    """

    logger.info("add_dataset_handler: " + str(request))

    project = get_project_and_check_permissions(project_guid, request.user, permission_level=CAN_EDIT)

    request_json = json.loads(request.body)

    logger.info("add_dataset_handler: received %s" % pformat(request_json))

    required_fields = ['sampleType', 'datasetType']
    if any(field not in request_json for field in required_fields):
        raise ValueError(
            "request must contain fields: {}".format(', '.join(required_fields)))

    sample_type = request_json['sampleType']
    dataset_type = request_json['datasetType']
    elasticsearch_index = request_json.get('elasticsearchIndex')
    if elasticsearch_index:
        elasticsearch_index = elasticsearch_index.strip()
    dataset_path = request_json.get('datasetPath')
    if dataset_path:
        dataset_path = dataset_path.strip()
    dataset_name = request_json.get('datasetName')
    if dataset_name:
        dataset_name = dataset_name.strip()

    ignore_extra_samples_in_callset = request_json.get('ignoreExtraSamplesInCallset')
    ignore_missing_family_members = request_json.get('ignoreMissingFamilyMembers')
    mapping_file_id = request_json.get('mappingFile', {}).get('uploadedFileId')
    mapping_file_path = request_json.get('mappingFilePath')

    try:
        updated_samples, created_sample_ids = add_dataset(
            project=project,
            sample_type=sample_type,
            dataset_type=dataset_type,
            elasticsearch_index=elasticsearch_index,
            dataset_path=dataset_path,
            dataset_name=dataset_name,
            max_edit_distance=0,
            ignore_extra_samples_in_callset=ignore_extra_samples_in_callset,
            ignore_missing_family_members=ignore_missing_family_members,
            mapping_file_path=mapping_file_path,
            mapping_file_id=mapping_file_id,
        )

        # update VCFFile records
        if updated_samples:
            if dataset_type == Sample.DATASET_TYPE_VARIANT_CALLS:
                reset_cached_search_results(project)

                base_project = BaseProject.objects.get(seqr_project=project)
                get_datastore(base_project).bust_project_cache(base_project.project_id)
                clear_project_results_cache(base_project.project_id)

                vcf_file = VCFFile.objects.filter(
                    project=base_project,
                    dataset_type=dataset_type,
                    sample_type=sample_type,
                    elasticsearch_index=elasticsearch_index).order_by('-pk').first()

                if not vcf_file:
                    vcf_file = VCFFile.objects.create(
                        project=base_project,
                        dataset_type=dataset_type,
                        sample_type=sample_type,
                        elasticsearch_index=elasticsearch_index,
                    )
                    logger.info("Created vcf file: " + str(vcf_file.__dict__))

                vcf_file.file_path = dataset_path or "{}.vcf.gz".format(elasticsearch_index)  # legacy VCFFile model requires non-empty vcf path
                vcf_file.loaded_date = iter(updated_samples).next().loaded_date
                vcf_file.save()

                for indiv in [s.individual for s in updated_samples]:
                    for base_indiv in BaseIndividual.objects.filter(seqr_individual=indiv).only('id'):
                        base_indiv.vcf_files.add(vcf_file)

            elif dataset_type == Sample.DATASET_TYPE_READ_ALIGNMENTS:
                for sample in updated_samples:
                    for base_indiv in BaseIndividual.objects.filter(seqr_individual=sample.individual).only('id'):
                        base_indiv.bam_file_path = sample.dataset_file_path
                        base_indiv.save()

        updated_sample_json = get_json_for_samples(updated_samples, project_guid=project_guid)
        response = {
            'samplesByGuid': {s['sampleGuid']: s for s in updated_sample_json}
        }
        updated_individuals = {s['individualGuid'] for s in updated_sample_json if s['sampleId'] in created_sample_ids}
        if updated_individuals:
            individuals = Individual.objects.filter(guid__in=updated_individuals).prefetch_related('sample_set', 'family').only('guid')
            response['individualsByGuid'] = {
                ind.guid: {'sampleGuids': [s.guid for s in ind.sample_set.only('guid').all()]} for ind in individuals
            }

            for ind in individuals:
                family = ind.family
                if family.analysis_status == Family.ANALYSIS_STATUS_WAITING_FOR_DATA:
                    update_seqr_model(family, analysis_status=Family.ANALYSIS_STATUS_ANALYSIS_IN_PROGRESS)

        return create_json_response(response)
    except Exception as e:
        traceback.print_exc()
        return create_json_response({'errors': [e.message or str(e)]}, status=400)

示例#6

0

显示文件

文件： dataset_api.py 项目： bnouyou/seqr

def add_variants_dataset_handler(request, project_guid):
    """Create or update samples for the given variant dataset

    Args:
        request: Django request object
        project_guid (string): GUID of the project that should be updated

    HTTP POST
        Request body - should contain the following json structure:
        {
            'elasticsearchIndex': <String> (required)
            'ignoreExtraSamplesInCallset': <Boolean>
            'mappingFilePath':  <String>
        }

        Response body - will contain the following structure:

    """

    project = get_project_and_check_permissions(project_guid, request.user, permission_level=CAN_EDIT)
    request_json = json.loads(request.body)

    try:
        if 'elasticsearchIndex' not in request_json:
            raise ValueError('"elasticsearchIndex" is required')
        elasticsearch_index = request_json['elasticsearchIndex'].strip()

        sample_ids, index_metadata = get_elasticsearch_index_samples(elasticsearch_index)
        validate_index_metadata(index_metadata, project, elasticsearch_index)
        sample_type = index_metadata['sampleType']
        dataset_path = index_metadata['sourceFilePath']

        sample_id_to_individual_id_mapping = load_mapping_file(
            request_json['mappingFilePath']) if request_json.get('mappingFilePath') else {}

        matched_sample_id_to_sample_record = match_sample_ids_to_sample_records(
            project=project,
            sample_ids=sample_ids,
            sample_type=sample_type,
            dataset_type=Sample.DATASET_TYPE_VARIANT_CALLS,
            elasticsearch_index=elasticsearch_index,
            sample_id_to_individual_id_mapping=sample_id_to_individual_id_mapping,
        )

        unmatched_samples = set(sample_ids) - set(matched_sample_id_to_sample_record.keys())

        if request_json.get('ignoreExtraSamplesInCallset'):
            if len(matched_sample_id_to_sample_record) == 0:
                raise Exception(
                    "None of the individuals or samples in the project matched the {} expected sample id(s)".format(
                        len(sample_ids)
                    ))
        elif len(unmatched_samples) > 0:
            raise Exception(
                'Matches not found for ES sample ids: {}. Uploading a mapping file for these samples, or select the "Ignore extra samples in callset" checkbox to ignore.'.format(
                    ", ".join(unmatched_samples)))

        prefetch_related_objects(matched_sample_id_to_sample_record.values(), 'individual__family')
        included_families = {sample.individual.family for sample in matched_sample_id_to_sample_record.values()}

        missing_individuals = Individual.objects.filter(
            family__in=included_families,
            sample__is_active=True,
            sample__dataset_type=Sample.DATASET_TYPE_VARIANT_CALLS,
        ).exclude(sample__in=matched_sample_id_to_sample_record.values()).select_related('family')
        missing_family_individuals = defaultdict(list)
        for individual in missing_individuals:
            missing_family_individuals[individual.family].append(individual)

        if missing_family_individuals:
            raise Exception(
                'The following families are included in the callset but are missing some family members: {}.'.format(
                    ', '.join(
                        ['{} ({})'.format(family.family_id, ', '.join([i.individual_id for i in missing_indivs]))
                         for family, missing_indivs in missing_family_individuals.items()]
                )))

        inactivate_sample_guids = _update_variant_samples(
            matched_sample_id_to_sample_record, elasticsearch_index=elasticsearch_index, dataset_path=dataset_path
        )

    except Exception as e:
        traceback.print_exc()
        return create_json_response({'errors': [e.message or str(e)]}, status=400)

    if not matched_sample_id_to_sample_record:
        return create_json_response({'samplesByGuid': {}})

    update_project_from_json(project, {'has_new_search': True})
    reset_cached_search_results(project)

    update_xbrowse_vcfffiles(
        project, sample_type, elasticsearch_index, dataset_path, matched_sample_id_to_sample_record
    )

    families_to_update = [
        family for family in included_families if family.analysis_status == Family.ANALYSIS_STATUS_WAITING_FOR_DATA]
    for family in families_to_update:
        update_model_from_json(family, {'analysis_status': Family.ANALYSIS_STATUS_ANALYSIS_IN_PROGRESS})

    response_json = _get_samples_json(matched_sample_id_to_sample_record, inactivate_sample_guids, project_guid)
    response_json['familiesByGuid'] = {family.guid: {'analysisStatus': Family.ANALYSIS_STATUS_ANALYSIS_IN_PROGRESS}
                                       for family in families_to_update}
    return create_json_response(response_json)

示例#7

0

显示文件

文件： dataset_api.py 项目： lizzij/seqr

def add_variants_dataset_handler(request, project_guid):
    """Create or update samples for the given variant dataset

    Args:
        request: Django request object
        project_guid (string): GUID of the project that should be updated

    HTTP POST
        Request body - should contain the following json structure:
        {
            'elasticsearchIndex': <String> (required)
            'ignoreExtraSamplesInCallset': <Boolean>
            'ignoreMissingFamilyMembers': <Boolean>
            'mappingFilePath':  <String>
        }

        Response body - will contain the following structure:

    """

    project = get_project_and_check_permissions(project_guid, request.user, permission_level=CAN_EDIT)
    request_json = json.loads(request.body)

    try:
        if 'elasticsearchIndex' not in request_json:
            raise ValueError('"elasticsearchIndex" is required')
        elasticsearch_index = request_json['elasticsearchIndex'].strip()

        sample_ids, index_metadata = get_elasticsearch_index_samples(elasticsearch_index)
        validate_index_metadata(index_metadata, project, elasticsearch_index)
        sample_type = index_metadata['sampleType']
        dataset_path = index_metadata['sourceFilePath']

        sample_id_to_individual_id_mapping = load_mapping_file(
            request_json['mappingFilePath']) if request_json.get('mappingFilePath') else {}

        matched_sample_id_to_sample_record = match_sample_ids_to_sample_records(
            project=project,
            sample_ids=sample_ids,
            sample_type=sample_type,
            dataset_type=Sample.DATASET_TYPE_VARIANT_CALLS,
            elasticsearch_index=elasticsearch_index,
            sample_id_to_individual_id_mapping=sample_id_to_individual_id_mapping,
        )

        unmatched_samples = set(sample_ids) - set(matched_sample_id_to_sample_record.keys())

        if request_json.get('ignoreExtraSamplesInCallset'):
            if len(matched_sample_id_to_sample_record) == 0:
                raise Exception(
                    "None of the individuals or samples in the project matched the {} expected sample id(s)".format(
                        len(sample_ids)
                    ))
        elif len(unmatched_samples) > 0:
            raise Exception(
                'Matches not found for ES sample ids: {}. Uploading a mapping file for these samples, or select the "Ignore extra samples in callset" checkbox to ignore.'.format(
                    ", ".join(unmatched_samples)))

        if not request_json.get('ignoreMissingFamilyMembers'):
            included_family_individuals = defaultdict(set)
            for sample in matched_sample_id_to_sample_record.values():
                included_family_individuals[sample.individual.family].add(sample.individual.individual_id)
            missing_family_individuals = []
            for family, individual_ids in included_family_individuals.items():
                missing_indivs = family.individual_set.filter(
                    sample__sample_status=Sample.SAMPLE_STATUS_LOADED,
                    sample__dataset_type=Sample.DATASET_TYPE_VARIANT_CALLS
                ).exclude(individual_id__in=individual_ids)
                if missing_indivs:
                    missing_family_individuals.append(
                        '{} ({})'.format(family.family_id, ', '.join([i.individual_id for i in missing_indivs]))
                    )
            if missing_family_individuals:
                raise Exception(
                    'The following families are included in the callset but are missing some family members: {}. This can lead to errors in variant search. If you still want to upload this callset, select the "Ignore missing family members" checkbox.'.format(
                        ', '.join(missing_family_individuals)
                    ))

        _update_samples(
            matched_sample_id_to_sample_record, elasticsearch_index=elasticsearch_index, dataset_path=dataset_path
        )

    except Exception as e:
        traceback.print_exc()
        return create_json_response({'errors': [e.message or str(e)]}, status=400)

    if not matched_sample_id_to_sample_record:
        return create_json_response({'samplesByGuid': {}})

    update_project_from_json(project, {'has_new_search': True})
    reset_cached_search_results(project)

    _deprecated_update_vcfffiles(
        project, sample_type, elasticsearch_index, dataset_path, matched_sample_id_to_sample_record
    )

    return create_json_response(_get_samples_json(matched_sample_id_to_sample_record, project_guid))

示例#8

0

显示文件

文件： lift_project_to_hg38.py 项目： xirasasa/seqr

    def handle(self, *args, **options):
        """transfer project"""
        project_arg = options['project']
        elasticsearch_index = options['es_index']

        project = Project.objects.get(Q(name=project_arg) | Q(guid=project_arg))
        logger.info('Updating project genome version for {}'.format(project.name))

        # Validate the provided index
        logger.info('Validating es index {}'.format(elasticsearch_index))
        sample_ids, index_metadata = get_elasticsearch_index_samples(elasticsearch_index)
        validate_index_metadata(index_metadata, project, elasticsearch_index, genome_version=GENOME_VERSION_GRCh38)
        sample_type = index_metadata['sampleType']
        dataset_path = index_metadata['sourceFilePath']

        matched_sample_id_to_sample_record = match_sample_ids_to_sample_records(
            project=project,
            sample_ids=sample_ids,
            sample_type=sample_type,
            dataset_type=Sample.DATASET_TYPE_VARIANT_CALLS,
            elasticsearch_index=elasticsearch_index,
            sample_id_to_individual_id_mapping={},
        )

        unmatched_samples = set(sample_ids) - set(matched_sample_id_to_sample_record.keys())
        if len(unmatched_samples) > 0:
            raise Exception('Matches not found for ES sample ids: {}.'.format(', '.join(unmatched_samples)))

        included_family_individuals = defaultdict(set)
        individual_guids_by_id = {}
        for sample in matched_sample_id_to_sample_record.values():
            included_family_individuals[sample.individual.family].add(sample.individual.individual_id)
            individual_guids_by_id[sample.individual.individual_id] = sample.individual.guid
        missing_family_individuals = []
        for family, individual_ids in included_family_individuals.items():
            missing_indivs = family.individual_set.filter(
                sample__sample_status=Sample.SAMPLE_STATUS_LOADED,
                sample__dataset_type=Sample.DATASET_TYPE_VARIANT_CALLS
            ).exclude(individual_id__in=individual_ids)
            if missing_indivs:
                missing_family_individuals.append(
                    '{} ({})'.format(family.family_id, ', '.join([i.individual_id for i in missing_indivs]))
                )
        if missing_family_individuals:
            raise Exception(
                'The following families are included in the callset but are missing some family members: {}.'.format(
                    ', '.join(missing_family_individuals)
                ))

        # Get and clean up expected saved variants
        saved_variant_models_by_guid = {v.guid: v for v in SavedVariant.objects.filter(project=project)}
        deleted_no_family = set()
        deleted_no_tags = set()
        for guid, variant in saved_variant_models_by_guid.items():
            if not variant.family:
                deleted_no_family.add(guid)
            elif not (variant.varianttag_set.count() or variant.variantnote_set.count()):
                deleted_no_tags.add(guid)

        if deleted_no_family:
            if raw_input('Do you want to delete the following {} saved variants with no family (y/n)?: {} '.format(
                    len(deleted_no_family), ', '.join(deleted_no_family))) == 'y':
                for guid in deleted_no_family:
                    saved_variant_models_by_guid.pop(guid).delete()
                logger.info('Deleted {} variants'.format(len(deleted_no_family)))

        if deleted_no_tags:
            if raw_input('Do you want to delete the following {} saved variants with no tags (y/n)?: {} '.format(
                    len(deleted_no_tags), ', '.join(deleted_no_tags))) == 'y':
                for guid in deleted_no_tags:
                    saved_variant_models_by_guid.pop(guid).delete()
                logger.info('Deleted {} variants'.format(len(deleted_no_tags)))

        expected_families = {sv.family for sv in saved_variant_models_by_guid.values()}
        missing_families = expected_families - set(included_family_individuals.keys())
        if missing_families:
            raise Exception(
                'The following families have saved variants but are missing from the callset: {}.'.format(
                    ', '.join([f.family_id for f in missing_families])
                ))

        # Lift-over saved variants
        saved_variants = get_json_for_saved_variants(
            saved_variant_models_by_guid.values(), add_details=True, project=project,
            individual_guids_by_id=individual_guids_by_id)
        saved_variants_to_lift = [v for v in saved_variants if v['genomeVersion'] != GENOME_VERSION_GRCh38]

        num_already_lifted = len(saved_variants) - len(saved_variants_to_lift)
        if num_already_lifted:
            if raw_input('Found {} saved variants already on Hg38. Continue with liftover (y/n)?'.format(num_already_lifted)) != 'y':
                raise Exception('Error: found {} saved variants already on Hg38'.format(num_already_lifted))
        logger.info('Lifting over {} variants (skipping {} that are already lifted)'.format(
            len(saved_variants_to_lift), num_already_lifted))

        liftover_to_38 = LiftOver('hg19', 'hg38')
        hg37_to_hg38_xpos = {}
        lift_failed = set()
        for v in saved_variants_to_lift:
            if not (hg37_to_hg38_xpos.get(v['xpos']) or v['xpos'] in lift_failed):
                hg38_coord = liftover_to_38.convert_coordinate('chr{}'.format(v['chrom'].lstrip('chr')), int(v['pos']))
                if hg38_coord and hg38_coord[0]:
                    hg37_to_hg38_xpos[v['xpos']] = get_xpos(hg38_coord[0][0], hg38_coord[0][1])
                else:
                    lift_failed.add(v['xpos'])

        if lift_failed:
            raise Exception(
                'Unable to lift over the following {} coordinates: {}'.format(len(lift_failed), ', '.join(lift_failed)))

        saved_variants_map = defaultdict(list)
        for v in saved_variants_to_lift:
            variant_model = saved_variant_models_by_guid[v['variantGuid']]
            saved_variants_map[(hg37_to_hg38_xpos[v['xpos']], v['ref'], v['alt'])].append(variant_model)

        es_variants = get_es_variants_for_variant_tuples(expected_families, saved_variants_map.keys())

        missing_variants = set(saved_variants_map.keys()) - {(v['xpos'], v['ref'], v['alt']) for v in es_variants}
        if missing_variants:
            missing_variant_strings = ['{}-{}-{} ({})'.format(
                xpos, ref, alt,
                ', '.join(['{}: {}'.format(v.family.family_id, v.guid) for v in saved_variants_map[(xpos, ref, alt)]]))
                for xpos, ref, alt in missing_variants]
            if raw_input('Unable to find the following {} variants in the index. Continue with update (y/n)?: {} '.format(
                    len(missing_variants), ', '.join(missing_variant_strings))) != 'y':
                raise Exception('Error: unable to find {} lifted-over variants'.format(len(missing_variants)))

        logger.info('Successfully lifted over {} variants'.format(len(es_variants)))

        #  Update saved variants
        for var in es_variants:
            saved_variant_models = saved_variants_map[(var['xpos'], var['ref'], var['alt'])]
            missing_families = [v.family.guid for v in saved_variant_models if v.family.guid not in var['familyGuids']]
            if missing_families:
                raise Exception('Error with variant {}:{}-{}-{} not find for expected families {}; found in families {}'.format(
                    var['chrom'], var['pos'], var['ref'], var['alt'], ', '.join(missing_families), ', '.join(var['familyGuids'])
                ))
            for saved_variant in saved_variant_models:
                saved_variant.xpos_start = var['xpos']
                saved_variant.saved_variant_json = json.dumps(var)
                saved_variant.save()

        logger.info('Successfully updated {} variants'.format(len(es_variants)))

        # Update project and sample data
        update_model_from_json(project, {'genome_version': GENOME_VERSION_GRCh38, 'has_new_search': True})
        _update_samples(
            matched_sample_id_to_sample_record, elasticsearch_index=elasticsearch_index, dataset_path=dataset_path
        )
        update_xbrowse_vcfffiles(
            project, sample_type, elasticsearch_index, dataset_path, matched_sample_id_to_sample_record
        )

        reset_cached_search_results(project)

        logger.info('---Done---')
        logger.info('Succesfully lifted over {} variants. Skipped {} failed variants.'.format(
            len(es_variants), len(missing_variants)))

示例#9

0

显示文件

文件： reset_cached_search_results.py 项目： macarthur-lab/seqr

 def handle(self, *args, **options):
     """transfer project"""
     project_name = options['project']
     project = Project.objects.get(name=project_name) if project_name else None
     reset_cached_search_results(project=project)
     logger.info('Reset cached search results for {}'.format(project_name or 'all projects'))