def _get_json_for_user(user, is_anvil=None, fields=None): """Returns JSON representation of the given User object Args: user (object): Django user model Returns: dict: json object """ if hasattr(user, '_wrapped'): user = user._wrapped # Django request.user actually stores the Django User objects in a ._wrapped attribute model_fields = [field for field in fields if field in MODEL_USER_FIELDS ] if fields else MODEL_USER_FIELDS computed_fields = [ field for field in fields if field in COMPUTED_USER_FIELDS ] if fields else COMPUTED_USER_FIELDS user_json = { _to_camel_case(field): getattr(user, field) for field in model_fields } user_json.update({ _to_camel_case(field): COMPUTED_USER_FIELDS[field](user, is_anvil=is_anvil) for field in computed_fields }) return user_json
def _get_json_for_models(models, nested_fields=None, user=None, process_result=None, guid_key=None): """Returns an array JSON representations of the given models. Args: models (array): Array of django models user (object): Django User object for determining whether to include restricted/internal-only fields nested_fields (array): Optional array of fields to get from the model that are nested on related objects process_result (lambda): Optional function to post-process a given model json guid_key (string): Optional key to use for the model's guid Returns: array: json objects """ if not models: return [] model_class = type(models[0]) fields = copy(model_class._meta.json_fields) if user and user.is_staff: fields += getattr(model_class._meta, 'internal_json_fields', []) for nested_field in nested_fields or []: if not nested_field.get('value'): prefetch_related_objects(models, '__'.join(nested_field['fields'][:-1])) results = [] for model in models: result = { _to_camel_case(field): getattr(model, field) for field in fields } for nested_field in (nested_fields or []): field_value = nested_field.get('value') if not field_value: field_value = model for field in nested_field['fields']: field_value = getattr(field_value, field) if field_value else None result[nested_field.get( 'key', _to_camel_case('_'.join( nested_field['fields'])))] = field_value if result.get('guid'): guid_key = guid_key or '{}{}Guid'.format( model_class.__name__[0].lower(), model_class.__name__[1:]) result[guid_key] = result.pop('guid') if result.get('createdBy'): result['createdBy'] = result['createdBy'].get_full_name( ) or result['createdBy'].email if process_result: process_result(result, model) results.append(result) return results
def elasticsearch_status(request): client = get_es_client() disk_fields = ['node', 'shards', 'disk.avail', 'disk.used', 'disk.percent'] disk_status = [{ _to_camel_case(field.replace('.', '_')): disk[field] for field in disk_fields } for disk in client.cat.allocation(format="json", h=','.join(disk_fields))] index_fields = ['index', 'docs.count', 'store.size', 'creation.date.string'] indices = [{ _to_camel_case(field.replace('.', '_')): index[field] for field in index_fields } for index in client.cat.indices(format="json", h=','.join(index_fields)) if all(not index['index'].startswith(omit_prefix) for omit_prefix in ['.', 'index_operations_log'])] aliases = defaultdict(list) for alias in client.cat.aliases(format="json", h='alias,index'): aliases[alias['alias']].append(alias['index']) index_metadata = get_index_metadata('_all', client, use_cache=False) active_samples = Sample.objects.filter(is_active=True).select_related('individual__family__project') seqr_index_projects = defaultdict(lambda: defaultdict(set)) es_projects = set() for sample in active_samples: for index_name in sample.elasticsearch_index.split(','): project = sample.individual.family.project es_projects.add(project) if index_name in aliases: for aliased_index_name in aliases[index_name]: seqr_index_projects[aliased_index_name][project].add(sample.individual.guid) else: seqr_index_projects[index_name.rstrip('*')][project].add(sample.individual.guid) for index in indices: index_name = index['index'] index.update(index_metadata[index_name]) projects_for_index = [] for index_prefix in list(seqr_index_projects.keys()): if index_name.startswith(index_prefix): projects_for_index += list(seqr_index_projects.pop(index_prefix).keys()) index['projects'] = [{'projectGuid': project.guid, 'projectName': project.name} for project in projects_for_index] errors = ['{} does not exist and is used by project(s) {}'.format( index, ', '.join(['{} ({} samples)'.format(p.name, len(indivs)) for p, indivs in project_individuals.items()]) ) for index, project_individuals in seqr_index_projects.items() if project_individuals] return create_json_response({ 'indices': indices, 'diskStats': disk_status, 'elasticsearchHost': ELASTICSEARCH_SERVER, 'errors': errors, })
def _get_es_meta(client, meta_type, fields, filter_rows=None): return [{ _to_camel_case(field.replace('.', '_')): o[field] for field in fields } for o in getattr(client.cat, meta_type)(format="json", h=','.join(fields)) if filter_rows is None or filter_rows(o)]
def _get_json_for_individuals(individuals, user=None, project_guid=None, family_guid=None, add_sample_guids_field=False, family_fields=None): """Returns a JSON representation for the given list of Individuals. Args: individuals (array): array of django models for the individual. user (object): Django User object for determining whether to include restricted/internal-only fields project_guid (string): An optional field to use as the projectGuid instead of querying the DB family_guid (boolean): An optional field to use as the familyGuid instead of querying the DB add_sample_guids_field (boolean): A flag to indicate weather sample ids should be added Returns: array: array of json objects """ def _get_case_review_status_modified_by(modified_by): return modified_by.email or modified_by.username if hasattr(modified_by, 'email') else modified_by def _load_phenotips_data(phenotips_data): phenotips_json = None if phenotips_data: try: phenotips_json = json.loads(phenotips_data) except Exception as e: logger.error("Couldn't parse phenotips: {}".format(e)) return phenotips_json def _process_result(result, individual): mother = result.pop('mother', None) father = result.pop('father', None) result.update({ 'caseReviewStatusLastModifiedBy': _get_case_review_status_modified_by(result.get('caseReviewStatusLastModifiedBy')), 'phenotipsData': _load_phenotips_data(result['phenotipsData']), 'maternalGuid': mother.guid if mother else None, 'paternalGuid': father.guid if father else None, 'maternalId': mother.individual_id if mother else None, 'paternalId': father.individual_id if father else None, 'displayName': result['displayName'] or result['individualId'], }) if add_sample_guids_field: result['sampleGuids'] = [s.guid for s in individual.sample_set.all()] nested_fields = [ {'fields': ('family', 'guid'), 'value': family_guid}, {'fields': ('family', 'project', 'guid'), 'key': 'projectGuid', 'value': project_guid}, ] if family_fields: for field in family_fields: nested_fields.append({'fields': ('family', field), 'key': _to_camel_case(field)}) prefetch_related_objects(individuals, 'family') prefetch_related_objects(individuals, 'mother') prefetch_related_objects(individuals, 'father') prefetch_related_objects(individuals, 'case_review_status_last_modified_by') if add_sample_guids_field: prefetch_related_objects(individuals, 'sample_set') return _get_json_for_models(individuals, nested_fields=nested_fields, user=user, process_result=_process_result)
def _get_json_for_individuals(individuals, user=None, project_guid=None, family_guid=None, add_sample_guids_field=False, family_fields=None): """Returns a JSON representation for the given list of Individuals. Args: individuals (array): array of django models for the individual. user (object): Django User object for determining whether to include restricted/internal-only fields project_guid (string): An optional field to use as the projectGuid instead of querying the DB family_guid (boolean): An optional field to use as the familyGuid instead of querying the DB add_sample_guids_field (boolean): A flag to indicate weather sample ids should be added Returns: array: array of json objects """ def _get_case_review_status_modified_by(modified_by): return modified_by.email or modified_by.username if hasattr(modified_by, 'email') else modified_by def _load_phenotips_data(phenotips_data): phenotips_json = None if phenotips_data: try: phenotips_json = json.loads(phenotips_data) except Exception as e: logger.error("Couldn't parse phenotips: {}".format(e)) return phenotips_json def _process_result(result, individual): mother = result.pop('mother', None) father = result.pop('father', None) result.update({ 'caseReviewStatusLastModifiedBy': _get_case_review_status_modified_by(result.get('caseReviewStatusLastModifiedBy')), 'phenotipsData': _load_phenotips_data(result['phenotipsData']), 'maternalGuid': mother.guid if mother else None, 'paternalGuid': father.guid if father else None, 'maternalId': mother.individual_id if mother else None, 'paternalId': father.individual_id if father else None, 'displayName': result['displayName'] or result['individualId'], }) if add_sample_guids_field: result['sampleGuids'] = [s.guid for s in individual.sample_set.all()] nested_fields = [ {'fields': ('family', 'guid'), 'value': family_guid}, {'fields': ('family', 'project', 'guid'), 'key': 'projectGuid', 'value': project_guid}, ] if family_fields: for field in family_fields: nested_fields.append({'fields': ('family', field), 'key': _to_camel_case(field)}) prefetch_related_objects(individuals, 'mother') prefetch_related_objects(individuals, 'father') prefetch_related_objects(individuals, 'case_review_status_last_modified_by') if add_sample_guids_field: prefetch_related_objects(individuals, 'sample_set') return _get_json_for_models(individuals, nested_fields=nested_fields, user=user, process_result=_process_result)
def _get_json_for_models(models, nested_fields=None, user=None, process_result=None, guid_key=None): """Returns an array JSON representations of the given models. Args: models (array): Array of django models user (object): Django User object for determining whether to include restricted/internal-only fields nested_fields (array): Optional array of fields to get from the model that are nested on related objects process_result (lambda): Optional function to post-process a given model json guid_key (string): Optional key to use for the model's guid Returns: array: json objects """ if not models: return [] model_class = type(models[0]) fields = copy(model_class._meta.json_fields) if user and user.is_staff: fields += getattr(model_class._meta, 'internal_json_fields', []) results = [] for model in models: result = {_to_camel_case(field): getattr(model, field) for field in fields} for nested_field in (nested_fields or []): field_value = nested_field.get('value') if not field_value: field_value = model for field in nested_field['fields']: field_value = getattr(field_value, field) if field_value else None result[nested_field.get('key', _to_camel_case('_'.join(nested_field['fields'])))] = field_value if result.get('guid'): guid_key = guid_key or '{}{}Guid'.format(model_class.__name__[0].lower(), model_class.__name__[1:]) result[guid_key] = result.pop('guid') if result.get('createdBy'): result['createdBy'] = result['createdBy'].get_full_name() or result['createdBy'].email if process_result: process_result(result, model) results.append(result) return results
def _add_sample_type_counts(cursor, projects_by_guid): """Retrieves per-family analysis status counts from the database and adds these to each project in the 'projects_by_guid' dictionary. Args: cursor: connected database cursor that can be used to execute SQL queries. projects_by_guid (dict): projects for which to add analysis counts """ if len(projects_by_guid) == 0: return {} sample_type_counts_query = """ SELECT p.guid AS project_guid, s.sample_type AS sample_type, COUNT(distinct s.individual_id) AS num_samples FROM seqr_sample AS s JOIN seqr_individual AS i ON s.individual_id=i.id JOIN seqr_family AS f ON i.family_id=f.id JOIN seqr_project AS p ON f.project_id=p.id {projects_WHERE_clause} AND dataset_type='{variant_dataset_type}' AND sample_status='{loaded_sample_status}' GROUP BY p.guid, s.sample_type """.strip().format( projects_WHERE_clause=_to_WHERE_clause( [guid for guid in projects_by_guid]), variant_dataset_type=Sample.DATASET_TYPE_VARIANT_CALLS, loaded_sample_status=Sample.SAMPLE_STATUS_LOADED, ) cursor.execute(sample_type_counts_query) columns = [_to_camel_case(col[0]) for col in cursor.description] for row in cursor.fetchall(): record = dict(zip(columns, row)) project_guid = record['projectGuid'] sample_type = record['sampleType'] num_samples = record['numSamples'] if project_guid not in projects_by_guid: continue # defensive programming if 'sampleTypeCounts' not in projects_by_guid[project_guid]: projects_by_guid[project_guid]['sampleTypeCounts'] = {} projects_by_guid[project_guid]['sampleTypeCounts'][ sample_type] = num_samples
def get_json_for_sample_dict(sample_dict): """Returns a JSON representation of the given Sample dictionary. Args: sample (object): dictionary representation for the Sample. Returns: dict: json object """ result = {_to_camel_case(field): sample_dict.get('sample_{}'.format(field)) for field in Sample._meta.json_fields} result.update({ 'projectGuid': sample_dict['project_guid'], 'individualGuid': sample_dict['individual_guid'], 'sampleGuid': result.pop('guid'), }) return result
def _add_sample_type_counts(cursor, projects_by_guid, user_is_staff=False): """Retrieves per-family analysis status counts from the database and adds these to each project in the 'projects_by_guid' dictionary. Args: cursor: connected database cursor that can be used to execute SQL queries. projects_by_guid (dict): projects for which to add analysis counts """ if len(projects_by_guid) == 0: return {} projects_WHERE_clause = "" if not user_is_staff: projects_WHERE_clause = _to_WHERE_clause( [guid for guid in projects_by_guid]) sample_type_counts_query = """ SELECT p.guid AS project_guid, s.sample_type AS sample_type, COUNT(*) AS num_samples FROM seqr_sample AS s JOIN seqr_individual AS i ON s.individual_id=i.id JOIN seqr_family AS f ON i.family_id=f.id JOIN seqr_project AS p ON f.project_id=p.id %(projects_WHERE_clause)s GROUP BY p.guid, s.sample_type """.strip() % locals() cursor.execute(sample_type_counts_query) columns = [_to_camel_case(col[0]) for col in cursor.description] for row in cursor.fetchall(): record = dict(zip(columns, row)) project_guid = record['projectGuid'] sample_type = record['sampleType'] num_samples = record['numSamples'] if project_guid not in projects_by_guid: continue # defensive programming if 'sampleTypeCounts' not in projects_by_guid[project_guid]: projects_by_guid[project_guid]['sampleTypeCounts'] = {} projects_by_guid[project_guid]['sampleTypeCounts'][ sample_type] = num_samples
def _add_sample_type_counts(cursor, projects_by_guid): """Retrieves per-family analysis status counts from the database and adds these to each project in the 'projects_by_guid' dictionary. Args: cursor: connected database cursor that can be used to execute SQL queries. projects_by_guid (dict): projects for which to add analysis counts """ if len(projects_by_guid) == 0: return {} sample_type_counts_query = """ SELECT p.guid AS project_guid, s.sample_type AS sample_type, COUNT(distinct s.individual_id) AS num_samples FROM seqr_sample AS s JOIN seqr_individual AS i ON s.individual_id=i.id JOIN seqr_family AS f ON i.family_id=f.id JOIN seqr_project AS p ON f.project_id=p.id {projects_WHERE_clause} AND dataset_type='{variant_dataset_type}' AND sample_status='{loaded_sample_status}' GROUP BY p.guid, s.sample_type """.strip().format( projects_WHERE_clause=_to_WHERE_clause([guid for guid in projects_by_guid]), variant_dataset_type=Sample.DATASET_TYPE_VARIANT_CALLS, loaded_sample_status=Sample.SAMPLE_STATUS_LOADED, ) cursor.execute(sample_type_counts_query) columns = [_to_camel_case(col[0]) for col in cursor.description] for row in cursor.fetchall(): record = dict(zip(columns, row)) project_guid = record['projectGuid'] sample_type = record['sampleType'] num_samples = record['numSamples'] if project_guid not in projects_by_guid: continue # defensive programming if 'sampleTypeCounts' not in projects_by_guid[project_guid]: projects_by_guid[project_guid]['sampleTypeCounts'] = {} projects_by_guid[project_guid]['sampleTypeCounts'][sample_type] = num_samples
def _get_json_for_user(user): """Returns JSON representation of the given User object Args: user (object): Django user model Returns: dict: json object """ if hasattr(user, '_wrapped'): user = user._wrapped # Django request.user actually stores the Django User objects in a ._wrapped attribute user_json = {_to_camel_case(field): getattr(user, field) for field in ['username', 'email', 'first_name', 'last_name', 'last_login', 'is_staff', 'date_joined', 'id']} user_json['displayName'] = user.get_full_name() return user_json
def _get_json_for_user(user): """Returns JSON representation of the given User object Args: user (object): Django user model Returns: dict: json object """ if hasattr(user, '_wrapped'): user = user._wrapped # Django request.user actually stores the Django User objects in a ._wrapped attribute user_json = {_to_camel_case(field): getattr(user, field) for field in ['username', 'email', 'first_name', 'last_name', 'last_login', 'is_staff', 'date_joined']} user_json['displayName'] = user.get_full_name() return user_json
def _get_json_for_individuals(individuals, user=None, project_guid=None, family_guid=None, add_sample_guids_field=False, family_fields=None, skip_nested=False, add_hpo_details=False, is_analyst=None, has_case_review_perm=None): """Returns a JSON representation for the given list of Individuals. Args: individuals (array): array of django models for the individual. user (object): Django User object for determining whether to include restricted/internal-only fields project_guid (string): An optional field to use as the projectGuid instead of querying the DB family_guid (boolean): An optional field to use as the familyGuid instead of querying the DB add_sample_guids_field (boolean): A flag to indicate weather sample ids should be added Returns: array: array of json objects """ if not individuals: return [] def _get_case_review_status_modified_by(modified_by): return modified_by.email or modified_by.username if hasattr( modified_by, 'email') else modified_by def _process_result(result, individual): mother = result.pop('mother', None) father = result.pop('father', None) result.update({ 'caseReviewStatusLastModifiedBy': _get_case_review_status_modified_by( result.get('caseReviewStatusLastModifiedBy')), 'maternalGuid': mother.guid if mother else None, 'paternalGuid': father.guid if father else None, 'maternalId': mother.individual_id if mother else None, 'paternalId': father.individual_id if father else None, 'displayName': result['displayName'] or result['individualId'], }) if add_sample_guids_field: result['sampleGuids'] = [ s.guid for s in individual.sample_set.all() ] result['igvSampleGuids'] = [ s.guid for s in individual.igvsample_set.all() ] kwargs = { 'additional_model_fields': _get_case_review_fields(individuals[0], has_case_review_perm, user, lambda indiv: indiv.family.project) } if project_guid or not skip_nested: nested_fields = [ { 'fields': ('family', 'guid'), 'value': family_guid }, { 'fields': ('family', 'project', 'guid'), 'key': 'projectGuid', 'value': project_guid }, ] if family_fields: for field in family_fields: nested_fields.append({ 'fields': ('family', field), 'key': _to_camel_case(field) }) kwargs.update({'nested_fields': nested_fields}) else: kwargs['additional_model_fields'].append('family_id') if add_hpo_details: kwargs['additional_model_fields'] += [ 'features', 'absent_features', 'nonstandard_features', 'absent_nonstandard_features' ] prefetch_related_objects(individuals, 'mother') prefetch_related_objects(individuals, 'father') if 'case_review_status_last_modified_by' in kwargs[ 'additional_model_fields']: prefetch_related_objects(individuals, 'case_review_status_last_modified_by') if add_sample_guids_field: prefetch_related_objects(individuals, 'sample_set') prefetch_related_objects(individuals, 'igvsample_set') parsed_individuals = _get_json_for_models(individuals, user=user, is_analyst=is_analyst, process_result=_process_result, **kwargs) if add_hpo_details: all_hpo_ids = set() for i in parsed_individuals: all_hpo_ids.update( [feature['id'] for feature in i.get('features') or []]) all_hpo_ids.update( [feature['id'] for feature in i.get('absentFeatures') or []]) hpo_terms_by_id = { hpo.hpo_id: hpo for hpo in HumanPhenotypeOntology.objects.filter( hpo_id__in=all_hpo_ids) } for i in parsed_individuals: for feature in i.get('features') or []: hpo = hpo_terms_by_id.get(feature['id']) if hpo: feature.update({ 'category': hpo.category_id, 'label': hpo.name }) for feature in i.get('absentFeatures') or []: hpo = hpo_terms_by_id.get(feature['id']) if hpo: feature.update({ 'category': hpo.category_id, 'label': hpo.name }) return parsed_individuals
def _get_empty_json_for_model(model_class): return { _to_camel_case(field): None for field in model_class._meta.json_fields }
'is_anvil': lambda user, is_anvil=None, **kwargs: is_anvil_authenticated(user) if is_anvil is None else is_anvil, 'display_name': lambda user, **kwargs: user.get_full_name(), 'is_analyst': lambda user, analyst_users=None, **kwargs: user in analyst_users if analyst_users is not None else user_is_analyst(user), 'is_data_manager': lambda user, **kwargs: user_is_data_manager(user), 'is_pm': lambda user, pm_users=None, **kwargs: user in pm_users if pm_users is not None else user_is_pm(user), } DEFAULT_USER = {_to_camel_case(field): '' for field in MAIN_USER_FIELDS} DEFAULT_USER.update( {_to_camel_case(field): val for field, val in BOOL_USER_FIELDS.items()}) DEFAULT_USER.update( {_to_camel_case(field): False for field in COMPUTED_USER_FIELDS.keys()}) def _get_json_for_user(user, is_anvil=None, fields=None, analyst_users=None, pm_users=None): """Returns JSON representation of the given User object
def _retrieve_projects_by_guid(cursor, projects_user_can_view, projects_user_can_edit): """Retrieves all relevant metadata for each project from the database, and returns a 'projects_by_guid' dictionary. Args: cursor: connected database cursor that can be used to execute SQL queries. projects_user_can_view (list): list of Django Project objects for which the user has CAN_VIEW permissions. projects_user_can_edit (list): list of Django Project objects for which the user has CAN_EDIT permissions. Returns: Dictionary that maps each project's GUID to a dictionary of key-value pairs representing attributes of that project. """ if len(projects_user_can_view) == 0: return {} # get all projects this user has permissions to view projects_WHERE_clause = _to_WHERE_clause([p.guid for p in projects_user_can_view]) # use raw SQL to avoid making N+1 queries. num_families_subquery = """ SELECT count(*) FROM seqr_family WHERE project_id=p.id """.strip() num_variant_tags_subquery = """ SELECT count(*) FROM seqr_varianttag AS v JOIN seqr_savedvariant AS s ON v.saved_variant_id=s.id WHERE project_id=p.id """.strip() num_individuals_subquery = """ SELECT count(*) FROM seqr_individual AS i JOIN seqr_family AS f ON i.family_id=f.id WHERE f.project_id=p.id """.strip() project_fields = ', '.join(Project._meta.json_fields) projects_query = """ SELECT guid AS project_guid, {project_fields}, ({num_variant_tags_subquery}) AS num_variant_tags, ({num_families_subquery}) AS num_families, ({num_individuals_subquery}) AS num_individuals FROM seqr_project AS p {projects_WHERE_clause} """.strip().format( project_fields=project_fields, num_variant_tags_subquery=num_variant_tags_subquery, num_families_subquery=num_families_subquery, num_individuals_subquery=num_individuals_subquery, projects_WHERE_clause=projects_WHERE_clause ) cursor.execute(projects_query) columns = [_to_camel_case(col[0]) for col in cursor.description] projects_by_guid = { r['projectGuid']: r for r in (dict(zip(columns, row)) for row in cursor.fetchall()) } # mark all projects where this user has edit permissions for project in projects_user_can_edit: projects_by_guid[project.guid]['canEdit'] = True return projects_by_guid
def _gene_json(gene): gene['constraints'] = _parse_gene_constraints(gene) gene = {_to_camel_case(k): v for k, v in gene.items()} gene['phenotypeInfo'] = {_to_camel_case(k): v for k, v in gene.get('phenotypeInfo', {}).items()} return gene
def elasticsearch_status(request): client = get_es_client() disk_fields = ['node', 'disk.avail', 'disk.used', 'disk.percent'] disk_status = [{ _to_camel_case(field.replace('.', '_')): disk[field] for field in disk_fields } for disk in client.cat.allocation(format="json", h=','.join(disk_fields)) ] index_fields = [ 'index', 'docs.count', 'store.size', 'creation.date.string' ] indices = [{ _to_camel_case(field.replace('.', '_')): index[field] for field in index_fields } for index in client.cat.indices(format="json", h=','.join(index_fields)) if index['index'] not in ['.kibana', 'index_operations_log']] aliases = defaultdict(list) for alias in client.cat.aliases(format="json", h='alias,index'): aliases[alias['alias']].append(alias['index']) mappings = Index('_all', using=client).get_mapping(doc_type='variant') latest_loaded_samples = get_latest_loaded_samples() prefetch_related_objects(latest_loaded_samples, 'individual__family__project') seqr_index_projects = defaultdict(lambda: defaultdict(set)) es_projects = set() for sample in latest_loaded_samples: for index_name in sample.elasticsearch_index.split(','): project = sample.individual.family.project es_projects.add(project) if index_name in aliases: for aliased_index_name in aliases[index_name]: seqr_index_projects[aliased_index_name][project].add( sample.individual.guid) else: seqr_index_projects[index_name.rstrip('*')][project].add( sample.individual.guid) for index in indices: index_name = index['index'] index_mapping = mappings[index_name]['mappings']['variant'] index.update(index_mapping.get('_meta', {})) index['hasNestedGenotypes'] = 'samples_num_alt_1' in index_mapping[ 'properties'] projects_for_index = [] for index_prefix in seqr_index_projects.keys(): if index_name.startswith(index_prefix): projects_for_index += seqr_index_projects.pop( index_prefix).keys() index['projects'] = [{ 'projectGuid': project.guid, 'projectName': project.name } for project in projects_for_index] errors = [ '{} does not exist and is used by project(s) {}'.format( index, ', '.join([ '{} ({} samples)'.format(p.name, len(indivs)) for p, indivs in project_individuals.items() ])) for index, project_individuals in seqr_index_projects.items() if project_individuals ] # TODO remove once all projects are switched off of mongo all_mongo_samples = Sample.objects.filter( dataset_type=Sample.DATASET_TYPE_VARIANT_CALLS, sample_status=Sample.SAMPLE_STATUS_LOADED, elasticsearch_index__isnull=True, ).exclude(individual__family__project__in=es_projects).prefetch_related( 'individual', 'individual__family__project') mongo_sample_individual_max_loaded_date = { agg['individual__guid']: agg['max_loaded_date'] for agg in all_mongo_samples.values('individual__guid').annotate( max_loaded_date=Max('loaded_date')) } mongo_project_samples = defaultdict(set) for s in all_mongo_samples: if s.loaded_date == mongo_sample_individual_max_loaded_date[ s.individual.guid]: mongo_project_samples[s.individual.family.project].add( s.dataset_file_path) mongo_projects = [{ 'projectGuid': project.guid, 'projectName': project.name, 'sourceFilePaths': sample_file_paths } for project, sample_file_paths in mongo_project_samples.items()] return create_json_response({ 'indices': indices, 'diskStats': disk_status, 'elasticsearchHost': ELASTICSEARCH_SERVER, 'mongoProjects': mongo_projects, 'errors': errors, })
def _retrieve_projects_by_guid(cursor, projects_user_can_view, projects_user_can_edit): """Retrieves all relevant metadata for each project from the database, and returns a 'projects_by_guid' dictionary. Args: cursor: connected database cursor that can be used to execute SQL queries. projects_user_can_view (list): list of Django Project objects for which the user has CAN_VIEW permissions. projects_user_can_edit (list): list of Django Project objects for which the user has CAN_EDIT permissions. Returns: Dictionary that maps each project's GUID to a dictionary of key-value pairs representing attributes of that project. """ if len(projects_user_can_view) == 0: return {} # get all projects this user has permissions to view projects_WHERE_clause = _to_WHERE_clause([p.guid for p in projects_user_can_view]) # use raw SQL to avoid making N+1 queries. num_families_subquery = """ SELECT count(*) FROM seqr_family WHERE project_id=p.id """.strip() num_variant_tags_subquery = """ SELECT count(*) FROM seqr_varianttag AS v JOIN seqr_varianttagtype AS t ON v.variant_tag_type_id=t.id WHERE project_id=p.id """.strip() num_individuals_subquery = """ SELECT count(*) FROM seqr_individual AS i JOIN seqr_family AS f ON i.family_id=f.id WHERE f.project_id=p.id """.strip() project_fields = ', '.join(Project._meta.json_fields) projects_query = """ SELECT guid AS project_guid, {project_fields}, ({num_variant_tags_subquery}) AS num_variant_tags, ({num_families_subquery}) AS num_families, ({num_individuals_subquery}) AS num_individuals FROM seqr_project AS p {projects_WHERE_clause} """.strip().format( project_fields=project_fields, num_variant_tags_subquery=num_variant_tags_subquery, num_families_subquery=num_families_subquery, num_individuals_subquery=num_individuals_subquery, projects_WHERE_clause=projects_WHERE_clause ) cursor.execute(projects_query) columns = [_to_camel_case(col[0]) for col in cursor.description] projects_by_guid = { r['projectGuid']: r for r in (dict(zip(columns, row)) for row in cursor.fetchall()) } # mark all projects where this user has edit permissions for project in projects_user_can_edit: projects_by_guid[project.guid]['canEdit'] = True return projects_by_guid
def _parse_hit(self, raw_hit): hit = {k: raw_hit[k] for k in QUERY_FIELD_NAMES if k in raw_hit} index_name = raw_hit.meta.index index_family_samples = self.samples_by_family_index[index_name] if hasattr(raw_hit.meta, 'matched_queries'): family_guids = list(raw_hit.meta.matched_queries) else: # Searches for all inheritance and all families do not filter on inheritance so there are no matched_queries alt_allele_samples = set() for alt_samples_field in HAS_ALT_FIELD_KEYS: alt_allele_samples.update(hit[alt_samples_field]) family_guids = [family_guid for family_guid, samples_by_id in index_family_samples.items() if any(sample_id in alt_allele_samples for sample_id in samples_by_id.keys())] genotypes = {} for family_guid in family_guids: samples_by_id = index_family_samples[family_guid] genotypes.update({ samples_by_id[genotype_hit['sample_id']].individual.guid: _get_field_values(genotype_hit, GENOTYPE_FIELDS_CONFIG) for genotype_hit in hit[GENOTYPES_FIELD_KEY] if genotype_hit['sample_id'] in samples_by_id }) genome_version = self.index_metadata[index_name].get('genomeVersion') lifted_over_genome_version = None lifted_over_chrom = None lifted_over_pos = None liftover_grch38_to_grch37 = _liftover_grch38_to_grch37() if liftover_grch38_to_grch37 and genome_version == GENOME_VERSION_GRCh38: if liftover_grch38_to_grch37: grch37_coord = liftover_grch38_to_grch37.convert_coordinate( 'chr{}'.format(hit['contig'].lstrip('chr')), int(hit['start']) ) if grch37_coord and grch37_coord[0]: lifted_over_genome_version = GENOME_VERSION_GRCh37 lifted_over_chrom = grch37_coord[0][0].lstrip('chr') lifted_over_pos = grch37_coord[0][1] populations = { population: _get_field_values( hit, POPULATION_RESPONSE_FIELD_CONFIGS, format_response_key=lambda key: key.lower(), lookup_field_prefix=population, existing_fields=self.index_metadata[index_name]['fields'], get_addl_fields=lambda field, field_config: [pop_config.get(field)] + ['{}_{}'.format(population, custom_field) for custom_field in field_config.get('fields', [])], ) for population, pop_config in POPULATIONS.items() } sorted_transcripts = [ {_to_camel_case(k): v for k, v in transcript.to_dict().items()} for transcript in hit[SORTED_TRANSCRIPTS_FIELD_KEY] or [] ] transcripts = defaultdict(list) for transcript in sorted_transcripts: transcripts[transcript['geneId']].append(transcript) result = _get_field_values(hit, CORE_FIELDS_CONFIG, format_response_key=str) result.update({ field_name: _get_field_values(hit, fields, lookup_field_prefix=field_name) for field_name, fields in NESTED_FIELDS.items() }) if hasattr(raw_hit.meta, 'sort'): result['_sort'] = [_parse_es_sort(sort, self._sort[i]) for i, sort in enumerate(raw_hit.meta.sort)] result.update({ 'familyGuids': sorted(family_guids), 'genotypes': genotypes, 'genomeVersion': genome_version, 'liftedOverGenomeVersion': lifted_over_genome_version, 'liftedOverChrom': lifted_over_chrom, 'liftedOverPos': lifted_over_pos, 'mainTranscript': sorted_transcripts[0] if len(sorted_transcripts) else {}, 'populations': populations, 'predictions': _get_field_values( hit, PREDICTION_FIELDS_CONFIG, format_response_key=lambda key: key.split('_')[1].lower() ), 'transcripts': transcripts, }) return result
def _get_empty_json_for_model(model_class): return {_to_camel_case(field): None for field in model_class._meta.json_fields}
def elasticsearch_status(request): client = get_es_client() disk_fields = ['node', 'disk.avail', 'disk.used', 'disk.percent'] disk_status = [{ _to_camel_case(field.replace('.', '_')): disk[field] for field in disk_fields } for disk in client.cat.allocation(format="json", h=','.join(disk_fields))] index_fields = ['index', 'docs.count', 'store.size', 'creation.date.string'] indices = [{ _to_camel_case(field.replace('.', '_')): index[field] for field in index_fields } for index in client.cat.indices(format="json", h=','.join(index_fields)) if index['index'] not in ['.kibana', 'index_operations_log']] aliases = defaultdict(list) for alias in client.cat.aliases(format="json", h='alias,index'): aliases[alias['alias']].append(alias['index']) mappings = Index('_all', using=client).get_mapping(doc_type='variant') latest_loaded_samples = get_latest_loaded_samples() prefetch_related_objects(latest_loaded_samples, 'individual__family__project') seqr_index_projects = defaultdict(lambda: defaultdict(set)) es_projects = set() for sample in latest_loaded_samples: for index_name in sample.elasticsearch_index.split(','): project = sample.individual.family.project es_projects.add(project) if index_name in aliases: for aliased_index_name in aliases[index_name]: seqr_index_projects[aliased_index_name][project].add(sample.individual.guid) else: seqr_index_projects[index_name.rstrip('*')][project].add(sample.individual.guid) for index in indices: index_name = index['index'] index_mapping = mappings[index_name]['mappings']['variant'] index.update(index_mapping.get('_meta', {})) index['hasNestedGenotypes'] = 'samples_num_alt_1' in index_mapping['properties'] projects_for_index = [] for index_prefix in seqr_index_projects.keys(): if index_name.startswith(index_prefix): projects_for_index += seqr_index_projects.pop(index_prefix).keys() index['projects'] = [{'projectGuid': project.guid, 'projectName': project.name} for project in projects_for_index] errors = ['{} does not exist and is used by project(s) {}'.format( index, ', '.join(['{} ({} samples)'.format(p.name, len(indivs)) for p, indivs in project_individuals.items()]) ) for index, project_individuals in seqr_index_projects.items() if project_individuals] # TODO remove once all projects are switched off of mongo all_mongo_samples = Sample.objects.filter( dataset_type=Sample.DATASET_TYPE_VARIANT_CALLS, sample_status=Sample.SAMPLE_STATUS_LOADED, elasticsearch_index__isnull=True, ).exclude(individual__family__project__in=es_projects).prefetch_related('individual', 'individual__family__project') mongo_sample_individual_max_loaded_date = { agg['individual__guid']: agg['max_loaded_date'] for agg in all_mongo_samples.values('individual__guid').annotate(max_loaded_date=Max('loaded_date')) } mongo_project_samples = defaultdict(set) for s in all_mongo_samples: if s.loaded_date == mongo_sample_individual_max_loaded_date[s.individual.guid]: mongo_project_samples[s.individual.family.project].add(s.dataset_file_path) mongo_projects = [{'projectGuid': project.guid, 'projectName': project.name, 'sourceFilePaths': sample_file_paths} for project, sample_file_paths in mongo_project_samples.items()] return create_json_response({ 'indices': indices, 'diskStats': disk_status, 'elasticsearchHost': ELASTICSEARCH_SERVER, 'mongoProjects': mongo_projects, 'errors': errors, })
def _retrieve_sample_batches_by_guid(cursor, projects_by_guid, user_is_staff=False): """Retrieves sample batches from the database, and returns a 'sample_batches_by_guid' dictionary, while also adding a 'sampleBatchGuids' attribute to each project dict in 'projects_by_guid' Args: cursor: connected database cursor that can be used to execute SQL queries. projects_by_guid: Dictionary that maps each project's GUID to a dictionary of key-value pairs representing attributes of that project. Returns: Dictionary that maps each sample batch's GUID to a dictionary of key-value pairs representing attributes of this sample batch. """ if len(projects_by_guid) == 0: return {} projects_WHERE_clause = "" if not user_is_staff: projects_WHERE_clause = _to_WHERE_clause([guid for guid in projects_by_guid]) num_samples_subquery = """ SELECT COUNT(*) FROM seqr_sample AS subquery_s WHERE subquery_s.sample_batch_id=sb.id """ sample_batch_query = """ SELECT p.guid AS project_guid, sb.guid AS sample_batch_guid, sb.id AS sample_batch_id, sb.sample_type AS sample_type, (%(num_samples_subquery)s) AS num_samples FROM seqr_samplebatch AS sb JOIN seqr_sample AS s ON sb.id=s.sample_batch_id JOIN seqr_individual_samples AS iss ON iss.sample_id=s.id JOIN seqr_individual AS i ON iss.individual_id=i.id JOIN seqr_family AS f ON i.family_id=f.id JOIN seqr_project AS p ON f.project_id=p.id %(projects_WHERE_clause)s GROUP BY p.guid, sb.guid, sb.id, sb.sample_type """.strip() % locals() # TODO retrieve sample batches based on sample batch permissions instead of going by project permissions cursor.execute(sample_batch_query) columns = [_to_camel_case(col[0]) for col in cursor.description] sample_batches_by_guid = {} for row in cursor.fetchall(): sample_batch_project_record = dict(zip(columns, row)) sample_batch_guid = sample_batch_project_record['sampleBatchGuid'] project_guid = sample_batch_project_record['projectGuid'] del sample_batch_project_record['projectGuid'] #del sample_batch_project_record['sampleBatchGuid'] sample_batches_by_guid[sample_batch_guid] = sample_batch_project_record project_record = projects_by_guid[project_guid] if 'sampleBatchGuids' not in project_record: project_record['sampleBatchGuids'] = [] project_record['sampleBatchGuids'].append(sample_batch_guid) return sample_batches_by_guid
def _get_json_for_record(record, fields): return {_to_camel_case(field[0]): record.get(field[1]) for field in fields}
def _parse_es_hit(raw_hit, family_samples_by_id): hit = {k: raw_hit[k] for k in QUERY_FIELD_NAMES if k in raw_hit} genotypes = {} family_guids = list(raw_hit.meta.matched_queries) for family_guid in family_guids: samples_by_id = family_samples_by_id[family_guid] genotypes.update({ samples_by_id[genotype_hit['sample_id']].individual.guid: _get_field_values(genotype_hit, GENOTYPE_FIELDS_CONFIG) for genotype_hit in hit[GENOTYPES_FIELD_KEY] if genotype_hit['sample_id'] in samples_by_id }) # TODO better handling for multi-project searches project = family_samples_by_id[ family_guids[0]].values()[0].individual.family.project genome_version = project.genome_version lifted_over_genome_version = None lifted_over_chrom = None lifted_over_pos = None liftover_grch38_to_grch37 = _liftover_grch38_to_grch37() if liftover_grch38_to_grch37 and genome_version == GENOME_VERSION_GRCh38: if liftover_grch38_to_grch37: grch37_coord = liftover_grch38_to_grch37.convert_coordinate( 'chr{}'.format(hit['contig'].lstrip('chr')), int(hit['start'])) if grch37_coord and grch37_coord[0]: lifted_over_chrom = grch37_coord[0][0].lstrip('chr') lifted_over_pos = grch37_coord[0][1] populations = { population: _get_field_values( hit, POPULATION_RESPONSE_FIELD_CONFIGS, format_response_key=lambda key: key.lower(), lookup_field_prefix=population, get_addl_fields=lambda field, field_config: [pop_config.get(field)] + [ '{}_{}'.format(population, custom_field) for custom_field in field_config.get('fields', []) ], ) for population, pop_config in POPULATIONS.items() } sorted_transcripts = [{ _to_camel_case(k): v for k, v in transcript.to_dict().items() } for transcript in hit[SORTED_TRANSCRIPTS_FIELD_KEY] or []] transcripts = defaultdict(list) for transcript in sorted_transcripts: transcripts[transcript['geneId']].append(transcript) result = _get_field_values(hit, CORE_FIELDS_CONFIG, format_response_key=str) result.update({ field_name: _get_field_values(hit, fields, lookup_field_prefix=field_name) for field_name, fields in NESTED_FIELDS.items() }) if hasattr(raw_hit.meta, 'sort'): result['_sort'] = [_parse_es_sort(sort) for sort in raw_hit.meta.sort] result.update({ 'projectGuid': project.guid, 'familyGuids': family_guids, 'genotypes': genotypes, 'genomeVersion': genome_version, 'liftedOverGenomeVersion': lifted_over_genome_version, 'liftedOverChrom': lifted_over_chrom, 'liftedOverPos': lifted_over_pos, 'mainTranscript': sorted_transcripts[0] if len(sorted_transcripts) else {}, 'populations': populations, 'predictions': _get_field_values( hit, PREDICTION_FIELDS_CONFIG, format_response_key=lambda key: key.split('_')[1].lower()), 'transcripts': transcripts, }) return result