def receive_igv_table_handler(request, project_guid): project = get_project_and_check_permissions(project_guid, request.user, can_edit=True) info = [] def _process_alignment_records(rows, **kwargs): invalid_row = next((row for row in rows if len(row) != 2), None) if invalid_row: raise ValueError("Must contain 2 columns: " + ', '.join(invalid_row)) return {row[0]: row[1] for row in rows} try: uploaded_file_id, filename, individual_dataset_mapping = save_uploaded_file( request, process_records=_process_alignment_records) matched_individuals = Individual.objects.filter( family__project=project, individual_id__in=individual_dataset_mapping.keys()) unmatched_individuals = set(individual_dataset_mapping.keys()) - { i.individual_id for i in matched_individuals } if len(unmatched_individuals) > 0: raise Exception( 'The following Individual IDs do not exist: {}'.format( ", ".join(unmatched_individuals))) info.append('Parsed {} rows from {}'.format( len(individual_dataset_mapping), filename)) existing_samples = IgvSample.objects.select_related( 'individual').filter(individual__in=matched_individuals) unchanged_individual_ids = { s.individual.individual_id for s in existing_samples if individual_dataset_mapping[ s.individual.individual_id] == s.file_path } if unchanged_individual_ids: info.append('No change detected for {} individuals'.format( len(unchanged_individual_ids))) updates_by_individual_guid = { i.guid: individual_dataset_mapping[i.individual_id] for i in matched_individuals if i.individual_id not in unchanged_individual_ids } except Exception as e: traceback.print_exc() return create_json_response({'errors': [str(e)]}, status=400) response = { 'updatesByIndividualGuid': updates_by_individual_guid, 'uploadedFileId': uploaded_file_id, 'errors': [], 'info': info, } return create_json_response(response)
def receive_hpo_table_handler(request, project_guid): """Handler for bulk update of hpo terms. This handler parses the records, but doesn't save them in the database. Instead, it saves them to a temporary file and sends a 'uploadedFileId' representing this file back to the client. Args: request (object): Django request object project_guid (string): project GUID """ project = get_project_and_check_permissions(project_guid, request.user) def process_records(json_records, filename=''): records, errors, warnings = _process_hpo_records(json_records, filename, project) if errors: raise ErrorsWarningsException(errors, warnings) return records, warnings try: uploaded_file_id, _, (json_records, warnings) = save_uploaded_file(request, process_records=process_records) except ErrorsWarningsException as e: return create_json_response({'errors': e.errors, 'warnings': e.warnings}, status=400, reason=e.errors) except Exception as e: return create_json_response({'errors': [e.message or str(e)], 'warnings': []}, status=400, reason=e.message or str(e)) response = { 'uploadedFileId': uploaded_file_id, 'errors': [], 'warnings': warnings, 'info': ['{} individuals will be updated'.format(len(json_records))], } return create_json_response(response)
def receive_individuals_table_handler(request, project_guid): """Handler for the initial upload of an Excel or .tsv table of individuals. This handler parses the records, but doesn't save them in the database. Instead, it saves them to a temporary file and sends a 'uploadedFileId' representing this file back to the client. If/when the client then wants to 'apply' this table, it can send the uploadedFileId to the save_individuals_table(..) handler to actually save the data in the database. Args: request (object): Django request object project_guid (string): project GUID """ project = get_project_and_check_permissions(project_guid, request.user) def parse_file(filename, stream): pedigree_records, errors, warnings = parse_pedigree_table(filename, stream, user=request.user, project=project) if errors: raise ErrorsWarningsException(errors, warnings) return pedigree_records try: uploaded_file_id, filename, json_records = save_uploaded_file(request, parse_file) except ErrorsWarningsException as e: return create_json_response({'errors': e.errors, 'warnings': e.warnings}, status=400, reason=e.errors) except Exception as e: return create_json_response({'errors': [e.message or str(e)], 'warnings': []}, status=400, reason=e.message or str(e)) # send back some stats num_families = len(set(r['familyId'] for r in json_records)) num_individuals = len(set(r['individualId'] for r in json_records)) num_families_to_create = len([ family_id for family_id in set(r['familyId'] for r in json_records) if not Family.objects.filter(family_id=family_id, project=project)]) num_individuals_to_create = len(set( r['individualId'] for r in json_records if not Individual.objects.filter( individual_id=r['individualId'], family__family_id=r['familyId'], family__project=project))) info = [ "{num_families} families, {num_individuals} individuals parsed from {filename}".format( num_families=num_families, num_individuals=num_individuals, filename=filename ), "%d new families, %d new individuals will be added to the project" % (num_families_to_create, num_individuals_to_create), "%d existing individuals will be updated" % (num_individuals - num_individuals_to_create), ] response = { 'uploadedFileId': uploaded_file_id, 'errors': [], 'warnings': [], 'info': info, } logger.info(response) return create_json_response(response)
def receive_hpo_table_handler(request, project_guid): """Handler for bulk update of hpo terms. This handler parses the records, but doesn't save them in the database. Instead, it saves them to a temporary file and sends a 'uploadedFileId' representing this file back to the client. Args: request (object): Django request object project_guid (string): project GUID """ project = get_project_and_check_permissions(project_guid, request.user) try: uploaded_file_id, _, json_records = save_uploaded_file( request, process_records=_process_hpo_records) except Exception as e: return create_json_response( { 'errors': [e.message or str(e)], 'warnings': [] }, status=400, reason=e.message or str(e)) updates_by_individual_guid = {} missing_individuals = [] unchanged_individuals = [] all_hpo_terms = set() for record in json_records: family_id = record.get(FAMILY_ID_COLUMN, None) individual_id = record.get(INDIVIDUAL_ID_COLUMN) individual_q = Individual.objects.filter( individual_id__in=[ individual_id, '{}_{}'.format(family_id, individual_id) ], family__project=project, ) if family_id: individual_q = individual_q.filter(family__family_id=family_id) individual = individual_q.first() if individual: features = record.get(FEATURES_COLUMN) or [] if individual.phenotips_data and features and \ _feature_set(features) == _feature_set(json.loads(individual.phenotips_data).get('features', [])): unchanged_individuals.append(individual_id) else: all_hpo_terms.update([feature['id'] for feature in features]) updates_by_individual_guid[individual.guid] = features else: missing_individuals.append(individual_id) if not updates_by_individual_guid: return create_json_response( { 'errors': [ 'Unable to find individuals to update for any of the {total} parsed individuals.{missing}{unchanged}' .format( total=len(missing_individuals) + len(unchanged_individuals), missing=' No matching ids found for {} individuals'. format(len(missing_individuals)) if missing_individuals else '', unchanged=' No changes detected for {} individuals'. format(len(unchanged_individuals)) if unchanged_individuals else '', ) ], 'warnings': [] }, status=400, reason='Unable to find any matching individuals') info = [ '{} individuals will be updated'.format( len(updates_by_individual_guid)) ] warnings = [] if missing_individuals: warnings.append( 'Unable to find matching ids for {} individuals. The following entries will not be updated: {}' .format(len(missing_individuals), ', '.join(missing_individuals))) if unchanged_individuals: warnings.append( 'No changes detected for {} individuals. The following entries will not be updated: {}' .format(len(unchanged_individuals), ', '.join(unchanged_individuals))) hpo_terms = { hpo.hpo_id: hpo for hpo in HumanPhenotypeOntology.objects.filter( hpo_id__in=all_hpo_terms) } invalid_hpo_terms = set() for features in updates_by_individual_guid.values(): for feature in features: hpo_data = hpo_terms.get(feature['id']) if hpo_data: feature['category'] = hpo_data.category_id feature['label'] = hpo_data.name else: invalid_hpo_terms.add(feature['id']) if invalid_hpo_terms: warnings.append( "The following HPO terms were not found in seqr's HPO data, and while they will be added they may be incorrect: {}" .format(', '.join(invalid_hpo_terms))) response = { 'updatesByIndividualGuid': updates_by_individual_guid, 'uploadedFileId': uploaded_file_id, 'errors': [], 'warnings': warnings, 'info': info, } return create_json_response(response)
def receive_igv_table_handler(request, project_guid): project = get_project_and_check_permissions(project_guid, request.user, can_edit=True) info = [] def _process_alignment_records(rows, **kwargs): invalid_row = next((row for row in rows if not 2 <= len(row) <= 3), None) if invalid_row: raise ValueError("Must contain 2 or 3 columns: " + ', '.join(invalid_row)) parsed_records = defaultdict(list) for row in rows: parsed_records[row[0]].append({ 'filePath': row[1], 'sampleId': row[2] if len(row) > 2 else None }) return parsed_records try: uploaded_file_id, filename, individual_dataset_mapping = save_uploaded_file( request, process_records=_process_alignment_records) matched_individuals = Individual.objects.filter( family__project=project, individual_id__in=individual_dataset_mapping.keys()) unmatched_individuals = set(individual_dataset_mapping.keys()) - { i.individual_id for i in matched_individuals } if len(unmatched_individuals) > 0: raise Exception( 'The following Individual IDs do not exist: {}'.format( ", ".join(unmatched_individuals))) info.append('Parsed {} rows in {} individuals from {}'.format( sum([len(rows) for rows in individual_dataset_mapping.values()]), len(individual_dataset_mapping), filename)) existing_sample_files = defaultdict(set) for sample in IgvSample.objects.select_related('individual').filter( individual__in=matched_individuals): existing_sample_files[sample.individual.individual_id].add( sample.file_path) unchanged_rows = set() for individual_id, updates in individual_dataset_mapping.items(): unchanged_rows.update([ (individual_id, update['filePath']) for update in updates if update['filePath'] in existing_sample_files[individual_id] ]) if unchanged_rows: info.append('No change detected for {} rows'.format( len(unchanged_rows))) all_updates = [] for i in matched_individuals: all_updates += [ dict(individualGuid=i.guid, **update) for update in individual_dataset_mapping[i.individual_id] if (i.individual_id, update['filePath']) not in unchanged_rows ] except Exception as e: return create_json_response({'errors': [str(e)]}, status=400) response = { 'updates': all_updates, 'uploadedFileId': uploaded_file_id, 'errors': [], 'info': info, } return create_json_response(response)
def receive_individuals_table_handler(request, project_guid): """Handler for the initial upload of an Excel or .tsv table of individuals. This handler parses the records, but doesn't save them in the database. Instead, it saves them to a temporary file and sends a 'uploadedFileId' representing this file back to the client. If/when the client then wants to 'apply' this table, it can send the uploadedFileId to the save_individuals_table(..) handler to actually save the data in the database. Args: request (object): Django request object project_guid (string): project GUID """ project = get_project_and_check_permissions(project_guid, request.user) def process_records(json_records, filename='ped_file'): pedigree_records, errors, warnings = parse_pedigree_table(json_records, filename, user=request.user, project=project) if errors: raise ErrorsWarningsException(errors, warnings) return pedigree_records try: uploaded_file_id, filename, json_records = save_uploaded_file(request, process_records=process_records) except ErrorsWarningsException as e: return create_json_response({'errors': e.errors, 'warnings': e.warnings}, status=400, reason=e.errors) except Exception as e: return create_json_response({'errors': [e.message or str(e)], 'warnings': []}, status=400, reason=e.message or str(e)) # send back some stats individual_ids_by_family = defaultdict(list) for r in json_records: if r.get(JsonConstants.PREVIOUS_INDIVIDUAL_ID_COLUMN): individual_ids_by_family[r[JsonConstants.FAMILY_ID_COLUMN]].append( (r[JsonConstants.PREVIOUS_INDIVIDUAL_ID_COLUMN], True) ) else: individual_ids_by_family[r[JsonConstants.FAMILY_ID_COLUMN]].append( (r[JsonConstants.INDIVIDUAL_ID_COLUMN], False) ) num_individuals = sum([len(indiv_ids) for indiv_ids in individual_ids_by_family.values()]) num_existing_individuals = 0 missing_prev_ids = [] for family_id, indiv_ids in individual_ids_by_family.items(): existing_individuals = {i.individual_id for i in Individual.objects.filter( individual_id__in=[indiv_id for (indiv_id, _) in indiv_ids], family__family_id=family_id, family__project=project ).only('individual_id')} num_existing_individuals += len(existing_individuals) missing_prev_ids += [indiv_id for (indiv_id, is_previous) in indiv_ids if is_previous and indiv_id not in existing_individuals] num_individuals_to_create = num_individuals - num_existing_individuals if missing_prev_ids: return create_json_response( {'errors': [ 'Could not find individuals with the following previous IDs: {}'.format(', '.join(missing_prev_ids)) ], 'warnings': []}, status=400, reason='Invalid input') family_ids = set(r[JsonConstants.FAMILY_ID_COLUMN] for r in json_records) num_families = len(family_ids) num_existing_families = Family.objects.filter(family_id__in=family_ids, project=project).count() num_families_to_create = num_families - num_existing_families info = [ "{num_families} families, {num_individuals} individuals parsed from {filename}".format( num_families=num_families, num_individuals=num_individuals, filename=filename ), "{} new families, {} new individuals will be added to the project".format(num_families_to_create, num_individuals_to_create), "{} existing individuals will be updated".format(num_existing_individuals), ] response = { 'uploadedFileId': uploaded_file_id, 'errors': [], 'warnings': [], 'info': info, } logger.info(response) return create_json_response(response)
def receive_alignment_table_handler(request, project_guid): """Create or update samples for the given dataset Args: request: Django request object project_guid (string): GUID of the project that should be updated HTTP POST Request body - should contain the following json structure: { 'sampleType': <"WGS", "WES", or "RNA"> (required) 'datasetType': <"VARIANTS", or "ALIGN"> (required) 'elasticsearchIndex': <String> 'datasetPath': <String> 'datasetName': <String> 'ignoreExtraSamplesInCallset': <Boolean> 'mappingFile': { 'uploadedFileId': <Id for temporary uploaded file> } } Response body - will contain the following structure: """ project = get_project_and_check_permissions(project_guid, request.user, permission_level=CAN_EDIT) info = [] def _process_alignment_records(rows, **kwargs): invalid_row = next((row for row in rows if len(row) != 2), None) if invalid_row: raise ValueError("Must contain 2 columns: " + ', '.join(invalid_row)) return {row[0]: row[1] for row in rows} try: uploaded_file_id, filename, individual_dataset_mapping = save_uploaded_file(request, process_records=_process_alignment_records) matched_individuals = Individual.objects.filter(family__project=project, individual_id__in=individual_dataset_mapping.keys()) unmatched_individuals = set(individual_dataset_mapping.keys()) - {i.individual_id for i in matched_individuals} if len(unmatched_individuals) > 0: raise Exception('The following Individual IDs do not exist: {}'.format(", ".join(unmatched_individuals))) info.append('Parsed {} rows from {}'.format(len(individual_dataset_mapping), filename)) existing_samples = Sample.objects.select_related('individual').filter( individual__in=matched_individuals, dataset_type=Sample.DATASET_TYPE_READ_ALIGNMENTS, is_active=True ) unchanged_individual_ids = {s.individual.individual_id for s in existing_samples if individual_dataset_mapping[s.individual.individual_id] == s.dataset_file_path} if unchanged_individual_ids: info.append('No change detected for {} individuals'.format(len(unchanged_individual_ids))) updates_by_individual_guid = {i.guid: individual_dataset_mapping[i.individual_id] for i in matched_individuals if i.individual_id not in unchanged_individual_ids} except Exception as e: traceback.print_exc() return create_json_response({'errors': [e.message or str(e)]}, status=400) response = { 'updatesByIndividualGuid': updates_by_individual_guid, 'uploadedFileId': uploaded_file_id, 'errors': [], 'info': info, } return create_json_response(response)
def receive_families_table_handler(request, project_guid): """Handler for the initial upload of an Excel or .tsv table of families. This handler parses the records, but doesn't save them in the database. Instead, it saves them to a temporary file and sends a 'uploadedFileId' representing this file back to the client. Args: request (object): Django request object project_guid (string): project GUID """ project = get_project_and_check_permissions(project_guid, request.user) def _process_records(records, filename=''): column_map = {} for i, field in enumerate(records[0]): key = field.lower() if 'family' in key: if 'prev' in key: column_map[PREVIOUS_FAMILY_ID_FIELD] = i else: column_map[FAMILY_ID_FIELD] = i elif 'display' in key: column_map['displayName'] = i elif 'description' in key: column_map['description'] = i elif 'phenotype' in key: column_map['codedPhenotype'] = i if FAMILY_ID_FIELD not in column_map: raise ValueError('Invalid header, missing family id column') return [{ column: row[index] if isinstance(index, int) else next( (row[i] for i in index if row[i]), None) for column, index in column_map.items() } for row in records[1:]] try: uploaded_file_id, filename, json_records = save_uploaded_file( request, process_records=_process_records) except Exception as e: return create_json_response({ 'errors': [str(e)], 'warnings': [] }, status=400, reason=str(e)) prev_fam_ids = { r[PREVIOUS_FAMILY_ID_FIELD] for r in json_records if r.get(PREVIOUS_FAMILY_ID_FIELD) } existing_prev_fam_ids = { f.family_id for f in Family.objects.filter(family_id__in=prev_fam_ids, project=project).only('family_id') } if len(prev_fam_ids) != len(existing_prev_fam_ids): missing_prev_ids = [ family_id for family_id in prev_fam_ids if family_id not in existing_prev_fam_ids ] return create_json_response( { 'errors': [ 'Could not find families with the following previous IDs: {}' .format(', '.join(missing_prev_ids)) ], 'warnings': [] }, status=400, reason='Invalid input') fam_ids = { r[FAMILY_ID_FIELD] for r in json_records if not r.get(PREVIOUS_FAMILY_ID_FIELD) } num_families_to_update = len(prev_fam_ids) + Family.objects.filter( family_id__in=fam_ids, project=project).count() num_families = len(json_records) num_families_to_create = num_families - num_families_to_update info = [ "{num_families} families parsed from {filename}".format( num_families=num_families, filename=filename), "{} new families will be added, {} existing families will be updated". format(num_families_to_create, num_families_to_update), ] return create_json_response({ 'uploadedFileId': uploaded_file_id, 'errors': [], 'warnings': [], 'info': info, })
def receive_hpo_table_handler(request, project_guid): """Handler for bulk update of hpo terms. This handler parses the records, but doesn't save them in the database. Instead, it saves them to a temporary file and sends a 'uploadedFileId' representing this file back to the client. Args: request (object): Django request object project_guid (string): project GUID """ project = get_project_and_check_permissions(project_guid, request.user) try: uploaded_file_id, _, json_records = save_uploaded_file(request, process_records=_process_hpo_records) except Exception as e: return create_json_response({'errors': [e.message or str(e)], 'warnings': []}, status=400, reason=e.message or str(e)) updates_by_individual_guid = {} missing_individuals = [] unchanged_individuals = [] all_hpo_terms = set() for record in json_records: family_id = record.get(FAMILY_ID_COLUMN, None) individual_id = record.get(INDIVIDUAL_ID_COLUMN) individual_q = Individual.objects.filter( individual_id__in=[individual_id, '{}_{}'.format(family_id, individual_id)], family__project=project, ) if family_id: individual_q = individual_q.filter(family__family_id=family_id) individual = individual_q.first() if individual: features = record.get(FEATURES_COLUMN) or [] if individual.phenotips_data and features and \ _feature_set(features) == _feature_set(json.loads(individual.phenotips_data).get('features', [])): unchanged_individuals.append(individual_id) else: all_hpo_terms.update([feature['id'] for feature in features]) updates_by_individual_guid[individual.guid] = features else: missing_individuals.append(individual_id) if not updates_by_individual_guid: return create_json_response({ 'errors': ['Unable to find individuals to update for any of the {total} parsed individuals.{missing}{unchanged}'.format( total=len(missing_individuals) + len(unchanged_individuals), missing=' No matching ids found for {} individuals'.format(len(missing_individuals)) if missing_individuals else '', unchanged=' No changes detected for {} individuals'.format(len(unchanged_individuals)) if unchanged_individuals else '', )], 'warnings': [] }, status=400, reason='Unable to find any matching individuals') hpo_terms = {hpo.hpo_id: hpo for hpo in HumanPhenotypeOntology.objects.filter(hpo_id__in=all_hpo_terms)} invalid_hpo_terms = set() for features in updates_by_individual_guid.values(): for feature in features: hpo_data = hpo_terms.get(feature['id']) if hpo_data: feature['category'] = hpo_data.category_id feature['label'] = hpo_data.name else: invalid_hpo_terms.add(feature['id']) if invalid_hpo_terms: return create_json_response({ 'errors': [ "The following HPO terms were not found in seqr's HPO data: {}".format(', '.join(invalid_hpo_terms)) ], 'warnings': [] }, status=400, reason='Invalid HPO terms') info = ['{} individuals will be updated'.format(len(updates_by_individual_guid))] warnings = [] if missing_individuals: warnings.append( 'Unable to find matching ids for {} individuals. The following entries will not be updated: {}'.format( len(missing_individuals), ', '.join(missing_individuals) )) if unchanged_individuals: warnings.append( 'No changes detected for {} individuals. The following entries will not be updated: {}'.format( len(unchanged_individuals), ', '.join(unchanged_individuals) )) response = { 'updatesByIndividualGuid': updates_by_individual_guid, 'uploadedFileId': uploaded_file_id, 'errors': [], 'warnings': warnings, 'info': info, } return create_json_response(response)
def receive_families_table_handler(request, project_guid): """Handler for the initial upload of an Excel or .tsv table of families. This handler parses the records, but doesn't save them in the database. Instead, it saves them to a temporary file and sends a 'uploadedFileId' representing this file back to the client. Args: request (object): Django request object project_guid (string): project GUID """ project = get_project_and_check_permissions(project_guid, request.user) def _process_records(records, filename=''): column_map = {} for i, field in enumerate(records[0]): key = field.lower() if 'family' in key: if 'prev' in key: column_map[PREVIOUS_FAMILY_ID_FIELD] = i else: column_map[FAMILY_ID_FIELD] = i elif 'display' in key: column_map['displayName'] = i elif 'description' in key: column_map['description'] = i elif 'phenotype' in key: column_map['codedPhenotype'] = i if FAMILY_ID_FIELD not in column_map: raise ValueError('Invalid header, missing family id column') return [{column: row[index] if isinstance(index, int) else next((row[i] for i in index if row[i]), None) for column, index in column_map.items()} for row in records[1:]] try: uploaded_file_id, filename, json_records = save_uploaded_file(request, process_records=_process_records) except Exception as e: return create_json_response({'errors': [e.message or str(e)], 'warnings': []}, status=400, reason=e.message or str(e)) prev_fam_ids = {r[PREVIOUS_FAMILY_ID_FIELD] for r in json_records if r.get(PREVIOUS_FAMILY_ID_FIELD)} existing_prev_fam_ids = {f.family_id for f in Family.objects.filter(family_id__in=prev_fam_ids, project=project).only('family_id')} if len(prev_fam_ids) != len(existing_prev_fam_ids): missing_prev_ids = [family_id for family_id in prev_fam_ids if family_id not in existing_prev_fam_ids] return create_json_response( {'errors': [ 'Could not find families with the following previous IDs: {}'.format(', '.join(missing_prev_ids)) ], 'warnings': []}, status=400, reason='Invalid input') fam_ids = {r[FAMILY_ID_FIELD] for r in json_records if not r.get(PREVIOUS_FAMILY_ID_FIELD)} num_families_to_update = len(prev_fam_ids) + Family.objects.filter(family_id__in=fam_ids, project=project).count() num_families = len(json_records) num_families_to_create = num_families - num_families_to_update info = [ "{num_families} families parsed from {filename}".format(num_families=num_families, filename=filename), "{} new families will be added, {} existing families will be updated".format(num_families_to_create, num_families_to_update), ] return create_json_response({ 'uploadedFileId': uploaded_file_id, 'errors': [], 'warnings': [], 'info': info, })