def handle(self, *args, **options): samples = (IgvSample.objects.filter( individual__family__project__name__in=args ) if args else IgvSample.objects.all()).filter( file_path__startswith='gs://' ).prefetch_related('individual', 'individual__family__project') missing_counter = collections.defaultdict(int) guids_of_samples_with_missing_file = set() for sample in tqdm.tqdm(samples, unit=" samples"): if not does_file_exist(sample.file_path): individual_id = sample.individual.individual_id project = sample.individual.family.project.name missing_counter[project] += 1 logger.info('Individual: {} file not found: {}'.format(individual_id, sample.file_path)) if not options.get('dry_run'): guids_of_samples_with_missing_file.add(sample.guid) if len(guids_of_samples_with_missing_file) > 0: IgvSample.bulk_update(user=None, update_json={'file_path': ''}, guid__in=guids_of_samples_with_missing_file) logger.info('---- DONE ----') logger.info('Checked {} samples'.format(len(samples))) if missing_counter: logger.info('{} files not found:'.format(sum(missing_counter.values()))) for project_name, c in sorted(missing_counter.items(), key=lambda t: -t[1]): logger.info(' {} in {}'.format(c, project_name))
def _validate_vcf(vcf_path, sample_type=None, genome_version=None): if not vcf_path or not isinstance(vcf_path, basestring): raise ValueError("Invalid vcf_path arg: %(vcf_path)s" % locals()) if not does_file_exist(vcf_path): raise ValueError("%(vcf_path)s not found" % locals()) header_line = None for i, line in enumerate(file_iter(vcf_path)): if line.startswith("#CHROM"): header_line = line break if line.startswith("#"): continue else: break if i > 20000: break # there's no way header is this long if not header_line: raise ValueError( "Unexpected VCF header. #CHROM not found before line: " + line) # TODO if annotating using gcloud, check whether dataproc has access to file # TODO check header, sample_type, genome_version header_fields = header_line.strip().split('\t') sample_ids = header_fields[9:] return sample_ids
def fetch_igv_track(request, project_guid, igv_track_path): get_project_and_check_permissions(project_guid, request.user) if igv_track_path.endswith( '.bam.bai') and not does_file_exist(igv_track_path): igv_track_path = igv_track_path.replace('.bam.bai', '.bai') return _stream_file(request, igv_track_path)
def _validate_dataset_path(dataset_path): try: dataset_file = does_file_exist(dataset_path) if dataset_file is None: raise Exception('"{}" not found'.format(dataset_path)) # check that dataset_path is accessible dataset_file_stats = get_file_stats(dataset_path) if dataset_file_stats is None: raise Exception('Unable to access "{}"'.format(dataset_path)) except Exception as e: raise Exception("Dataset path error: " + str(e))
def _validate_dataset_path(dataset_path): try: dataset_file = file_utils.does_file_exist(dataset_path) if dataset_file is None: raise Exception('"{}" not found'.format(dataset_path)) # check that dataset_path is accessible dataset_file_stats = file_utils.get_file_stats(dataset_path) if dataset_file_stats is None: raise Exception('Unable to access "{}"'.format(dataset_path)) except Exception as e: raise Exception("Dataset path error: " + str(e))
def update_individual_igv_sample(request, individual_guid): individual = Individual.objects.get(guid=individual_guid) project = individual.family.project check_project_permissions(project, request.user, can_edit=True) request_json = json.loads(request.body) try: file_path = request_json.get('filePath') if not file_path: raise ValueError('request must contain fields: filePath') suffix = '.'.join(file_path.split('.')[1:]) sample_type = SAMPLE_TYPE_MAP.get(suffix) if not sample_type: raise Exception( 'Invalid file extension for "{}" - valid extensions are {}'. format(file_path, ', '.join(SAMPLE_TYPE_MAP.keys()))) if not does_file_exist(file_path): raise Exception('Error accessing "{}"'.format(file_path)) sample, created = get_or_create_model_from_json( IgvSample, create_json={ 'individual': individual, 'sample_type': sample_type }, update_json={ 'file_path': file_path, 'sample_id': request_json.get('sampleId') }, user=request.user) response = { 'igvSamplesByGuid': { sample.guid: get_json_for_sample(sample, individual_guid=individual_guid, project_guid=project.guid) } } if created: response['individualsByGuid'] = { individual.guid: { 'igvSampleGuids': [s.guid for s in individual.igvsample_set.all()] } } return create_json_response(response) except Exception as e: error = str(e) return create_json_response({'error': error}, status=400, reason=error)
def run_vep(self, input_vcf_path, output_vds_path): """Run VEP on the dataset. Assumes the dataproc cluster already exists.""" if not does_file_exist(input_vcf_path): raise ValueError("%(input_vcf_path)s not foud" % locals()) script_path = os.path.join(BASE_DIR, "seqr/pipelines/hail/run_vep.py") script_args = [ input_vcf_path, output_vds_path, ] self.hail_runner.run_hail(script_path, *script_args)
def validate_dataset(project, sample_type, analysis_type, genome_version, dataset_path, max_edit_distance=0, dataset_id=None): """Validates the given dataset. Args: project (object): sample_type (string): analysis_type (string): genome_version (string): dataset_path (string): max_edit_distance (int): dataset_id (string): Return: (errors, warnings, info) tuple Dataset.ANALYSIS_TYPE_VARIANT_CALLS """ #elasticsearch_host = options["elasticsearch_host"] #elasticsearch_index = options["elasticsearch_index"] #is_loaded = options["is_loaded"] # check args errors = [] warnings = [] info = [] # basic file path checks if analysis_type == Dataset.ANALYSIS_TYPE_VARIANT_CALLS: if not dataset_path.endswith(".vcf.gz") and not dataset_path.endswith( ".vds"): errors.append("Dataset path must end with .vcf.gz or .vds") elif analysis_type == Dataset.ANALYSIS_TYPE_ALIGNMENT: if not any([ dataset_path.endswith(suffix) for suffix in ('.txt', '.tsv', '.xls', '.xlsx') ]): errors.append( "BAM / CRAM table must have a .txt or .xls extension") else: errors.append("dataset type not supported: %s" % (analysis_type, )) if errors: return errors, warnings, info # check that dataset file exists try: dataset_file = does_file_exist(dataset_path) if dataset_file is None: errors.append("Unable to access %s" % (dataset_path, )) else: # check that dataset_path is accessible dataset_file_stats = get_file_stats(dataset_path) if dataset_file_stats is None: errors.append("Unable to access %s" % (dataset_path, )) except Exception as e: errors.append("dataset path error: " + str(e)) if errors: return errors, warnings, info # validate dataset contents if analysis_type == Dataset.ANALYSIS_TYPE_VARIANT_CALLS: # validate VCF and get sample ids try: sample_ids = _validate_vcf(dataset_path, sample_type=sample_type, genome_version=genome_version) except ValueError as e: errors.append(str(e)) return errors, warnings, info matched_sample_id_to_sample_record = match_sample_ids_to_sample_records( project, sample_ids=sample_ids, sample_type=sample_type, max_edit_distance=max_edit_distance, ) if len(matched_sample_id_to_sample_record) == 0: all_vcf_sample_id_count = len(sample_ids) all_project_sample_id_count = len( Sample.objects.filter(individual__family__project=project, sample_type=sample_type)) errors.append( "None of the individuals or samples in the project matched the %(all_vcf_sample_id_count)s sample id(s) in the VCF" % locals()) return errors, warnings, info # if Dataset record exists, retrieve it and check if it's already been loaded previously try: dataset = get_dataset( project=project, analysis_type=analysis_type, genome_version=genome_version, source_file_path=dataset_path, #elasticsearch_host=elasticsearch_host, #elasticsearch_index=elasticsearch_index, #is_loaded=is_loaded, ) except ObjectDoesNotExist as e: logger.warning("No existing dataset found") # check if all VCF samples loaded already - TODO update this? vcf_sample_ids = set(matched_sample_id_to_sample_record.keys()) existing_sample_ids = set([s.sample_id for s in dataset.samples.all()]) if dataset.is_loaded and len(vcf_sample_ids - existing_sample_ids) == 0: info.append("All %s samples in this VCF have already been loaded" % len(vcf_sample_ids)) return errors, warnings, info elif not dataset.is_loaded: info.append("Dataset not loaded. Loading...") elif len(vcf_sample_ids - existing_sample_ids) != 0: info.append("Data will be loaded for these samples: %s" % (vcf_sample_ids - existing_sample_ids, )) return errors, warnings, info
def add_dataset(project, sample_type, analysis_type, genome_version, dataset_path, max_edit_distance=0, dataset_id=None, name=None, description=None, ignore_extra_samples_in_callset=False): """Validates the given dataset. Args: project (object): sample_type (string): analysis_type (string): genome_version (string): dataset_path (string): max_edit_distance (int): dataset_id (string): ignore_extra_samples_in_callset (bool): Return: (errors, warnings, info) tuple Dataset.ANALYSIS_TYPE_VARIANT_CALLS """ #elasticsearch_host = options["elasticsearch_host"] #elasticsearch_index = options["elasticsearch_index"] #is_loaded = options["is_loaded"] # check args errors = [] warnings = [] info = [] # basic file path checks if analysis_type == Dataset.ANALYSIS_TYPE_VARIANT_CALLS: if not dataset_path.endswith(".vcf.gz") and not dataset_path.endswith( ".vds"): errors.append("Dataset path must end with .vcf.gz or .vds") elif analysis_type == Dataset.ANALYSIS_TYPE_ALIGNMENT: if not any([ dataset_path.endswith(suffix) for suffix in ('.txt', '.tsv', '.xls', '.xlsx') ]): errors.append( "BAM / CRAM table must have a .txt or .xls extension") else: errors.append("dataset type not supported: %s" % (analysis_type, )) if errors: return errors, warnings, info # check that dataset file exists try: dataset_file = does_file_exist(dataset_path) if dataset_file is None: errors.append("Unable to access %s" % (dataset_path, )) else: # check that dataset_path is accessible dataset_file_stats = get_file_stats(dataset_path) if dataset_file_stats is None: errors.append("Unable to access %s" % (dataset_path, )) except Exception as e: errors.append("dataset path error: " + str(e)) if errors: return errors, warnings, info # validate dataset contents if analysis_type == Dataset.ANALYSIS_TYPE_VARIANT_CALLS: # validate VCF and get sample ids try: all_vcf_sample_ids = _validate_vcf(dataset_path, sample_type=sample_type, genome_version=genome_version) except ValueError as e: errors.append(str(e)) return errors, warnings, info matched_sample_id_to_sample_record = match_sample_ids_to_sample_records( project, sample_ids=all_vcf_sample_ids, sample_type=sample_type, max_edit_distance=max_edit_distance, create_sample_records=True, ) if not ignore_extra_samples_in_callset and len( matched_sample_id_to_sample_record) < len(all_vcf_sample_ids): errors.append( "Matches not found for VCF sample ids: " + ", ".join( set(all_vcf_sample_ids) - set(matched_sample_id_to_sample_record.keys())) + ". Select the 'Ignore extra samples in callset' checkbox to ignore this." ) if len(matched_sample_id_to_sample_record) == 0: errors.append( "None of the individuals or samples in the project matched the %(all_vcf_sample_id_count)s sample id(s) in the VCF" % locals()) return errors, warnings, info # if Dataset record exists, retrieve it and check if it's already been loaded previously # retrieve or create Dataset record and link it to sample(s) dataset = get_or_create_elasticsearch_dataset( project=project, analysis_type=analysis_type, genome_version=genome_version, source_file_path=dataset_path, elasticsearch_index=dataset_id, ) if dataset_id is not None: dataset.is_loaded = True dataset.loaded_date = timezone.now() dataset.name = name dataset.description = description dataset.save() link_dataset_to_sample_records( dataset, matched_sample_id_to_sample_record.values()) # check if all VCF samples loaded already - TODO update this? vcf_sample_ids = set(matched_sample_id_to_sample_record.keys()) existing_sample_ids = set([s.sample_id for s in dataset.samples.all()]) if dataset.is_loaded and len(vcf_sample_ids - existing_sample_ids) == 0: info.append("All %s samples in this VCF have already been loaded" % len(vcf_sample_ids)) return errors, warnings, info elif not dataset.is_loaded: info.append("Dataset not loaded. Loading...") elif len(vcf_sample_ids - existing_sample_ids) != 0: info.append("Data will be loaded for these samples: %s" % (vcf_sample_ids - existing_sample_ids, )) return errors, warnings, info
def create_project_from_workspace(request, namespace, name): """ Create a project when a cooperator requests to load data from an AnVIL workspace. :param request: Django request object :param namespace: The namespace (or the billing account) of the workspace :param name: The name of the workspace. It also be used as the project name :return the projectsByGuid with the new project json """ # Validate that the current user has logged in through google and has sufficient permissions workspace_meta = check_workspace_perm(request.user, CAN_EDIT, namespace, name, can_share=True, meta_fields=['workspace.bucketName']) projects = Project.objects.filter(workspace_namespace=namespace, workspace_name=name) if projects: error = 'Project "{}" for workspace "{}/{}" exists.'.format(projects.first().name, namespace, name) return create_json_response({'error': error}, status=400, reason=error) # Validate all the user inputs from the post body request_json = json.loads(request.body) missing_fields = [field for field in ['genomeVersion', 'uploadedFileId', 'dataPath'] if not request_json.get(field)] if missing_fields: error = 'Field(s) "{}" are required'.format(', '.join(missing_fields)) return create_json_response({'error': error}, status=400, reason=error) if not request_json.get('agreeSeqrAccess'): error = 'Must agree to grant seqr access to the data in the associated workspace.' return create_json_response({'error': error}, status=400, reason=error) # Add the seqr service account to the corresponding AnVIL workspace added_account_to_workspace = add_service_account(request.user, namespace, name) if added_account_to_workspace: _wait_for_service_account_access(request.user,namespace, name) # Validate the data path bucket_name = workspace_meta['workspace']['bucketName'] data_path = 'gs://{bucket}/{path}'.format(bucket=bucket_name.rstrip('/'), path=request_json['dataPath'].lstrip('/')) if not does_file_exist(data_path): error = 'Data file or path {} is not found.'.format(request_json['dataPath']) return create_json_response({'error': error}, status=400, reason=error) # Parse families/individuals in the uploaded pedigree file json_records = load_uploaded_file(request_json['uploadedFileId']) pedigree_records, errors, ped_warnings = parse_pedigree_table(json_records, 'uploaded pedigree file', user=request.user) errors += ped_warnings if errors: return create_json_response({'errors': errors}, status=400) # Create a new Project in seqr project_args = { 'name': name, 'genome_version': request_json['genomeVersion'], 'description': request_json.get('description', ''), 'workspace_namespace': namespace, 'workspace_name': name, } project = create_model_from_json(Project, project_args, user=request.user) # add families and individuals according to the uploaded individual records _, updated_individuals = add_or_update_individuals_and_families( project, individual_records=pedigree_records, user=request.user ) # Send an email to all seqr data managers try: _send_load_data_email(project, updated_individuals, data_path, request.user) except Exception as ee: message = 'Exception while sending email to user {}. {}'.format(request.user, str(ee)) logger.error(message) return create_json_response({'projectGuid': project.guid})
def handle(self, *args, **options): analysis_type = Dataset.ANALYSIS_TYPE_VARIANT_CALLS # parse and validate args sample_type = options["sample_type"] genome_version = options["genome_version"] validate_only = options["validate_only"] remap_sample_ids = options["remap_sample_ids"] max_edit_distance = options["max_edit_distance_for_id_match"] pedigree_file_path = options["pedigree_file"] export_pedigree_file_template = options["export_pedigree_file_template"] project_guid = options["project_id"] vcf_path = options["vcf_path"] elasticsearch_index = options["elasticsearch_index"] is_loaded = options["is_loaded"] # look up project id and validate other args try: project = Project.objects.get(guid=project_guid) except ObjectDoesNotExist: raise CommandError("Invalid project id: %(project_guid)s" % locals()) #if project.genome_version != genome_version: # raise CommandError("Genome version %s doesn't match the project's genome version which is %s" % (genome_version, project.genome_version)) if pedigree_file_path and not os.path.isfile(pedigree_file_path): raise CommandError("Can't open pedigree file: %(pedigree_file_path)s" % locals()) # parse the pedigree file if specified if pedigree_file_path: input_stream = file_iter(pedigree_file_path) json_records, errors, warnings = parse_pedigree_table(pedigree_file_path, input_stream) if errors: for message in errors: logger.error(message) raise CommandError("Unable to parse %(pedigree_file_path)s" % locals()) if warnings: for message in warnings: logger.warn(message) if not validate_only: add_or_update_individuals_and_families(project, json_records) # validate VCF and get sample ids vcf_sample_ids = _validate_vcf(vcf_path, sample_type=sample_type, genome_version=genome_version) if remap_sample_ids: if not does_file_exist(remap_sample_ids): raise ValueError("File not found: " + remap_sample_ids) id_mapping = {} for line in file_iter(remap_sample_ids): fields = line.strip().split("\t") if len(fields) != 2: raise ValueError("Must contain 2 columns: " + str(fields)) id_mapping[fields[0]] = fields[1] remapped_vcf_sample_ids = [] for sample_id in vcf_sample_ids: if sample_id in id_mapping: remapped_vcf_sample_ids.append(id_mapping[sample_id]) print("Remapped %s to %s" % (sample_id, id_mapping[sample_id])) else: remapped_vcf_sample_ids.append(sample_id) print("No sample id mapping for %s" % sample_id) vcf_sample_ids = remapped_vcf_sample_ids vcf_sample_ids_to_sample_records = match_sample_ids_to_sample_records( project, sample_ids=vcf_sample_ids, sample_type=sample_type, max_edit_distance=max_edit_distance, create_sample_records=not validate_only, ) if export_pedigree_file_template: with open(export_pedigree_file_template, "w") as out_f: out_f.write("#%s\n" % ("\t".join(['family_id', 'individual_id', 'paternal_id', 'maternal_id', 'sex', 'affected_status'],))) for vcf_sample_id in vcf_sample_ids: if vcf_sample_id in vcf_sample_ids_to_sample_records: continue family_id = individual_id = vcf_sample_id out_f.write("%s\n" % ("\t".join([family_id, individual_id, '', '', '', ''],))) logger.info("Wrote out %(export_pedigree_file_template)s. Exiting..." % locals()) return if len(vcf_sample_ids_to_sample_records) == 0: all_vcf_sample_id_count = len(vcf_sample_ids) all_project_sample_id_count = len(Sample.objects.filter(individual__family__project=project, sample_type=sample_type)) logger.info("None of the individuals or samples in the project matched the %(all_vcf_sample_id_count)s sample id(s) in the VCF" % locals()) return # retrieve or create Dataset record and link it to sample(s) dataset = get_or_create_elasticsearch_dataset( project=project, analysis_type=analysis_type, genome_version=genome_version, source_file_path=vcf_path, elasticsearch_index=elasticsearch_index, is_loaded=is_loaded, ) if is_loaded and not dataset.loaded_date: dataset.loaded_date=timezone.now() dataset.save() link_dataset_to_sample_records(dataset, vcf_sample_ids_to_sample_records.values()) # check if all VCF samples loaded already vcf_sample_ids = set(vcf_sample_ids_to_sample_records.keys()) existing_sample_ids = set([s.sample_id for s in dataset.samples.all()]) if dataset.is_loaded and len(vcf_sample_ids - existing_sample_ids) == 0: logger.info("All %s samples in this VCF have already been loaded" % len(vcf_sample_ids)) return elif not dataset.is_loaded: logger.info("Dataset not loaded. %s Loading..." % (is_loaded,)) elif len(vcf_sample_ids - existing_sample_ids) != 0: logger.info("Dataset is loaded but these samples aren't included in the dataset: %s" % (vcf_sample_ids - existing_sample_ids, )) logger.info("done")
def validate_alignment_dataset_path(dataset_path): if not does_file_exist(dataset_path): raise Exception('Error accessing "{}"'.format(dataset_path))
def upload_qc_pipeline_output(request): file_path = json.loads(request.body)['file'].strip() if not does_file_exist(file_path, user=request.user): return create_json_response( {'errors': ['File not found: {}'.format(file_path)]}, status=400) raw_records = parse_file(file_path, file_iter(file_path, user=request.user)) json_records = [dict(zip(raw_records[0], row)) for row in raw_records[1:]] try: dataset_type, data_type, records_by_sample_id = _parse_raw_qc_records( json_records) except ValueError as e: return create_json_response({'errors': [str(e)]}, status=400, reason=str(e)) info_message = 'Parsed {} {} samples'.format( len(json_records), 'SV' if dataset_type == Sample.DATASET_TYPE_SV_CALLS else data_type) logger.info(info_message, request.user) info = [info_message] warnings = [] samples = Sample.objects.filter( sample_id__in=records_by_sample_id.keys(), sample_type=Sample.SAMPLE_TYPE_WES if data_type == 'exome' else Sample.SAMPLE_TYPE_WGS, dataset_type=dataset_type, ).exclude(individual__family__project__name__in=EXCLUDE_PROJECTS).exclude( individual__family__project__projectcategory__name= EXCLUDE_PROJECT_CATEGORY) sample_individuals = { agg['sample_id']: agg['individuals'] for agg in samples.values('sample_id').annotate( individuals=ArrayAgg('individual_id', distinct=True)) } sample_individual_max_loaded_date = { agg['individual_id']: agg['max_loaded_date'] for agg in samples.values('individual_id').annotate( max_loaded_date=Max('loaded_date')) } individual_latest_sample_id = { s.individual_id: s.sample_id for s in samples if s.loaded_date == sample_individual_max_loaded_date.get(s.individual_id) } for sample_id, record in records_by_sample_id.items(): record['individual_ids'] = list({ individual_id for individual_id in sample_individuals.get(sample_id, []) if individual_latest_sample_id[individual_id] == sample_id }) missing_sample_ids = { sample_id for sample_id, record in records_by_sample_id.items() if not record['individual_ids'] } if missing_sample_ids: individuals = Individual.objects.filter( individual_id__in=missing_sample_ids ).exclude(family__project__name__in=EXCLUDE_PROJECTS).exclude( family__project__projectcategory__name=EXCLUDE_PROJECT_CATEGORY ).filter(sample__sample_type=Sample.SAMPLE_TYPE_WES if data_type == 'exome' else Sample.SAMPLE_TYPE_WGS).distinct() individual_db_ids_by_id = defaultdict(list) for individual in individuals: individual_db_ids_by_id[individual.individual_id].append( individual.id) for sample_id, record in records_by_sample_id.items(): if not record['individual_ids'] and len( individual_db_ids_by_id[sample_id]) >= 1: record['individual_ids'] = individual_db_ids_by_id[sample_id] missing_sample_ids.remove(sample_id) multi_individual_samples = { sample_id: len(record['individual_ids']) for sample_id, record in records_by_sample_id.items() if len(record['individual_ids']) > 1 } if multi_individual_samples: logger.warning( 'Found {} multi-individual samples from qc output'.format( len(multi_individual_samples)), request.user) warnings.append( 'The following {} samples were added to multiple individuals: {}'. format( len(multi_individual_samples), ', '.join( sorted([ '{} ({})'.format(sample_id, count) for sample_id, count in multi_individual_samples.items() ])))) if missing_sample_ids: logger.warning( 'Missing {} samples from qc output'.format( len(missing_sample_ids)), request.user) warnings.append('The following {} samples were skipped: {}'.format( len(missing_sample_ids), ', '.join(sorted(list(missing_sample_ids))))) records_with_individuals = [ record for sample_id, record in records_by_sample_id.items() if sample_id not in missing_sample_ids ] if dataset_type == Sample.DATASET_TYPE_SV_CALLS: _update_individuals_sv_qc(records_with_individuals, request.user) else: _update_individuals_variant_qc(records_with_individuals, data_type, warnings, request.user) message = 'Found and updated matching seqr individuals for {} samples'.format( len(json_records) - len(missing_sample_ids)) info.append(message) return create_json_response({ 'errors': [], 'warnings': warnings, 'info': info, })