Python does_file_exist 예제들, seqr.utils.file_utils.does_file_exist Python 예제들

예제 #1

0

파일 보기

    def handle(self, *args, **options):
        samples = (IgvSample.objects.filter(
            individual__family__project__name__in=args
        ) if args else IgvSample.objects.all()).filter(
            file_path__startswith='gs://'
        ).prefetch_related('individual', 'individual__family__project')

        missing_counter = collections.defaultdict(int)
        guids_of_samples_with_missing_file = set()
        for sample in tqdm.tqdm(samples, unit=" samples"):
            if not does_file_exist(sample.file_path):
                individual_id = sample.individual.individual_id
                project = sample.individual.family.project.name
                missing_counter[project] += 1
                logger.info('Individual: {}  file not found: {}'.format(individual_id, sample.file_path))
                if not options.get('dry_run'):
                    guids_of_samples_with_missing_file.add(sample.guid)

        if len(guids_of_samples_with_missing_file) > 0:
            IgvSample.bulk_update(user=None, update_json={'file_path': ''}, guid__in=guids_of_samples_with_missing_file)

        logger.info('---- DONE ----')
        logger.info('Checked {} samples'.format(len(samples)))
        if missing_counter:
            logger.info('{} files not found:'.format(sum(missing_counter.values())))
            for project_name, c in sorted(missing_counter.items(), key=lambda t: -t[1]):
                logger.info('   {} in {}'.format(c, project_name))

예제 #2

0

파일 보기

파일: dataset_validation.py 프로젝트: tianyunwang/seqr

def _validate_vcf(vcf_path, sample_type=None, genome_version=None):
    if not vcf_path or not isinstance(vcf_path, basestring):
        raise ValueError("Invalid vcf_path arg: %(vcf_path)s" % locals())

    if not does_file_exist(vcf_path):
        raise ValueError("%(vcf_path)s not found" % locals())

    header_line = None
    for i, line in enumerate(file_iter(vcf_path)):
        if line.startswith("#CHROM"):
            header_line = line
            break
        if line.startswith("#"):
            continue
        else:
            break

        if i > 20000:
            break  # there's no way header is this long

    if not header_line:
        raise ValueError(
            "Unexpected VCF header. #CHROM not found before line: " + line)

    # TODO if annotating using gcloud, check whether dataproc has access to file

    # TODO check header, sample_type, genome_version
    header_fields = header_line.strip().split('\t')
    sample_ids = header_fields[9:]

    return sample_ids

예제 #3

0

파일 보기

def fetch_igv_track(request, project_guid, igv_track_path):

    get_project_and_check_permissions(project_guid, request.user)

    if igv_track_path.endswith(
            '.bam.bai') and not does_file_exist(igv_track_path):
        igv_track_path = igv_track_path.replace('.bam.bai', '.bai')

    return _stream_file(request, igv_track_path)

예제 #4

0

파일 보기

def _validate_dataset_path(dataset_path):
    try:
        dataset_file = does_file_exist(dataset_path)
        if dataset_file is None:
            raise Exception('"{}" not found'.format(dataset_path))
        # check that dataset_path is accessible
        dataset_file_stats = get_file_stats(dataset_path)
        if dataset_file_stats is None:
            raise Exception('Unable to access "{}"'.format(dataset_path))
    except Exception as e:
        raise Exception("Dataset path error: " + str(e))

예제 #5

0

파일 보기

파일: dataset_utils.py 프로젝트: macarthur-lab/seqr

def _validate_dataset_path(dataset_path):
    try:
        dataset_file = file_utils.does_file_exist(dataset_path)
        if dataset_file is None:
            raise Exception('"{}" not found'.format(dataset_path))
        # check that dataset_path is accessible
        dataset_file_stats = file_utils.get_file_stats(dataset_path)
        if dataset_file_stats is None:
            raise Exception('Unable to access "{}"'.format(dataset_path))
    except Exception as e:
        raise Exception("Dataset path error: " + str(e))

예제 #6

0

파일 보기

def update_individual_igv_sample(request, individual_guid):
    individual = Individual.objects.get(guid=individual_guid)
    project = individual.family.project
    check_project_permissions(project, request.user, can_edit=True)

    request_json = json.loads(request.body)

    try:
        file_path = request_json.get('filePath')
        if not file_path:
            raise ValueError('request must contain fields: filePath')

        suffix = '.'.join(file_path.split('.')[1:])
        sample_type = SAMPLE_TYPE_MAP.get(suffix)
        if not sample_type:
            raise Exception(
                'Invalid file extension for "{}" - valid extensions are {}'.
                format(file_path, ', '.join(SAMPLE_TYPE_MAP.keys())))
        if not does_file_exist(file_path):
            raise Exception('Error accessing "{}"'.format(file_path))

        sample, created = get_or_create_model_from_json(
            IgvSample,
            create_json={
                'individual': individual,
                'sample_type': sample_type
            },
            update_json={
                'file_path': file_path,
                'sample_id': request_json.get('sampleId')
            },
            user=request.user)

        response = {
            'igvSamplesByGuid': {
                sample.guid:
                get_json_for_sample(sample,
                                    individual_guid=individual_guid,
                                    project_guid=project.guid)
            }
        }
        if created:
            response['individualsByGuid'] = {
                individual.guid: {
                    'igvSampleGuids':
                    [s.guid for s in individual.igvsample_set.all()]
                }
            }
        return create_json_response(response)
    except Exception as e:
        error = str(e)
        return create_json_response({'error': error}, status=400, reason=error)

예제 #7

0

파일 보기

    def run_vep(self, input_vcf_path, output_vds_path):
        """Run VEP on the dataset. Assumes the dataproc cluster already exists."""

        if not does_file_exist(input_vcf_path):
            raise ValueError("%(input_vcf_path)s not foud" % locals())

        script_path = os.path.join(BASE_DIR, "seqr/pipelines/hail/run_vep.py")
        script_args = [
            input_vcf_path,
            output_vds_path,
        ]

        self.hail_runner.run_hail(script_path, *script_args)

예제 #8

0

파일 보기

파일: dataset_validation.py 프로젝트: tianyunwang/seqr

def validate_dataset(project,
                     sample_type,
                     analysis_type,
                     genome_version,
                     dataset_path,
                     max_edit_distance=0,
                     dataset_id=None):
    """Validates the given dataset.
    Args:
        project (object):
        sample_type (string):
        analysis_type (string):
        genome_version (string):
        dataset_path (string):
        max_edit_distance (int):
        dataset_id (string):
    Return:
        (errors, warnings, info) tuple

    Dataset.ANALYSIS_TYPE_VARIANT_CALLS
    """
    #elasticsearch_host = options["elasticsearch_host"]
    #elasticsearch_index = options["elasticsearch_index"]
    #is_loaded = options["is_loaded"]

    # check args
    errors = []
    warnings = []
    info = []

    # basic file path checks
    if analysis_type == Dataset.ANALYSIS_TYPE_VARIANT_CALLS:
        if not dataset_path.endswith(".vcf.gz") and not dataset_path.endswith(
                ".vds"):
            errors.append("Dataset path must end with .vcf.gz or .vds")
    elif analysis_type == Dataset.ANALYSIS_TYPE_ALIGNMENT:
        if not any([
                dataset_path.endswith(suffix)
                for suffix in ('.txt', '.tsv', '.xls', '.xlsx')
        ]):
            errors.append(
                "BAM / CRAM table must have a .txt or .xls extension")
    else:
        errors.append("dataset type not supported: %s" % (analysis_type, ))

    if errors:
        return errors, warnings, info

    # check that dataset file exists
    try:
        dataset_file = does_file_exist(dataset_path)
        if dataset_file is None:
            errors.append("Unable to access %s" % (dataset_path, ))
        else:
            # check that dataset_path is accessible
            dataset_file_stats = get_file_stats(dataset_path)
            if dataset_file_stats is None:
                errors.append("Unable to access %s" % (dataset_path, ))
    except Exception as e:
        errors.append("dataset path error: " + str(e))

    if errors:
        return errors, warnings, info

    # validate dataset contents
    if analysis_type == Dataset.ANALYSIS_TYPE_VARIANT_CALLS:
        # validate VCF and get sample ids
        try:
            sample_ids = _validate_vcf(dataset_path,
                                       sample_type=sample_type,
                                       genome_version=genome_version)
        except ValueError as e:
            errors.append(str(e))
            return errors, warnings, info

        matched_sample_id_to_sample_record = match_sample_ids_to_sample_records(
            project,
            sample_ids=sample_ids,
            sample_type=sample_type,
            max_edit_distance=max_edit_distance,
        )

        if len(matched_sample_id_to_sample_record) == 0:
            all_vcf_sample_id_count = len(sample_ids)
            all_project_sample_id_count = len(
                Sample.objects.filter(individual__family__project=project,
                                      sample_type=sample_type))
            errors.append(
                "None of the individuals or samples in the project matched the %(all_vcf_sample_id_count)s sample id(s) in the VCF"
                % locals())
            return errors, warnings, info

        # if Dataset record exists, retrieve it and check if it's already been loaded previously
        try:
            dataset = get_dataset(
                project=project,
                analysis_type=analysis_type,
                genome_version=genome_version,
                source_file_path=dataset_path,
                #elasticsearch_host=elasticsearch_host,
                #elasticsearch_index=elasticsearch_index,
                #is_loaded=is_loaded,
            )
        except ObjectDoesNotExist as e:
            logger.warning("No existing dataset found")

        # check if all VCF samples loaded already - TODO update this?
        vcf_sample_ids = set(matched_sample_id_to_sample_record.keys())
        existing_sample_ids = set([s.sample_id for s in dataset.samples.all()])
        if dataset.is_loaded and len(vcf_sample_ids -
                                     existing_sample_ids) == 0:
            info.append("All %s samples in this VCF have already been loaded" %
                        len(vcf_sample_ids))
            return errors, warnings, info
        elif not dataset.is_loaded:
            info.append("Dataset not loaded. Loading...")
        elif len(vcf_sample_ids - existing_sample_ids) != 0:
            info.append("Data will be loaded for these samples: %s" %
                        (vcf_sample_ids - existing_sample_ids, ))

    return errors, warnings, info

예제 #9

0

파일 보기

파일: dataset_validation.py 프로젝트: tianyunwang/seqr

def add_dataset(project,
                sample_type,
                analysis_type,
                genome_version,
                dataset_path,
                max_edit_distance=0,
                dataset_id=None,
                name=None,
                description=None,
                ignore_extra_samples_in_callset=False):
    """Validates the given dataset.
    Args:
        project (object):
        sample_type (string):
        analysis_type (string):
        genome_version (string):
        dataset_path (string):
        max_edit_distance (int):
        dataset_id (string):
        ignore_extra_samples_in_callset (bool):
    Return:
        (errors, warnings, info) tuple

    Dataset.ANALYSIS_TYPE_VARIANT_CALLS
    """
    #elasticsearch_host = options["elasticsearch_host"]
    #elasticsearch_index = options["elasticsearch_index"]
    #is_loaded = options["is_loaded"]

    # check args
    errors = []
    warnings = []
    info = []

    # basic file path checks
    if analysis_type == Dataset.ANALYSIS_TYPE_VARIANT_CALLS:
        if not dataset_path.endswith(".vcf.gz") and not dataset_path.endswith(
                ".vds"):
            errors.append("Dataset path must end with .vcf.gz or .vds")
    elif analysis_type == Dataset.ANALYSIS_TYPE_ALIGNMENT:
        if not any([
                dataset_path.endswith(suffix)
                for suffix in ('.txt', '.tsv', '.xls', '.xlsx')
        ]):
            errors.append(
                "BAM / CRAM table must have a .txt or .xls extension")
    else:
        errors.append("dataset type not supported: %s" % (analysis_type, ))

    if errors:
        return errors, warnings, info

    # check that dataset file exists
    try:
        dataset_file = does_file_exist(dataset_path)
        if dataset_file is None:
            errors.append("Unable to access %s" % (dataset_path, ))
        else:
            # check that dataset_path is accessible
            dataset_file_stats = get_file_stats(dataset_path)
            if dataset_file_stats is None:
                errors.append("Unable to access %s" % (dataset_path, ))
    except Exception as e:
        errors.append("dataset path error: " + str(e))

    if errors:
        return errors, warnings, info

    # validate dataset contents
    if analysis_type == Dataset.ANALYSIS_TYPE_VARIANT_CALLS:
        # validate VCF and get sample ids
        try:
            all_vcf_sample_ids = _validate_vcf(dataset_path,
                                               sample_type=sample_type,
                                               genome_version=genome_version)
        except ValueError as e:
            errors.append(str(e))
            return errors, warnings, info

        matched_sample_id_to_sample_record = match_sample_ids_to_sample_records(
            project,
            sample_ids=all_vcf_sample_ids,
            sample_type=sample_type,
            max_edit_distance=max_edit_distance,
            create_sample_records=True,
        )

        if not ignore_extra_samples_in_callset and len(
                matched_sample_id_to_sample_record) < len(all_vcf_sample_ids):
            errors.append(
                "Matches not found for VCF sample ids: " + ", ".join(
                    set(all_vcf_sample_ids) -
                    set(matched_sample_id_to_sample_record.keys())) +
                ". Select the 'Ignore extra samples in callset' checkbox to ignore this."
            )

        if len(matched_sample_id_to_sample_record) == 0:
            errors.append(
                "None of the individuals or samples in the project matched the %(all_vcf_sample_id_count)s sample id(s) in the VCF"
                % locals())
            return errors, warnings, info

            # if Dataset record exists, retrieve it and check if it's already been loaded previously

        # retrieve or create Dataset record and link it to sample(s)
        dataset = get_or_create_elasticsearch_dataset(
            project=project,
            analysis_type=analysis_type,
            genome_version=genome_version,
            source_file_path=dataset_path,
            elasticsearch_index=dataset_id,
        )

        if dataset_id is not None:
            dataset.is_loaded = True
            dataset.loaded_date = timezone.now()

        dataset.name = name
        dataset.description = description
        dataset.save()

        link_dataset_to_sample_records(
            dataset, matched_sample_id_to_sample_record.values())

        # check if all VCF samples loaded already - TODO update this?
        vcf_sample_ids = set(matched_sample_id_to_sample_record.keys())
        existing_sample_ids = set([s.sample_id for s in dataset.samples.all()])
        if dataset.is_loaded and len(vcf_sample_ids -
                                     existing_sample_ids) == 0:
            info.append("All %s samples in this VCF have already been loaded" %
                        len(vcf_sample_ids))
            return errors, warnings, info
        elif not dataset.is_loaded:
            info.append("Dataset not loaded. Loading...")
        elif len(vcf_sample_ids - existing_sample_ids) != 0:
            info.append("Data will be loaded for these samples: %s" %
                        (vcf_sample_ids - existing_sample_ids, ))

    return errors, warnings, info

예제 #10

0

파일 보기

파일: anvil_workspace_api.py 프로젝트: danielpavlic/seqr

def create_project_from_workspace(request, namespace, name):
    """
    Create a project when a cooperator requests to load data from an AnVIL workspace.

    :param request: Django request object
    :param namespace: The namespace (or the billing account) of the workspace
    :param name: The name of the workspace. It also be used as the project name
    :return the projectsByGuid with the new project json

    """
    # Validate that the current user has logged in through google and has sufficient permissions
    workspace_meta = check_workspace_perm(request.user, CAN_EDIT, namespace, name, can_share=True, meta_fields=['workspace.bucketName'])

    projects = Project.objects.filter(workspace_namespace=namespace, workspace_name=name)
    if projects:
        error = 'Project "{}" for workspace "{}/{}" exists.'.format(projects.first().name, namespace, name)
        return create_json_response({'error': error}, status=400, reason=error)

    # Validate all the user inputs from the post body
    request_json = json.loads(request.body)

    missing_fields = [field for field in ['genomeVersion', 'uploadedFileId', 'dataPath'] if not request_json.get(field)]
    if missing_fields:
        error = 'Field(s) "{}" are required'.format(', '.join(missing_fields))
        return create_json_response({'error': error}, status=400, reason=error)

    if not request_json.get('agreeSeqrAccess'):
        error = 'Must agree to grant seqr access to the data in the associated workspace.'
        return create_json_response({'error': error}, status=400, reason=error)

    # Add the seqr service account to the corresponding AnVIL workspace
    added_account_to_workspace = add_service_account(request.user, namespace, name)
    if added_account_to_workspace:
        _wait_for_service_account_access(request.user,namespace, name)

    # Validate the data path
    bucket_name = workspace_meta['workspace']['bucketName']
    data_path = 'gs://{bucket}/{path}'.format(bucket=bucket_name.rstrip('/'), path=request_json['dataPath'].lstrip('/'))
    if not does_file_exist(data_path):
        error = 'Data file or path {} is not found.'.format(request_json['dataPath'])
        return create_json_response({'error': error}, status=400, reason=error)

    # Parse families/individuals in the uploaded pedigree file
    json_records = load_uploaded_file(request_json['uploadedFileId'])
    pedigree_records, errors, ped_warnings = parse_pedigree_table(json_records, 'uploaded pedigree file', user=request.user)
    errors += ped_warnings
    if errors:
        return create_json_response({'errors': errors}, status=400)

    # Create a new Project in seqr
    project_args = {
        'name': name,
        'genome_version': request_json['genomeVersion'],
        'description': request_json.get('description', ''),
        'workspace_namespace': namespace,
        'workspace_name': name,
    }

    project = create_model_from_json(Project, project_args, user=request.user)

    # add families and individuals according to the uploaded individual records
    _, updated_individuals = add_or_update_individuals_and_families(
        project, individual_records=pedigree_records, user=request.user
    )

    # Send an email to all seqr data managers
    try:
        _send_load_data_email(project, updated_individuals, data_path, request.user)
    except Exception as ee:
        message = 'Exception while sending email to user {}. {}'.format(request.user, str(ee))
        logger.error(message)

    return create_json_response({'projectGuid':  project.guid})

예제 #11

0

파일 보기

파일: add_vcf.py 프로젝트: tianyunwang/seqr

    def handle(self, *args, **options):

        analysis_type = Dataset.ANALYSIS_TYPE_VARIANT_CALLS

        # parse and validate args
        sample_type = options["sample_type"]
        genome_version = options["genome_version"]
        validate_only = options["validate_only"]
        remap_sample_ids = options["remap_sample_ids"]
        max_edit_distance = options["max_edit_distance_for_id_match"]
        pedigree_file_path = options["pedigree_file"]
        export_pedigree_file_template = options["export_pedigree_file_template"]
        project_guid = options["project_id"]
        vcf_path = options["vcf_path"]
        elasticsearch_index = options["elasticsearch_index"]
        is_loaded = options["is_loaded"]

        # look up project id and validate other args
        try:
            project = Project.objects.get(guid=project_guid)
        except ObjectDoesNotExist:
            raise CommandError("Invalid project id: %(project_guid)s" % locals())

        #if project.genome_version != genome_version:
        #    raise CommandError("Genome version %s doesn't match the project's genome version which is %s" % (genome_version, project.genome_version))

        if pedigree_file_path and not os.path.isfile(pedigree_file_path):
            raise CommandError("Can't open pedigree file: %(pedigree_file_path)s" % locals())

        # parse the pedigree file if specified
        if pedigree_file_path:

            input_stream = file_iter(pedigree_file_path)
            json_records, errors, warnings = parse_pedigree_table(pedigree_file_path, input_stream)

            if errors:
                for message in errors:
                    logger.error(message)
                raise CommandError("Unable to parse %(pedigree_file_path)s" % locals())

            if warnings:
                for message in warnings:
                    logger.warn(message)

            if not validate_only:
                add_or_update_individuals_and_families(project, json_records)

        # validate VCF and get sample ids
        vcf_sample_ids = _validate_vcf(vcf_path, sample_type=sample_type, genome_version=genome_version)

        if remap_sample_ids:
            if not does_file_exist(remap_sample_ids):
                raise ValueError("File not found: " + remap_sample_ids)

            id_mapping = {}
            for line in file_iter(remap_sample_ids):
                fields = line.strip().split("\t")
                if len(fields) != 2:
                    raise ValueError("Must contain 2 columns: " + str(fields))
                id_mapping[fields[0]] = fields[1]

            remapped_vcf_sample_ids = []
            for sample_id in vcf_sample_ids:
                if sample_id in id_mapping:
                    remapped_vcf_sample_ids.append(id_mapping[sample_id])
                    print("Remapped %s to %s" % (sample_id, id_mapping[sample_id]))
                else:
                    remapped_vcf_sample_ids.append(sample_id)
                    print("No sample id mapping for %s" % sample_id)
                    
            vcf_sample_ids = remapped_vcf_sample_ids

        vcf_sample_ids_to_sample_records = match_sample_ids_to_sample_records(
            project,
            sample_ids=vcf_sample_ids,
            sample_type=sample_type,
            max_edit_distance=max_edit_distance,
            create_sample_records=not validate_only,
        )

        if export_pedigree_file_template:
            with open(export_pedigree_file_template, "w") as out_f:
                out_f.write("#%s\n" % ("\t".join(['family_id', 'individual_id', 'paternal_id', 'maternal_id', 'sex', 'affected_status'],)))
                for vcf_sample_id in vcf_sample_ids:
                    if vcf_sample_id in vcf_sample_ids_to_sample_records:
                        continue

                    family_id = individual_id = vcf_sample_id
                    out_f.write("%s\n" % ("\t".join([family_id, individual_id, '', '', '', ''],)))
            logger.info("Wrote out %(export_pedigree_file_template)s. Exiting..." % locals())
            return

        if len(vcf_sample_ids_to_sample_records) == 0:
            all_vcf_sample_id_count = len(vcf_sample_ids)
            all_project_sample_id_count = len(Sample.objects.filter(individual__family__project=project, sample_type=sample_type))
            logger.info("None of the individuals or samples in the project matched the %(all_vcf_sample_id_count)s sample id(s) in the VCF" % locals())
            return

        # retrieve or create Dataset record and link it to sample(s)
        dataset = get_or_create_elasticsearch_dataset(
            project=project,
            analysis_type=analysis_type,
            genome_version=genome_version,
            source_file_path=vcf_path,
            elasticsearch_index=elasticsearch_index,
            is_loaded=is_loaded,
        )

        if is_loaded and not dataset.loaded_date:
            dataset.loaded_date=timezone.now()
            dataset.save()

        link_dataset_to_sample_records(dataset, vcf_sample_ids_to_sample_records.values())

        # check if all VCF samples loaded already
        vcf_sample_ids = set(vcf_sample_ids_to_sample_records.keys())
        existing_sample_ids = set([s.sample_id for s in dataset.samples.all()])
        if dataset.is_loaded and len(vcf_sample_ids - existing_sample_ids) == 0:
            logger.info("All %s samples in this VCF have already been loaded" % len(vcf_sample_ids))
            return
        elif not dataset.is_loaded:
            logger.info("Dataset not loaded. %s Loading..." % (is_loaded,))
        elif len(vcf_sample_ids - existing_sample_ids) != 0:
            logger.info("Dataset is loaded but these samples aren't included in the dataset: %s" % (vcf_sample_ids - existing_sample_ids, ))

        logger.info("done")

예제 #12

0

파일 보기

파일: dataset_utils.py 프로젝트: SarahBeecroft/seqr

def validate_alignment_dataset_path(dataset_path):
    if not does_file_exist(dataset_path):
        raise Exception('Error accessing "{}"'.format(dataset_path))

예제 #13

0

파일 보기

def upload_qc_pipeline_output(request):
    file_path = json.loads(request.body)['file'].strip()
    if not does_file_exist(file_path, user=request.user):
        return create_json_response(
            {'errors': ['File not found: {}'.format(file_path)]}, status=400)
    raw_records = parse_file(file_path, file_iter(file_path,
                                                  user=request.user))

    json_records = [dict(zip(raw_records[0], row)) for row in raw_records[1:]]

    try:
        dataset_type, data_type, records_by_sample_id = _parse_raw_qc_records(
            json_records)
    except ValueError as e:
        return create_json_response({'errors': [str(e)]},
                                    status=400,
                                    reason=str(e))

    info_message = 'Parsed {} {} samples'.format(
        len(json_records),
        'SV' if dataset_type == Sample.DATASET_TYPE_SV_CALLS else data_type)
    logger.info(info_message, request.user)
    info = [info_message]
    warnings = []

    samples = Sample.objects.filter(
        sample_id__in=records_by_sample_id.keys(),
        sample_type=Sample.SAMPLE_TYPE_WES
        if data_type == 'exome' else Sample.SAMPLE_TYPE_WGS,
        dataset_type=dataset_type,
    ).exclude(individual__family__project__name__in=EXCLUDE_PROJECTS).exclude(
        individual__family__project__projectcategory__name=
        EXCLUDE_PROJECT_CATEGORY)

    sample_individuals = {
        agg['sample_id']: agg['individuals']
        for agg in samples.values('sample_id').annotate(
            individuals=ArrayAgg('individual_id', distinct=True))
    }

    sample_individual_max_loaded_date = {
        agg['individual_id']: agg['max_loaded_date']
        for agg in samples.values('individual_id').annotate(
            max_loaded_date=Max('loaded_date'))
    }
    individual_latest_sample_id = {
        s.individual_id: s.sample_id
        for s in samples if s.loaded_date ==
        sample_individual_max_loaded_date.get(s.individual_id)
    }

    for sample_id, record in records_by_sample_id.items():
        record['individual_ids'] = list({
            individual_id
            for individual_id in sample_individuals.get(sample_id, [])
            if individual_latest_sample_id[individual_id] == sample_id
        })

    missing_sample_ids = {
        sample_id
        for sample_id, record in records_by_sample_id.items()
        if not record['individual_ids']
    }
    if missing_sample_ids:
        individuals = Individual.objects.filter(
            individual_id__in=missing_sample_ids
        ).exclude(family__project__name__in=EXCLUDE_PROJECTS).exclude(
            family__project__projectcategory__name=EXCLUDE_PROJECT_CATEGORY
        ).filter(sample__sample_type=Sample.SAMPLE_TYPE_WES if data_type ==
                 'exome' else Sample.SAMPLE_TYPE_WGS).distinct()
        individual_db_ids_by_id = defaultdict(list)
        for individual in individuals:
            individual_db_ids_by_id[individual.individual_id].append(
                individual.id)
        for sample_id, record in records_by_sample_id.items():
            if not record['individual_ids'] and len(
                    individual_db_ids_by_id[sample_id]) >= 1:
                record['individual_ids'] = individual_db_ids_by_id[sample_id]
                missing_sample_ids.remove(sample_id)

    multi_individual_samples = {
        sample_id: len(record['individual_ids'])
        for sample_id, record in records_by_sample_id.items()
        if len(record['individual_ids']) > 1
    }
    if multi_individual_samples:
        logger.warning(
            'Found {} multi-individual samples from qc output'.format(
                len(multi_individual_samples)), request.user)
        warnings.append(
            'The following {} samples were added to multiple individuals: {}'.
            format(
                len(multi_individual_samples), ', '.join(
                    sorted([
                        '{} ({})'.format(sample_id, count) for sample_id, count
                        in multi_individual_samples.items()
                    ]))))

    if missing_sample_ids:
        logger.warning(
            'Missing {} samples from qc output'.format(
                len(missing_sample_ids)), request.user)
        warnings.append('The following {} samples were skipped: {}'.format(
            len(missing_sample_ids),
            ', '.join(sorted(list(missing_sample_ids)))))

    records_with_individuals = [
        record for sample_id, record in records_by_sample_id.items()
        if sample_id not in missing_sample_ids
    ]

    if dataset_type == Sample.DATASET_TYPE_SV_CALLS:
        _update_individuals_sv_qc(records_with_individuals, request.user)
    else:
        _update_individuals_variant_qc(records_with_individuals, data_type,
                                       warnings, request.user)

    message = 'Found and updated matching seqr individuals for {} samples'.format(
        len(json_records) - len(missing_sample_ids))
    info.append(message)

    return create_json_response({
        'errors': [],
        'warnings': warnings,
        'info': info,
    })