Python file_iter 예제들, seqr.utils.file_utils.file_iter Python 예제들

예제 #1

0

파일 보기

def _stream_file(request, path):
    # based on https://gist.github.com/dcwatson/cb5d8157a8fa5a4a046e
    content_type = 'application/octet-stream'
    range_header = request.META.get('HTTP_RANGE', None)
    if range_header:
        range_match = re.compile(r'bytes\s*=\s*(\d+)\s*-\s*(\d*)',
                                 re.I).match(range_header)
        first_byte, last_byte = range_match.groups()
        first_byte = int(first_byte) if first_byte else 0
        last_byte = int(last_byte)
        length = last_byte - first_byte + 1
        resp = StreamingHttpResponse(file_iter(path,
                                               byte_range=(first_byte,
                                                           last_byte),
                                               raw_content=True,
                                               user=request.user),
                                     status=206,
                                     content_type=content_type)
        resp['Content-Length'] = str(length)
        resp['Content-Range'] = 'bytes %s-%s' % (first_byte, last_byte)
    else:
        resp = StreamingHttpResponse(file_iter(path,
                                               raw_content=True,
                                               user=request.user),
                                     content_type=content_type)
    resp['Accept-Ranges'] = 'bytes'
    return resp

예제 #2

0

파일 보기

파일: dataset_validation.py 프로젝트: tianyunwang/seqr

def _validate_vcf(vcf_path, sample_type=None, genome_version=None):
    if not vcf_path or not isinstance(vcf_path, basestring):
        raise ValueError("Invalid vcf_path arg: %(vcf_path)s" % locals())

    if not does_file_exist(vcf_path):
        raise ValueError("%(vcf_path)s not found" % locals())

    header_line = None
    for i, line in enumerate(file_iter(vcf_path)):
        if line.startswith("#CHROM"):
            header_line = line
            break
        if line.startswith("#"):
            continue
        else:
            break

        if i > 20000:
            break  # there's no way header is this long

    if not header_line:
        raise ValueError(
            "Unexpected VCF header. #CHROM not found before line: " + line)

    # TODO if annotating using gcloud, check whether dataproc has access to file

    # TODO check header, sample_type, genome_version
    header_fields = header_line.strip().split('\t')
    sample_ids = header_fields[9:]

    return sample_ids

예제 #3

0

파일 보기

    def handle(self, *args, **options):

        # parse and validate args
        validate_only = options["validate_only"]
        project_guid = options["project_id"]
        pedigree_file_path = options["pedigree_file"]

        # look up project id and validate other args
        try:
            project = Project.objects.get(guid=project_guid)
        except ObjectDoesNotExist:
            raise CommandError("Invalid project id: %(project_guid)s" %
                               locals())

        if pedigree_file_path and not os.path.isfile(pedigree_file_path):
            raise CommandError("Can't open pedigree file: %(pedigree_file)s" %
                               locals())

        # parse the pedigree file if specified
        input_stream = file_iter(pedigree_file_path)
        json_records, errors, warnings = parse_pedigree_table(
            pedigree_file_path, input_stream)

        if errors:
            for message in errors:
                logger.error(message)
            raise CommandError("Unable to parse %(pedigree_file_path)s" %
                               locals())

        if warnings:
            for message in warnings:
                logger.warn(message)

        if not validate_only:
            add_or_update_individuals_and_families(project, json_records)

예제 #4

0

파일 보기

def _load_mapping_file(mapping_file_id, mapping_file_path):
    id_mapping = {}
    file_content = []
    if mapping_file_id:
        file_content = load_uploaded_file(mapping_file_id)
    elif mapping_file_path:
        file_content = parse_file(mapping_file_path,
                                  file_iter(mapping_file_path))
    for line in file_content:
        if len(line) != 2:
            raise ValueError("Must contain 2 columns: " + ', '.join(line))
        id_mapping[line[0]] = line[1]
    return id_mapping

예제 #5

0

파일 보기

def add_individuals_from_pedigree_file(project, pedigree_file_path, validate_only=False):
    if pedigree_file_path and not os.path.isfile(pedigree_file_path):
        raise CommandError("Can't open pedigree file: %(pedigree_file)s" % locals())

    # parse the pedigree file if specified
    input_stream = file_iter(pedigree_file_path)
    json_records, errors, warnings = parse_pedigree_table(pedigree_file_path, input_stream)

    if errors:
        for message in errors:
            logger.error(message)
        raise CommandError("Unable to parse %(pedigree_file_path)s" % locals())

    if warnings:
        for message in warnings:
            logger.warn(message)

    if not validate_only:
        add_or_update_individuals_and_families(project, json_records)

예제 #6

0

파일 보기

def _validate_vcf(vcf_path):
    header_line = None
    for line in file_iter(vcf_path):
        if line.startswith("#CHROM"):
            header_line = line
            break
        if line.startswith("#"):
            continue
        else:
            break

    if not header_line:
        raise Exception("Unexpected VCF header. #CHROM not found before line: {}".format(line))

    header_fields = header_line.strip().split('\t')
    sample_ids = header_fields[9:]

    if not sample_ids:
        raise Exception('No samples found in VCF "{}"'.format(vcf_path))

예제 #7

0

파일 보기

파일: dataset_utils.py 프로젝트: macarthur-lab/seqr

def _validate_vcf(vcf_path):
    header_line = None
    for line in file_utils.file_iter(vcf_path):
        if line.startswith("#CHROM"):
            header_line = line
            break
        if line.startswith("#"):
            continue
        else:
            break

    if not header_line:
        raise Exception("Unexpected VCF header. #CHROM not found before line: {}".format(line))

    header_fields = header_line.strip().split('\t')
    sample_ids = header_fields[9:]

    if not sample_ids:
        raise Exception('No samples found in VCF "{}"'.format(vcf_path))

예제 #8

0

파일 보기

파일: staff_api.py 프로젝트: SarahBeecroft/seqr

def upload_qc_pipeline_output(request):
    file_path = json.loads(request.body)['file']
    raw_records = parse_file(file_path, file_iter(file_path))

    json_records = [dict(zip(raw_records[0], row)) for row in raw_records[1:]]

    missing_columns = [
        field for field in [
            'seqr_id', 'data_type', 'filter_flags', 'qc_metrics_filters',
            'qc_pop'
        ] if field not in json_records[0]
    ]
    if missing_columns:
        message = 'The following required columns are missing: {}'.format(
            ', '.join(missing_columns))
        return create_json_response({'errors': [message]},
                                    status=400,
                                    reason=message)

    dataset_types = {
        record['data_type'].lower()
        for record in json_records if record['data_type'].lower() != 'n/a'
    }
    if len(dataset_types) == 0:
        message = 'No dataset type detected'
        return create_json_response({'errors': [message]},
                                    status=400,
                                    reason=message)
    elif len(dataset_types) > 1:
        message = 'Multiple dataset types detected: {}'.format(
            ' ,'.join(dataset_types))
        return create_json_response({'errors': [message]},
                                    status=400,
                                    reason=message)
    elif list(dataset_types)[0] not in DATASET_TYPE_MAP:
        message = 'Unexpected dataset type detected: "{}" (should be "exome" or "genome")'.format(
            list(dataset_types)[0])
        return create_json_response({'errors': [message]},
                                    status=400,
                                    reason=message)

    dataset_type = DATASET_TYPE_MAP[list(dataset_types)[0]]

    info_message = 'Parsed {} {} samples'.format(len(json_records),
                                                 dataset_type)
    logger.info(info_message)
    info = [info_message]
    warnings = []

    sample_ids = {record['seqr_id'] for record in json_records}
    samples = Sample.objects.filter(
        sample_id__in=sample_ids,
        sample_type=Sample.SAMPLE_TYPE_WES
        if dataset_type == 'exome' else Sample.SAMPLE_TYPE_WGS,
    ).exclude(individual__family__project__name__in=EXCLUDE_PROJECTS).exclude(
        individual__family__project__projectcategory__name=
        EXCLUDE_PROJECT_CATEGORY)

    sample_individuals = {
        agg['sample_id']: agg['individuals']
        for agg in samples.values('sample_id').annotate(
            individuals=ArrayAgg('individual_id', distinct=True))
    }

    sample_individual_max_loaded_date = {
        agg['individual_id']: agg['max_loaded_date']
        for agg in samples.values('individual_id').annotate(
            max_loaded_date=Max('loaded_date'))
    }
    individual_latest_sample_id = {
        s.individual_id: s.sample_id
        for s in samples if s.loaded_date ==
        sample_individual_max_loaded_date.get(s.individual_id)
    }

    for record in json_records:
        record['individual_ids'] = list({
            individual_id
            for individual_id in sample_individuals.get(record['seqr_id'], [])
            if individual_latest_sample_id[individual_id] == record['seqr_id']
        })

    missing_sample_ids = {
        record['seqr_id']
        for record in json_records if not record['individual_ids']
    }
    if missing_sample_ids:
        individuals = Individual.objects.filter(
            individual_id__in=missing_sample_ids
        ).exclude(family__project__name__in=EXCLUDE_PROJECTS).exclude(
            family__project__projectcategory__name=EXCLUDE_PROJECT_CATEGORY
        ).exclude(sample__sample_type=Sample.SAMPLE_TYPE_WGS if dataset_type ==
                  'exome' else Sample.SAMPLE_TYPE_WES)
        individual_db_ids_by_id = defaultdict(list)
        for individual in individuals:
            individual_db_ids_by_id[individual.individual_id].append(
                individual.id)
        for record in json_records:
            if not record['individual_ids'] and len(
                    individual_db_ids_by_id[record['seqr_id']]) == 1:
                record['individual_ids'] = individual_db_ids_by_id[
                    record['seqr_id']]
                missing_sample_ids.remove(record['seqr_id'])

    multi_individual_samples = {
        record['seqr_id']: len(record['individual_ids'])
        for record in json_records if len(record['individual_ids']) > 1
    }
    if multi_individual_samples:
        logger.info('Found {} multi-individual samples from qc output'.format(
            len(multi_individual_samples)))
        warnings.append(
            'The following {} samples were added to multiple individuals: {}'.
            format(
                len(multi_individual_samples), ', '.join(
                    sorted([
                        '{} ({})'.format(sample_id, count) for sample_id, count
                        in multi_individual_samples.items()
                    ]))))

    if missing_sample_ids:
        logger.info('Missing {} samples from qc output'.format(
            len(missing_sample_ids)))
        warnings.append('The following {} samples were skipped: {}'.format(
            len(missing_sample_ids),
            ', '.join(sorted(list(missing_sample_ids)))))

    unknown_filter_flags = set()
    unknown_pop_filter_flags = set()

    inidividuals_by_population = defaultdict(list)
    for record in json_records:
        filter_flags = {}
        for flag in json.loads(record['filter_flags']):
            flag = '{}_{}'.format(flag,
                                  dataset_type) if flag == 'coverage' else flag
            flag_col = FILTER_FLAG_COL_MAP.get(flag, flag)
            if flag_col in record:
                filter_flags[flag] = record[flag_col]
            else:
                unknown_filter_flags.add(flag)

        pop_platform_filters = {}
        for flag in json.loads(record['qc_metrics_filters']):
            flag_col = 'sample_qc.{}'.format(flag)
            if flag_col in record:
                pop_platform_filters[flag] = record[flag_col]
            else:
                unknown_pop_filter_flags.add(flag)

        if filter_flags or pop_platform_filters:
            Individual.objects.filter(id__in=record['individual_ids']).update(
                filter_flags=filter_flags or None,
                pop_platform_filters=pop_platform_filters or None)

        inidividuals_by_population[
            record['qc_pop'].upper()] += record['individual_ids']

    for population, indiv_ids in inidividuals_by_population.items():
        Individual.objects.filter(id__in=indiv_ids).update(
            population=population)

    if unknown_filter_flags:
        message = 'The following filter flags have no known corresponding value and were not saved: {}'.format(
            ', '.join(unknown_filter_flags))
        logger.info(message)
        warnings.append(message)

    if unknown_pop_filter_flags:
        message = 'The following population platform filters have no known corresponding value and were not saved: {}'.format(
            ', '.join(unknown_pop_filter_flags))
        logger.info(message)
        warnings.append(message)

    message = 'Found and updated matching seqr individuals for {} samples'.format(
        len(json_records) - len(missing_sample_ids))
    info.append(message)
    logger.info(message)

    return create_json_response({
        'errors': [],
        'warnings': warnings,
        'info': info,
    })

예제 #9

0

파일 보기

파일: add_vcf.py 프로젝트: tianyunwang/seqr

    def handle(self, *args, **options):

        analysis_type = Dataset.ANALYSIS_TYPE_VARIANT_CALLS

        # parse and validate args
        sample_type = options["sample_type"]
        genome_version = options["genome_version"]
        validate_only = options["validate_only"]
        remap_sample_ids = options["remap_sample_ids"]
        max_edit_distance = options["max_edit_distance_for_id_match"]
        pedigree_file_path = options["pedigree_file"]
        export_pedigree_file_template = options["export_pedigree_file_template"]
        project_guid = options["project_id"]
        vcf_path = options["vcf_path"]
        elasticsearch_index = options["elasticsearch_index"]
        is_loaded = options["is_loaded"]

        # look up project id and validate other args
        try:
            project = Project.objects.get(guid=project_guid)
        except ObjectDoesNotExist:
            raise CommandError("Invalid project id: %(project_guid)s" % locals())

        #if project.genome_version != genome_version:
        #    raise CommandError("Genome version %s doesn't match the project's genome version which is %s" % (genome_version, project.genome_version))

        if pedigree_file_path and not os.path.isfile(pedigree_file_path):
            raise CommandError("Can't open pedigree file: %(pedigree_file_path)s" % locals())

        # parse the pedigree file if specified
        if pedigree_file_path:

            input_stream = file_iter(pedigree_file_path)
            json_records, errors, warnings = parse_pedigree_table(pedigree_file_path, input_stream)

            if errors:
                for message in errors:
                    logger.error(message)
                raise CommandError("Unable to parse %(pedigree_file_path)s" % locals())

            if warnings:
                for message in warnings:
                    logger.warn(message)

            if not validate_only:
                add_or_update_individuals_and_families(project, json_records)

        # validate VCF and get sample ids
        vcf_sample_ids = _validate_vcf(vcf_path, sample_type=sample_type, genome_version=genome_version)

        if remap_sample_ids:
            if not does_file_exist(remap_sample_ids):
                raise ValueError("File not found: " + remap_sample_ids)

            id_mapping = {}
            for line in file_iter(remap_sample_ids):
                fields = line.strip().split("\t")
                if len(fields) != 2:
                    raise ValueError("Must contain 2 columns: " + str(fields))
                id_mapping[fields[0]] = fields[1]

            remapped_vcf_sample_ids = []
            for sample_id in vcf_sample_ids:
                if sample_id in id_mapping:
                    remapped_vcf_sample_ids.append(id_mapping[sample_id])
                    print("Remapped %s to %s" % (sample_id, id_mapping[sample_id]))
                else:
                    remapped_vcf_sample_ids.append(sample_id)
                    print("No sample id mapping for %s" % sample_id)
                    
            vcf_sample_ids = remapped_vcf_sample_ids

        vcf_sample_ids_to_sample_records = match_sample_ids_to_sample_records(
            project,
            sample_ids=vcf_sample_ids,
            sample_type=sample_type,
            max_edit_distance=max_edit_distance,
            create_sample_records=not validate_only,
        )

        if export_pedigree_file_template:
            with open(export_pedigree_file_template, "w") as out_f:
                out_f.write("#%s\n" % ("\t".join(['family_id', 'individual_id', 'paternal_id', 'maternal_id', 'sex', 'affected_status'],)))
                for vcf_sample_id in vcf_sample_ids:
                    if vcf_sample_id in vcf_sample_ids_to_sample_records:
                        continue

                    family_id = individual_id = vcf_sample_id
                    out_f.write("%s\n" % ("\t".join([family_id, individual_id, '', '', '', ''],)))
            logger.info("Wrote out %(export_pedigree_file_template)s. Exiting..." % locals())
            return

        if len(vcf_sample_ids_to_sample_records) == 0:
            all_vcf_sample_id_count = len(vcf_sample_ids)
            all_project_sample_id_count = len(Sample.objects.filter(individual__family__project=project, sample_type=sample_type))
            logger.info("None of the individuals or samples in the project matched the %(all_vcf_sample_id_count)s sample id(s) in the VCF" % locals())
            return

        # retrieve or create Dataset record and link it to sample(s)
        dataset = get_or_create_elasticsearch_dataset(
            project=project,
            analysis_type=analysis_type,
            genome_version=genome_version,
            source_file_path=vcf_path,
            elasticsearch_index=elasticsearch_index,
            is_loaded=is_loaded,
        )

        if is_loaded and not dataset.loaded_date:
            dataset.loaded_date=timezone.now()
            dataset.save()

        link_dataset_to_sample_records(dataset, vcf_sample_ids_to_sample_records.values())

        # check if all VCF samples loaded already
        vcf_sample_ids = set(vcf_sample_ids_to_sample_records.keys())
        existing_sample_ids = set([s.sample_id for s in dataset.samples.all()])
        if dataset.is_loaded and len(vcf_sample_ids - existing_sample_ids) == 0:
            logger.info("All %s samples in this VCF have already been loaded" % len(vcf_sample_ids))
            return
        elif not dataset.is_loaded:
            logger.info("Dataset not loaded. %s Loading..." % (is_loaded,))
        elif len(vcf_sample_ids - existing_sample_ids) != 0:
            logger.info("Dataset is loaded but these samples aren't included in the dataset: %s" % (vcf_sample_ids - existing_sample_ids, ))

        logger.info("done")

예제 #10

0

파일 보기

파일: add_vcf.py 프로젝트: obonyojimmy/seqr

    def handle(self, *args, **options):

        analysis_type = Dataset.ANALYSIS_TYPE_VARIANT_CALLS

        # parse and validate args
        sample_type = options["sample_type"]
        genome_version = options["genome_version"]
        validate_only = options["validate_only"]
        max_edit_distance = options["max_edit_distance_for_id_match"]
        pedigree_file_path = options["pedigree_file"]
        export_pedigree_file_template = options["export_pedigree_file_template"]
        project_guid = options["project_id"]
        vcf_path = options["vcf_path"]
        dataset_id = options["dataset_id"]

        # look up project id and validate other args
        try:
            project = Project.objects.get(guid=project_guid)
        except ObjectDoesNotExist:
            raise CommandError("Invalid project id: %(project_guid)s" % locals())

        if project.genome_version != genome_version:
            raise CommandError("Genome version %s doesn't match the project's genome version which is %s" % (genome_version, project.genome_version))

        if pedigree_file_path and not os.path.isfile(pedigree_file_path):
            raise CommandError("Can't open pedigree file: %(pedigree_file_path)s" % locals())

        # parse the pedigree file if specified
        if pedigree_file_path:

            input_stream = file_iter(pedigree_file_path)
            json_records, errors, warnings = parse_pedigree_table(pedigree_file_path, input_stream)

            if errors:
                for message in errors:
                    logger.error(message)
                raise CommandError("Unable to parse %(pedigree_file_path)s" % locals())

            if warnings:
                for message in warnings:
                    logger.warn(message)

            if not validate_only:
                add_or_update_individuals_and_families(project, json_records)

        # validate VCF and get sample ids
        vcf_sample_ids = _validate_vcf(vcf_path, sample_type=sample_type, genome_version=genome_version)

        vcf_sample_ids_to_sample_records = match_sample_ids_to_sample_records(
            project,
            sample_ids=vcf_sample_ids,
            sample_type=sample_type,
            max_edit_distance=max_edit_distance,
            create_records_for_new_sample_ids=not validate_only,
        )

        if export_pedigree_file_template:
            with open(export_pedigree_file_template, "w") as out_f:
                out_f.write("#%s\n" % ("\t".join(['family_id', 'individual_id', 'paternal_id', 'maternal_id', 'sex', 'affected_status'],)))
                for vcf_sample_id in vcf_sample_ids:
                    if vcf_sample_id in vcf_sample_ids_to_sample_records:
                        continue

                    family_id = individual_id = vcf_sample_id
                    out_f.write("%s\n" % ("\t".join([family_id, individual_id, '', '', '', ''],)))
            logger.info("Wrote out %(export_pedigree_file_template)s. Exiting..." % locals())
            return

        if len(vcf_sample_ids_to_sample_records) == 0:
            all_vcf_sample_id_count = len(vcf_sample_ids)
            all_project_sample_id_count = len(Sample.objects.filter(individual__family__project=project, sample_type=sample_type))
            logger.info(("No matches found between the %(all_vcf_sample_id_count)s sample id(s) in the VCF and "
                "the %(all_project_sample_id_count)s %(sample_type)s sample id(s) in %(project_guid)s") % locals())
            return

        if validate_only:
            return

         # retrieve or create Dataset record and link it to sample(s)
        dataset = get_or_create_dataset(
            analysis_type=analysis_type,
            source_file_path=vcf_path,
            project=project,
            dataset_id=dataset_id,
        )

        link_dataset_to_sample_records(dataset, vcf_sample_ids_to_sample_records.values())

        # check if all VCF samples loaded already
        vcf_sample_ids = set(vcf_sample_ids_to_sample_records.keys())
        existing_sample_ids = set([s.sample_id for s in dataset.samples.all()])
        if dataset.is_loaded and len(vcf_sample_ids - existing_sample_ids) == 0:
            logger.info("All %s samples in this VCF have already been loaded" % len(vcf_sample_ids))
            return

        # load the VCF
        _load_variants(dataset)

        logger.info("done")

예제 #11

0

파일 보기

def load_mapping_file(mapping_file_path, user):
    file_content = parse_file(mapping_file_path,
                              file_iter(mapping_file_path, user=user))
    return _load_mapping_file(file_content)

예제 #12

0

파일 보기

def upload_qc_pipeline_output(request):
    file_path = json.loads(request.body)['file']
    raw_records = parse_file(file_path, file_iter(file_path))

    json_records = [dict(zip(raw_records[0], row)) for row in raw_records[1:]]

    try:
        dataset_type, data_type, records_by_sample_id = _parse_raw_qc_records(json_records)
    except ValueError as e:
        return create_json_response({'errors': [str(e)]}, status=400, reason=str(e))

    info_message = 'Parsed {} {} samples'.format(
        len(json_records), 'SV' if dataset_type == Sample.DATASET_TYPE_SV_CALLS else data_type)
    logger.info(info_message)
    info = [info_message]
    warnings = []

    samples = Sample.objects.filter(
        sample_id__in=records_by_sample_id.keys(),
        sample_type=Sample.SAMPLE_TYPE_WES if data_type == 'exome' else Sample.SAMPLE_TYPE_WGS,
        dataset_type=dataset_type,
    ).exclude(
        individual__family__project__name__in=EXCLUDE_PROJECTS
    ).exclude(individual__family__project__projectcategory__name=EXCLUDE_PROJECT_CATEGORY)

    sample_individuals = {
        agg['sample_id']: agg['individuals'] for agg in
        samples.values('sample_id').annotate(individuals=ArrayAgg('individual_id', distinct=True))
    }

    sample_individual_max_loaded_date = {
        agg['individual_id']: agg['max_loaded_date'] for agg in
        samples.values('individual_id').annotate(max_loaded_date=Max('loaded_date'))
    }
    individual_latest_sample_id = {
        s.individual_id: s.sample_id for s in samples
        if s.loaded_date == sample_individual_max_loaded_date.get(s.individual_id)
    }

    for sample_id, record in records_by_sample_id.items():
        record['individual_ids'] = list({
            individual_id for individual_id in sample_individuals.get(sample_id, [])
            if individual_latest_sample_id[individual_id] == sample_id
        })

    missing_sample_ids = {sample_id for sample_id, record in records_by_sample_id.items() if not record['individual_ids']}
    if missing_sample_ids:
        individuals = Individual.objects.filter(individual_id__in=missing_sample_ids).exclude(
            family__project__name__in=EXCLUDE_PROJECTS).exclude(
            family__project__projectcategory__name=EXCLUDE_PROJECT_CATEGORY).filter(
            sample__sample_type=Sample.SAMPLE_TYPE_WES if data_type == 'exome' else Sample.SAMPLE_TYPE_WGS).distinct()
        individual_db_ids_by_id = defaultdict(list)
        for individual in individuals:
            individual_db_ids_by_id[individual.individual_id].append(individual.id)
        for sample_id, record in records_by_sample_id.items():
            if not record['individual_ids'] and len(individual_db_ids_by_id[sample_id]) >= 1:
                record['individual_ids'] = individual_db_ids_by_id[sample_id]
                missing_sample_ids.remove(sample_id)

    multi_individual_samples = {
        sample_id: len(record['individual_ids']) for sample_id, record in records_by_sample_id.items()
        if len(record['individual_ids']) > 1}
    if multi_individual_samples:
        logger.info('Found {} multi-individual samples from qc output'.format(len(multi_individual_samples)))
        warnings.append('The following {} samples were added to multiple individuals: {}'.format(
            len(multi_individual_samples), ', '.join(
                sorted(['{} ({})'.format(sample_id, count) for sample_id, count in multi_individual_samples.items()]))))

    if missing_sample_ids:
        logger.info('Missing {} samples from qc output'.format(len(missing_sample_ids)))
        warnings.append('The following {} samples were skipped: {}'.format(
            len(missing_sample_ids), ', '.join(sorted(list(missing_sample_ids)))))

    records_with_individuals = [
        record for sample_id, record in records_by_sample_id.items() if sample_id not in missing_sample_ids
    ]

    if dataset_type == Sample.DATASET_TYPE_SV_CALLS:
        _update_individuals_sv_qc(records_with_individuals, request.user)
    else:
        _update_individuals_variant_qc(records_with_individuals, data_type, warnings, request.user)

    message = 'Found and updated matching seqr individuals for {} samples'.format(len(json_records) - len(missing_sample_ids))
    info.append(message)
    logger.info(message)

    return create_json_response({
        'errors': [],
        'warnings': warnings,
        'info': info,
    })

예제 #13

0

파일 보기

def load_mapping_file(mapping_file_path):
    file_content = parse_file(mapping_file_path,
                              file_utils.file_iter(mapping_file_path))
    return _load_mapping_file(file_content)

예제 #14

0

파일 보기

def _validate_vcf_metadata(vcf_path):
    metadata = '\n'.join([line for line in file_utils.file_iter(vcf_path)])
    if 'sample_annotations' not in json.loads(metadata):
        raise Exception('No samples found in "{}"'.format(vcf_path))

예제 #15

0

파일 보기

파일: dataset_utils.py 프로젝트: macarthur-lab/seqr

def _validate_vcf_metadata(vcf_path):
    metadata = '\n'.join([line for line in file_utils.file_iter(vcf_path)])
    if 'sample_annotations' not in json.loads(metadata):
        raise Exception('No samples found in "{}"'.format(vcf_path))

예제 #16

0

파일 보기

파일: dataset_utils.py 프로젝트: macarthur-lab/seqr

def load_mapping_file(mapping_file_path):
    file_content = parse_file(mapping_file_path, file_utils.file_iter(mapping_file_path))
    return _load_mapping_file(file_content)