Exemplo n.º 1
0
def create_dataset(project,
                   analysis_type,
                   source_file_path,
                   is_loaded=False,
                   loaded_date=None,
                   dataset_id=None):

    # compute a dataset_id based on source_file_path
    if dataset_id is None:
        file_stats = get_file_stats(source_file_path)
        dataset_id = "_".join(
            map(str, [
                datetime.datetime.fromtimestamp(float(
                    file_stats.ctime)).strftime('%Y%m%d'),
                os.path.basename(source_file_path).split(".")[0][:20],
                file_stats.size
            ]))

    # create the Dataset
    dataset = Dataset.objects.create(
        project=project,
        analysis_type=analysis_type,
        dataset_id=dataset_id,
        source_file_path=source_file_path,
        is_loaded=is_loaded,
        loaded_date=loaded_date,
    )

    return dataset
Exemplo n.º 2
0
def _validate_dataset_path(dataset_path):
    try:
        dataset_file = does_file_exist(dataset_path)
        if dataset_file is None:
            raise Exception('"{}" not found'.format(dataset_path))
        # check that dataset_path is accessible
        dataset_file_stats = get_file_stats(dataset_path)
        if dataset_file_stats is None:
            raise Exception('Unable to access "{}"'.format(dataset_path))
    except Exception as e:
        raise Exception("Dataset path error: " + str(e))
Exemplo n.º 3
0
def _validate_dataset_path(dataset_path):
    try:
        dataset_file = file_utils.does_file_exist(dataset_path)
        if dataset_file is None:
            raise Exception('"{}" not found'.format(dataset_path))
        # check that dataset_path is accessible
        dataset_file_stats = file_utils.get_file_stats(dataset_path)
        if dataset_file_stats is None:
            raise Exception('Unable to access "{}"'.format(dataset_path))
    except Exception as e:
        raise Exception("Dataset path error: " + str(e))
Exemplo n.º 4
0
def validate_dataset(project,
                     sample_type,
                     analysis_type,
                     genome_version,
                     dataset_path,
                     max_edit_distance=0,
                     dataset_id=None):
    """Validates the given dataset.
    Args:
        project (object):
        sample_type (string):
        analysis_type (string):
        genome_version (string):
        dataset_path (string):
        max_edit_distance (int):
        dataset_id (string):
    Return:
        (errors, warnings, info) tuple

    Dataset.ANALYSIS_TYPE_VARIANT_CALLS
    """
    #elasticsearch_host = options["elasticsearch_host"]
    #elasticsearch_index = options["elasticsearch_index"]
    #is_loaded = options["is_loaded"]

    # check args
    errors = []
    warnings = []
    info = []

    # basic file path checks
    if analysis_type == Dataset.ANALYSIS_TYPE_VARIANT_CALLS:
        if not dataset_path.endswith(".vcf.gz") and not dataset_path.endswith(
                ".vds"):
            errors.append("Dataset path must end with .vcf.gz or .vds")
    elif analysis_type == Dataset.ANALYSIS_TYPE_ALIGNMENT:
        if not any([
                dataset_path.endswith(suffix)
                for suffix in ('.txt', '.tsv', '.xls', '.xlsx')
        ]):
            errors.append(
                "BAM / CRAM table must have a .txt or .xls extension")
    else:
        errors.append("dataset type not supported: %s" % (analysis_type, ))

    if errors:
        return errors, warnings, info

    # check that dataset file exists
    try:
        dataset_file = does_file_exist(dataset_path)
        if dataset_file is None:
            errors.append("Unable to access %s" % (dataset_path, ))
        else:
            # check that dataset_path is accessible
            dataset_file_stats = get_file_stats(dataset_path)
            if dataset_file_stats is None:
                errors.append("Unable to access %s" % (dataset_path, ))
    except Exception as e:
        errors.append("dataset path error: " + str(e))

    if errors:
        return errors, warnings, info

    # validate dataset contents
    if analysis_type == Dataset.ANALYSIS_TYPE_VARIANT_CALLS:
        # validate VCF and get sample ids
        try:
            sample_ids = _validate_vcf(dataset_path,
                                       sample_type=sample_type,
                                       genome_version=genome_version)
        except ValueError as e:
            errors.append(str(e))
            return errors, warnings, info

        matched_sample_id_to_sample_record = match_sample_ids_to_sample_records(
            project,
            sample_ids=sample_ids,
            sample_type=sample_type,
            max_edit_distance=max_edit_distance,
        )

        if len(matched_sample_id_to_sample_record) == 0:
            all_vcf_sample_id_count = len(sample_ids)
            all_project_sample_id_count = len(
                Sample.objects.filter(individual__family__project=project,
                                      sample_type=sample_type))
            errors.append(
                "None of the individuals or samples in the project matched the %(all_vcf_sample_id_count)s sample id(s) in the VCF"
                % locals())
            return errors, warnings, info

        # if Dataset record exists, retrieve it and check if it's already been loaded previously
        try:
            dataset = get_dataset(
                project=project,
                analysis_type=analysis_type,
                genome_version=genome_version,
                source_file_path=dataset_path,
                #elasticsearch_host=elasticsearch_host,
                #elasticsearch_index=elasticsearch_index,
                #is_loaded=is_loaded,
            )
        except ObjectDoesNotExist as e:
            logger.warning("No existing dataset found")

        # check if all VCF samples loaded already - TODO update this?
        vcf_sample_ids = set(matched_sample_id_to_sample_record.keys())
        existing_sample_ids = set([s.sample_id for s in dataset.samples.all()])
        if dataset.is_loaded and len(vcf_sample_ids -
                                     existing_sample_ids) == 0:
            info.append("All %s samples in this VCF have already been loaded" %
                        len(vcf_sample_ids))
            return errors, warnings, info
        elif not dataset.is_loaded:
            info.append("Dataset not loaded. Loading...")
        elif len(vcf_sample_ids - existing_sample_ids) != 0:
            info.append("Data will be loaded for these samples: %s" %
                        (vcf_sample_ids - existing_sample_ids, ))

    return errors, warnings, info
Exemplo n.º 5
0
def add_dataset(project,
                sample_type,
                analysis_type,
                genome_version,
                dataset_path,
                max_edit_distance=0,
                dataset_id=None,
                name=None,
                description=None,
                ignore_extra_samples_in_callset=False):
    """Validates the given dataset.
    Args:
        project (object):
        sample_type (string):
        analysis_type (string):
        genome_version (string):
        dataset_path (string):
        max_edit_distance (int):
        dataset_id (string):
        ignore_extra_samples_in_callset (bool):
    Return:
        (errors, warnings, info) tuple

    Dataset.ANALYSIS_TYPE_VARIANT_CALLS
    """
    #elasticsearch_host = options["elasticsearch_host"]
    #elasticsearch_index = options["elasticsearch_index"]
    #is_loaded = options["is_loaded"]

    # check args
    errors = []
    warnings = []
    info = []

    # basic file path checks
    if analysis_type == Dataset.ANALYSIS_TYPE_VARIANT_CALLS:
        if not dataset_path.endswith(".vcf.gz") and not dataset_path.endswith(
                ".vds"):
            errors.append("Dataset path must end with .vcf.gz or .vds")
    elif analysis_type == Dataset.ANALYSIS_TYPE_ALIGNMENT:
        if not any([
                dataset_path.endswith(suffix)
                for suffix in ('.txt', '.tsv', '.xls', '.xlsx')
        ]):
            errors.append(
                "BAM / CRAM table must have a .txt or .xls extension")
    else:
        errors.append("dataset type not supported: %s" % (analysis_type, ))

    if errors:
        return errors, warnings, info

    # check that dataset file exists
    try:
        dataset_file = does_file_exist(dataset_path)
        if dataset_file is None:
            errors.append("Unable to access %s" % (dataset_path, ))
        else:
            # check that dataset_path is accessible
            dataset_file_stats = get_file_stats(dataset_path)
            if dataset_file_stats is None:
                errors.append("Unable to access %s" % (dataset_path, ))
    except Exception as e:
        errors.append("dataset path error: " + str(e))

    if errors:
        return errors, warnings, info

    # validate dataset contents
    if analysis_type == Dataset.ANALYSIS_TYPE_VARIANT_CALLS:
        # validate VCF and get sample ids
        try:
            all_vcf_sample_ids = _validate_vcf(dataset_path,
                                               sample_type=sample_type,
                                               genome_version=genome_version)
        except ValueError as e:
            errors.append(str(e))
            return errors, warnings, info

        matched_sample_id_to_sample_record = match_sample_ids_to_sample_records(
            project,
            sample_ids=all_vcf_sample_ids,
            sample_type=sample_type,
            max_edit_distance=max_edit_distance,
            create_sample_records=True,
        )

        if not ignore_extra_samples_in_callset and len(
                matched_sample_id_to_sample_record) < len(all_vcf_sample_ids):
            errors.append(
                "Matches not found for VCF sample ids: " + ", ".join(
                    set(all_vcf_sample_ids) -
                    set(matched_sample_id_to_sample_record.keys())) +
                ". Select the 'Ignore extra samples in callset' checkbox to ignore this."
            )

        if len(matched_sample_id_to_sample_record) == 0:
            errors.append(
                "None of the individuals or samples in the project matched the %(all_vcf_sample_id_count)s sample id(s) in the VCF"
                % locals())
            return errors, warnings, info

            # if Dataset record exists, retrieve it and check if it's already been loaded previously

        # retrieve or create Dataset record and link it to sample(s)
        dataset = get_or_create_elasticsearch_dataset(
            project=project,
            analysis_type=analysis_type,
            genome_version=genome_version,
            source_file_path=dataset_path,
            elasticsearch_index=dataset_id,
        )

        if dataset_id is not None:
            dataset.is_loaded = True
            dataset.loaded_date = timezone.now()

        dataset.name = name
        dataset.description = description
        dataset.save()

        link_dataset_to_sample_records(
            dataset, matched_sample_id_to_sample_record.values())

        # check if all VCF samples loaded already - TODO update this?
        vcf_sample_ids = set(matched_sample_id_to_sample_record.keys())
        existing_sample_ids = set([s.sample_id for s in dataset.samples.all()])
        if dataset.is_loaded and len(vcf_sample_ids -
                                     existing_sample_ids) == 0:
            info.append("All %s samples in this VCF have already been loaded" %
                        len(vcf_sample_ids))
            return errors, warnings, info
        elif not dataset.is_loaded:
            info.append("Dataset not loaded. Loading...")
        elif len(vcf_sample_ids - existing_sample_ids) != 0:
            info.append("Data will be loaded for these samples: %s" %
                        (vcf_sample_ids - existing_sample_ids, ))

    return errors, warnings, info