Пример #1
0
def load_case(adapter, case_obj, update=False):
    """Load a case to the database

    Args:
        adapter: Connection to database
        case_obj: dict
        update(bool): If existing case should be updated

    Returns:
        case_obj(models.Case)
    """
    # Check if the case already exists in database.
    existing_case = adapter.case(case_obj)
    if existing_case:
        if not update:
            raise CaseError("Case {0} already exists in database".format(
                case_obj['case_id']))
        case_obj = update_case(case_obj, existing_case)

    # Add the case to database
    try:
        adapter.add_case(case_obj, update=update)
    except CaseError as err:
        raise err

    return case_obj
Пример #2
0
def get_case(family_lines, family_type='ped', vcf_path=None):
    """Return ped_parser case from a family file
    
    Create a dictionary with case data. If no family file is given create from VCF
    
    Args:
        family_lines (iterator): The family lines
        family_type (str): The format of the family lines
        vcf_path(str): Path to VCF
    
    Returns:
        family (Family): A ped_parser family object
    """
    family = None
    LOG.info("Parsing family information")

    family_parser = FamilyParser(family_lines, family_type)

    families = list(family_parser.families.keys())

    LOG.info("Found families {0}".format(', '.join(families)))

    if len(families) > 1:
        raise CaseError("Only one family per load can be used")

    family = family_parser.families[families[0]]

    return family
Пример #3
0
def update_case(case_obj, existing_case):
    """Update an existing case
    
    This will add paths to VCF files, individuals etc
    
    Args:
        case_obj(models.Case)
        existing_case(models.Case)
    
    Returns:
        updated_case(models.Case): Updated existing case
    """
    variant_nrs = ['nr_variants', 'nr_sv_variants']
    individuals = [('individuals', '_inds'), ('sv_individuals', '_sv_inds')]

    updated_case = deepcopy(existing_case)

    for i, file_name in enumerate(['vcf_path', 'vcf_sv_path']):
        variant_type = 'snv'
        if file_name == 'vcf_sv_path':
            variant_type = 'sv'
        if case_obj.get(file_name):
            if updated_case.get(file_name):
                LOG.warning("VCF of type %s already exists in case",
                            variant_type)
                raise CaseError("Can not replace VCF in existing case")
            else:
                updated_case[file_name] = case_obj[file_name]
                updated_case[variant_nrs[i]] = case_obj[variant_nrs[i]]
                updated_case[individuals[i][0]] = case_obj[individuals[i][0]]
                updated_case[individuals[i][1]] = case_obj[individuals[i][1]]

    return updated_case
Пример #4
0
def update_case(case_obj, existing_case):
    """Update an existing case

    This will add paths to VCF files, individuals etc

    Args:
        case_obj(models.Case)
        existing_case(models.Case)

    Returns:
        updated_case(models.Case): Updated existing case
    """
    variant_nrs = ["nr_variants", "nr_sv_variants"]
    individuals = [("individuals", "_inds"), ("sv_individuals", "_sv_inds")]

    updated_case = deepcopy(existing_case)

    for i, file_name in enumerate(["vcf_path", "vcf_sv_path"]):
        variant_type = "snv"
        if file_name == "vcf_sv_path":
            variant_type = "sv"
        if case_obj.get(file_name):
            if updated_case.get(file_name):
                LOG.warning("VCF of type %s already exists in case",
                            variant_type)
                raise CaseError("Can not replace VCF in existing case")
            else:
                updated_case[file_name] = case_obj[file_name]
                updated_case[variant_nrs[i]] = case_obj[variant_nrs[i]]
                updated_case[individuals[i][0]] = case_obj[individuals[i][0]]
                updated_case[individuals[i][1]] = case_obj[individuals[i][1]]

    return updated_case
Пример #5
0
def get_family(family_lines, family_type='ped'):
    """Return the families found in  a family file
    
        Args:
            family_lines (iterator): The family lines
            family_type (str): The format of the family lines
        
        Returns:
            family (Family): A ped_parser family object
    """
    family = None
    logger.info("Parsing family information")
    family_parser = FamilyParser(family_lines, family_type)

    families = list(family_parser.families.keys())

    logger.info("Found families {0}".format(', '.join(families)))

    if len(families) > 1:
        raise CaseError("Only one family per load can be used")

    family = family_parser.families[families[0]]

    return family
Пример #6
0
def get_formated_variant(variant, individuals, family_id, gq_treshold=None):
    """Return a formated variant line
    
        Take a vcf formated variant line and return a dictionary with the
        relevant information.
    
        If criterias are not fullfilled, eg. variant have no gt call or quality
        is below gq treshold then an empty dictionary is returned.
        
        Args:
            variant (dict): A variant dictionary
            individuals (list[str]): A list with individual ids
            family_id (str): The family id
        
        Return:
            formated_variant (dict): A variant dictionary
    """
    gq_treshold = gq_treshold or 20

    chrom = variant['CHROM'].lstrip('chr')
    pos = int(variant['POS'])
    ref = variant['REF']
    alt = variant['ALT']

    formated_variant = {}

    if ',' in alt:
        raise Exception("Multi allele calls are not allowed.")

    format_field = variant['FORMAT'].split(':')

    found_variant = False
    found_homozygote = False
    found_hemizygote = False

    for ind_id in individuals:
        ind_obj = individuals[ind_id]

        if ind_id in variant:
            raw_gt_call = variant[ind_id]
        else:
            raise CaseError("Individual {0} from ped does not exist in"\
                            " vcf".format(ind_id))

        gt_call = dict(zip(format_field, raw_gt_call.split(':')))

        genotype = Genotype(**gt_call)
        if genotype.genotype_quality >= gq_treshold:
            if genotype.has_variant:
                logger.debug("Found variant in affected")
                found_variant = True

                # If variant in X or Y and individual is male,
                # we need to check hemizygosity
                if chrom in ['X', 'Y'] and ind_obj.sex == 1:
                    if not check_par(chrom, pos):
                        logger.debug("Found hemizygous variant")
                        found_hemizygote = True

                if genotype.homo_alt:
                    logger.debug("Found homozygote alternative variant")
                    found_homozygote = True

    if found_variant:
        formated_variant['_id'] = '_'.join([chrom, str(pos), ref, alt])
        formated_variant['chrom'] = chrom
        formated_variant['pos'] = pos
        formated_variant['ref'] = ref
        formated_variant['alt'] = alt
        formated_variant['homozygote'] = 0
        formated_variant['hemizygote'] = 0

        if found_hemizygote:
            formated_variant['hemizygote'] = 1
        elif found_homozygote:
            formated_variant['homozygote'] = 1

        if family_id:
            formated_variant['family_id'] = family_id

    return formated_variant
Пример #7
0
            try:
                # If a profile dict exists, get the profile for ind_id
                profile = profiles[ind_id] if profiles else None
                # If matching samples are found, get these samples for ind_id
                similar_samples = matches[ind_id] if matches else None
                ind_obj = Individual(
                    ind_id=ind_id,
                    case_id=case_id,
                    ind_index=_ind_pos[ind_id],
                    sex=individual.sex,
                    profile=profile,
                    similar_samples=similar_samples,
                )
                ind_objs.append(dict(ind_obj))
            except KeyError:
                raise CaseError("Ind %s in ped file does not exist in VCF",
                                ind_id)
    else:
        # If there where no family file we can create individuals from what we know
        for ind_id in individual_positions:
            profile = profiles[ind_id] if profiles else None
            similar_samples = matches[ind_id] if matches else None
            ind_obj = Individual(
                ind_id=ind_id,
                case_id=case_id,
                ind_index=individual_positions[ind_id],
                profile=profile,
                similar_samples=similar_samples,
            )
            ind_objs.append(dict(ind_obj))

    # Add individuals to the correct variant type
Пример #8
0
def update_database(adapter,
                    variant_file=None,
                    sv_file=None,
                    family_file=None,
                    family_type='ped',
                    skip_case_id=False,
                    gq_treshold=None,
                    case_id=None,
                    max_window=3000):
    """Update a case in the database
            
    Args:
          adapter: Connection to database
          variant_file(str): Path to variant file
          sv_file(str): Path to sv variant file
          family_file(str): Path to family file
          family_type(str): Format of family file
          skip_case_id(bool): If no case information should be added to variants
          gq_treshold(int): If only quality variants should be considered
          case_id(str): If different case id than the one in family file should be used
          max_window(int): Specify the max size for sv windows

    Returns:
          nr_inserted(int)
    """
    vcf_files = []
    nr_variants = None
    vcf_individuals = None
    if variant_file:
        vcf_info = check_vcf(variant_file)
        nr_variants = vcf_info['nr_variants']
        variant_type = vcf_info['variant_type']
        vcf_files.append(variant_file)
        # Get the indivuduals that are present in vcf file
        vcf_individuals = vcf_info['individuals']

    nr_sv_variants = None
    sv_individuals = None
    if sv_file:
        vcf_info = check_vcf(sv_file, 'sv')
        nr_sv_variants = vcf_info['nr_variants']
        vcf_files.append(sv_file)
        sv_individuals = vcf_info['individuals']

    # If a gq treshold is used the variants needs to have GQ
    for _vcf_file in vcf_files:
        # Get a cyvcf2.VCF object
        vcf = get_vcf(_vcf_file)

        if gq_treshold:
            if not vcf.contains('GQ'):
                LOG.warning(
                    'Set gq-treshold to 0 or add info to vcf {0}'.format(
                        _vcf_file))
                raise SyntaxError('GQ is not defined in vcf header')

    # Get a ped_parser.Family object from family file
    family = None
    family_id = None
    if family_file:
        with open(family_file, 'r') as family_lines:
            family = get_case(family_lines=family_lines,
                              family_type=family_type)
            family_id = family.family_id

    # There has to be a case_id or a family at this stage.
    case_id = case_id or family_id

    # Convert infromation to a loqusdb Case object
    case_obj = build_case(
        case=family,
        case_id=case_id,
        vcf_path=variant_file,
        vcf_individuals=vcf_individuals,
        nr_variants=nr_variants,
        vcf_sv_path=sv_file,
        sv_individuals=sv_individuals,
        nr_sv_variants=nr_sv_variants,
    )

    existing_case = adapter.case(case_obj)
    if not existing_case:
        raise CaseError("Case {} does not exist in database".format(
            case_obj['case_id']))

    # Update the existing case in database
    case_obj = load_case(
        adapter=adapter,
        case_obj=case_obj,
        update=True,
    )

    nr_inserted = 0
    # If case was succesfully added we can store the variants
    for file_type in ['vcf_path', 'vcf_sv_path']:
        variant_type = 'snv'
        if file_type == 'vcf_sv_path':
            variant_type = 'sv'
        if case_obj.get(file_type) is None:
            continue

        vcf_obj = get_vcf(case_obj[file_type])
        try:
            nr_inserted += load_variants(
                adapter=adapter,
                vcf_obj=vcf_obj,
                case_obj=case_obj,
                skip_case_id=skip_case_id,
                gq_treshold=gq_treshold,
                max_window=max_window,
                variant_type=variant_type,
            )
        except Exception as err:
            # If something went wrong do a rollback
            LOG.warning(err)
            delete(
                adapter=adapter,
                case_obj=case_obj,
                update=True,
                existing_case=existing_case,
            )
            raise err
    return nr_inserted
Пример #9
0
def load_database(adapter,
                  variant_file,
                  family_file,
                  nr_variants=None,
                  family_type='ped',
                  skip_case_id=False,
                  gq_treshold=None,
                  case_id=None):
    """Load the database with a case and its variants
            
            Args:
                  adapter
                  variant_file(str)
                  family_file(str)
                  family_type(str)
                  skip_case_id(bool)
 
    """
    vcf = get_vcf(variant_file)

    if gq_treshold:
        if not vcf.contains('GQ'):
            logger.warning('Set gq-treshold to 0 or add info to vcf')
            raise SyntaxError('GQ is not defined in vcf header')

    with open(family_file, 'r') as family_lines:
        family = get_family(family_lines=family_lines, family_type=family_type)

    family_id = family.family_id

    if case_id:
        family_id = case_id

    if not family.affected_individuals:
        logger.warning("No affected individuals could be found in ped file")

    logger.debug("Found affected individuals in ped file: {0}".format(
        ', '.join(family.affected_individuals)))

    vcf_individuals = vcf.samples
    ind_positions = {}
    for i, ind_id in enumerate(vcf_individuals):
        ind_positions[ind_id] = i

    for ind_id in family.individuals:
        if ind_id not in ind_positions:
            raise CaseError(
                "Ind {0} in ped file does not exist in VCF".format(ind_id))

    load_family(adapter=adapter, case_id=family_id, vcf_path=variant_file)

    try:
        load_variants(
            adapter=adapter,
            family_id=family_id,
            individuals=family.individuals,
            vcf=vcf,
            ind_positions=ind_positions,
            nr_variants=nr_variants,
            skip_case_id=skip_case_id,
            gq_treshold=gq_treshold,
        )
    except Exception as err:
        logger.warning(err)
        ##TODO Delete inserted information here
        raise err