Пример #1
0
def delete(ctx, variant_file, family_file, family_type, family_id):
    """Delete the variants of a case."""
    if not family_file or family_id:
        logger.error("Please provide a family file or a case id")
        logger.info("Exiting")
        ctx.abort()

    family = get_family(
        family_lines=family_file,
        family_type=family_type
    )

    family_id = family.family_id
    affected_individuals = family.affected_individuals
    adapter = ctx.obj['adapter']

    if variant_file == '-':
        logger.info("Start parsing variants from stdin")
        variant_stream = get_vcf_handle(fsock=sys.stdin)
    else:
        logger.info("Start parsing variants from stdin")
        variant_stream = get_vcf_handle(infile=variant_file)

    start_deleting = datetime.now()
    try:
        count = delete_variants(adapter, variant_stream, family_id,
                                affected_individuals)
    except CaseError as error:
        logger.warning(error.message)
        ctx.abort()

    logger.info("Nr of variants deleted: {0}".format(count))
    logger.info("Time to delete variants: {0}"
                .format(datetime.now() - start_deleting))
Пример #2
0
def get_header(vcf_file_path):
    """Parse the header and return a header object

        Args:
            vcf_file_path(str): Path to vcf

        Returns:
            head: A HeaderParser object
    """
    logger.info("Parsing header of file {0}".format(vcf_file_path))
    head = HeaderParser()
    handle = get_vcf_handle(infile=vcf_file_path)
    # Parse the header
    for line in handle:
        line = line.rstrip()
        if line.startswith('#'):
            if line.startswith('##'):
                head.parse_meta_data(line)
            else:
                head.parse_header_line(line)
        else:
            break

    handle.close()

    return head
Пример #3
0
    def variant(self, case_id, variant_id):
        """Return a specific variant.

            Args:
                case_id (str): Path to vcf file
                variant_id (str): A variant id

            Returns:
                variant (Variant): The variant object for the given id
        """
        case_obj = self.case(case_id=case_id)
        vcf_file_path = case_obj.variant_source
        head = self._get_header(vcf_file_path)

        handle = get_vcf_handle(infile=vcf_file_path)
        relevant_lines = (line for line in handle if not line.startswith('#'))
        for index, variant_line in enumerate(relevant_lines):
            index += 1
            line_id = get_variant_id(variant_line=variant_line).lstrip('chrCHR')
            if line_id == variant_id:
                return self._format_variant(
                    variant_line=variant_line,
                    index=index,
                    case_obj=case_obj,
                    head=head
                )

        return None
Пример #4
0
def get_header(vcf_file_path):
    """Parse the header and return a header object

        Args:
            vcf_file_path(str): Path to vcf

        Returns:
            head: A HeaderParser object
    """
    logger.info("Parsing header of file {0}".format(vcf_file_path))
    head = HeaderParser()
    handle = get_vcf_handle(infile=vcf_file_path)
    # Parse the header
    for line in handle:
        line = line.rstrip()
        if line.startswith('#'):
            if line.startswith('##'):
                head.parse_meta_data(line)
            else:
                head.parse_header_line(line)
        else:
            break

    handle.close()

    return head
 def test_get_vcf_handle_file(self):
     """docstring for test_get_vcf_handle_file"""
     file_handle = get_vcf_handle(infile=self.temp_file.name)
     result = []
     for line in file_handle:
         line = line.rstrip()
         result.append(line)
     
     assert result == vcf_lines
Пример #6
0
    def test_get_vcf_handle_file(self):
        """docstring for test_get_vcf_handle_file"""
        file_handle = get_vcf_handle(infile=self.temp_file.name)
        result = []
        for line in file_handle:
            line = line.rstrip()
            result.append(line)

        assert result == vcf_lines
Пример #7
0
    def _get_filtered_variants(self, vcf_file_path, filters={}):
        """Check if variants follows the filters

            This function will try to make filters faster for the vcf adapter

            Args:
                vcf_file_path(str): Path to vcf
                filters (dict): A dictionary with filters

            Yields:
                varian_line (str): A vcf variant line
        """

        genes = set()
        consequences = set()
        sv_types = set()

        if filters.get('gene_ids'):
            genes = set([gene_id.strip() for gene_id in filters['gene_ids']])

        if filters.get('consequence'):
            consequences = set(filters['consequence'])

        if filters.get('sv_types'):
            sv_types = set(filters['sv_types'])

        logger.info("Get variants from {0}".format(vcf_file_path))

        handle = get_vcf_handle(infile=vcf_file_path)
        for variant_line in handle:
            if not variant_line.startswith('#'):
                keep_variant = True

                if genes and keep_variant:
                    keep_variant = False
                    for gene in genes:
                        if "{0}".format(gene) in variant_line:
                            keep_variant = True
                            break

                if consequences and keep_variant:
                    keep_variant = False
                    for consequence in consequences:
                        if consequence in variant_line:
                            keep_variant = True
                            break

                if sv_types and keep_variant:
                    keep_variant = False
                    for sv_type in sv_types:
                        if sv_type in variant_line:
                            keep_variant = True
                            break

                if keep_variant:
                    yield variant_line
Пример #8
0
    def _get_filtered_variants(self, case_obj, filters={}):
        """Check if variants follows the filters

            This function will try to make filters faster for the vcf adapter

            Args:
                case_obj (puzzle.models.Case): A case object
                filters (dict): A dictionary with filters
        """

        genes = set()
        consequences = set()
        sv_types = set()

        vcf_file_path = case_obj.variant_source
        logger.info("Parsing file {0}".format(vcf_file_path))

        if filters.get("gene_ids"):
            genes = set([gene_id.strip() for gene_id in filters["gene_ids"]])

        if filters.get("consequence"):
            consequences = set(filters["consequence"])

        if filters.get("sv_types"):
            sv_types = set(filters["sv_types"])

        handle = get_vcf_handle(infile=vcf_file_path)

        for variant_line in handle:
            if not variant_line.startswith("#"):
                keep_variant = True

                if genes and keep_variant:
                    keep_variant = False
                    for gene in genes:
                        if "|{0}|".format(gene) in variant_line:
                            keep_variant = True
                            break

                if consequences and keep_variant:
                    keep_variant = False
                    for consequence in consequences:
                        if consequence in variant_line:
                            keep_variant = True
                            break

                if sv_types and keep_variant:
                    keep_variant = False
                    for sv_type in sv_types:
                        if sv_type in variant_line:
                            keep_variant = True
                            break

                if keep_variant:
                    yield variant_line
Пример #9
0
def load(ctx, variant_file, family_file, family_type, bulk_insert):
    """Load the variants of a case

    The loading is based on if the variant is seen in a ny affected individual
    in the family.
    """
    if not family_file:
        logger.error("Please provide a family file")
        ctx.abort()

    if variant_file == '-':
        logger.info("Parsing variants from stdin")
        variant_file = get_vcf_handle(fsock=sys.stdin)
    else:
        logger.info("Start parsing variants from stdin")
        variant_path = os.path.abspath(variant_file)
        variant_file = get_vcf_handle(infile=variant_file)

    try:
        family = get_family(family_lines=family_file, family_type=family_type)
    except SyntaxError as error:
        logger.warning(error.message)
        ctx.abort()

    if not family.affected_individuals:
        logger.error("No affected individuals could be found in ped file")
        ctx.abort()
    logger.info("Found affected individuals in ped file: {0}"
                .format(', '.join(family.affected_individuals)))

    adapter = ctx.obj['adapter']
    try:
        load_variants(adapter, family.family_id, family.affected_individuals,
                      variant_file, bulk_insert=bulk_insert,
                      vcf_path=variant_path)
    except CaseError as error:
        logger.error(error.message)
        ctx.abort()
Пример #10
0
    def _formated_variants(self, raw_variants, case_obj):
        """Return variant objects

            Args:
                raw_variants (Iterable): An iterable with variant lines
                case_obj (puzzle.nodels.Case): A case object

        """
        vcf_file_path = case_obj.variant_source

        logger.info("Parsing file {0}".format(vcf_file_path))
        head = HeaderParser()
        handle = get_vcf_handle(infile=vcf_file_path)
        # Parse the header
        for line in handle:
            line = line.rstrip()
            if line.startswith("#"):
                if line.startswith("##"):
                    head.parse_meta_data(line)
                else:
                    head.parse_header_line(line)
            else:
                break

        handle.close()

        header_line = head.header

        # Get the individual ids for individuals in vcf file
        vcf_individuals = set([ind_id for ind_id in head.individuals])

        variant_columns = ["CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER"]

        vep_header = head.vep_columns
        snpeff_header = head.snpeff_columns

        index = 0
        for variant_line in raw_variants:
            if not variant_line.startswith("#"):
                index += 1
                # Create a variant dict:
                variant_dict = get_variant_dict(variant_line=variant_line, header_line=header_line)
                variant_dict["CHROM"] = variant_dict["CHROM"].lstrip("chrCHR")
                # Crreate a info dict:
                info_dict = get_info_dict(info_line=variant_dict["INFO"])
                # Check if vep annotation:
                vep_string = info_dict.get("CSQ")

                # Check if snpeff annotation:
                snpeff_string = info_dict.get("ANN")

                if vep_string:
                    # Get the vep annotations
                    vep_info = get_vep_info(vep_string=vep_string, vep_header=vep_header)

                elif snpeff_string:
                    # Get the vep annotations
                    snpeff_info = get_snpeff_info(snpeff_string=snpeff_string, snpeff_header=snpeff_header)

                variant = Variant(**{column: variant_dict.get(column, ".") for column in variant_columns})

                logger.debug("Creating a variant object of variant {0}".format(variant.get("variant_id")))

                variant["index"] = index
                logger.debug("Updating index to: {0}".format(index))

                variant["start"] = int(variant_dict["POS"])

                if self.variant_type == "sv":
                    other_chrom = variant["CHROM"]
                    # If we have a translocation:
                    if ":" in variant_dict["ALT"] and not "<" in variant_dict["ALT"]:
                        other_coordinates = variant_dict["ALT"].strip("ACGTN[]").split(":")
                        other_chrom = other_coordinates[0].lstrip("chrCHR")
                        other_position = other_coordinates[1]
                        variant["stop"] = other_position

                        # Set 'infinity' to length if translocation
                        variant["sv_len"] = float("inf")
                    else:
                        variant["stop"] = int(info_dict.get("END", variant_dict["POS"]))
                        variant["sv_len"] = variant["stop"] - variant["start"]

                    variant["stop_chrom"] = other_chrom

                else:
                    variant["stop"] = int(variant_dict["POS"]) + (len(variant_dict["REF"]) - len(variant_dict["ALT"]))

                variant["sv_type"] = info_dict.get("SVTYPE")
                variant["cytoband_start"] = get_cytoband_coord(chrom=variant["CHROM"], pos=variant["start"])
                if variant.get("stop_chrom"):
                    variant["cytoband_stop"] = get_cytoband_coord(chrom=variant["stop_chrom"], pos=variant["stop"])

                # It would be easy to update these keys...
                thousand_g = info_dict.get("1000GAF")
                if thousand_g:
                    logger.debug("Updating thousand_g to: {0}".format(thousand_g))
                    variant["thousand_g"] = float(thousand_g)
                    variant.add_frequency("1000GAF", variant.get("thousand_g"))

                # SV specific tag for number of occurances
                occurances = info_dict.get("OCC")
                if occurances:
                    logger.debug("Updating occurances to: {0}".format(occurances))
                    variant["occurances"] = float(occurances)
                    variant.add_frequency("OCC", occurances)

                cadd_score = info_dict.get("CADD")
                if cadd_score:
                    logger.debug("Updating cadd_score to: {0}".format(cadd_score))
                    variant["cadd_score"] = float(cadd_score)

                rank_score_entry = info_dict.get("RankScore")
                if rank_score_entry:
                    for family_annotation in rank_score_entry.split(","):
                        rank_score = family_annotation.split(":")[-1]
                    logger.debug("Updating rank_score to: {0}".format(rank_score))
                    variant["rank_score"] = float(rank_score)

                genetic_models_entry = info_dict.get("GeneticModels")
                if genetic_models_entry:
                    genetic_models = []
                    for family_annotation in genetic_models_entry.split(","):
                        for genetic_model in family_annotation.split(":")[-1].split("|"):
                            genetic_models.append(genetic_model)
                    logger.debug("Updating rank_score to: {0}".format(rank_score))
                    variant["genetic_models"] = genetic_models

                # Add genotype calls:
                for individual in case_obj.individuals:
                    sample_id = individual.ind_id

                    if sample_id in vcf_individuals:

                        raw_call = dict(zip(variant_dict["FORMAT"].split(":"), variant_dict[sample_id].split(":")))
                        variant.add_individual(
                            Genotype(
                                sample_id=sample_id,
                                genotype=raw_call.get("GT", "./."),
                                case_id=individual.case_name,
                                phenotype=individual.phenotype,
                                ref_depth=raw_call.get("AD", ",").split(",")[0],
                                alt_depth=raw_call.get("AD", ",").split(",")[1],
                                genotype_quality=raw_call.get("GQ", "."),
                                depth=raw_call.get("DP", "."),
                                supporting_evidence=raw_call.get("SU", "0"),
                                pe_support=raw_call.get("PE", "0"),
                                sr_support=raw_call.get("SR", "0"),
                            )
                        )

                # Add transcript information:
                gmaf = None
                if vep_string:
                    for transcript_info in vep_info:
                        transcript = self._get_vep_transcripts(transcript_info)
                        gmaf_raw = transcript_info.get("GMAF")
                        if gmaf_raw:
                            gmaf = float(gmaf_raw.split(":")[-1])
                        variant.add_transcript(transcript)

                if gmaf:
                    variant.add_frequency("GMAF", gmaf)
                    if not variant.thousand_g:
                        variant.thousand_g = gmaf

                elif snpeff_string:
                    for transcript_info in snpeff_info:
                        transcript = self._get_snpeff_transcripts(transcript_info)
                        variant.add_transcript(transcript)

                variant["most_severe_consequence"] = get_most_severe_consequence(variant["transcripts"])

                for gene in self._get_genes(variant):
                    variant.add_gene(gene)

                self._add_compounds(variant=variant, info_dict=info_dict)

                yield variant
Пример #11
0
def get_individuals(vcf=None, case_lines=None, case_type='ped'):
        """Get the individuals from a vcf file, and/or a ped file.

            Args:
                vcf (str): Path to a vcf
                case_lines(Iterable): Ped like lines
                case_type(str): Format of ped lines

            Returns:
                individuals (generator): generator with Individuals
        """
        individuals = []

        if case_lines:
            # read individuals from ped file
            family_parser = FamilyParser(case_lines, family_type=case_type)
            families = family_parser.families
            logger.info("Found families {0}".format(
                            ','.join(list(families.keys()))))
            if len(families) != 1:
                logger.error("Only one family can be used with vcf adapter")
                raise IOError

            case_id = list(families.keys())[0]
            logger.info("Family used in analysis: {0}".format(case_id))

            for ind_id in family_parser.individuals:
                ind = family_parser.individuals[ind_id]
                logger.info("Found individual {0}".format(ind.individual_id))

                individual = Individual(
                    ind_id=ind.individual_id,
                    case_id=case_id,
                    mother=ind.mother,
                    father=ind.father,
                    sex=str(ind.sex),
                    phenotype=str(ind.phenotype),
                    variant_source=vcf,
                )
                individuals.append(individual)

        elif vcf:
            # read individuals from vcf file
            case_id = os.path.basename(vcf)
            head = HeaderParser()
            handle = get_vcf_handle(infile=vcf)
            for line in handle:
                line = line.rstrip()
                if line.startswith('#'):
                    if line.startswith('##'):
                        head.parse_meta_data(line)
                    else:
                        head.parse_header_line(line)
                else:
                    break

            for index, ind in enumerate(head.individuals):
                # If we only have a vcf file we can not get metadata about the
                # individuals
                individual = Individual(
                    ind_id=ind,
                    case_id=case_id,
                    variant_source=vcf,
                )
                individuals.append(individual)

                logger.debug("Found individual {0} in {1}".format(
                    ind, vcf))

        return individuals
 def test_get_vcf_handle_no_input(self):
     """docstring for test_get_vcf_handle_file"""
     with pytest.raises(IOError):
         file_handle = get_vcf_handle()
Пример #13
0
    def _formated_variants(self, raw_variants, case_obj):
        """Return variant objects

            Args:
                raw_variants (Iterable): An iterable with variant lines
                case_obj (puzzle.nodels.Case): A case object

        """
        vcf_file_path = case_obj.variant_source

        logger.info("Parsing file {0}".format(vcf_file_path))
        head = HeaderParser()
        handle = get_vcf_handle(infile=vcf_file_path)
        # Parse the header
        for line in handle:
            line = line.rstrip()
            if line.startswith('#'):
                if line.startswith('##'):
                    head.parse_meta_data(line)
                else:
                    head.parse_header_line(line)
            else:
                break

        handle.close()

        header_line = head.header

        # Get the individual ids for individuals in vcf file
        vcf_individuals = set([ind_id for ind_id in head.individuals])

        variant_columns = ['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER']

        vep_header = head.vep_columns
        snpeff_header = head.snpeff_columns

        index = 0
        for variant_line in raw_variants:
            if not variant_line.startswith('#'):
                index += 1
                #Create a variant dict:
                variant_dict =  get_variant_dict(
                    variant_line = variant_line,
                    header_line = header_line
                )
                variant_dict['CHROM'] = variant_dict['CHROM'].lstrip('chrCHR')
                #Crreate a info dict:
                info_dict = get_info_dict(
                    info_line = variant_dict['INFO']
                )
                #Check if vep annotation:
                vep_string = info_dict.get('CSQ')

                #Check if snpeff annotation:
                snpeff_string = info_dict.get('ANN')

                if vep_string:
                    #Get the vep annotations
                    vep_info = get_vep_info(
                        vep_string = vep_string,
                        vep_header = vep_header
                    )

                elif snpeff_string:
                    #Get the vep annotations
                    snpeff_info = get_snpeff_info(
                        snpeff_string = snpeff_string,
                        snpeff_header = snpeff_header
                    )

                variant = Variant(
                    **{column: variant_dict.get(column, '.')
                        for column in variant_columns}
                    )

                logger.debug("Creating a variant object of variant {0}".format(
                    variant.get('variant_id')))

                variant['index'] = index
                logger.debug("Updating index to: {0}".format(
                    index))

                variant['start'] = int(variant_dict['POS'])


                if self.variant_type == 'sv':
                    other_chrom = variant['CHROM']
                    # If we have a translocation:
                    if ':' in variant_dict['ALT']:
                        other_coordinates = variant_dict['ALT'].strip('ACGTN[]').split(':')
                        other_chrom = other_coordinates[0].lstrip('chrCHR')
                        other_position = other_coordinates[1]
                        variant['stop'] = other_position

                        #Set 'infinity' to length if translocation
                        variant['sv_len'] = float('inf')
                    else:
                        variant['stop'] = int(info_dict.get('END', variant_dict['POS']))
                        variant['sv_len'] = variant['stop'] - variant['start']

                    variant['stop_chrom'] = other_chrom

                else:
                    variant['stop'] = int(variant_dict['POS']) + \
                        (len(variant_dict['REF']) - len(variant_dict['ALT']))

                variant['sv_type'] = info_dict.get('SVTYPE')
                variant['cytoband_start'] = get_cytoband_coord(
                                                chrom=variant['CHROM'],
                                                pos=variant['start'])
                if variant.get('stop_chrom'):
                    variant['cytoband_stop'] = get_cytoband_coord(
                                                chrom=variant['stop_chrom'],
                                                pos=variant['stop'])

                # It would be easy to update these keys...
                thousand_g = info_dict.get('1000GAF')
                if thousand_g:
                    logger.debug("Updating thousand_g to: {0}".format(
                        thousand_g))
                    variant['thousand_g'] = float(thousand_g)
                    variant.add_frequency('1000GAF', variant.get('thousand_g'))

                #SV specific tag for number of occurances
                occurances = info_dict.get('OCC')
                if occurances:
                    logger.debug("Updating occurances to: {0}".format(
                        occurances))
                    variant['occurances'] = float(occurances)
                    variant.add_frequency('OCC', occurances)

                cadd_score = info_dict.get('CADD')
                if cadd_score:
                    logger.debug("Updating cadd_score to: {0}".format(
                        cadd_score))
                    variant['cadd_score'] = float(cadd_score)

                rank_score_entry = info_dict.get('RankScore')
                if rank_score_entry:
                    for family_annotation in rank_score_entry.split(','):
                        rank_score = family_annotation.split(':')[-1]
                    logger.debug("Updating rank_score to: {0}".format(
                        rank_score))
                    variant['rank_score'] = float(rank_score)

                genetic_models_entry = info_dict.get('GeneticModels')
                if genetic_models_entry:
                    genetic_models = []
                    for family_annotation in genetic_models_entry.split(','):
                        for genetic_model in family_annotation.split(':')[-1].split('|'):
                            genetic_models.append(genetic_model)
                    logger.debug("Updating rank_score to: {0}".format(
                        rank_score))
                    variant['genetic_models'] = genetic_models

                #Add genotype calls:
                for individual in case_obj.individuals:
                    sample_id = individual.ind_id

                    if sample_id in vcf_individuals:

                        raw_call = dict(zip(
                            variant_dict['FORMAT'].split(':'),
                            variant_dict[sample_id].split(':'))
                        )
                        variant.add_individual(Genotype(
                            sample_id = sample_id,
                            genotype = raw_call.get('GT', './.'),
                            case_id = individual.case_name,
                            phenotype = individual.phenotype,
                            ref_depth = raw_call.get('AD', ',').split(',')[0],
                            alt_depth = raw_call.get('AD', ',').split(',')[1],
                            genotype_quality = raw_call.get('GQ', '.'),
                            depth = raw_call.get('DP', '.'),
                            supporting_evidence = raw_call.get('SU', '0'),
                            pe_support = raw_call.get('PE', '0'),
                            sr_support = raw_call.get('SR', '0'),
                        ))

                # Add transcript information:
                if vep_string:
                    for transcript in self._get_vep_transcripts(variant, vep_info):
                        variant.add_transcript(transcript)

                elif snpeff_string:
                    for transcript in self._get_snpeff_transcripts(variant, snpeff_info):
                        variant.add_transcript(transcript)

                variant['most_severe_consequence'] = get_most_severe_consequence(
                    variant['transcripts']
                )

                for gene in self._get_genes(variant):
                    variant.add_gene(gene)

                self._add_compounds(variant=variant, info_dict=info_dict)

                yield variant
Пример #14
0
 def test_get_vcf_handle_no_input(self):
     """docstring for test_get_vcf_handle_file"""
     with pytest.raises(IOError):
         file_handle = get_vcf_handle()