示例#1
0
def check_coordinates(chromosome, pos, coordinates):
    """Check if the variant is in the interval given by the coordinates

        Args:
            chromosome(str): Variant chromosome
            pos(int): Variant position
            coordinates(dict): Dictionary with the region of interest
    """
    chrom_match = CHR_PATTERN.match(chromosome)
    chrom = chrom_match.group(2)

    if chrom != coordinates['chrom']:
        return False

    if (pos >= coordinates['start'] and pos <= coordinates['end']):
        return True

    return False
示例#2
0
def get_end_chrom(alt, chrom):
    """Return the end chromosome for a tranlocation

    Args:
        alt(str)
        chrom(str)

    Returns:
        end_chrom(str)
    """
    end_chrom = chrom
    if ":" not in alt:
        return end_chrom

    match = BND_ALT_PATTERN.match(alt)
    # BND will often be translocations between different chromosomes
    if match:
        other_chrom = match.group(1)
        match = CHR_PATTERN.match(other_chrom)
        end_chrom = match.group(2)
    return end_chrom
示例#3
0
def is_par(chromosome, position, build='37'):
    """Check if a variant is in the Pseudo Autosomal Region or not
    
    Args:
        chromosome(str)
        position(int)
        build(str): The genome build
    
    Returns:
        bool
    """
    chrom_match = CHR_PATTERN.match(chromosome)
    chrom = chrom_match.group(2)

    # PAR regions are only on X and Y
    if not chrom in ['X', 'Y']:
        return False
    # Check if variant is in first PAR region
    if PAR_COORDINATES[build][chrom].search(position):
        return True

    return False
示例#4
0
文件: variant.py 项目: dcrosta/scout
    # These are to display how the rank score is built
    rank_results_header = rank_results_header or []
    # Vep information
    vep_header = vep_header or []

    parsed_variant = {}

    # Create the ID for the variant
    case_id = case['_id']
    if '-' in case_id:
        logger.debug('internal case id detected')
        genmod_key = case['display_name']
    else:
        genmod_key = case['_id']

    chrom_match = CHR_PATTERN.match(variant.CHROM)
    chrom = chrom_match.group(2)
    # Builds a dictionary with the different ids that are used
    parsed_variant['ids'] = parse_ids(chrom=chrom,
                                      pos=variant.POS,
                                      ref=variant.REF,
                                      alt=variant.ALT[0],
                                      case_id=case_id,
                                      variant_type=variant_type)
    parsed_variant['case_id'] = case_id
    # type can be 'clinical' or 'research'
    parsed_variant['variant_type'] = variant_type
    # category is sv or snv
    # cyvcf2 knows if it is a sv, indel or snv variant
    if not category:
        category = variant.var_type
示例#5
0
def parse_coordinates(variant, category):
    """Find out the coordinates for a variant
    
    Args:
        variant(cyvcf2.Variant)
    
    Returns:
        coordinates(dict): A dictionary on the form:
        {
            'position':<int>, 
            'end':<int>, 
            'end_chrom':<str>,
            'length':<int>, 
            'sub_category':<str>,
            'mate_id':<str>,
            'cytoband_start':<str>,
            'cytoband_end':<str>,
        }
    """
    ref = variant.REF
    alt = variant.ALT[0]
    chrom_match = CHR_PATTERN.match(variant.CHROM)
    chrom = chrom_match.group(2)

    svtype = variant.INFO.get('SVTYPE')
    if svtype:
        svtype = svtype.lower()

    mate_id = variant.INFO.get('MATEID')

    svlen = variant.INFO.get('SVLEN')

    svend = variant.INFO.get('END')
    snvend = int(variant.end)

    position = int(variant.POS)

    ref_len = len(ref)
    alt_len = len(alt)

    sub_category = get_sub_category(alt_len, ref_len, category, svtype)
    end = get_end(position, alt, category, snvend, svend)

    length = get_length(alt_len, ref_len, category, position, end, svtype,
                        svlen)
    end_chrom = chrom

    if sub_category == 'bnd':
        if ':' in alt:
            match = BND_ALT_PATTERN.match(alt)
            # BND will often be translocations between different chromosomes
            if match:
                other_chrom = match.group(1)
                match = CHR_PATTERN.match(other_chrom)
                end_chrom = match.group(2)

    cytoband_start = get_cytoband_coordinates(chrom, position)
    cytoband_end = get_cytoband_coordinates(end_chrom, end)

    coordinates = {
        'position': position,
        'end': end,
        'length': length,
        'sub_category': sub_category,
        'mate_id': mate_id,
        'cytoband_start': cytoband_start,
        'cytoband_end': cytoband_end,
        'end_chrom': end_chrom,
    }

    return coordinates
示例#6
0
def parse_coordinates(variant, category):
    """Find out the coordinates for a variant

    Args:
        variant(cyvcf2.Variant)

    Returns:
        coordinates(dict): A dictionary on the form:
        {
            'position':<int>,
            'end':<int>,
            'end_chrom':<str>,
            'length':<int>,
            'sub_category':<str>,
            'mate_id':<str>,
            'cytoband_start':<str>,
            'cytoband_end':<str>,
        }
    """
    if variant.ALT:
        alt = variant.ALT[0]
    if category == "str" and not variant.ALT:
        alt = "."

    chrom_match = CHR_PATTERN.match(variant.CHROM)
    chrom = chrom_match.group(2)
    end_chrom = chrom

    position = int(variant.POS)

    ref_len = len(variant.REF)
    alt_len = len(alt)

    if category in {"sv", "cancer_sv"}:
        svtype = variant.INFO.get("SVTYPE")
        if svtype:
            svtype = svtype.lower()
        sub_category = svtype
        if sub_category == "bnd":
            end_chrom = get_end_chrom(alt, chrom)
        end = sv_end(
            pos=position,
            alt=alt,
            svend=variant.INFO.get("END"),
            svlen=variant.INFO.get("SVLEN"),
        )
        length = sv_length(
            pos=position,
            end=end,
            chrom=chrom,
            end_chrom=end_chrom,
            svlen=variant.INFO.get("SVLEN"),
        )

    else:
        sub_category = "snv"
        end = int(variant.end)
        length = alt_len
        if ref_len != alt_len:
            sub_category = "indel"
            abs(ref_len - alt_len)

    coordinates = {
        "position": position,
        "end": end,
        "length": length,
        "sub_category": sub_category,
        "mate_id": variant.INFO.get("MATEID"),
        "cytoband_start": get_cytoband_coordinates(chrom, position),
        "cytoband_end": get_cytoband_coordinates(end_chrom, end),
        "end_chrom": end_chrom,
    }

    return coordinates
示例#7
0
def parse_coordinates(variant, category):
    """Find out the coordinates for a variant

    Args:
        variant(cyvcf2.Variant)

    Returns:
        coordinates(dict): A dictionary on the form:
        {
            'position':<int>,
            'end':<int>,
            'end_chrom':<str>,
            'length':<int>,
            'sub_category':<str>,
            'mate_id':<str>,
            'cytoband_start':<str>,
            'cytoband_end':<str>,
        }
    """
    ref = variant.REF

    if variant.ALT:
        alt = variant.ALT[0]
    if category == "str" and not variant.ALT:
        alt = "."

    chrom_match = CHR_PATTERN.match(variant.CHROM)
    chrom = chrom_match.group(2)

    svtype = variant.INFO.get("SVTYPE")
    if svtype:
        svtype = svtype.lower()

    mate_id = variant.INFO.get("MATEID")

    svlen = variant.INFO.get("SVLEN")

    svend = variant.INFO.get("END")
    snvend = int(variant.end)

    position = int(variant.POS)

    ref_len = len(ref)
    alt_len = len(alt)

    sub_category = get_sub_category(alt_len, ref_len, category, svtype)
    end = get_end(position, alt, category, snvend, svend)

    length = get_length(alt_len, ref_len, category, position, end, svtype,
                        svlen)
    end_chrom = chrom

    if sub_category == "bnd":
        if ":" in alt:
            match = BND_ALT_PATTERN.match(alt)
            # BND will often be translocations between different chromosomes
            if match:
                other_chrom = match.group(1)
                match = CHR_PATTERN.match(other_chrom)
                end_chrom = match.group(2)

    cytoband_start = get_cytoband_coordinates(chrom, position)
    cytoband_end = get_cytoband_coordinates(end_chrom, end)

    coordinates = {
        "position": position,
        "end": end,
        "length": length,
        "sub_category": sub_category,
        "mate_id": mate_id,
        "cytoband_start": cytoband_start,
        "cytoband_end": cytoband_end,
        "end_chrom": end_chrom,
    }

    return coordinates
示例#8
0
def parse_variant(variant, case, variant_type='clinical',
                 rank_results_header=None, vep_header=None,
                 individual_positions=None, category=None):
    """Return a parsed variant

        Get all the necessary information to build a variant object

    Args:
        variant(cyvcf2.Variant)
        case(dict)
        variant_type(str): 'clinical' or 'research'
        rank_results_header(list)
        vep_header(list)
        individual_positions(dict): Explain what position each individual has
                                    in vcf
        category(str): 'snv', 'sv', 'str' or 'cancer'

    Returns:
        parsed_variant(dict): Parsed variant
    """
    # These are to display how the rank score is built
    rank_results_header = rank_results_header or []
    # Vep information
    vep_header = vep_header or []

    parsed_variant = {}

    # Create the ID for the variant
    case_id = case['_id']
    if '-' in case_id:
        logger.debug('internal case id detected')
        genmod_key = case['display_name']
    else:
        genmod_key = case['_id']

    chrom_match = CHR_PATTERN.match(variant.CHROM)
    chrom = chrom_match.group(2)

    # Builds a dictionary with the different ids that are used

    if variant.ALT:
        alt=variant.ALT[0]
    elif not variant.ALT and category == "str":
        alt='.'

    parsed_variant['ids'] = parse_ids(
        chrom=chrom,
        pos=variant.POS,
        ref=variant.REF,
        alt=alt,
        case_id=case_id,
        variant_type=variant_type,
    )
    parsed_variant['case_id'] = case_id
    # type can be 'clinical' or 'research'
    parsed_variant['variant_type'] = variant_type
    # category is sv or snv
    # cyvcf2 knows if it is a sv, indel or snv variant
    if not category:
        category = variant.var_type
        if category == 'indel':
            category = 'snv'
        if category == 'snp':
            category = 'snv'

    parsed_variant['category'] = category

    ################# General information #################

    parsed_variant['reference'] = variant.REF

    ### We allways assume splitted and normalized vcfs!!!
    if len(variant.ALT) > 1:
        raise VcfError("Variants are only allowed to have one alternative")
    parsed_variant['alternative'] = alt

    # cyvcf2 will set QUAL to None if '.' in vcf
    parsed_variant['quality'] = variant.QUAL

    if variant.FILTER:
        parsed_variant['filters'] = variant.FILTER.split(';')
    else:
        parsed_variant['filters'] = ['PASS']

    # Add the dbsnp ids
    parsed_variant['dbsnp_id'] = variant.ID

    # This is the id of other position in translocations
    # (only for specific svs)
    parsed_variant['mate_id'] = None

    ################# Position specific #################
    parsed_variant['chromosome'] = chrom

    coordinates = parse_coordinates(variant, category)

    parsed_variant['position'] = coordinates['position']
    parsed_variant['sub_category'] = coordinates['sub_category']
    parsed_variant['mate_id'] = coordinates['mate_id']
    parsed_variant['end'] = coordinates['end']
    parsed_variant['length'] = coordinates['length']
    parsed_variant['end_chrom'] = coordinates['end_chrom']
    parsed_variant['cytoband_start'] = coordinates['cytoband_start']
    parsed_variant['cytoband_end'] = coordinates['cytoband_end']

    ################# Add rank score #################
    # The rank score is central for displaying variants in scout.

    rank_score = parse_rank_score(variant.INFO.get('RankScore', ''), genmod_key)
    parsed_variant['rank_score'] = rank_score or 0


    ################# Add gt calls #################
    if individual_positions and case['individuals']:
        parsed_variant['samples'] = parse_genotypes(variant, case['individuals'],
                                                    individual_positions)
    else:
        parsed_variant['samples'] = []

    ################# Add compound information #################
    compounds = parse_compounds(compound_info=variant.INFO.get('Compounds'),
                                case_id=genmod_key,
                                variant_type=variant_type)
    if compounds:
        parsed_variant['compounds'] = compounds

    ################# Add inheritance patterns #################

    genetic_models = parse_genetic_models(variant.INFO.get('GeneticModels'), genmod_key)
    if genetic_models:
        parsed_variant['genetic_models'] = genetic_models

    ################# Add autozygosity calls if present #################

    azlength = variant.INFO.get('AZLENGTH')
    if azlength:
        parsed_variant['azlength'] = int(azlength)

    azqual = variant.INFO.get('AZQUAL')
    if azqual:
        parsed_variant['azqual'] = float(azqual)

    ################ Add STR info if present ################

    # repeat id generally corresponds to gene symbol
    repeat_id = variant.INFO.get('REPID')
    if repeat_id:
        parsed_variant['str_repid'] = str(repeat_id)

    # repeat unit - used e g in PanelApp naming of STRs
    repeat_unit = variant.INFO.get('RU')
    if repeat_unit:
        parsed_variant['str_ru'] = str(repeat_unit)

    # repeat ref - reference copy number
    repeat_ref = variant.INFO.get('REF')
    if repeat_ref:
        parsed_variant['str_ref'] = int(repeat_ref)

    # repeat len - number of repeats found in case
    repeat_len = variant.INFO.get('RL')
    if repeat_len:
        parsed_variant['str_len'] = int(repeat_len)

    # str status - this indicates the severity of the expansion level
    str_status = variant.INFO.get('STR_STATUS')
    if str_status:
        parsed_variant['str_status'] = str(str_status)

    ################# Add gene and transcript information #################
    raw_transcripts = []
    if vep_header:
        vep_info = variant.INFO.get('CSQ')
        if vep_info:
            raw_transcripts = (dict(zip(vep_header, transcript_info.split('|')))
                               for transcript_info in vep_info.split(','))

    parsed_transcripts = []
    dbsnp_ids = set()
    cosmic_ids = set()
    for parsed_transcript in parse_transcripts(raw_transcripts, parsed_variant['alternative']):
        parsed_transcripts.append(parsed_transcript)
        for dbsnp in parsed_transcript.get('dbsnp', []):
            dbsnp_ids.add(dbsnp)
        for cosmic in parsed_transcript.get('cosmic', []):
            cosmic_ids.add(cosmic)

    # The COSMIC tag in INFO is added via VEP and/or bcftools annotate

    cosmic_tag = variant.INFO.get('COSMIC')
    if cosmic_tag:
        cosmic_ids.add(cosmic_tag[4:])

    if (dbsnp_ids and not parsed_variant['dbsnp_id']):
        parsed_variant['dbsnp_id'] = ';'.join(dbsnp_ids)

    if cosmic_ids:
        parsed_variant['cosmic_ids'] = list(cosmic_ids)

    gene_info = parse_genes(parsed_transcripts)

    parsed_variant['genes'] = gene_info

    hgnc_ids = set([])

    for gene in parsed_variant['genes']:
        hgnc_ids.add(gene['hgnc_id'])

    parsed_variant['hgnc_ids'] = list(hgnc_ids)

    ################# Add clinsig prediction #################
    if variant.INFO.get('CLNACC'):
        acc = variant.INFO.get('CLNACC')
    else:
        acc = variant.INFO.get('CLNVID')
    clnsig_predictions = parse_clnsig(
        acc=acc,
        sig=variant.INFO.get('CLNSIG'),
        revstat=variant.INFO.get('CLNREVSTAT'),
        transcripts=parsed_transcripts
        )

    if clnsig_predictions:
        parsed_variant['clnsig'] = clnsig_predictions

    ################# Add the frequencies #################
    frequencies = parse_frequencies(variant, parsed_transcripts)

    parsed_variant['frequencies'] = frequencies

    # parse out old local observation count
    local_obs_old = variant.INFO.get('Obs')
    if local_obs_old:
        parsed_variant['local_obs_old'] = int(local_obs_old)

    local_obs_hom_old = variant.INFO.get('Hom')
    if local_obs_hom_old:
        parsed_variant['local_obs_hom_old'] = int(local_obs_hom_old)

    ###################### Add severity predictions ######################
    cadd = parse_cadd(variant, parsed_transcripts)
    if cadd:
        parsed_variant['cadd_score'] = cadd

    spidex = variant.INFO.get('SPIDEX')
    if spidex:
        parsed_variant['spidex'] = float(spidex)

    ###################### Add conservation ######################

    parsed_variant['conservation'] = parse_conservations(variant)

    parsed_variant['callers'] = parse_callers(variant, category=category)

    rank_result = variant.INFO.get('RankResult')
    if rank_result:
        results = [int(i) for i in rank_result.split('|')]
        parsed_variant['rank_result'] = dict(zip(rank_results_header, results))

    ###################### Add SV specific annotations ######################
    sv_frequencies = parse_sv_frequencies(variant)
    for key in sv_frequencies:
        parsed_variant['frequencies'][key] = sv_frequencies[key]

    ###################### Add Cancer specific annotations ######################
    # MSK_MVL indicates if variants are in the MSK managed variant list
    # https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5437632/
    mvl_tag = variant.INFO.get('MSK_MVL')
    if mvl_tag:
        parsed_variant['mvl_tag'] = True

    return parsed_variant
示例#9
0
def load_variants(adapter,
                  variant_file,
                  case_obj,
                  variant_type='clinical',
                  category='snv',
                  rank_threshold=6,
                  chrom=None,
                  start=None,
                  end=None):
    """Load all variant in variants

        Args:
            adapter(MongoAdapter)
            variant_file(str): Path to variant file
            case(Case)
            variant_type(str)
            category(str): 'snv' or 'sv'
            rank_threshold(int)
            chrom(str)
            start(int)
            end(int)
    """

    institute_obj = adapter.institute(institute_id=case_obj['owner'])

    if not institute_obj:
        raise IntegrityError("Institute {0} does not exist in"
                             " database.".format(case_obj['owner']))

    gene_to_panels = adapter.gene_to_panels()

    hgncid_to_gene = adapter.hgncid_to_gene()

    coordinates = {}

    vcf_obj = VCF(variant_file)

    rank_results_header = parse_rank_results_header(vcf_obj)
    vep_header = parse_vep_header(vcf_obj)

    # This is a dictionary to tell where ind are in vcf
    individual_positions = {}
    for i, ind in enumerate(vcf_obj.samples):
        individual_positions[ind] = i

    LOG.info("Start inserting variants into database")
    start_insertion = datetime.now()
    start_five_thousand = datetime.now()
    # To get it right if the file is empty
    nr_variants = -1
    nr_inserted = 0
    inserted = 1

    coordinates = False
    if chrom:
        coordinates = {'chrom': chrom, 'start': start, 'end': end}

    try:
        for nr_variants, variant in enumerate(vcf_obj):

            # Get the neccesary coordinates
            # Parse away any chr CHR prefix
            chrom_match = CHR_PATTERN.match(variant.CHROM)
            chrom = chrom_match.group(2)
            position = variant.POS

            add_variant = False

            # If coordinates are specified we want to upload all variants that
            # resides within the specified region
            if coordinates:
                if check_coordinates(chrom, position, coordinates):
                    add_variant = True
            # If there are no coordinates we allways want to load MT variants
            elif chrom == 'MT':
                add_variant = True
            # Otherwise we need to check is rank score requirement are fulfilled
            else:
                rank_score = parse_rank_score(variant.INFO.get('RankScore'),
                                              case_obj['display_name'])
                if rank_score >= rank_threshold:
                    add_variant = True
            variant_obj = None

            # Log the number of variants parsed
            if (nr_variants != 0 and nr_variants % 5000 == 0):
                LOG.info("%s variants parsed" % str(nr_variants))
                LOG.info(
                    "Time to parse variants: {} ".format(datetime.now() -
                                                         start_five_thousand))
                start_five_thousand = datetime.now()

            if not add_variant:
                continue

            ####### Here we know that the variant should be loaded #########
            # We follow the scout paradigm of parse -> build -> load

            # Parse the variant
            parsed_variant = parse_variant(
                variant=variant,
                case=case_obj,
                variant_type=variant_type,
                rank_results_header=rank_results_header,
                vep_header=vep_header,
                individual_positions=individual_positions)

            # Build the variant object
            variant_obj = build_variant(
                variant=parsed_variant,
                institute_id=institute_obj['_id'],
                gene_to_panels=gene_to_panels,
                hgncid_to_gene=hgncid_to_gene,
            )

            # Load the variant abject
            # We could get integrity error here since if we want to load all variants of a region
            # there will likely already be variants from that region loaded
            try:
                load_variant(adapter, variant_obj)
                nr_inserted += 1
            except IntegrityError as error:
                pass

            # Log number of inserted variants
            if (nr_inserted != 0
                    and (nr_inserted * inserted) % (1000 * inserted) == 0):
                LOG.info("%s variants inserted" % nr_inserted)
                inserted += 1

    except Exception as error:
        if not coordinates:
            LOG.warning("Deleting inserted variants")
            delete_variants(adapter, case_obj, variant_type)
        raise error

    LOG.info("All variants inserted.")
    LOG.info("Number of variants in file: {0}".format(nr_variants + 1))
    LOG.info("Number of variants inserted: {0}".format(nr_inserted))
    LOG.info("Time to insert variants:{0}".format(datetime.now() -
                                                  start_insertion))
示例#10
0
def parse_coordinates(variant, category):
    """Find out the coordinates for a variant

    Args:
        variant(cyvcf2.Variant)

    Returns:
        coordinates(dict): A dictionary on the form:
        {
            'position':<int>,
            'end':<int>,
            'end_chrom':<str>,
            'length':<int>,
            'sub_category':<str>,
            'mate_id':<str>,
            'cytoband_start':<str>,
            'cytoband_end':<str>,
        }
    """
    ref = variant.REF

    if variant.ALT:
        alt = variant.ALT[0]
    if category=="str" and not variant.ALT:
        alt = '.'

    chrom_match = CHR_PATTERN.match(variant.CHROM)
    chrom = chrom_match.group(2)

    svtype = variant.INFO.get('SVTYPE')
    if svtype:
        svtype = svtype.lower()

    mate_id = variant.INFO.get('MATEID')

    svlen = variant.INFO.get('SVLEN')

    svend = variant.INFO.get('END')
    snvend = int(variant.end)

    position = int(variant.POS)

    ref_len = len(ref)
    alt_len = len(alt)

    sub_category = get_sub_category(alt_len, ref_len, category, svtype)
    end = get_end(position, alt, category, snvend, svend)

    length = get_length(alt_len, ref_len, category, position, end, svtype, svlen)
    end_chrom = chrom

    if sub_category == 'bnd':
        if ':' in alt:
            match = BND_ALT_PATTERN.match(alt)
            # BND will often be translocations between different chromosomes
            if match:
                other_chrom = match.group(1)
                match = CHR_PATTERN.match(other_chrom)
                end_chrom = match.group(2)

    cytoband_start = get_cytoband_coordinates(chrom, position)
    cytoband_end = get_cytoband_coordinates(end_chrom, end)

    coordinates = {
        'position': position,
        'end': end,
        'length': length,
        'sub_category': sub_category,
        'mate_id': mate_id,
        'cytoband_start': cytoband_start,
        'cytoband_end': cytoband_end,
        'end_chrom': end_chrom,
    }


    return coordinates