def check_coordinates(chromosome, pos, coordinates): """Check if the variant is in the interval given by the coordinates Args: chromosome(str): Variant chromosome pos(int): Variant position coordinates(dict): Dictionary with the region of interest """ chrom_match = CHR_PATTERN.match(chromosome) chrom = chrom_match.group(2) if chrom != coordinates['chrom']: return False if (pos >= coordinates['start'] and pos <= coordinates['end']): return True return False
def get_end_chrom(alt, chrom): """Return the end chromosome for a tranlocation Args: alt(str) chrom(str) Returns: end_chrom(str) """ end_chrom = chrom if ":" not in alt: return end_chrom match = BND_ALT_PATTERN.match(alt) # BND will often be translocations between different chromosomes if match: other_chrom = match.group(1) match = CHR_PATTERN.match(other_chrom) end_chrom = match.group(2) return end_chrom
def is_par(chromosome, position, build='37'): """Check if a variant is in the Pseudo Autosomal Region or not Args: chromosome(str) position(int) build(str): The genome build Returns: bool """ chrom_match = CHR_PATTERN.match(chromosome) chrom = chrom_match.group(2) # PAR regions are only on X and Y if not chrom in ['X', 'Y']: return False # Check if variant is in first PAR region if PAR_COORDINATES[build][chrom].search(position): return True return False
# These are to display how the rank score is built rank_results_header = rank_results_header or [] # Vep information vep_header = vep_header or [] parsed_variant = {} # Create the ID for the variant case_id = case['_id'] if '-' in case_id: logger.debug('internal case id detected') genmod_key = case['display_name'] else: genmod_key = case['_id'] chrom_match = CHR_PATTERN.match(variant.CHROM) chrom = chrom_match.group(2) # Builds a dictionary with the different ids that are used parsed_variant['ids'] = parse_ids(chrom=chrom, pos=variant.POS, ref=variant.REF, alt=variant.ALT[0], case_id=case_id, variant_type=variant_type) parsed_variant['case_id'] = case_id # type can be 'clinical' or 'research' parsed_variant['variant_type'] = variant_type # category is sv or snv # cyvcf2 knows if it is a sv, indel or snv variant if not category: category = variant.var_type
def parse_coordinates(variant, category): """Find out the coordinates for a variant Args: variant(cyvcf2.Variant) Returns: coordinates(dict): A dictionary on the form: { 'position':<int>, 'end':<int>, 'end_chrom':<str>, 'length':<int>, 'sub_category':<str>, 'mate_id':<str>, 'cytoband_start':<str>, 'cytoband_end':<str>, } """ ref = variant.REF alt = variant.ALT[0] chrom_match = CHR_PATTERN.match(variant.CHROM) chrom = chrom_match.group(2) svtype = variant.INFO.get('SVTYPE') if svtype: svtype = svtype.lower() mate_id = variant.INFO.get('MATEID') svlen = variant.INFO.get('SVLEN') svend = variant.INFO.get('END') snvend = int(variant.end) position = int(variant.POS) ref_len = len(ref) alt_len = len(alt) sub_category = get_sub_category(alt_len, ref_len, category, svtype) end = get_end(position, alt, category, snvend, svend) length = get_length(alt_len, ref_len, category, position, end, svtype, svlen) end_chrom = chrom if sub_category == 'bnd': if ':' in alt: match = BND_ALT_PATTERN.match(alt) # BND will often be translocations between different chromosomes if match: other_chrom = match.group(1) match = CHR_PATTERN.match(other_chrom) end_chrom = match.group(2) cytoband_start = get_cytoband_coordinates(chrom, position) cytoband_end = get_cytoband_coordinates(end_chrom, end) coordinates = { 'position': position, 'end': end, 'length': length, 'sub_category': sub_category, 'mate_id': mate_id, 'cytoband_start': cytoband_start, 'cytoband_end': cytoband_end, 'end_chrom': end_chrom, } return coordinates
def parse_coordinates(variant, category): """Find out the coordinates for a variant Args: variant(cyvcf2.Variant) Returns: coordinates(dict): A dictionary on the form: { 'position':<int>, 'end':<int>, 'end_chrom':<str>, 'length':<int>, 'sub_category':<str>, 'mate_id':<str>, 'cytoband_start':<str>, 'cytoband_end':<str>, } """ if variant.ALT: alt = variant.ALT[0] if category == "str" and not variant.ALT: alt = "." chrom_match = CHR_PATTERN.match(variant.CHROM) chrom = chrom_match.group(2) end_chrom = chrom position = int(variant.POS) ref_len = len(variant.REF) alt_len = len(alt) if category in {"sv", "cancer_sv"}: svtype = variant.INFO.get("SVTYPE") if svtype: svtype = svtype.lower() sub_category = svtype if sub_category == "bnd": end_chrom = get_end_chrom(alt, chrom) end = sv_end( pos=position, alt=alt, svend=variant.INFO.get("END"), svlen=variant.INFO.get("SVLEN"), ) length = sv_length( pos=position, end=end, chrom=chrom, end_chrom=end_chrom, svlen=variant.INFO.get("SVLEN"), ) else: sub_category = "snv" end = int(variant.end) length = alt_len if ref_len != alt_len: sub_category = "indel" abs(ref_len - alt_len) coordinates = { "position": position, "end": end, "length": length, "sub_category": sub_category, "mate_id": variant.INFO.get("MATEID"), "cytoband_start": get_cytoband_coordinates(chrom, position), "cytoband_end": get_cytoband_coordinates(end_chrom, end), "end_chrom": end_chrom, } return coordinates
def parse_coordinates(variant, category): """Find out the coordinates for a variant Args: variant(cyvcf2.Variant) Returns: coordinates(dict): A dictionary on the form: { 'position':<int>, 'end':<int>, 'end_chrom':<str>, 'length':<int>, 'sub_category':<str>, 'mate_id':<str>, 'cytoband_start':<str>, 'cytoband_end':<str>, } """ ref = variant.REF if variant.ALT: alt = variant.ALT[0] if category == "str" and not variant.ALT: alt = "." chrom_match = CHR_PATTERN.match(variant.CHROM) chrom = chrom_match.group(2) svtype = variant.INFO.get("SVTYPE") if svtype: svtype = svtype.lower() mate_id = variant.INFO.get("MATEID") svlen = variant.INFO.get("SVLEN") svend = variant.INFO.get("END") snvend = int(variant.end) position = int(variant.POS) ref_len = len(ref) alt_len = len(alt) sub_category = get_sub_category(alt_len, ref_len, category, svtype) end = get_end(position, alt, category, snvend, svend) length = get_length(alt_len, ref_len, category, position, end, svtype, svlen) end_chrom = chrom if sub_category == "bnd": if ":" in alt: match = BND_ALT_PATTERN.match(alt) # BND will often be translocations between different chromosomes if match: other_chrom = match.group(1) match = CHR_PATTERN.match(other_chrom) end_chrom = match.group(2) cytoband_start = get_cytoband_coordinates(chrom, position) cytoband_end = get_cytoband_coordinates(end_chrom, end) coordinates = { "position": position, "end": end, "length": length, "sub_category": sub_category, "mate_id": mate_id, "cytoband_start": cytoband_start, "cytoband_end": cytoband_end, "end_chrom": end_chrom, } return coordinates
def parse_variant(variant, case, variant_type='clinical', rank_results_header=None, vep_header=None, individual_positions=None, category=None): """Return a parsed variant Get all the necessary information to build a variant object Args: variant(cyvcf2.Variant) case(dict) variant_type(str): 'clinical' or 'research' rank_results_header(list) vep_header(list) individual_positions(dict): Explain what position each individual has in vcf category(str): 'snv', 'sv', 'str' or 'cancer' Returns: parsed_variant(dict): Parsed variant """ # These are to display how the rank score is built rank_results_header = rank_results_header or [] # Vep information vep_header = vep_header or [] parsed_variant = {} # Create the ID for the variant case_id = case['_id'] if '-' in case_id: logger.debug('internal case id detected') genmod_key = case['display_name'] else: genmod_key = case['_id'] chrom_match = CHR_PATTERN.match(variant.CHROM) chrom = chrom_match.group(2) # Builds a dictionary with the different ids that are used if variant.ALT: alt=variant.ALT[0] elif not variant.ALT and category == "str": alt='.' parsed_variant['ids'] = parse_ids( chrom=chrom, pos=variant.POS, ref=variant.REF, alt=alt, case_id=case_id, variant_type=variant_type, ) parsed_variant['case_id'] = case_id # type can be 'clinical' or 'research' parsed_variant['variant_type'] = variant_type # category is sv or snv # cyvcf2 knows if it is a sv, indel or snv variant if not category: category = variant.var_type if category == 'indel': category = 'snv' if category == 'snp': category = 'snv' parsed_variant['category'] = category ################# General information ################# parsed_variant['reference'] = variant.REF ### We allways assume splitted and normalized vcfs!!! if len(variant.ALT) > 1: raise VcfError("Variants are only allowed to have one alternative") parsed_variant['alternative'] = alt # cyvcf2 will set QUAL to None if '.' in vcf parsed_variant['quality'] = variant.QUAL if variant.FILTER: parsed_variant['filters'] = variant.FILTER.split(';') else: parsed_variant['filters'] = ['PASS'] # Add the dbsnp ids parsed_variant['dbsnp_id'] = variant.ID # This is the id of other position in translocations # (only for specific svs) parsed_variant['mate_id'] = None ################# Position specific ################# parsed_variant['chromosome'] = chrom coordinates = parse_coordinates(variant, category) parsed_variant['position'] = coordinates['position'] parsed_variant['sub_category'] = coordinates['sub_category'] parsed_variant['mate_id'] = coordinates['mate_id'] parsed_variant['end'] = coordinates['end'] parsed_variant['length'] = coordinates['length'] parsed_variant['end_chrom'] = coordinates['end_chrom'] parsed_variant['cytoband_start'] = coordinates['cytoband_start'] parsed_variant['cytoband_end'] = coordinates['cytoband_end'] ################# Add rank score ################# # The rank score is central for displaying variants in scout. rank_score = parse_rank_score(variant.INFO.get('RankScore', ''), genmod_key) parsed_variant['rank_score'] = rank_score or 0 ################# Add gt calls ################# if individual_positions and case['individuals']: parsed_variant['samples'] = parse_genotypes(variant, case['individuals'], individual_positions) else: parsed_variant['samples'] = [] ################# Add compound information ################# compounds = parse_compounds(compound_info=variant.INFO.get('Compounds'), case_id=genmod_key, variant_type=variant_type) if compounds: parsed_variant['compounds'] = compounds ################# Add inheritance patterns ################# genetic_models = parse_genetic_models(variant.INFO.get('GeneticModels'), genmod_key) if genetic_models: parsed_variant['genetic_models'] = genetic_models ################# Add autozygosity calls if present ################# azlength = variant.INFO.get('AZLENGTH') if azlength: parsed_variant['azlength'] = int(azlength) azqual = variant.INFO.get('AZQUAL') if azqual: parsed_variant['azqual'] = float(azqual) ################ Add STR info if present ################ # repeat id generally corresponds to gene symbol repeat_id = variant.INFO.get('REPID') if repeat_id: parsed_variant['str_repid'] = str(repeat_id) # repeat unit - used e g in PanelApp naming of STRs repeat_unit = variant.INFO.get('RU') if repeat_unit: parsed_variant['str_ru'] = str(repeat_unit) # repeat ref - reference copy number repeat_ref = variant.INFO.get('REF') if repeat_ref: parsed_variant['str_ref'] = int(repeat_ref) # repeat len - number of repeats found in case repeat_len = variant.INFO.get('RL') if repeat_len: parsed_variant['str_len'] = int(repeat_len) # str status - this indicates the severity of the expansion level str_status = variant.INFO.get('STR_STATUS') if str_status: parsed_variant['str_status'] = str(str_status) ################# Add gene and transcript information ################# raw_transcripts = [] if vep_header: vep_info = variant.INFO.get('CSQ') if vep_info: raw_transcripts = (dict(zip(vep_header, transcript_info.split('|'))) for transcript_info in vep_info.split(',')) parsed_transcripts = [] dbsnp_ids = set() cosmic_ids = set() for parsed_transcript in parse_transcripts(raw_transcripts, parsed_variant['alternative']): parsed_transcripts.append(parsed_transcript) for dbsnp in parsed_transcript.get('dbsnp', []): dbsnp_ids.add(dbsnp) for cosmic in parsed_transcript.get('cosmic', []): cosmic_ids.add(cosmic) # The COSMIC tag in INFO is added via VEP and/or bcftools annotate cosmic_tag = variant.INFO.get('COSMIC') if cosmic_tag: cosmic_ids.add(cosmic_tag[4:]) if (dbsnp_ids and not parsed_variant['dbsnp_id']): parsed_variant['dbsnp_id'] = ';'.join(dbsnp_ids) if cosmic_ids: parsed_variant['cosmic_ids'] = list(cosmic_ids) gene_info = parse_genes(parsed_transcripts) parsed_variant['genes'] = gene_info hgnc_ids = set([]) for gene in parsed_variant['genes']: hgnc_ids.add(gene['hgnc_id']) parsed_variant['hgnc_ids'] = list(hgnc_ids) ################# Add clinsig prediction ################# if variant.INFO.get('CLNACC'): acc = variant.INFO.get('CLNACC') else: acc = variant.INFO.get('CLNVID') clnsig_predictions = parse_clnsig( acc=acc, sig=variant.INFO.get('CLNSIG'), revstat=variant.INFO.get('CLNREVSTAT'), transcripts=parsed_transcripts ) if clnsig_predictions: parsed_variant['clnsig'] = clnsig_predictions ################# Add the frequencies ################# frequencies = parse_frequencies(variant, parsed_transcripts) parsed_variant['frequencies'] = frequencies # parse out old local observation count local_obs_old = variant.INFO.get('Obs') if local_obs_old: parsed_variant['local_obs_old'] = int(local_obs_old) local_obs_hom_old = variant.INFO.get('Hom') if local_obs_hom_old: parsed_variant['local_obs_hom_old'] = int(local_obs_hom_old) ###################### Add severity predictions ###################### cadd = parse_cadd(variant, parsed_transcripts) if cadd: parsed_variant['cadd_score'] = cadd spidex = variant.INFO.get('SPIDEX') if spidex: parsed_variant['spidex'] = float(spidex) ###################### Add conservation ###################### parsed_variant['conservation'] = parse_conservations(variant) parsed_variant['callers'] = parse_callers(variant, category=category) rank_result = variant.INFO.get('RankResult') if rank_result: results = [int(i) for i in rank_result.split('|')] parsed_variant['rank_result'] = dict(zip(rank_results_header, results)) ###################### Add SV specific annotations ###################### sv_frequencies = parse_sv_frequencies(variant) for key in sv_frequencies: parsed_variant['frequencies'][key] = sv_frequencies[key] ###################### Add Cancer specific annotations ###################### # MSK_MVL indicates if variants are in the MSK managed variant list # https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5437632/ mvl_tag = variant.INFO.get('MSK_MVL') if mvl_tag: parsed_variant['mvl_tag'] = True return parsed_variant
def load_variants(adapter, variant_file, case_obj, variant_type='clinical', category='snv', rank_threshold=6, chrom=None, start=None, end=None): """Load all variant in variants Args: adapter(MongoAdapter) variant_file(str): Path to variant file case(Case) variant_type(str) category(str): 'snv' or 'sv' rank_threshold(int) chrom(str) start(int) end(int) """ institute_obj = adapter.institute(institute_id=case_obj['owner']) if not institute_obj: raise IntegrityError("Institute {0} does not exist in" " database.".format(case_obj['owner'])) gene_to_panels = adapter.gene_to_panels() hgncid_to_gene = adapter.hgncid_to_gene() coordinates = {} vcf_obj = VCF(variant_file) rank_results_header = parse_rank_results_header(vcf_obj) vep_header = parse_vep_header(vcf_obj) # This is a dictionary to tell where ind are in vcf individual_positions = {} for i, ind in enumerate(vcf_obj.samples): individual_positions[ind] = i LOG.info("Start inserting variants into database") start_insertion = datetime.now() start_five_thousand = datetime.now() # To get it right if the file is empty nr_variants = -1 nr_inserted = 0 inserted = 1 coordinates = False if chrom: coordinates = {'chrom': chrom, 'start': start, 'end': end} try: for nr_variants, variant in enumerate(vcf_obj): # Get the neccesary coordinates # Parse away any chr CHR prefix chrom_match = CHR_PATTERN.match(variant.CHROM) chrom = chrom_match.group(2) position = variant.POS add_variant = False # If coordinates are specified we want to upload all variants that # resides within the specified region if coordinates: if check_coordinates(chrom, position, coordinates): add_variant = True # If there are no coordinates we allways want to load MT variants elif chrom == 'MT': add_variant = True # Otherwise we need to check is rank score requirement are fulfilled else: rank_score = parse_rank_score(variant.INFO.get('RankScore'), case_obj['display_name']) if rank_score >= rank_threshold: add_variant = True variant_obj = None # Log the number of variants parsed if (nr_variants != 0 and nr_variants % 5000 == 0): LOG.info("%s variants parsed" % str(nr_variants)) LOG.info( "Time to parse variants: {} ".format(datetime.now() - start_five_thousand)) start_five_thousand = datetime.now() if not add_variant: continue ####### Here we know that the variant should be loaded ######### # We follow the scout paradigm of parse -> build -> load # Parse the variant parsed_variant = parse_variant( variant=variant, case=case_obj, variant_type=variant_type, rank_results_header=rank_results_header, vep_header=vep_header, individual_positions=individual_positions) # Build the variant object variant_obj = build_variant( variant=parsed_variant, institute_id=institute_obj['_id'], gene_to_panels=gene_to_panels, hgncid_to_gene=hgncid_to_gene, ) # Load the variant abject # We could get integrity error here since if we want to load all variants of a region # there will likely already be variants from that region loaded try: load_variant(adapter, variant_obj) nr_inserted += 1 except IntegrityError as error: pass # Log number of inserted variants if (nr_inserted != 0 and (nr_inserted * inserted) % (1000 * inserted) == 0): LOG.info("%s variants inserted" % nr_inserted) inserted += 1 except Exception as error: if not coordinates: LOG.warning("Deleting inserted variants") delete_variants(adapter, case_obj, variant_type) raise error LOG.info("All variants inserted.") LOG.info("Number of variants in file: {0}".format(nr_variants + 1)) LOG.info("Number of variants inserted: {0}".format(nr_inserted)) LOG.info("Time to insert variants:{0}".format(datetime.now() - start_insertion))
def parse_coordinates(variant, category): """Find out the coordinates for a variant Args: variant(cyvcf2.Variant) Returns: coordinates(dict): A dictionary on the form: { 'position':<int>, 'end':<int>, 'end_chrom':<str>, 'length':<int>, 'sub_category':<str>, 'mate_id':<str>, 'cytoband_start':<str>, 'cytoband_end':<str>, } """ ref = variant.REF if variant.ALT: alt = variant.ALT[0] if category=="str" and not variant.ALT: alt = '.' chrom_match = CHR_PATTERN.match(variant.CHROM) chrom = chrom_match.group(2) svtype = variant.INFO.get('SVTYPE') if svtype: svtype = svtype.lower() mate_id = variant.INFO.get('MATEID') svlen = variant.INFO.get('SVLEN') svend = variant.INFO.get('END') snvend = int(variant.end) position = int(variant.POS) ref_len = len(ref) alt_len = len(alt) sub_category = get_sub_category(alt_len, ref_len, category, svtype) end = get_end(position, alt, category, snvend, svend) length = get_length(alt_len, ref_len, category, position, end, svtype, svlen) end_chrom = chrom if sub_category == 'bnd': if ':' in alt: match = BND_ALT_PATTERN.match(alt) # BND will often be translocations between different chromosomes if match: other_chrom = match.group(1) match = CHR_PATTERN.match(other_chrom) end_chrom = match.group(2) cytoband_start = get_cytoband_coordinates(chrom, position) cytoband_end = get_cytoband_coordinates(end_chrom, end) coordinates = { 'position': position, 'end': end, 'length': length, 'sub_category': sub_category, 'mate_id': mate_id, 'cytoband_start': cytoband_start, 'cytoband_end': cytoband_end, 'end_chrom': end_chrom, } return coordinates