def get_data_from_gencode_gtf(gtf_file): """ Parse gencode GTF file Returns iter of (datatype, dict) tuples datatype is one of gene, transcript, exon, cds dict is the corresponding object """ for line in gtf_file: if line.startswith('#'): continue fields = line.strip('\n').split('\t') if fields[2] not in ['gene', 'transcript', 'exon', 'CDS']: continue chrom = fields[0][3:] if len(chrom) > 3: continue # skip the pseudo contigs start = int(fields[3]) # GTF files are 1-indexed: http://www.ensembl.org/info/website/upload/gff.html stop = int(fields[4]) info = dict(x.strip().split() for x in fields[8].split(';') if x != '') info = {k: v.strip('"') for k, v in info.items()} if 'gene_id' in info: info['gene_id'] = info['gene_id'].split('.')[0] # TODO: ignore all entities that are part of an ENSGR gene if info['gene_id'].startswith('ENSGR'): continue if 'transcript_id' in info: info['transcript_id'] = info['transcript_id'].split('.')[0] if 'exon_id' in info: info['exon_id'] = info['exon_id'].split('.')[0] info['chrom'] = chrom info['start'] = start info['stop'] = stop info['xstart'] = get_xpos(chrom, start) info['xstop'] = get_xpos(chrom, stop) # pretend 'CDS' isn't capitalized in gencode gtf file yield fields[2].lower(), info
def parse_clinvar_vcf(clinvar_vcf_path=None): """Load clinvar vcf file Rows have the following format: #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO 1\t949422\t475283\tG\tA\t.\t.\tALLELEID=446939;CLNDISDB=MedGen:C4015293,OMIM:616126,Orphanet:ORPHA319563;CLNDN=Immunodeficiency_38_with_basal_ganglia_calcification;CLNHGVS=NC_000001.10:g.949422G>A;CLNREVSTAT=criteria_provided,_single_submitter;CLNSIG=Benign;CLNVC=single_nucleotide_variant;CLNVCSO=SO:0001483;GENEINFO=ISG15:9636;MC=SO:0001583|missense_variant;ORIGIN=1;RS=143888043 Args: clinvar_vcf_path (string): optional alternate path """ if clinvar_vcf_path is None: clinvar_vcf_path = settings.REFERENCE_SETTINGS.clinvar_vcf_file header = None clinvar_file = gzip.open(clinvar_vcf_path) if clinvar_vcf_path.endswith(".gz") else open(clinvar_vcf_path) for line in tqdm.tqdm(clinvar_file, unit=" clinvar records"): line = line.strip() if line.startswith("##"): continue if header is None: header = get_vcf_headers(line) continue fields = line.split("\t") fields = dict(zip(header, fields)) info_fields = dict([info.split('=') for info in fields['INFO'].split(';')]) fields.update(info_fields) chrom = fields["CHROM"] pos = int(fields["POS"]) ref = fields["REF"] alt = fields["ALT"] variant_id = fields["ID"] if not valid_chrom(chrom): continue clinical_significance = fields.get("CLNSIG", "").lower() if clinical_significance in ["", "not provided", "other", "association"]: continue yield { 'xpos': get_xpos(chrom, pos), 'ref': ref, 'alt': alt, 'variant_id': variant_id, 'clinsig': clinical_significance, }
header = fields else: line_dict = dict(zip(header, fields)) chrom = line_dict["chrom"] pos = int(line_dict["pos"]) ref = line_dict["ref"] alt = line_dict["alt"] if "M" in chrom: continue # because get_xpos doesn't support chrMT. clinical_significance = line_dict["clinical_significance"].lower() if clinical_significance in ["not provided", "other", "association"]: continue else: for c in clinical_significance.split(";"): pathogenicity_values_counter[c] += 1 xpos = get_xpos(chrom, pos) CLINVAR_VARIANTS[(xpos, ref, alt)] = (line_dict["measureset_id"], clinical_significance) #for k in sorted(pathogenicity_values_counter.keys(), key=lambda k: -pathogenicity_values_counter[k]): # print(" %5d %s" % (pathogenicity_values_counter[k], k)) # set the secret key if os.access("/etc/xbrowse_django_secret_key", os.R_OK): with open("/etc/xbrowse_django_secret_key") as f: SECRET_KEY = f.read().strip() SESSION_COOKIE_SECURE = True CSRF_COOKIE_SECURE = True else: print("Warning: could not access /etc/xbrowse_django_secret_key. Falling back on insecure hard-coded SECRET_KEY") SECRET_KEY = "~~~ this key string is FOR DEVELOPMENT USE ONLY ~~~"
header = fields else: line_dict = dict(zip(header, fields)) chrom = line_dict["chrom"] pos = int(line_dict["pos"]) ref = line_dict["ref"] alt = line_dict["alt"] if "M" in chrom: continue # because get_xpos doesn't support chrMT. clinical_significance = line_dict["clinical_significance"].lower() if clinical_significance in ["not provided", "other", "association"]: continue else: for c in clinical_significance.split(";"): pathogenicity_values_counter[c] += 1 xpos = get_xpos(chrom, pos) CLINVAR_VARIANTS[(xpos, ref, alt)] = (line_dict["measureset_id"], clinical_significance) #for k in sorted(pathogenicity_values_counter.keys(), key=lambda k: -pathogenicity_values_counter[k]): # print(" %5d %s" % (pathogenicity_values_counter[k], k)) # print("%d variants loaded" % len(CLINVAR_VARIANTS)) if len(sys.argv) >= 2 and sys.argv[1] == 'test': # use in-memory sqlite database for running tests DATABASES['default'] = { 'ENGINE': 'django.db.backends.sqlite3', 'NAME': 'seqr_test_db.sqlite', 'USER': '', 'PASSWORD': '', 'HOST': '', 'PORT': '',
def get_clinvar_variants(): global _CLINVAR_VARIANTS if _CLINVAR_VARIANTS is None: if not settings.CLINVAR_TSV: raise ValueError("settings.CLINVAR_TSV not set") if not os.path.isfile(settings.CLINVAR_TSV): raise ValueError("settings.CLINVAR_TSV file not found: %s" % (settings.CLINVAR_TSV, )) _CLINVAR_VARIANTS = {} header = None pathogenicity_values_counter = collections.defaultdict(int) print("Reading Clinvar data into memory: " + settings.CLINVAR_TSV) for line in open(settings.CLINVAR_TSV): line = line.strip() if line.startswith("#"): continue fields = line.split("\t") if header is None: header = fields if "clinical_significance" not in line.lower(): raise ValueError( "'clinical_significance' not found in header line: %s" % str(header)) continue try: if "clinical_significance" in line.lower(): raise ValueError( "'clinical_significance' found in non-header line: %s" % str(header)) line_dict = dict(zip(header, fields)) chrom = line_dict["chrom"] pos = int(line_dict["pos"]) ref = line_dict["ref"] alt = line_dict["alt"] if "M" in chrom: continue # because get_xpos doesn't support chrMT. clinical_significance = line_dict[ "clinical_significance"].lower() if clinical_significance in [ "not provided", "other", "association" ]: continue else: for c in clinical_significance.split(";"): pathogenicity_values_counter[c] += 1 xpos = get_xpos(chrom, pos) _CLINVAR_VARIANTS[(xpos, ref, alt)] = (line_dict.get("variation_id") or line_dict.get("measureset_id"), clinical_significance) #for k in sorted(pathogenicity_values_counter.keys(), key=lambda k: -pathogenicity_values_counter[k]): # sys.stderr.write((" %5d %s\n" % (pathogenicity_values_counter[k], k))) #sys.stderr.write("%d clinvar variants loaded \n" % len(CLINVAR_VARIANTS)) except Exception as e: sys.stderr.write( "Error while parsing clinvar row: \n%s\n %s\n" % ( line, e, )) return _CLINVAR_VARIANTS