def get_gene(self, gene_name): """ Get a gene from the cache or attempt to disambiguate or add a new record. """ if not gene_name: return gene_pk = self.gene_cache.get(gene_name, None) if gene_pk: return Gene(pk=gene_pk) # Attempt to disambiguate, only if this is the only synonym may it be # associated. potential_genes = list(Gene.objects.filter( synonyms__label__iexact=gene_name).distinct()) if len(potential_genes) == 1: self.gene_cache[gene_name] = potential_genes[0].pk return potential_genes[0] # Only if there are no matches should we create a new record, # otherwise the synonym war will continue if len(potential_genes) == 0: gene = Gene(chr=self.get_chromosome(self.chrom), symbol=gene_name) gene.save() self.gene_cache[gene_name] = gene.pk return gene
def load_families(self, cursor): cursor.execute(''' SELECT tag, description, gene.id FROM raw.hgnc_families LEFT OUTER JOIN gene ON (hgnc_families.hgnc_id = gene.hgnc_id) ORDER BY tag ''') keys = ['tag', 'description', 'gene_id'] families = {f.tag: f for f in GeneFamily.objects.all()} while True: rows = cursor.fetchmany(100) if not rows: break for row in rows: record = dict(zip(keys, row)) if record['tag'] not in families: family = GeneFamily(tag=record['tag'], description=record['description']) family.save() families[family.tag] = family if record['gene_id']: gene = Gene(pk=record['gene_id']) family = families[record['tag']] gene._state.db = family._state.db gene.families.add(family)
def setUp(self): import string chr1 = Chromosome(value='1', label='1') chr1.save() genes = {} for char in string.lowercase: g = Gene(chr=chr1, symbol=char) g.save() genes[char] = g geneset = GeneSet(name='test') geneset.save() geneset.bulk([genes[c] for c in 'someday']) self.genes = genes self.geneset = geneset
def _get_or_create_gene(self, record): target = {} # Link HGNC id target['hgnc_id'] = int(record['hgnc_id']) # Parse and map chromosome if record['chromosome'] == 'mitochondria': target['chr_id'] = self.chromosomes['MT'] elif ' and ' in record['chromosome']: target['chr_id'] = self.chromosomes['XY'] else: match = self.chrom_re.match(record['chromosome']) if not match: log.warning('unable to match gene chromosome from HGNC', extra={ 'hgnc_id': record['hgnc_id'], 'raw_chr': record['chromosome'], }) return None, False target['chr_id'] = self.chromosomes[match.groups()[0]] target['symbol'] = record['approved_symbol'].encode('utf8') target['name'] = record['approved_name'].encode('utf8') # If the gene already exists by hgnc_id, fetch it. Next check # by symbol (in case of a new approved gene) and set the # hgnc_id. Fallback to creating a new gene try: return Gene.objects.get(hgnc_id=target['hgnc_id']), False except Gene.DoesNotExist: try: gene = Gene.objects.get(symbol=target['symbol']) gene.hgnc_id = target['hgnc_id'] except Gene.DoesNotExist: gene = Gene(**target) gene.save() return gene, True
def get_gene(self, gene_name): "Get a gene from the cache or attempt to disambiguate or add a new record." if not gene_name: return gene_pk = self.gene_cache.get(gene_name, None) if gene_pk: return Gene(pk=gene_pk) # Attempt to disambiguate, only if this is the only synonym may it be # associated potential_genes = list(Gene.objects.filter(synonyms__label__iexact=gene_name).distinct()) if len(potential_genes) == 1: self.gene_cache[gene_name] = potential_genes[0].pk return potential_genes[0] # Only if there are no matches should we create a new record, # otherwise the synonym war will continue if len(potential_genes) == 0: gene = Gene(chr=self.get_chromosome(self.chrom), symbol=gene_name) gene.save() self.gene_cache[gene_name] = gene.pk return gene
def _get_or_create_gene(self, record): target = {} # Link HGNC id target["hgnc_id"] = int(record["hgnc_id"]) # Parse and map chromosome if record["chromosome"] == "mitochondria": target["chr_id"] = self.chromosomes["MT"] elif " and " in record["chromosome"]: target["chr_id"] = self.chromosomes["XY"] else: match = self.chrom_re.match(record["chromosome"]) if not match: log.warning( "unable to match gene chromosome from HGNC", extra={"hgnc_id": record["hgnc_id"], "raw_chr": record["chromosome"]}, ) return None, False target["chr_id"] = self.chromosomes[match.groups()[0]] target["symbol"] = record["approved_symbol"].encode("utf8") target["name"] = record["approved_name"].encode("utf8") # If the gene already exists by hgnc_id, fetch it. Next check # by symbol (in case of a new approved gene) and set the # hgnc_id. Fallback to creating a new gene try: return Gene.objects.get(hgnc_id=target["hgnc_id"]), False except Gene.DoesNotExist: try: gene = Gene.objects.get(symbol=target["symbol"]) gene.hgnc_id = target["hgnc_id"] except Gene.DoesNotExist: gene = Gene(**target) gene.save() return gene, True
def get_transcript(self, gene_pk, refseq_id): "Get a transcript from the cache or add a new record." if not refseq_id: return transcript_pk = self.transcripts.get(refseq_id) if transcript_pk: return transcript_pk gene = Gene(pk=gene_pk) transcript = Transcript(refseq_id=refseq_id, gene=gene) try: transcript.save() except IntegrityError: transcript = Transcript.objects.get(refseq_id=refseq_id, gene=gene) self.transcripts[refseq_id] = transcript.pk return transcript.pk
def load_hgmd_snp(self, cursor, using=None): cursor.execute(''' select distinct cl.acc_num as hgmd_id, c.value as chr, c.id as chr_id, v.id as variant_id, trim(both from cl.disease) as phenotype, ph.id as phenotype_id, cl.gene as gene, g.id as gene_id, cl.pmid as pubmed, pm.pmid as pubmed_id from ( select m.acc_num, m.disease, m.gene, m.pmid, c.chromosome, c."coordSTART" as pos from raw.hgmd_mutation m inner join raw.hgmd_hg19_coords c on (m.acc_num = c.acc_num) ) cl left outer join chromosome c on (cl.chromosome = c.value) left outer join variant v on (c.id = v.chr_id and cl.pos = v.pos) left outer join variant_type vt on (v.type_id = vt.id) left outer join phenotype ph on (lower(trim(both from regexp_replace(cl.disease, '\s*\?$', ''))) = lower(ph.term)) left outer join pubmed pm on (cl.pmid::varchar = pm.pmid::varchar) left outer join gene g on (cl.gene::text = g.symbol::text) left outer join variant_phenotype vp on (vp.variant_id = v.id) where vt.value = 'SNP' and cl.disease not like '%%?' and v.id is not null and g.id is not null order by c.id ''') keys = ['hgmd_id', 'chr', 'chr_id', 'variant_id', 'phenotype', 'phenotype_id', 'gene', 'gene_id', 'pubmed', 'pubmed_id'] count = 0 new_pubmed_map = {} new_phenotype_map = {} chrs = dict(Chromosome.objects.values_list('value', 'id')) while True: rows = cursor.fetchmany(100) if not rows: break for row in rows: record = dict(zip(keys, row)) # Get or create a pubmed record if record['pubmed_id']: pubmed = PubMed(pmid=record['pubmed_id']) pubmed._state.db = using # Some records have a bogus PMID. Only proces the valid ones. elif type(record['pubmed']) is int or record['pubmed'].isdigit(): pmid = int(record['pubmed']) if pmid in new_pubmed_map: pubmed = new_pubmed_map[pmid] else: pubmed = PubMed(pmid=pmid) pubmed.save() new_pubmed_map[pmid] = pubmed else: pubmed = None # Get or create a the phenotype, associate the HGMD id with if record['phenotype_id']: phenotype = Phenotype(pk=record['phenotype_id']) phenotype._state.db = using else: term = record['phenotype'] # Check newly added objects if term in new_phenotype_map: phenotype = new_phenotype_map[term] else: phenotype = Phenotype(term=record['phenotype']) phenotype.save() new_phenotype_map[term] = phenotype _chr = Chromosome(pk=chrs[record['chr']]) _chr._state.db = using if record['gene_id']: gene = Gene(pk=record['gene_id']) gene._state.db = using try: gp = GenePhenotype.objects.get(gene=gene, phenotype=phenotype) except GenePhenotype.DoesNotExist: gp = GenePhenotype(gene=gene, phenotype=phenotype) gp.hgmd_id = record['hgmd_id'] gp.save() else: gene = None if record['variant_id']: variant = Variant(pk=record['variant_id']) variant._state.db = using try: vp = VariantPhenotype.objects.get(variant=variant, phenotype=phenotype) except VariantPhenotype.DoesNotExist: vp = VariantPhenotype(variant=variant, phenotype=phenotype) vp.hgmd_id = record['hgmd_id'] vp.save() else: variant = None if pubmed: phenotype.articles.add(pubmed) if variant: variant.articles.add(pubmed) if gene: gene.articles.add(pubmed) count += 1 sys.stdout.write('{0}\r'.format(count)) sys.stdout.flush() return count