示例#1
0
文件: load.py 项目: leipzig/varify
    def get_gene(self, gene_name):
        """
        Get a gene from the cache or attempt to disambiguate or add a new
        record.
        """
        if not gene_name:
            return

        gene_pk = self.gene_cache.get(gene_name, None)
        if gene_pk:
            return Gene(pk=gene_pk)

        # Attempt to disambiguate, only if this is the only synonym may it be
        # associated.
        potential_genes = list(Gene.objects.filter(
            synonyms__label__iexact=gene_name).distinct())
        if len(potential_genes) == 1:
            self.gene_cache[gene_name] = potential_genes[0].pk
            return potential_genes[0]

        # Only if there are no matches should we create a new record,
        # otherwise the synonym war will continue
        if len(potential_genes) == 0:
            gene = Gene(chr=self.get_chromosome(self.chrom), symbol=gene_name)
            gene.save()
            self.gene_cache[gene_name] = gene.pk
            return gene
示例#2
0
    def load_families(self, cursor):
        cursor.execute('''
            SELECT tag, description, gene.id
            FROM raw.hgnc_families
                LEFT OUTER JOIN gene ON (hgnc_families.hgnc_id = gene.hgnc_id)
            ORDER BY tag
        ''')

        keys = ['tag', 'description', 'gene_id']

        families = {f.tag: f for f in GeneFamily.objects.all()}

        while True:
            rows = cursor.fetchmany(100)
            if not rows:
                break

            for row in rows:
                record = dict(zip(keys, row))

                if record['tag'] not in families:
                    family = GeneFamily(tag=record['tag'],
                                        description=record['description'])
                    family.save()
                    families[family.tag] = family

                if record['gene_id']:
                    gene = Gene(pk=record['gene_id'])
                    family = families[record['tag']]
                    gene._state.db = family._state.db
                    gene.families.add(family)
示例#3
0
    def setUp(self):
        import string
        chr1 = Chromosome(value='1', label='1')
        chr1.save()

        genes = {}
        for char in string.lowercase:
            g = Gene(chr=chr1, symbol=char)
            g.save()
            genes[char] = g

        geneset = GeneSet(name='test')
        geneset.save()
        geneset.bulk([genes[c] for c in 'someday'])

        self.genes = genes
        self.geneset = geneset
示例#4
0
    def setUp(self):
        import string
        chr1 = Chromosome(value='1', label='1')
        chr1.save()

        genes = {}
        for char in string.lowercase:
            g = Gene(chr=chr1, symbol=char)
            g.save()
            genes[char] = g

        geneset = GeneSet(name='test')
        geneset.save()
        geneset.bulk([genes[c] for c in 'someday'])

        self.genes = genes
        self.geneset = geneset
示例#5
0
    def _get_or_create_gene(self, record):
        target = {}

        # Link HGNC id
        target['hgnc_id'] = int(record['hgnc_id'])

        # Parse and map chromosome
        if record['chromosome'] == 'mitochondria':
            target['chr_id'] = self.chromosomes['MT']
        elif ' and ' in record['chromosome']:
            target['chr_id'] = self.chromosomes['XY']
        else:
            match = self.chrom_re.match(record['chromosome'])
            if not match:
                log.warning('unable to match gene chromosome from HGNC',
                            extra={
                                'hgnc_id': record['hgnc_id'],
                                'raw_chr': record['chromosome'],
                            })
                return None, False
            target['chr_id'] = self.chromosomes[match.groups()[0]]

        target['symbol'] = record['approved_symbol'].encode('utf8')
        target['name'] = record['approved_name'].encode('utf8')

        # If the gene already exists by hgnc_id, fetch it. Next check
        # by symbol (in case of a new approved gene) and set the
        # hgnc_id. Fallback to creating a new gene
        try:
            return Gene.objects.get(hgnc_id=target['hgnc_id']), False
        except Gene.DoesNotExist:
            try:
                gene = Gene.objects.get(symbol=target['symbol'])
                gene.hgnc_id = target['hgnc_id']
            except Gene.DoesNotExist:
                gene = Gene(**target)
            gene.save()
            return gene, True
示例#6
0
    def get_gene(self, gene_name):
        "Get a gene from the cache or attempt to disambiguate or add a new record."
        if not gene_name:
            return

        gene_pk = self.gene_cache.get(gene_name, None)
        if gene_pk:
            return Gene(pk=gene_pk)

        # Attempt to disambiguate, only if this is the only synonym may it be
        # associated
        potential_genes = list(Gene.objects.filter(synonyms__label__iexact=gene_name).distinct())
        if len(potential_genes) == 1:
            self.gene_cache[gene_name] = potential_genes[0].pk
            return potential_genes[0]

        # Only if there are no matches should we create a new record,
        # otherwise the synonym war will continue
        if len(potential_genes) == 0:
            gene = Gene(chr=self.get_chromosome(self.chrom), symbol=gene_name)
            gene.save()
            self.gene_cache[gene_name] = gene.pk
            return gene
示例#7
0
文件: load.py 项目: hassanNS/varify
    def _get_or_create_gene(self, record):
        target = {}

        # Link HGNC id
        target["hgnc_id"] = int(record["hgnc_id"])

        # Parse and map chromosome
        if record["chromosome"] == "mitochondria":
            target["chr_id"] = self.chromosomes["MT"]
        elif " and " in record["chromosome"]:
            target["chr_id"] = self.chromosomes["XY"]
        else:
            match = self.chrom_re.match(record["chromosome"])
            if not match:
                log.warning(
                    "unable to match gene chromosome from HGNC",
                    extra={"hgnc_id": record["hgnc_id"], "raw_chr": record["chromosome"]},
                )
                return None, False
            target["chr_id"] = self.chromosomes[match.groups()[0]]

        target["symbol"] = record["approved_symbol"].encode("utf8")
        target["name"] = record["approved_name"].encode("utf8")

        # If the gene already exists by hgnc_id, fetch it. Next check
        # by symbol (in case of a new approved gene) and set the
        # hgnc_id. Fallback to creating a new gene
        try:
            return Gene.objects.get(hgnc_id=target["hgnc_id"]), False
        except Gene.DoesNotExist:
            try:
                gene = Gene.objects.get(symbol=target["symbol"])
                gene.hgnc_id = target["hgnc_id"]
            except Gene.DoesNotExist:
                gene = Gene(**target)
            gene.save()
            return gene, True
示例#8
0
    def _get_or_create_gene(self, record):
        target = {}

        # Link HGNC id
        target['hgnc_id'] = int(record['hgnc_id'])

        # Parse and map chromosome
        if record['chromosome'] == 'mitochondria':
            target['chr_id'] = self.chromosomes['MT']
        elif ' and ' in record['chromosome']:
            target['chr_id'] = self.chromosomes['XY']
        else:
            match = self.chrom_re.match(record['chromosome'])
            if not match:
                log.warning('unable to match gene chromosome from HGNC', extra={
                    'hgnc_id': record['hgnc_id'],
                    'raw_chr': record['chromosome'],
                })
                return None, False
            target['chr_id'] = self.chromosomes[match.groups()[0]]

        target['symbol'] = record['approved_symbol'].encode('utf8')
        target['name'] = record['approved_name'].encode('utf8')

        # If the gene already exists by hgnc_id, fetch it. Next check
        # by symbol (in case of a new approved gene) and set the
        # hgnc_id. Fallback to creating a new gene
        try:
            return Gene.objects.get(hgnc_id=target['hgnc_id']), False
        except Gene.DoesNotExist:
            try:
                gene = Gene.objects.get(symbol=target['symbol'])
                gene.hgnc_id = target['hgnc_id']
            except Gene.DoesNotExist:
                gene = Gene(**target)
            gene.save()
            return gene, True
示例#9
0
 def get_transcript(self, gene_pk, refseq_id):
     "Get a transcript from the cache or add a new record."
     if not refseq_id:
         return
     transcript_pk = self.transcripts.get(refseq_id)
     if transcript_pk:
         return transcript_pk
     gene = Gene(pk=gene_pk)
     transcript = Transcript(refseq_id=refseq_id, gene=gene)
     try:
         transcript.save()
     except IntegrityError:
         transcript = Transcript.objects.get(refseq_id=refseq_id, gene=gene)
     self.transcripts[refseq_id] = transcript.pk
     return transcript.pk
示例#10
0
    def load_hgmd_snp(self, cursor, using=None):
        cursor.execute('''
            select distinct
                cl.acc_num as hgmd_id,
                c.value as chr,
                c.id as chr_id,
                v.id as variant_id,
                trim(both from cl.disease) as phenotype,
                ph.id as phenotype_id,
                cl.gene as gene,
                g.id as gene_id,
                cl.pmid as pubmed,
                pm.pmid as pubmed_id
            from (
                select m.acc_num, m.disease, m.gene, m.pmid, c.chromosome, c."coordSTART" as pos
                    from raw.hgmd_mutation m inner join raw.hgmd_hg19_coords c on (m.acc_num = c.acc_num)
            ) cl
                left outer join chromosome c on (cl.chromosome = c.value)
                left outer join variant v on (c.id = v.chr_id and cl.pos = v.pos)
                left outer join variant_type vt on (v.type_id = vt.id)
                left outer join phenotype ph on (lower(trim(both from regexp_replace(cl.disease, '\s*\?$', ''))) = lower(ph.term))
                left outer join pubmed pm on (cl.pmid::varchar = pm.pmid::varchar)
                left outer join gene g on (cl.gene::text = g.symbol::text)
                left outer join variant_phenotype vp on (vp.variant_id = v.id)
            where vt.value = 'SNP'
                and cl.disease not like '%%?'
                and v.id is not null and g.id is not null
            order by c.id
        ''')

        keys = ['hgmd_id', 'chr', 'chr_id', 'variant_id',
            'phenotype', 'phenotype_id', 'gene', 'gene_id',
            'pubmed', 'pubmed_id']

        count = 0
        new_pubmed_map = {}
        new_phenotype_map = {}

        chrs = dict(Chromosome.objects.values_list('value', 'id'))

        while True:
            rows = cursor.fetchmany(100)
            if not rows:
                break

            for row in rows:
                record = dict(zip(keys, row))

                # Get or create a pubmed record
                if record['pubmed_id']:
                    pubmed = PubMed(pmid=record['pubmed_id'])
                    pubmed._state.db = using
                # Some records have a bogus PMID. Only proces the valid ones.
                elif type(record['pubmed']) is int or record['pubmed'].isdigit():
                    pmid = int(record['pubmed'])
                    if pmid in new_pubmed_map:
                        pubmed = new_pubmed_map[pmid]
                    else:
                        pubmed = PubMed(pmid=pmid)
                        pubmed.save()
                        new_pubmed_map[pmid] = pubmed
                else:
                    pubmed = None

                # Get or create a the phenotype, associate the HGMD id with
                if record['phenotype_id']:
                    phenotype = Phenotype(pk=record['phenotype_id'])
                    phenotype._state.db = using
                else:
                    term = record['phenotype']
                    # Check newly added objects
                    if term in new_phenotype_map:
                        phenotype = new_phenotype_map[term]
                    else:
                        phenotype = Phenotype(term=record['phenotype'])
                        phenotype.save()
                        new_phenotype_map[term] = phenotype

                _chr = Chromosome(pk=chrs[record['chr']])
                _chr._state.db = using

                if record['gene_id']:
                    gene = Gene(pk=record['gene_id'])
                    gene._state.db = using

                    try:
                        gp = GenePhenotype.objects.get(gene=gene, phenotype=phenotype)
                    except GenePhenotype.DoesNotExist:
                        gp = GenePhenotype(gene=gene, phenotype=phenotype)
                    gp.hgmd_id = record['hgmd_id']
                    gp.save()
                else:
                    gene = None

                if record['variant_id']:
                    variant = Variant(pk=record['variant_id'])
                    variant._state.db = using

                    try:
                        vp = VariantPhenotype.objects.get(variant=variant, phenotype=phenotype)
                    except VariantPhenotype.DoesNotExist:
                        vp = VariantPhenotype(variant=variant, phenotype=phenotype)
                    vp.hgmd_id = record['hgmd_id']
                    vp.save()
                else:
                    variant = None

                if pubmed:
                    phenotype.articles.add(pubmed)
                    if variant:
                        variant.articles.add(pubmed)
                    if gene:
                        gene.articles.add(pubmed)

                count += 1

            sys.stdout.write('{0}\r'.format(count))
            sys.stdout.flush()

        return count