Exemplo n.º 1
0
        def _fn(x, eid):
            out = {'gene': eid, 'translation': []}

            def mapping(transcript_id, protein_id):
                trid = transcript_id and transcript_id != '\\N' and transcript_id or None
                pid = protein_id and protein_id != '\\N' and protein_id or None
                if trid and pid:
                    out['translation'].append({"rna": trid, "protein": pid})

            if isinstance(x, list):
                transcript_li = []
                protein_li = []
                for _x in x:
                    if _x[0] and _x[0] != '\\N':
                        transcript_li.append(_x[0])
                    if _x[1] and _x[1] != '\\N':
                        protein_li.append(_x[1])
                    mapping(_x[0], _x[1])

                if transcript_li:
                    out['transcript'] = normalized_value(transcript_li)
                if protein_li:
                    out['protein'] = normalized_value(protein_li)
            else:
                if x[0] and x[0] != '\\N':
                    out['transcript'] = x[0]
                if x[1] and x[1] != '\\N':
                    out['protein'] = x[1]
                mapping(x[0], x[1])

            return out
Exemplo n.º 2
0
        def _ff(d):
            (taxid, symbol, locus_tag, synonyms, dbxrefs, map_location,
             description, type_of_gene, other_designations,
             modification_date) = d
            out = dict(taxid=int(taxid), symbol=symbol, name=description)
            if map_location != '-':
                out['map_location'] = map_location
            if type_of_gene != '-':
                out['type_of_gene'] = type_of_gene
            if synonyms != '-':
                out['alias'] = normalized_value(synonyms.split('|'))
            if locus_tag != '-':
                out['locus_tag'] = locus_tag
            if other_designations != "-":
                out['other_names'] = normalized_value(
                    other_designations.split('|'))

            # when merged, this will become the default timestamp
            # as of 2017/12/10, some timestamps can have different formats
            if len(modification_date) > 8:
                out["_timestamp"] = datetime.datetime.strptime(
                    modification_date, "%m/%d/%Y %H:%M:%S")
            else:
                out["_timestamp"] = datetime.datetime.strptime(
                    modification_date, "%Y%m%d")

            for x in dbxrefs.split('|'):
                if x == '-':
                    continue
                xd = x.split(':')
                if len(xd) == 3 and xd[0] == xd[1] and \
                        xd[0] in ['VGNC', 'HGNC', 'MGI']:
                    # a fix for NCBI bug for dup xref prefix, 'HGNC:HGNC:36328'
                    xd = xd[1:]
                try:
                    _db, _id = xd
                except:
                    print(repr(x))
                    raise
                # we don't need ensembl xref from here, we will get it from
                # Ensembl directly
                if _db.lower() in ['ensembl', 'imgt/gene-db']:
                    # we don't need 'IMGT/GENE-DB" xref either, because they
                    # are mostly the same as gene symbol
                    continue
                # add "MGI:" prefix for MGI ids.
                if _db.lower() == 'mgi':
                    _id = "MGI:" + _id
                out[_db] = _id
            return out
Exemplo n.º 3
0
 def _ff(d):
     out = {'rna': [], 'protein': [], 'genomic': [], 'translation': []}
     for rna, prot, dna in d:
         if rna == '-': rna = None
         if prot == '-': prot = None
         if dna == '-': dna = None
         if rna is not None:
             out['rna'].append(rna)
         if prot is not None:
             out['protein'].append(prot)
         if dna is not None:
             out['genomic'].append(dna)
         if rna and prot:
             out['translation'].append({'rna': rna, 'protein': prot})
     # remove dup
     for k in out:
         out[k] = normalized_value(out[k])
     # remove empty rna/protein/genomic field
     _out = {}
     for k, v in out.items():
         if v:
             _out[k] = v
     if _out:
         _out = {self.fieldname: _out}
     return _out
Exemplo n.º 4
0
    def load(self, aslist=False):
        if self.species_li:
            _includefn = lambda ld: int(ld[0]) in self.taxid_set and ld[
                1] != '-'
        else:
            _includefn = lambda ld: ld[1] != '-'
        gene2retired = tab2dict(self.datafile, (1, 2),
                                0,
                                alwayslist=1,
                                includefn=_includefn)
        gene2retired = dict_convert(
            gene2retired,
            valuefn=lambda x: normalized_value([int(xx) for xx in x]))

        gene_d = {}
        for gid, retired in gene2retired.items():
            gene_d[gid] = {'retired': retired}

        if aslist:
            return dict_to_list(gene_d)
        else:
            return gene_d