def _parse(self, line: str): count = 0 items = [i.strip() for i in line.split('\t')] assert len(items) == len(CONTENT), '{} items'.format(len(items)) for i in range(len(items)): if items[i] == '""': items[i] = None row = Line._make(items) if self._db_key is None or row.id != self._db_key.accession: if self._record is not None: self._loadRecord(self._db_key, self._record) count = 1 #noinspection PyTypeChecker record = GeneRecord(Species.budding_yeast, symbol=row.symbol if row.symbol else row.location, name=row.name, chromosome=row.location[1], location=row.location) # add DB references self._db_key = DBRef(Namespace.sgd, row.id) record.addDBRef(self._db_key) # add systematic name (= location) as a symbol if row.symbol: record.addSymbol(row.location) # add gene length as a keyword if row.length: record.addKeyword(row.length) # add protein names as alternative symbol names if row.protein_name: record.addSymbol(row.protein_name) # stack the record (multiple alias lines!) self._record = record if row.alias and row.alias not in (row.symbol, row.name, row.location): if " " in row.alias and len(row.alias) > 8: self._record.addName(row.alias) else: self._record.addSymbol(row.alias) return count
def _parse(self, line: str): items = [i.strip() for i in line.split('\t')] assert len(items) > 32, '{} items'.format(len(items)) for idx in COLUMNS: if not items[idx] or items[idx] == '-': items[idx] = None row = Line._make(items[col] for col in COLUMNS) #noinspection PyTypeChecker record = GeneRecord(Species.rat, symbol=row.symbol, name=row.name, chromosome=row.chromosome, location=( '{}{}'.format(row.chromosome, row.location) if row.chromosome and row.location else None )) db_key = DBRef(Namespace.rgd, row.id) # add DB references record.addDBRef(db_key) for ns in (Namespace.entrez, Namespace.uniprot): accs = getattr(row, ns) if accs: if ns == Namespace.uniprot: # noinspection PyUnresolvedReferences for acc in accs.split(';'): record.addDBRef(DBRef(ns, acc)) else: # noinspection PyUnresolvedReferences accs = accs.split(';') record.addDBRef(DBRef(ns, accs[0])) # parse symbol strings if row.symbol: record.addSymbol(row.symbol) for field in (row.old_symbols, row.qtl_symbols): if field: for symbol in field.split(';'): record.addSymbol(symbol) # parse name strings if row.name: record.addName(row.name) if row.old_names: for name in row.old_names.split(';'): record.addName(name.strip()) # parse keywords strings if row.descriptions: for desc in row.descriptions.split('; '): record.addKeyword(desc.strip()) try: self._loadRecord(db_key, record) except DuplicateEntityError: accs = getattr(row, Namespace.entrez) if accs: # Entrez Gene is not unique, having created multiple GIs for # the same gene. Sometimes, single Entrez Genes are badly # linked by RGD, as in the case of RGD:69363 linking to # GI:113900, that should be linked to GI:10092108. This code # can update such artifacts in RGD, too, and eliminates the # duplicate Genes. logging.warning('removing duplicate rat genes for ' 'rgd:%s with Entrez GIs %s', row.id, accs) rgd_ref = self.session.query(GeneRef).filter( GeneRef.accession == row.id ).filter(GeneRef.namespace == Namespace.rgd).one() logging.debug('correct %s links to gene:%s', repr(rgd_ref), rgd_ref.id) orphan_genes = {} # Update retired RGD and Entrez entries by pointing the # outdated Refs to the right Gene (rgd_ref.id), while deleting # the "duplicate" Genes. # noinspection PyUnresolvedReferences for gi in accs.split(';'): entrez_ref = self.session.query(GeneRef).filter( GeneRef.accession == gi ).filter(GeneRef.namespace == Namespace.entrez).one() if entrez_ref.id != rgd_ref.id: try: retired_ref = self.session.query(GeneRef).filter( GeneRef.id == entrez_ref.id ).filter(GeneRef.namespace == Namespace.rgd).one() logging.debug('updating %s and retired %s ' 'reference to orphan gene:%s', repr(entrez_ref), repr(retired_ref), entrez_ref.id) retired_ref.id = rgd_ref.id except NoResultFound: logging.debug('updating %s reference ' 'to orphan gene:%s', repr(entrez_ref), entrez_ref.id) if entrez_ref.id not in orphan_genes: orphan_genes[entrez_ref.id] = self.session.query( Gene ).filter( Gene.id == entrez_ref.id ).one() entrez_ref.id = rgd_ref.id for gene in orphan_genes.values(): self.session.delete(gene) self._flush() self._loadRecord(db_key, record) else: raise return 1
def _parse(self, line: str): items = [i.strip() for i in line.split('\t')] assert len(items) > 1, line for idx in range(len(items)): if items[idx] == '-': items[idx] = '' while len(items) < 16: items.append('') row = Line._make(items) record = GeneRecord(Species.human, symbol=row.symbol, name=row.name, location=row.location if row.location else None) db_key = DBRef(Namespace.hgnc, row.id) record.addDBRef(db_key) # link DB references for ns in DB_REFS: acc = getattr(row, ns) if acc: if ns in FIX_ACCESSION: # noinspection PyUnresolvedReferences acc = acc[acc.find(":") + 1:] ref = DBRef(ns, acc) if ref in WRONG_DB_REFS: new_ref = WRONG_DB_REFS[ref] logging.info('correcting wrong ref %s->%s', '{}:{}'.format(*ref), '{}:{}'.format(*new_ref)) ref = new_ref record.addDBRef(ref) # parse symbol strings for field in (row.previous_symbols, row.synonyms): if field: for symbol in Parser._parseCD(field): record.addSymbol(symbol) # parse name strings for field in (row.previous_names, row.name_synonyms): if field: for name in Parser._parseQCD(field): record.addName(name) # parse keywords strings if row.gene_family_symbols: for kwd in Parser._parseCD(row.gene_family_symbols): record.addKeyword(kwd) # location with ALT_REF_LOCI values - eliminate them if row.location and " ALT_REF_LOCI" in row.location: record.location = row.location[:row.location.find(" ALT_REF_LOCI")] for name in Parser._parseQCD(row.gene_family_names): for subname in name.split(' / '): for subsubname in subname.split(' : '): subsubname = subsubname.strip() if subsubname.lower() not in ('other', '"other"'): record.addKeyword(subsubname) try: self._loadRecord(db_key, record) except DuplicateEntityError: if len(record.refs) == 2: # assume all HGNC links that do not coincide with the # Entrez back-link are bad, as it seems it is mostly # HGNC that is not up-to-date. logging.warn('removing likely bad Entrez ref in %s:%s', *db_key) assert any(r.namespace == Namespace.entrez for r in record.refs), record.refs record.refs = {r for r in record.refs if r.namespace == Namespace.hgnc} assert len(record.refs) == 1, record.refs self._loadRecord(db_key, record) else: raise return 1
def _parseMain(self, line: str): # remove the backslash junk in the Entrez data file idx = line.find('\\') while idx != -1: if len(line) > idx + 1 and line[idx + 1].isalnum(): line = '{}/{}'.format(line[:idx], line[idx + 1:]) else: line = '{}{}'.format(line[:idx], line[idx + 1:]) idx = line.find('\\', idx) items = [i.strip() for i in line.split('\t')] # ignore the undocumented "NEWENTRY" junk in the file if items[2] == 'NEWENTRY': return 0 cleanChromosome = items[6].find('|') # drop (too long!) chr. strings with multiple chromosomes listed if cleanChromosome != -1: items[6] = items[6][0:cleanChromosome] for idx in range(len(items)): if items[idx] == '-': items[idx] = "" # remove any junk names from the official names/symbols for idx in [2, 8, 10, 11]: if items[idx] and items[idx].lower() in JUNK_NAMES: logging.debug( 'removing %s "%s" from %s:%s', COLNAME[idx], items[idx], Namespace.entrez, items[1] ) items[idx] = "" row = Line._make(items) # example of a bad symbol: gi:835054 (but accepted) assert not row.symbol or len(row.symbol) < 65, \ '{}:{} has an illegal symbol="{}"'.format( Namespace.entrez, row.id, row.symbol ) db_key = DBRef(Namespace.entrez, row.id) record = GeneRecord(row.species_id, symbol=row.symbol, name=row.name, chromosome=row.chromosome, location=row.map_location) record.addDBRef(db_key) # separate existing DB links and new DB references if row.dbxrefs: for xref in row.dbxrefs.split('|'): db, acc = xref.split(':') try: if TRANSLATE[db]: db_ref = DBRef(TRANSLATE[db], acc) if db_ref not in self._generefs: record.addDBRef(db_ref) self._generefs.add(db_ref) except KeyError: logging.warn('unknown dbXref to "%s"', db) # parsed symbol strings if row.nomenclature_symbol: record.addSymbol(row.nomenclature_symbol) if row.locus_tag: record.addSymbol(row.locus_tag) if row.synonyms: # clean up the synonym mess, moving names to where they # belong, e.g., gi:814702 cites "cleavage and polyadenylation # specificity factor 73 kDa subunit-II" as a gene symbol for sym in row.synonyms.split('|'): sym = sym.strip() if sym.lower() not in JUNK_NAMES: if isGeneSymbol(sym): record.addSymbol(sym) else: record.addName(sym) # parsed name strings if row.nomenclature_name: record.addName(row.nomenclature_name) if row.other_designations: # as with synonyms, at least skip the most frequent junk for name in row.other_designations.split('|'): name = name.strip() if name.lower() not in JUNK_NAMES: if isGeneSymbol(name): record.addSymbol(name) else: record.addName(name) # parsed keyword strings if row.type_of_gene and row.type_of_gene not in ('other', 'unknown'): record.addKeyword(row.type_of_gene) # add the PubMed links parsed earlier (if any): if db_key.accession in self._pmidMapping: record.pmids = self._pmidMapping[db_key.accession] self._loadRecord(db_key, record) return 1