def _parse(self, line: str): items = [i.strip() for i in line.split('\t')] assert len(items) > 1, line for idx in range(len(items)): if items[idx] == '-': items[idx] = '' while len(items) < 16: items.append('') row = Line._make(items) record = GeneRecord(Species.human, symbol=row.symbol, name=row.name, location=row.location if row.location else None) db_key = DBRef(Namespace.hgnc, row.id) record.addDBRef(db_key) # link DB references for ns in DB_REFS: acc = getattr(row, ns) if acc: if ns in FIX_ACCESSION: # noinspection PyUnresolvedReferences acc = acc[acc.find(":") + 1:] ref = DBRef(ns, acc) if ref in WRONG_DB_REFS: new_ref = WRONG_DB_REFS[ref] logging.info('correcting wrong ref %s->%s', '{}:{}'.format(*ref), '{}:{}'.format(*new_ref)) ref = new_ref record.addDBRef(ref) # parse symbol strings for field in (row.previous_symbols, row.synonyms): if field: for symbol in Parser._parseCD(field): record.addSymbol(symbol) # parse name strings for field in (row.previous_names, row.name_synonyms): if field: for name in Parser._parseQCD(field): record.addName(name) # parse keywords strings if row.gene_family_symbols: for kwd in Parser._parseCD(row.gene_family_symbols): record.addKeyword(kwd) # location with ALT_REF_LOCI values - eliminate them if row.location and " ALT_REF_LOCI" in row.location: record.location = row.location[:row.location.find(" ALT_REF_LOCI")] for name in Parser._parseQCD(row.gene_family_names): for subname in name.split(' / '): for subsubname in subname.split(' : '): subsubname = subsubname.strip() if subsubname.lower() not in ('other', '"other"'): record.addKeyword(subsubname) try: self._loadRecord(db_key, record) except DuplicateEntityError: if len(record.refs) == 2: # assume all HGNC links that do not coincide with the # Entrez back-link are bad, as it seems it is mostly # HGNC that is not up-to-date. logging.warn('removing likely bad Entrez ref in %s:%s', *db_key) assert any(r.namespace == Namespace.entrez for r in record.refs), record.refs record.refs = {r for r in record.refs if r.namespace == Namespace.hgnc} assert len(record.refs) == 1, record.refs self._loadRecord(db_key, record) else: raise return 1