Пример #1
0
Файл: hgnc.py Проект: fnl/gnamed
    def _parse(self, line: str):
        items = [i.strip() for i in line.split('\t')]
        assert len(items) > 1, line

        for idx in range(len(items)):
            if items[idx] == '-':
                items[idx] = ''

        while len(items) < 16:
            items.append('')

        row = Line._make(items)
        record = GeneRecord(Species.human, symbol=row.symbol, name=row.name,
                            location=row.location if row.location else None)
        db_key = DBRef(Namespace.hgnc, row.id)
        record.addDBRef(db_key)

        # link DB references
        for ns in DB_REFS:
            acc = getattr(row, ns)

            if acc:
                if ns in FIX_ACCESSION:
                    # noinspection PyUnresolvedReferences
                    acc = acc[acc.find(":") + 1:]

                ref = DBRef(ns, acc)

                if ref in WRONG_DB_REFS:
                    new_ref = WRONG_DB_REFS[ref]
                    logging.info('correcting wrong ref %s->%s',
                                 '{}:{}'.format(*ref),
                                 '{}:{}'.format(*new_ref))
                    ref = new_ref

                record.addDBRef(ref)

        # parse symbol strings
        for field in (row.previous_symbols, row.synonyms):
            if field:
                for symbol in Parser._parseCD(field):
                    record.addSymbol(symbol)

        # parse name strings
        for field in (row.previous_names, row.name_synonyms):
            if field:
                for name in Parser._parseQCD(field):
                    record.addName(name)

        # parse keywords strings
        if row.gene_family_symbols:
            for kwd in Parser._parseCD(row.gene_family_symbols):
                record.addKeyword(kwd)

        # location with ALT_REF_LOCI values - eliminate them
        if row.location and " ALT_REF_LOCI" in row.location:
            record.location = row.location[:row.location.find(" ALT_REF_LOCI")]

        for name in Parser._parseQCD(row.gene_family_names):
            for subname in name.split(' / '):
                for subsubname in subname.split(' : '):
                    subsubname = subsubname.strip()

                    if subsubname.lower() not in ('other', '"other"'):
                        record.addKeyword(subsubname)

        try:
            self._loadRecord(db_key, record)
        except DuplicateEntityError:
            if len(record.refs) == 2:
                # assume all HGNC links that do not coincide with the
                # Entrez back-link are bad, as it seems it is mostly
                # HGNC that is not up-to-date.
                logging.warn('removing likely bad Entrez ref in %s:%s',
                             *db_key)
                assert any(r.namespace == Namespace.entrez
                           for r in record.refs), record.refs
                record.refs = {r for r in record.refs if
                               r.namespace == Namespace.hgnc}
                assert len(record.refs) == 1, record.refs
                self._loadRecord(db_key, record)
            else:
                raise

        return 1