def _fn(x): out = {} if x[0].strip() not in ['', '\\N']: out['symbol'] = x[0].strip() if x[1].strip() not in ['', '\\N']: _name = SubStr(x[1].strip(), '', ' [Source:').strip() if _name: out['name'] = _name return out
def _fn(x): import logging out = {'taxid': int(x[0])} if x[1].strip() not in ['', '\\N']: out['symbol'] = x[1].strip() if x[2].strip() not in ['', '\\N']: _name = SubStr(x[2].strip(), '', ' [Source:').strip() if _name: out['name'] = _name return out
def get_geneid(self, rec): '''Return geneid as integer, None if not found.''' geneid = None gene_feature = [x for x in rec.features if x.type == 'gene'] # NCBI has now fixed this issue (https://twitter.com/kdpru/status/474673626730741761) # if len(gene_feature) == 0 and rec.id == 'NR_001526.1': # print "Known error for NR_001526.1. Fixed." # return '252949' # a temp fix for this wrong rec from NCBI assert len(gene_feature) == 1, '#: {}, id: {}'.format(len(gene_feature), rec.id) gene_feature = gene_feature[0] db_xref = gene_feature.qualifiers.get('db_xref', None) if db_xref: x = [x for x in db_xref if x.startswith('GeneID:')] if len(x) == 1: geneid = int(SubStr(x[0], 'GeneID:')) return geneid
def get_summary(self, rec): '''Return summary string if available, return '' otherwise.''' summary = '' comment = rec.annotations.get('comment', None) if comment: if comment.find('Summary:') != -1: summary = SubStr(comment, 'Summary: ',).replace('\n', ' ') for end_str in [# '[provided by RefSeq].', #'[provided by ', #'[supplied by ', '##', # '[RGD', 'COMPLETENESS:', 'Sequence Note:', 'Transcript Variant:', 'CCDS Note:', 'Publication Note:', ' '*10]: if summary.find(end_str) != -1: summary = SubStr(summary, end_string=end_str) summary = summary.strip() return summary