def __init__(self, uri=None, echo=True, drop=False, logger=None): self.uri = uri or 'sqlite:///:memory:' self.logger = logger or SimpleLogger('Manager') self.engine = sqlalchemy.create_engine(uri, echo=echo) self.engine.logger.level = self.logger.level self.session = self.connect() self.create_table(drop=drop)
def __init__(self, dbfile=':memory:', echo=False, drop=False, logger=None): self.drop = drop self.dbfile = dbfile self.uri = f'sqlite:///{dbfile}' self.logger = logger or SimpleLogger('Manager') self.engine = sqlalchemy.create_engine(self.uri, echo=echo) self.engine.logger.level = self.logger.level self.session = self.connect()
def cli(ctx, **kwargs): dbfile = kwargs['dbfile'] dirname = os.path.dirname(dbfile) if dirname and not os.path.exists(dirname): os.makedirs(dirname) ctx.ensure_object(dict) ctx.obj['manager'] = Manager(dbfile=dbfile, echo=kwargs['echo']) ctx.obj['logger'] = SimpleLogger('OMIM-CLI') ctx.obj['entry'] = Entry(omim_url=kwargs['url'])
def main(**kwargs): start_time = time.time() logger = SimpleLogger('MAIN') logger.info(f'input arguments: {kwargs}') year = kwargs['year'] end = kwargs['end'] or year code = kwargs['code'] subcategory = kwargs['subcategory'] level = int(kwargs['level']) if kwargs['level'] else None count = kwargs['count'] letpub = LetPub(logger=logger) outdir = kwargs['outdir'] outfile = os.path.join(kwargs['outdir'], kwargs['outfile']) if not os.path.exists(outdir): os.makedirs(outdir) if kwargs['list']: code_list = letpub.code_list print(code_list.get(code)) exit(0) try: with open(outfile, 'w') as out: for context in letpub.search(code, startTime=year, endTime=end, subcategory=subcategory, level=level, count=count): if not count: line = json.dumps(context, ensure_ascii=False) + '\n' out.write(line) if not count: logger.info(f'save file: {outfile}') except KeyboardInterrupt: os.remove(outfile) elapsed = time.time() - start_time logger.info(f'elapsed time: {elapsed:.2f}s')
class OMIM(object): def __init__(self, omim_url='https://mirror.omim.org'): self.omim_url = omim_url self.logger = SimpleLogger('OMIM') def get_soup(self, url): soup = WR.get_soup(url) return soup def get_mim2gene(self, outfile=None): url = self.omim_url + '/static/omim/data/mim2gene.txt' resp = WR.get_response(url, stream=True) if outfile: with open(outfile, 'wb') as out: for chunk in resp.iter_content(chunk_size=512): out.write(chunk) self.logger.debug(f'save file: {outfile}') else: return resp.text def parse_mim2gene(self, mim2gene=None, mim_types=('gene', 'gene/phenotype')): if mim2gene and os.path.isfile(mim2gene): self.logger.debug(f'parsing mim2gene from file: {mim2gene} ...') text = open(mim2gene).read().strip() else: self.logger.debug(f'parsing mim2gene from website ...') text = self.get_mim2gene() fields = 'mim_number mim_type entrez_gene_id hgnc_gene_symbol ensembl_gene_id'.split() for line in text.split('\n'): if line.startswith('# Generated:'): generated = line.split(': ')[-1] continue elif line.startswith('#') or not line.strip(): continue linelist = line.split('\t') context = dict(zip(fields, linelist)) if mim_types and context['mim_type'] not in mim_types: continue context['generated'] = date_parse(generated) yield context['mim_number'], context
class Eutils(object): """ params: db database name api_key api_key or NCBI_API_KEY in environment optional params: term term for esearch id id(s) for efetch field field for esearch """ base_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/' logger = SimpleLogger('Eutils') IF = ImpactFactor() def __init__(self, db='pubmed', service_url='translate.google.cn', api_key=None, **kwargs): self.db = db self.api_key = api_key self.validate_api_key() self.TR = GoogleTrans(service_url=service_url) def parse_params(self, **kwargs): """ - add default db - add api_key if available """ params = {'db': self.db} if self.api_key: params['api_key'] = self.api_key params.update(kwargs) if 'api_key' in params and params['api_key'] is None: del params['api_key'] return params def esearch(self, term, retstart=0, retmax=250, head=False, limit=None, **kwargs): """ https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch > -search from a database from given term >> - esearch.cgi?db=pubmed&term=ngs >> - esearch.cgi?db=pubmed&term=ngs&retmode=xml&field=TIAB >> - esearch.cgi?db=pubmed&term=ngs[Title/Abstract]&retmode=xml """ url = self.base_url + 'esearch.fcgi' params = self.parse_params(term=term, retmode='json', retstart=retstart, retmax=retmax, **kwargs) # print(params) result = WebRequest.get_response(url, params=params).json()['esearchresult'] if head: return result self.logger.info( '{count} articles found with term: {querytranslation}'.format( **result)) if limit is None and int(result['count']) > 250: self.logger.warning( 'too many results, you can limit output with option "-l/--limit N", ' 'or simplify your input with sub-command "advance-search" ') exit(1) idlist = result['idlist'] while int(result['retstart']) + int(result['retmax']) < int( result['count']): if limit and len(idlist) >= limit: break retstart = int(result['retstart']) + int(result['retmax']) params = self.parse_params(term=term, retmode='json', retstart=retstart, retmax=retmax, **kwargs) result = WebRequest.get_response( url, params=params).json()['esearchresult'] idlist += result['idlist'] if limit: self.logger.info('limit {} from {}'.format(limit, result['count'])) idlist = idlist[:limit] if idlist: self.logger.debug('idlist: {} ...'.format(', '.join(idlist[:10]))) else: self.logger.warning('no result for term: {}'.format(term)) return idlist def efetch(self, ids, batch_size=5, **kwargs): """ https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EFetch > - fetch from a database for given ids >> efetch.cgi?db=pubmed&id=1,2,3 """ url = self.base_url + 'efetch.fcgi' self.logger.info('fetching start: total {}, batch_size: {}'.format( len(ids), batch_size)) for n in range(0, len(ids), batch_size): _id = ','.join(ids[n:n + batch_size]) self.logger.debug(f'fetching xml: {n+1} - {n+batch_size}') params = self.parse_params(id=_id, retmode='xml') xml = WebRequest.get_response(url, params=params).text self.logger.debug(f'parsing xml: {n+1} - {n+batch_size}') for context in xml_parser.parse(xml): article = Article(**context) yield article def einfo(self, **kwargs): """ https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EInfo > - show all database list >> einfo.fcgi?db= > - show dbinfo for given database >> einfo.fcgi?db=pubmed """ url = self.base_url + 'einfo.fcgi' params = self.parse_params(retmode='json', **kwargs) info = WebRequest.get_response(url, params=params, allowed_codes=[200, 400]).json() return info def elink(self, ids, dbfrom='pubmed', cmd='neighbor', **kwargs): """ https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ELink > - get cited (`{"linkname": "pubmed_pubmed_citedin"}`) >> elink.fcgi?dbfrom=pubmed&db=pubmed&id=20210808&cmd=neighbor&retmode=json > - get pdf url >> elink.fcgi?dbfrom=pubmed&db=pubmed&cmd=prlinks&id=10210801 cmds: - neighbor (default) - neighbor_score - neighbor_history - acheck - ncheck - lcheck - llinks - llinkslib - prlinks """ url = self.base_url + 'elink.fcgi' params = self.parse_params(retmode='json', id=ids, dbfrom=dbfrom, cmd=cmd, **kwargs) result = WebRequest.get_response(url, params=params).json() return result def get_cited(self, _id, dbfrom='pubmed', cmd='neighbor'): """ get the cited pmids for given pmid """ linksetdbs = self.elink(_id, dbfrom=dbfrom, cmd=cmd)['linksets'][0]['linksetdbs'] links = [ linkset['links'] for linkset in linksetdbs if linkset['linkname'] == 'pubmed_pubmed_citedin' ] citedin = links[0] if links else [] return {'count': len(citedin), 'links': citedin} def get_pdf_url(self, _id, dbfrom='pubmed', cmd='prlinks'): """ get the pdf url for given pmid """ idurllist = self.elink(_id, dbfrom=dbfrom, cmd=cmd)['linksets'][0]['idurllist'] result = { each['id']: each['objurls'][0]['url']['value'] for each in idurllist } return result def validate_fields(self, **kwargs): """ Show all fields for given database """ fieldlist = self.einfo( **kwargs)['einforesult']['dbinfo'][0]['fieldlist'] fieldlist = sorted(fieldlist, key=lambda x: x.get('fullname')) fields = {} table = prettytable.PrettyTable( field_names=['Number', 'Name', 'FullName', 'Description']) for n, each in enumerate(fieldlist, 1): # print('{n}\t{name}\t{fullname}\t{description}'.format(n=n, **each)) fields[n] = [each['name'], each['fullname'], each['description']] table.add_row( [n, each['name'], each['fullname'], each['description']]) table.align['FullName'] = 'l' table.align['Description'] = 'l' click.secho(str(table), fg='bright_blue') return fields def validate_api_key(self): """ Description: https://www.ncbi.nlm.nih.gov/account/settings/#accountSettingsApiKeyManagement - E-utils users are allowed 3 requests/second without an API key. - Create an API key to increase your e-utils limit to 10 requests/second. """ configfile = os.path.join(os.path.expanduser('~'), '.pypubmed.cfg') if not self.api_key: if os.path.isfile(configfile): self.api_key = open(configfile).read().strip() if not self.api_key: msg = textwrap.dedent(''' API_KEY not found! Use eutils with a api_key is a good idea! You can create one from: https://www.ncbi.nlm.nih.gov/account/settings/#accountSettingsApiKeyManagement If you already have one, you can add it to your environment: export NCBI_API_KEY=xxxx''' ) self.logger.warning(msg) return res = self.einfo() if 'error' in res: self.logger.warning('invalid api_key, please check: {}'.format( self.api_key)) self.api_key = None else: self.logger.info('Valid api_key: {}'.format(self.api_key)) with open(configfile, 'w') as out: out.write(self.api_key) def search(self, term, cited=True, translate=True, impact_factor=True, translate_cache=None, **kwargs): """ term: - string, eg. 'ngs AND disease' - pmid, eg. 1,2,3 - file with pmid """ if os.path.isfile(term): idlist = open(term).read().strip().split() elif all(each.isdigit() for each in term.split(',')): idlist = term.split(',') else: idlist = self.esearch(term, **kwargs) # print(kwargs);exit() articles = self.efetch(idlist, **kwargs) for article in articles: if impact_factor: res = self.IF.search(article.issn) or self.IF.search( article.e_issn) article.impact_factor = res['factor'] if res else '.' if cited: article.cited = self.get_cited(article.pmid) if translate: if translate_cache and translate_cache.get(article.pmid): article.abstract_cn = translate_cache.get(article.pmid) else: article.abstract_cn = 'translate failed' n = 0 while n < 5: n += 1 try: article.abstract_cn = self.TR.translate( article.abstract) break except Exception as e: print(e) time.sleep(3) yield article
class PassConfig(object): """ >>> from passconfig import PassConfig >>> pc = PassConfig() >>> username, password = pc.get() >>> if True: >>> pc.save() """ logger = SimpleLogger('PassConfig') def __init__(self, username=None, password=None, configfile=CONFIG_DEFAULT, section='common'): self.username = username self.password = password self.configfile = configfile self.section = section self.conf = ConfigParser() if configfile: self.conf.read(configfile) def get(self): """ Return usernane, password """ if self.username and self.password: username, password = self.username, self.password elif os.path.exists(self.configfile): username, password = self.read() else: username, password = None, None if not all([username, password]): click.secho('please input your username and password', fg='yellow') username = username or click.prompt('>>> username') password = click.prompt('>>> password', hide_input=True, confirmation_prompt=True) self.username = username self.password = password return username, password def read(self): """ Read username and password from configfile """ password = None username = self.username if not self.conf.has_section(self.section): pass elif username and self.conf.has_option(self.section, self.username): password = self.conf.get(self.section, self.username) elif not self.username and len(self.conf.options(self.section)) == 1: username = self.conf.options(self.section)[0] password = self.conf.get(self.section, username) return username, password @staticmethod def safe_open(filename, mode='r'): """ Make directory firstly if not exists """ if 'w' in mode: dirname = os.path.dirname(filename) if dirname and not os.path.exists(dirname): os.makedirs(dirname) return open(filename, mode) def save(self): """ Save username and password to configfile """ if not self.conf.has_section(self.section): self.conf.add_section(self.section) self.conf.set(self.section, self.username, self.password) with self.safe_open(self.configfile, 'w') as out: self.conf.write(out) self.logger.info('save username and password to file: {}'.format( self.configfile))
class Official(object): base_url = 'http://output.nsfc.gov.cn' logger = SimpleLogger('Official') field_codes = WR.get_response(base_url + '/common/data/fieldCode').json()['data'] @classmethod def get_field_codes(cls): """ 所有的学科代码 """ url = cls.base_url + '/common/data/fieldCode' print(url) return WR.get_response(url).json()['data'] @classmethod def list_root_codes(cls): """ 获取所有的学科分类代码 """ root_codes = {} for context in cls.field_codes: if len(context['code']) == 1: root_codes[context['code']] = context['name'] return root_codes @classmethod def list_child_codes(cls, keys): """ 获取最低级的学科代码 C01 --> C010101, C010102, ... H10 --> H1001, H1002, ... """ child_codes = {} for key in keys.split(','): for context in cls.field_codes: code = context['code'] if len(code) == 1: continue if code.startswith(key): child_codes[code] = context['name'] if code[:-2] in child_codes: del child_codes[code[:-2]] return child_codes @classmethod def get_conclusion_data(cls, ratify_number, detail=True): """ 获取指定项目批准号的结题数据 """ url = cls.base_url + '/baseQuery/data/conclusionQueryResultsData' payload = { 'ratifyNo': ratify_number, 'queryType': 'input', 'complete': 'true', } result = WR.get_response(url, method='POST', json=payload).json()['data']['resultsData'] data = {} if result: data['projectid'] = result[0][0] data['project_type'] = result[0][3] data['result_stat'] = result[0][10] if detail and data.get('projectid'): detail_data = cls.get_detail_data(data['projectid']) data.update(detail_data) return data @classmethod def get_detail_data(cls, projectid): url = cls.base_url + '/baseQuery/data/conclusionProjectInfo/' + projectid data = WR.get_response(url).json()['data'] return data @classmethod def get_conclusion_report(cls, ratify_number, tmpdir='tmp', pdf=True, outfile=None): data = cls.get_conclusion_data(ratify_number, detail=False) if not data: cls.logger.warning(f'no conclusion result for: {ratify_number}') return images = list(cls.get_conclusion_report_images(data['projectid'])) if not os.path.exists(tmpdir): os.makedirs(tmpdir) pngs = [] for n, url in enumerate(images, 1): name = os.path.basename(url) png = f'{tmpdir}/{name}.png' pngs.append(png) cls.logger.debug( f'[{n}/{len(images)}] download png: {url} => {png}') resp = WR.get_response(url, stream=True) with open(png, 'wb') as out: for chunk in resp.iter_content(chunk_size=512): out.write(chunk) cls.logger.debug(f'save png: {png}') if pdf: cls.logger.debug('converting *png to pdf') outfile = outfile or f'{ratify_number}.pdf' with open(outfile, 'wb') as out: out.write(img2pdf.convert(pngs)) size = human_readable.file_size(os.stat(outfile).st_size) cls.logger.info(f'save pdf: {outfile} [{size}]') return True @classmethod def get_conclusion_report_images(cls, projectid): url = cls.base_url + '/baseQuery/data/completeProjectReport' index = 1 while True: payload = {'id': projectid, 'index': index} res = WR.get_response(url, method='POST', data=payload).json()['data'] if not res['hasnext']: break yield cls.base_url + res['url'] index += 1
class Export(object): logger = SimpleLogger('Export') def __init__(self, data, outfile='out.xlsx', outtype=None, fields=None, fillna='.', **kwargs): self.outfile = outfile self.outtype = outtype or outfile.split('.')[-1] # self.data = list(self.filter_data(data, fields)) if fields else data self.data = list(self.reformat_data(data, fields=fields, fillna=fillna)) def reformat_data(self, data, fields, fillna): if fields: field_list = fields.strip(',').split(',') else: field_list = data[0].keys() for context in data: out_ctx = {k: v or fillna for k, v in context.items() if k in field_list} yield out_ctx def filter_data(self, data, fields): fields = fields.strip(',').split(',') for context in data: out_ctx = {k: v for k, v in context.items() if k in fields} yield out_ctx def export(self): if self.outtype == 'xlsx': self.export_xlsx() elif self.outtype == 'json': self.export_json() elif self.outtype in ('jl', 'jsonlines'): self.export_json_lines() else: self.logger.error('outtype is invalid, please check!') exit(1) self.logger.info('save file: {}'.format(self.outfile)) def write_title(self, sheet, titles, fg_color=colors.BLACK, bg_color=colors.WHITE, border=True, bold=True, width=18, size=12): for col, value in enumerate(titles, 1): w = width * 4 if value in ('abstract', 'abstract_cn') else width sheet.column_dimensions[get_column_letter(col)].width = w _ = sheet.cell(1, col, value=value) # set cell styles _.alignment = Alignment(horizontal='left',vertical='center',wrap_text=True) _.fill = PatternFill(start_color=bg_color, end_color=bg_color, fill_type="solid") _.font = Font(bold=bold, color=fg_color, size=size) def add_hyperlink(self, key, value): """ method1: value=HYPERLINK("http://www.baidu.com", "baidu") sheet.cell(row, colum, value=value) method2: _ = sheet.cell(row, column, value='baidu') _.hyperlink = 'http://www.baidu.com' """ if key == 'pmid': url = 'https://pubmed.ncbi.nlm.nih.gov/{value}/' elif key == 'pmc': url = 'https://www.ncbi.nlm.nih.gov/pmc/articles/{value}/' elif key == 'doi': url = 'https://doi.org/{value}' else: return None url = url.format(**locals()) return url @property def all_fields(self): return ''' pmid title abstract abstract_cn impact_factor journal med_abbr iso_abbr pubdate pubmed_pubdate pmc issn e_issn doi year pagination volume issue pub_status authors keywords pub_types cited '''.split() def sort(self, item): if type(item) == tuple: k = item[0] else: k = item if k in self.all_fields: return self.all_fields.index(k) else: return 9999 def export_xlsx(self, sheet_title='Result', freeze_panes='B1'): """ PatternFill: - https://openpyxl.readthedocs.io/en/latest/api/openpyxl.styles.fills.html?highlight=PatternFill """ book = openpyxl.Workbook() sheet = book.active # # freeze the first column # sheet.freeze_panes = 'B1' # # freeze the first row # sheet.freeze_panes = 'A2' # freeze the first column and the first row sheet.freeze_panes = 'B2' sheet.title = sheet_title titles = sorted(list(self.data[0].keys()), key=self.sort) self.write_title(sheet, titles) for row, context in enumerate(self.data, 2): color = '00b3ffb3' if row % 2 else '00b3ffff' for col, (key, value) in enumerate(sorted(list(context.items()), key=self.sort), 1): if type(value) == list: try: value = ', '.join(value) except: try: value = json.dumps(value, ensure_ascii=False) except: value = str(value) elif type(value) == dict: value = json.dumps(value, ensure_ascii=False) try: _ = sheet.cell(row, col, value=value) except: _ = sheet.cell(row, col, value=str(value)) _.fill = PatternFill(start_color=color, end_color=color, fill_type="solid") if key in ('pmid', 'pmc', 'doi') and value not in ('.', None): _.hyperlink = self.add_hyperlink(key, value) _.font = Font(color=colors.BLUE, italic=True) wrap_text = None if key in ('abstract', 'abstract_cn'): wrap_text = True _.alignment = Alignment(horizontal='left',vertical='center',wrap_text=wrap_text) book.save(self.outfile) def export_json(self, **kwargs): with safe_open(self.outfile, 'w') as out: json.dump(self.data, out, ensure_ascii=False, **kwargs) def export_json_lines(self): with safe_open(self.outfile, 'w') as out: for context in self.data: out.write(json.dumps(context, ensure_ascii=False) + '\n')
def __init__(self, omim_url='https://mirror.omim.org'): self.omim_url = omim_url self.logger = SimpleLogger('OMIM')
def __init__(self, logger=None): self.logger = logger or SimpleLogger('LetPub') self.subcategory_list = self.list_support_types() self.province_list = self.list_provinces() self.code_list = self.list_codes()
class ExcelParser(object): logger = SimpleLogger('ExcelParser') def __init__(self): pass def parse(self, filename, data_only=False, read_only=False, sheet_idx=None, choose_one=False, skip=None, limit=None, **kwargs): """ data_only=True: get the value instead of formula when data_type is 'f' read_only=True: to deal with large file, some attributes might lost """ wb = openpyxl.load_workbook(filename, data_only=data_only, read_only=read_only) sheets = wb.worksheets if len(wb.sheetnames) > 1: if sheet_idx is not None: sheets = [wb.worksheets[sheet_idx]] elif choose_one: sheets = self.choose_sheet(wb) return self.get_data(sheets, skip=skip, limit=limit) def choose_sheet(self, workbook): context = dict(enumerate(workbook.sheetnames)) click.secho('{}'.format(json.dumps(context, indent=2, ensure_ascii=False)), err=True, fg='bright_green') while True: idxes = click.prompt('please choose one or more sheets, separate by comma', err=True) for idx in idxes.split(','): if int(idx) not in context: self.logger.warning('bad choice, choose from: {}'.format(list(context.keys()))) continue sheets = [workbook.worksheets[int(idx)] for idx in idxes.split(',')] return sheets def get_data(self, worksheets, skip=None, limit=None, fillna=''): data = OrderedDict() for ws in worksheets: data[ws.title] = [] for n, row in enumerate(ws.rows): if skip and n < skip: continue if limit and len(data[ws.title]) > limit: break line = [ cell.value.strftime('%Y-%m-%d') if cell.data_type == 'd' else fillna if cell.value is None else cell.value for cell in row ] data[ws.title].append(line) return data def export(self, data, outfile=None, fmt='table', indent=None, sep='\t', header=True, index=True, color=None, pager=False): """"export data parameters data: data return by get_data method outfile: output file, default stdout fmt: 'table', 'html', 'tsv' or 'json' indent: for json fmt export sep: for tsv fmt export """ out = open(outfile, 'w') if outfile else sys.stdout with out: for sheet, rows in data.items(): click.secho('>>> {}'.format(sheet), err=True, fg='yellow') fd = Formatter(rows, header=header) if fmt == 'table': res = fd.to_table(index=index).get_string() elif fmt == 'html': res = fd.to_table(index=index).get_html_string() elif fmt == 'json': res = fd.to_json(indent=indent) elif fmt == 'tsv': res = fd.to_tsv(sep=sep) else: exit('bad format, choose from table, html, json, tsv') if color: res = click.style(res, fg=color) if pager: click.echo_via_pager(res, color=False) else: out.write(res + '\n')
class GoogleTrans(object): """ >>> from simple_googletrans import GoogleTrans >>> t = GoogleTrans() >>> t.translate('hello world') >>> >>> t.show_languages() """ logger = SimpleLogger('GoogleTrans') def __init__(self, service_url=None, proxies=None, timeout=None): self.translator = googletrans.Translator(service_urls=[service_url], proxies=proxies, timeout=timeout) self.nltk_checked = False def translate(self, text, dest='zh-cn', **kwargs): if os.path.isfile(text): text = open(text).read() texts = self.split_text(text) result = [] for text in texts: text = self.translator.translate(text, dest=dest, **kwargs).text result.append(text) result = ''.join(result) return result def check_nltk(self): """ - download from Interactive Python >>> import nltk >>> nltk.download('punkt') - download from command line $ python -m nltk.downloader punkt - more: http://www.nltk.org/data.html """ try: import nltk except SyntaxError: self.logger.warning( 'nltk is not available for Python2, use Python3 please.') exit(1) try: nltk.sent_tokenize('hello world') self.logger.info('nltk is ok!') except Exception: self.logger.warning('nltk_data not found! downloading start ...') try: nltk.download('punkt') self.nltk_checked = True except: self.logger.error( 'nltk_data download failed! you can also try: python -m nltk.downloader punkt' ) exit(1) def split_text(self, text, max_len=5000): """ googletrans limits 5000 characters split text with nltk.sent_tokenize >>> nltk.sent_tokenize('hello world!') """ if len(text) <= max_len: return [text] if not self.nltk_checked: self.check_nltk() self.logger.info('split text with nltk') texts = [] for sent in nltk.sent_tokenize(text): if (not texts) or (len(texts[-1]) + len(sent) > max_len): texts.append(sent) else: texts[-1] += ' ' + sent return texts def show_languages(self): data = googletrans.LANGCODES table = prettytable.PrettyTable(['Index', 'Abbr', 'Language']) for n, (lang, abbr) in enumerate(sorted(data.items(), key=lambda x: x[1]), 1): table.add_row([n, abbr, lang]) table.align['Abbr'] = 'l' table.align['Language'] = 'l' click.secho(str(table), fg='cyan')
class ExcelConcat(object): logger = SimpleLogger('ExcelConcat') def __init__(self): self.wb = openpyxl.Workbook() self.wb.remove(self.wb.active) # remove default 'Sheet' def concat(self, infiles, keep_fmt=False, keep_size=True, merge_cells=True, keep_other=True): for infile in infiles: self.logger.debug('loading file: {}'.format(infile)) in_book = openpyxl.load_workbook(infile) for sheetname in in_book.sheetnames: sheet = in_book[sheetname] self.logger.debug( 'copy sheet: {} [{} rows, {} columns]'.format( sheetname, sheet.max_row, sheet.max_column)) ws = self.wb.create_sheet(sheetname) for row in sheet.rows: for cell in row: ws[cell.coordinate] = cell.value if keep_fmt and cell.has_style: # might be slow for big file self.copy_format(cell, ws[cell.coordinate]) if keep_size: self.copy_size(sheet, ws) if merge_cells: self.merge_cells(sheet, ws) if keep_other: self.copy_other(sheet, ws) def copy_format(self, origin_cell, target_cell): """ copy style for each cell """ fmt_list = ('alignment', 'font', 'fill', 'border', 'comment', 'hyperlink', 'data_type', 'number_format') for fmt in fmt_list: value = getattr(origin_cell, fmt) if not value: continue if isinstance(value, openpyxl.styles.proxy.StyleProxy): value = value.copy() setattr(target_cell, fmt, value) def copy_size(self, origin_sheet, target_sheet): """ copy width for columns and height for rows """ self.logger.debug('copy height and width for sheet: {}'.format( origin_sheet.title)) for column in range(1, origin_sheet.max_column + 1): letter = openpyxl.utils.get_column_letter(column) width = origin_sheet.column_dimensions[letter].width target_sheet.column_dimensions[ letter].width = origin_sheet.column_dimensions[letter].width for row in range(1, origin_sheet.max_row + 1): target_sheet.row_dimensions[ row].height = origin_sheet.row_dimensions[row].height def merge_cells(self, origin_sheet, target_sheet): """ copy merged cells """ self.logger.debug('merge cells for sheet: {}'.format( origin_sheet.title)) for ranges in origin_sheet.merged_cell_ranges: target_sheet.merge_cells(ranges.coord) def copy_image(self, origin_sheet, target_sheet): self.logger.debug('copy images for sheet: {}'.format( origin_sheet.title)) for im in origin_sheet._images: target_sheet.add_image(im) def copy_other(self, origin_sheet, target_sheet): for other in ('image', 'table', 'chart', 'pivot'): items = getattr(origin_sheet, '_{}s'.format(other)) if other == 'table': # dict for table items = items.values() if items: self.logger.debug('copy {} for sheet: {}'.format( other, origin_sheet.title)) for item in items: getattr(target_sheet, 'add_{}'.format(other))(item) if origin_sheet.data_validations.dataValidation: self.logger.debug('copy data_validations for sheet: {}'.format( origin_sheet.title)) for data_validation in origin_sheet.data_validations.dataValidation: target_sheet.add_data_validation(data_validation) def save(self, outfile): self.wb.save(outfile) self.logger.info('save file: {}'.format(outfile))
def main(**kwargs): logger = SimpleLogger('STATS') logger.level = logger.level_maps[kwargs['log_level']] logger.info(f'input arguments: {kwargs}') dbfile = kwargs['dbfile'] limit = kwargs['limit'] outfile = kwargs['outfile'] if kwargs['keys']: table = PrettyTable(['Key', 'Comment', 'Type']) for k, v in Project.metadata.tables['project'].columns.items(): table.add_row([k, v.comment, v.type]) for field in table._field_names: table.align[field] = 'l' print(click.style(str(table), fg='cyan')) exit(0) if not os.path.isfile(dbfile): logger.error(f'dbfile not exists! [{dbfile}]') baidu = version_info['baidu_data'] logger.info(f'可通过百度网盘下载需要的数据:{baidu}\n' f'下载完成后可通过-d参数指定数据库文件,也可以拷贝文件到:{DEFAULT_DB}') exit(1) uri = f'sqlite:///{dbfile}' with Manager(uri=uri, echo=False, logger=logger) as m: query = m.session.query(Project) if kwargs['search']: for key, value in kwargs['search']: if '%' in value: query = query.filter(Project.__dict__[key].like(value)) elif key in ('approval_year', ) and not value.isdigit(): if '-' in value: min_value, max_value = value.split('-') query = query.filter( Project.__dict__[key] >= min_value) query = query.filter( Project.__dict__[key] <= max_value) else: logger.error('bad approval_year: {value}') exit(1) else: query = query.filter(Project.__dict__[key] == value) if limit: query = query.limit(limit) logger.debug(str(query)) if kwargs['count']: logger.info(f'count: {query.count()}') elif not query.count(): logger.warning('no result for your input') else: if outfile and kwargs['format'] == 'xlsx': wb = openpyxl.Workbook() ws = wb.active ws.title = 'NSFC-RESULT' title = [ k for k, v in query.first().__dict__.items() if k != '_sa_instance_state' ] ws.append(title) for col, v in enumerate(title, 1): _ = ws.cell(row=1, column=col, value=v) _.font = Font(color='FFFFFF', bold=True) _.fill = PatternFill(start_color='000000', end_color='000000', fill_type='solid') for n, row in enumerate(query): context = [ v for k, v in row.__dict__.items() if k != '_sa_instance_state' ] ws.append(context) ws.freeze_panes = 'A2' wb.save(outfile) else: out = open(outfile, 'w') if outfile else sys.stdout with out: if kwargs['format'] == 'json': data = [{ k: v for k, v in row.__dict__.items() if k != '_sa_instance_state' } for row in query] out.write( json.dumps(data, ensure_ascii=False, indent=2) + '\n') else: for n, row in enumerate(query): context = { k: v for k, v in row.__dict__.items() if k != '_sa_instance_state' } if n == 0 and kwargs['format'] == 'tsv': title = '\t'.join(context.keys()) out.write(title + '\n') if kwargs['format'] == 'tsv': line = '\t'.join(map(str, context.values())) else: line = json.dumps(context, ensure_ascii=False) out.write(line + '\n') if outfile: logger.info(f'save file: {outfile}')
default='jl', show_choices=True, show_default=True) @click.option('-K', '--keys', help='list the available keys for query', is_flag=True) @click.option('-C', '--count', help='just output the out of searching', is_flag=True) @click.option('-L', '--limit', help='the count of limit of output', type=int) @click.option('-l', '--log-level', help='the level of logging', type=click.Choice(SimpleLogger().level_maps), default='info', show_choices=True, show_default=True) def main(**kwargs): logger = SimpleLogger('STATS') logger.level = logger.level_maps[kwargs['log_level']] logger.info(f'input arguments: {kwargs}') dbfile = kwargs['dbfile'] limit = kwargs['limit'] outfile = kwargs['outfile'] if kwargs['keys']:
def __init__(self, url=None, timeout=10): self.timeout = timeout self.logger = SimpleLogger('SciHub') self.url = url or self.check_url(url)
class SciHub(object): def __init__(self, url=None, timeout=10): self.timeout = timeout self.logger = SimpleLogger('SciHub') self.url = url or self.check_url(url) def check_url(self, url): def _check(url): self.logger.info(f'checking url: {url} ...') try: resp = requests.get(url, timeout=self.timeout) elapsed = resp.elapsed.total_seconds() soup = bs4.BeautifulSoup(resp.text, 'html.parser') form = soup.select_one('form[method="POST"]') if form: self.logger.info(f'good url: {url} [{elapsed}s]') return elapsed except Exception as e: self.logger.warning(e) self.logger.warning(f'bad url: {url}') return None self.logger.info('checking url automaticlly ...') hosts, update_time = check_host() self.logger.debug(f'update time: {update_time}') for host in hosts: elapsed = _check(host) if elapsed: url = host break if not url: self.logger.error('no available url, please use -c to check') exit(1) self.logger.info(f'use url: {url}') return url def search(self, term, max_try=3): """ term: URL, PMID, DOI or search string return: the url of pdf """ soup = WR.get_soup(self.url) form = soup.select_one('form[method="POST"]') post_url = self.url if form.attrs['action'] == '/' else form.attrs[ 'action'] payload = {'sci-hub-plugin-check': '', 'request': term} self.logger.debug(f'search pdf url for: {term}') while max_try: max_try -= 1 soup = WR.get_soup(post_url, method='POST', data=payload, timeout=self.timeout) pdf = soup.select_one('#pdf') if 'article not found' in soup.text: self.logger.warning(f'article not found [{term}]') return elif not pdf: # print(soup.select('title')) continue pdf_url = pdf.attrs['src'] if pdf_url.startswith('//'): pdf_url = post_url.split(':')[0] + f':{pdf_url}' self.logger.info(f'pdf url: {pdf_url}') return pdf_url self.logger.error( f'your searching has no result, please check! [{term}]') def download(self, url, outdir='.', filename=None, chunk_size=512, overwrite=None, show_progress=True): """download pdf from url """ filename = filename or os.path.basename(url).split('#')[0] if outdir != '.' and not os.path.exists(outdir): os.makedirs(outdir) outfile = os.path.join(outdir, filename) if os.path.isfile(outfile) and os.stat(outfile).st_size > 0: if not isinstance(overwrite, bool): overwrite = click.confirm( 'The file already exists, do you want to overwrite it?') if overwrite: self.logger.debug(f'overwriting the file: {outfile}') else: self.logger.debug(f'skip downloading file: {outfile}') return True resp = WR.get_response(url, stream=True) if resp.headers['Content-Type'] != 'application/pdf': resp = self.deal_captcha(url, outdir, filename, chunk_size) length = int(resp.headers.get('Content-Length')) # if os.path.isfile(outfile) and os.stat(outfile).st_size == length: self.logger.info( f'downloading pdf: {outfile} [{length/1024/1024:.2f} M]') bar = click.progressbar(length=length, label='downloading', show_percent=True, show_pos=True, show_eta=True) with open(outfile, 'wb') as out, bar: for chunk in resp.iter_content(chunk_size=chunk_size): out.write(chunk) if show_progress: bar.update(chunk_size) self.logger.info(f'save file: {outfile}') return True def deal_captcha(self, url, outdir, filename, chunk_size): """deal with the captcha """ soup = WR.get_soup(url) img = soup.select_one('#captcha').attrs['src'] img_url = url.rsplit('/', 3)[0] + img # print(img_url) self.logger.warning(f'need to type in the captcha: {img_url}') if os.getenv('DISPLAY'): self.logger.info( f'failed to open the picture, you can open it in your browser: {img_url}' ) else: content = WR.get_response(img_url, max_try=1).content im = Image.open(io.BytesIO(content)) # im.show() # this will block the program im.save('tmp.png') # **************************** # ***** non-blocked mode ***** # **************************** pylab.ion() img = pylab.imread('tmp.png') pylab.imshow(img) pylab.show() while True: answer = click.prompt('please input the captcha') if answer == 'new': return self.download(url, outdir=outdir, filename=filename, chunk_size=chunk_size) payload = { 'id': img_url.split('/')[-1].split('.')[0], 'answer': answer } # payload = {'id': '6058249282282', 'answer': 'manila'} self.logger.debug(payload) resp = WR.get_response(url, method='POST', stream=True, data=payload) if resp.headers['Content-Type'] == 'application/pdf': pylab.close() return resp self.logger.warning('bad captcha, try again!')
class ExcelBuilder(object): logger = SimpleLogger('ExcelBuilder') def __init__(self): self.wb = openpyxl.Workbook(write_only=False) self.wb.remove(self.wb.active) # remove default 'Sheet' def create_sheet(self, title=None): """ require: len(title) <= 31 """ if title and len(title) > 31: self.logger.warning('title is to long, limit 31 characters') title = title[:31] self.logger.info('create sheet: {}'.format(title)) self.ws = self.wb.create_sheet(title) def add_title(self, titles, **style): self.logger.debug('>>> add title') self.ws.append(titles) self.set_row_style(**style) def add_rows(self, rows, color_list=None, **style): self.logger.info('>>> add rows') self.logger.debug('color_list: {}, style: {}'.format( color_list, list(style.keys()))) for n, row in enumerate(rows, 1): self.ws.append(row) if not (color_list or style): continue if color_list: color = color_list[n % len(color_list)] style['PatternFill'] = PatternFill(start_color=color, end_color=color, fill_type='solid') self.set_row_style(**style) def set_row_style(self, **style): if not style: return for cell in self.ws[self.ws.max_row]: if style.get('font'): cell.font = style['font'] if style.get('alignment'): cell.alignment = style['alignment'] if style.get('border'): cell.border = style['border'] if style.get('PatternFill'): # print(cell.coordinate, cell.value, style['PatternFill'].start_color.value) cell.fill = style['PatternFill'] elif style.get('GradientFill'): cell.fill = style['GradientFill'] def set_dimensions_style(self, height=None, width=None, **style): self.logger.info('>>> set dimensions style') height = height or (style and style.get('height')) width = width or (style and style.get('width')) if height: for row in range(1, self.ws.max_row + 1): r = self.ws.row_dimensions[row] r.height = height if width: for column in range(1, self.ws.max_column + 1): c = self.ws.column_dimensions[get_column_letter(column)] c.width = width def freeze_panes(self, first_column=None, first_row=True, first_column_and_row=None, coordinate=None): if not coordinate: if first_column: coordinate = 'B1' elif first_row: coordinate = 'A2' elif first_column_and_row: coordinate = 'B2' if coordinate: self.logger.info('>>> freeze: {}'.format(coordinate)) self.ws.freeze_panes = coordinate def add_comment(self, coordinate, *args, **kwargs): self.logger.info('>>> add comment') self.ws[coordinate].comment = Comment(*args, **kwargs) def add_hyperlink(self, coordinate, hyperlink): """ # method1: >>> value = '=HYPERLINK("http://www.baidu.com", "baidu")' >>> sheet.cell(row, colum, value=value) # method2: >>> _ = sheet.cell(row, column, value='baidu') >>> _.hyperlink = 'http://www.baidu.com' # method3: >>> sheet['B3'].hyperlink = Hyperlink(ref='', target='http://www.baidu.com', tooltip='baidu') """ self.logger.info('>>> add hyperlink') self.ws[coordinate].hyperlink = hyperlink def merge_cells(self, *args, **kwargs): """ range_string=None start_row=None start_column=None end_row=None end_column=None """ self.logger.info('>>> merge cells') self.ws.merge_cells(*args, **kwargs) def auto_filter(self): """ automatical filter for first row """ self.ws.auto_filter.ref = self.ws.dimensions def save(self, outfile): self.wb.save(outfile) self.logger.warning('save file: {}'.format(outfile))
class Formatter(object): logger = SimpleLogger('DataFormatter') def __init__(self, rows, header=True): self.rows = rows self.header = header and self.check_header() def check_header(self): counter = Counter(self.rows[0]) dup_names = [k for k, v in counter.items() if v > 1] if dup_names: click.secho( 'could not set header=True as duplicate field names: {}'. format(dup_names), err=True, fg='yellow') return False return True def to_table(self, align='l', index=False): """ return a prettytable object >>> t = to_table() >>> str(t) >>> t.get_string() >>> t.get_html_string() """ table = prettytable.PrettyTable() if self.header: field_names = self.rows[0] rows = self.rows[1:] else: field_names = list( map(openpyxl.utils.get_column_letter, range(1, len(self.rows[0]) + 1))) rows = self.rows if index: table.field_names = ['Index'] + field_names else: table.field_names = field_names for n, row in enumerate(rows, 1): if index: row = [n] + row table.add_row(row) for field in table.field_names: table.align[field] = align return table def to_json(self, indent=None, ensure_ascii=False): data = [] if not self.header: data = self.rows else: fields = self.rows[0] rows = self.rows[1:] for row in rows: context = dict(zip(fields, row)) data.append(context) return json.dumps(data, indent=indent, ensure_ascii=ensure_ascii) def to_tsv(self, sep='\t', quote=''): data = [] for row in self.rows: line = sep.join('{0}{1}{0}'.format(quote, each) for each in row) data.append(line) return '\n'.join(data)