Пример #1
0
    def __init__(self, uri=None, echo=True, drop=False, logger=None):
        self.uri = uri or 'sqlite:///:memory:'
        self.logger = logger or SimpleLogger('Manager')
        self.engine = sqlalchemy.create_engine(uri, echo=echo)
        self.engine.logger.level = self.logger.level

        self.session = self.connect()
        self.create_table(drop=drop)
Пример #2
0
 def __init__(self, dbfile=':memory:', echo=False, drop=False, logger=None):
     self.drop = drop
     self.dbfile = dbfile
     self.uri = f'sqlite:///{dbfile}'
     self.logger = logger or SimpleLogger('Manager')
     self.engine = sqlalchemy.create_engine(self.uri, echo=echo)
     self.engine.logger.level = self.logger.level
     self.session = self.connect()
Пример #3
0
def cli(ctx, **kwargs):
    dbfile = kwargs['dbfile']
    dirname = os.path.dirname(dbfile)
    if dirname and not os.path.exists(dirname):
        os.makedirs(dirname)

    ctx.ensure_object(dict)

    ctx.obj['manager'] = Manager(dbfile=dbfile, echo=kwargs['echo'])
    ctx.obj['logger'] = SimpleLogger('OMIM-CLI')
    ctx.obj['entry'] = Entry(omim_url=kwargs['url'])
Пример #4
0
def main(**kwargs):
    start_time = time.time()

    logger = SimpleLogger('MAIN')
    logger.info(f'input arguments: {kwargs}')

    year = kwargs['year']
    end = kwargs['end'] or year
    code = kwargs['code']
    subcategory = kwargs['subcategory']
    level = int(kwargs['level']) if kwargs['level'] else None
    count = kwargs['count']
    letpub = LetPub(logger=logger)

    outdir = kwargs['outdir']
    outfile = os.path.join(kwargs['outdir'], kwargs['outfile'])
    if not os.path.exists(outdir):
        os.makedirs(outdir)

    if kwargs['list']:
        code_list = letpub.code_list
        print(code_list.get(code))
        exit(0)

    try:
        with open(outfile, 'w') as out:
            for context in letpub.search(code,
                                         startTime=year,
                                         endTime=end,
                                         subcategory=subcategory,
                                         level=level,
                                         count=count):
                if not count:
                    line = json.dumps(context, ensure_ascii=False) + '\n'
                    out.write(line)
        if not count:
            logger.info(f'save file: {outfile}')
    except KeyboardInterrupt:
        os.remove(outfile)

    elapsed = time.time() - start_time
    logger.info(f'elapsed time: {elapsed:.2f}s')
Пример #5
0
class OMIM(object):
    def __init__(self, omim_url='https://mirror.omim.org'):
        self.omim_url = omim_url
        self.logger = SimpleLogger('OMIM')

    def get_soup(self, url):
        soup = WR.get_soup(url)
        return soup

    def get_mim2gene(self, outfile=None):
        url = self.omim_url + '/static/omim/data/mim2gene.txt'
        resp = WR.get_response(url, stream=True)
        if outfile:
            with open(outfile, 'wb') as out:
                for chunk in resp.iter_content(chunk_size=512):
                    out.write(chunk)
            self.logger.debug(f'save file: {outfile}')
        else:
            return resp.text

    def parse_mim2gene(self, mim2gene=None, mim_types=('gene', 'gene/phenotype')):
        if mim2gene and os.path.isfile(mim2gene):
            self.logger.debug(f'parsing mim2gene from file: {mim2gene} ...')
            text = open(mim2gene).read().strip()
        else:
            self.logger.debug(f'parsing mim2gene from website ...')
            text = self.get_mim2gene()

        fields = 'mim_number mim_type entrez_gene_id hgnc_gene_symbol ensembl_gene_id'.split()
        for line in text.split('\n'):
            if line.startswith('# Generated:'):
                generated = line.split(': ')[-1]
                continue
            elif line.startswith('#') or not line.strip():
                continue
            linelist = line.split('\t')
            context = dict(zip(fields, linelist))

            if mim_types and context['mim_type'] not in mim_types:
                continue

            context['generated'] = date_parse(generated)
            yield context['mim_number'], context
Пример #6
0
class Eutils(object):
    """
        params:
            db          database name
            api_key     api_key or NCBI_API_KEY in environment

        optional params:
            term        term for esearch
            id          id(s) for efetch
            field       field for esearch
    """
    base_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
    logger = SimpleLogger('Eutils')
    IF = ImpactFactor()

    def __init__(self,
                 db='pubmed',
                 service_url='translate.google.cn',
                 api_key=None,
                 **kwargs):
        self.db = db
        self.api_key = api_key
        self.validate_api_key()
        self.TR = GoogleTrans(service_url=service_url)

    def parse_params(self, **kwargs):
        """
            - add default db
            - add api_key if available
        """
        params = {'db': self.db}
        if self.api_key:
            params['api_key'] = self.api_key
        params.update(kwargs)

        if 'api_key' in params and params['api_key'] is None:
            del params['api_key']

        return params

    def esearch(self,
                term,
                retstart=0,
                retmax=250,
                head=False,
                limit=None,
                **kwargs):
        """
            https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch

            > -search from a database from given term
            >> - esearch.cgi?db=pubmed&term=ngs
            >> - esearch.cgi?db=pubmed&term=ngs&retmode=xml&field=TIAB
            >> - esearch.cgi?db=pubmed&term=ngs[Title/Abstract]&retmode=xml
        """
        url = self.base_url + 'esearch.fcgi'
        params = self.parse_params(term=term,
                                   retmode='json',
                                   retstart=retstart,
                                   retmax=retmax,
                                   **kwargs)

        # print(params)

        result = WebRequest.get_response(url,
                                         params=params).json()['esearchresult']

        if head:
            return result

        self.logger.info(
            '{count} articles found with term: {querytranslation}'.format(
                **result))

        if limit is None and int(result['count']) > 250:
            self.logger.warning(
                'too many results, you can limit output with option "-l/--limit N", '
                'or simplify your input with sub-command "advance-search" ')
            exit(1)

        idlist = result['idlist']

        while int(result['retstart']) + int(result['retmax']) < int(
                result['count']):
            if limit and len(idlist) >= limit:
                break
            retstart = int(result['retstart']) + int(result['retmax'])
            params = self.parse_params(term=term,
                                       retmode='json',
                                       retstart=retstart,
                                       retmax=retmax,
                                       **kwargs)
            result = WebRequest.get_response(
                url, params=params).json()['esearchresult']
            idlist += result['idlist']

        if limit:
            self.logger.info('limit {} from {}'.format(limit, result['count']))
            idlist = idlist[:limit]

        if idlist:
            self.logger.debug('idlist: {} ...'.format(', '.join(idlist[:10])))
        else:
            self.logger.warning('no result for term: {}'.format(term))

        return idlist

    def efetch(self, ids, batch_size=5, **kwargs):
        """
            https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EFetch

            > - fetch from a database for given ids
            >> efetch.cgi?db=pubmed&id=1,2,3
        """
        url = self.base_url + 'efetch.fcgi'

        self.logger.info('fetching start: total {}, batch_size: {}'.format(
            len(ids), batch_size))

        for n in range(0, len(ids), batch_size):
            _id = ','.join(ids[n:n + batch_size])

            self.logger.debug(f'fetching xml: {n+1} - {n+batch_size}')
            params = self.parse_params(id=_id, retmode='xml')
            xml = WebRequest.get_response(url, params=params).text

            self.logger.debug(f'parsing xml: {n+1} - {n+batch_size}')
            for context in xml_parser.parse(xml):
                article = Article(**context)
                yield article

    def einfo(self, **kwargs):
        """
            https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EInfo

            > - show all database list
            >> einfo.fcgi?db=
            > - show dbinfo for given database
            >> einfo.fcgi?db=pubmed
        """
        url = self.base_url + 'einfo.fcgi'
        params = self.parse_params(retmode='json', **kwargs)
        info = WebRequest.get_response(url,
                                       params=params,
                                       allowed_codes=[200, 400]).json()
        return info

    def elink(self, ids, dbfrom='pubmed', cmd='neighbor', **kwargs):
        """
            https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ELink

            > - get cited (`{"linkname": "pubmed_pubmed_citedin"}`)
            >> elink.fcgi?dbfrom=pubmed&db=pubmed&id=20210808&cmd=neighbor&retmode=json
            > - get pdf url
            >> elink.fcgi?dbfrom=pubmed&db=pubmed&cmd=prlinks&id=10210801

            cmds:
                - neighbor (default)
                - neighbor_score
                - neighbor_history
                - acheck
                - ncheck
                - lcheck
                - llinks
                - llinkslib
                - prlinks
        """
        url = self.base_url + 'elink.fcgi'
        params = self.parse_params(retmode='json',
                                   id=ids,
                                   dbfrom=dbfrom,
                                   cmd=cmd,
                                   **kwargs)
        result = WebRequest.get_response(url, params=params).json()
        return result

    def get_cited(self, _id, dbfrom='pubmed', cmd='neighbor'):
        """
            get the cited pmids for given pmid
        """
        linksetdbs = self.elink(_id, dbfrom=dbfrom,
                                cmd=cmd)['linksets'][0]['linksetdbs']
        links = [
            linkset['links'] for linkset in linksetdbs
            if linkset['linkname'] == 'pubmed_pubmed_citedin'
        ]

        citedin = links[0] if links else []
        return {'count': len(citedin), 'links': citedin}

    def get_pdf_url(self, _id, dbfrom='pubmed', cmd='prlinks'):
        """
            get the pdf url for given pmid
        """
        idurllist = self.elink(_id, dbfrom=dbfrom,
                               cmd=cmd)['linksets'][0]['idurllist']

        result = {
            each['id']: each['objurls'][0]['url']['value']
            for each in idurllist
        }

        return result

    def validate_fields(self, **kwargs):
        """
            Show all fields for given database
        """
        fieldlist = self.einfo(
            **kwargs)['einforesult']['dbinfo'][0]['fieldlist']

        fieldlist = sorted(fieldlist, key=lambda x: x.get('fullname'))

        fields = {}
        table = prettytable.PrettyTable(
            field_names=['Number', 'Name', 'FullName', 'Description'])
        for n, each in enumerate(fieldlist, 1):
            # print('{n}\t{name}\t{fullname}\t{description}'.format(n=n, **each))
            fields[n] = [each['name'], each['fullname'], each['description']]
            table.add_row(
                [n, each['name'], each['fullname'], each['description']])

        table.align['FullName'] = 'l'
        table.align['Description'] = 'l'
        click.secho(str(table), fg='bright_blue')

        return fields

    def validate_api_key(self):
        """
            Description: https://www.ncbi.nlm.nih.gov/account/settings/#accountSettingsApiKeyManagement
                - E-utils users are allowed 3 requests/second without an API key.
                - Create an API key to increase your e-utils limit to 10 requests/second.
        """
        configfile = os.path.join(os.path.expanduser('~'), '.pypubmed.cfg')

        if not self.api_key:
            if os.path.isfile(configfile):
                self.api_key = open(configfile).read().strip()
            if not self.api_key:
                msg = textwrap.dedent('''
                    API_KEY not found! Use eutils with a api_key is a good idea!
                    You can create one from: https://www.ncbi.nlm.nih.gov/account/settings/#accountSettingsApiKeyManagement
                    If you already have one, you can add it to your environment: export NCBI_API_KEY=xxxx'''
                                      )
                self.logger.warning(msg)
                return

        res = self.einfo()
        if 'error' in res:
            self.logger.warning('invalid api_key, please check: {}'.format(
                self.api_key))
            self.api_key = None
        else:
            self.logger.info('Valid api_key: {}'.format(self.api_key))
            with open(configfile, 'w') as out:
                out.write(self.api_key)

    def search(self,
               term,
               cited=True,
               translate=True,
               impact_factor=True,
               translate_cache=None,
               **kwargs):
        """
            term:
                - string, eg. 'ngs AND disease'
                - pmid, eg. 1,2,3
                - file with pmid
        """
        if os.path.isfile(term):
            idlist = open(term).read().strip().split()
        elif all(each.isdigit() for each in term.split(',')):
            idlist = term.split(',')
        else:
            idlist = self.esearch(term, **kwargs)

        # print(kwargs);exit()

        articles = self.efetch(idlist, **kwargs)
        for article in articles:
            if impact_factor:
                res = self.IF.search(article.issn) or self.IF.search(
                    article.e_issn)
                article.impact_factor = res['factor'] if res else '.'

            if cited:
                article.cited = self.get_cited(article.pmid)

            if translate:
                if translate_cache and translate_cache.get(article.pmid):
                    article.abstract_cn = translate_cache.get(article.pmid)
                else:
                    article.abstract_cn = 'translate failed'
                    n = 0
                    while n < 5:
                        n += 1
                        try:
                            article.abstract_cn = self.TR.translate(
                                article.abstract)
                            break
                        except Exception as e:
                            print(e)
                            time.sleep(3)
            yield article
Пример #7
0
class PassConfig(object):
    """
    >>> from passconfig import PassConfig
    >>> pc = PassConfig()
    >>> username, password = pc.get()
    >>> if True:
    >>>     pc.save()
    """
    logger = SimpleLogger('PassConfig')

    def __init__(self,
                 username=None,
                 password=None,
                 configfile=CONFIG_DEFAULT,
                 section='common'):
        self.username = username
        self.password = password
        self.configfile = configfile
        self.section = section

        self.conf = ConfigParser()
        if configfile:
            self.conf.read(configfile)

    def get(self):
        """
            Return usernane, password
        """
        if self.username and self.password:
            username, password = self.username, self.password
        elif os.path.exists(self.configfile):
            username, password = self.read()
        else:
            username, password = None, None

        if not all([username, password]):
            click.secho('please input your username and password', fg='yellow')
            username = username or click.prompt('>>> username')
            password = click.prompt('>>> password',
                                    hide_input=True,
                                    confirmation_prompt=True)

        self.username = username
        self.password = password
        return username, password

    def read(self):
        """
            Read username and password from configfile
        """
        password = None
        username = self.username

        if not self.conf.has_section(self.section):
            pass
        elif username and self.conf.has_option(self.section, self.username):
            password = self.conf.get(self.section, self.username)
        elif not self.username and len(self.conf.options(self.section)) == 1:
            username = self.conf.options(self.section)[0]
            password = self.conf.get(self.section, username)

        return username, password

    @staticmethod
    def safe_open(filename, mode='r'):
        """
            Make directory firstly if not exists
        """
        if 'w' in mode:
            dirname = os.path.dirname(filename)
            if dirname and not os.path.exists(dirname):
                os.makedirs(dirname)
        return open(filename, mode)

    def save(self):
        """
            Save username and password to configfile
        """
        if not self.conf.has_section(self.section):
            self.conf.add_section(self.section)
        self.conf.set(self.section, self.username, self.password)

        with self.safe_open(self.configfile, 'w') as out:
            self.conf.write(out)
            self.logger.info('save username and password to file: {}'.format(
                self.configfile))
Пример #8
0
class Official(object):
    base_url = 'http://output.nsfc.gov.cn'
    logger = SimpleLogger('Official')

    field_codes = WR.get_response(base_url +
                                  '/common/data/fieldCode').json()['data']

    @classmethod
    def get_field_codes(cls):
        """
            所有的学科代码
        """
        url = cls.base_url + '/common/data/fieldCode'
        print(url)
        return WR.get_response(url).json()['data']

    @classmethod
    def list_root_codes(cls):
        """
            获取所有的学科分类代码
        """
        root_codes = {}
        for context in cls.field_codes:
            if len(context['code']) == 1:
                root_codes[context['code']] = context['name']
        return root_codes

    @classmethod
    def list_child_codes(cls, keys):
        """
            获取最低级的学科代码
                C01  -->  C010101, C010102, ...
                H10  -->  H1001, H1002, ...
        """
        child_codes = {}
        for key in keys.split(','):
            for context in cls.field_codes:
                code = context['code']
                if len(code) == 1:
                    continue
                if code.startswith(key):
                    child_codes[code] = context['name']
                    if code[:-2] in child_codes:
                        del child_codes[code[:-2]]
        return child_codes

    @classmethod
    def get_conclusion_data(cls, ratify_number, detail=True):
        """
            获取指定项目批准号的结题数据
        """
        url = cls.base_url + '/baseQuery/data/conclusionQueryResultsData'
        payload = {
            'ratifyNo': ratify_number,
            'queryType': 'input',
            'complete': 'true',
        }
        result = WR.get_response(url, method='POST',
                                 json=payload).json()['data']['resultsData']
        data = {}
        if result:
            data['projectid'] = result[0][0]
            data['project_type'] = result[0][3]
            data['result_stat'] = result[0][10]

        if detail and data.get('projectid'):
            detail_data = cls.get_detail_data(data['projectid'])
            data.update(detail_data)
        return data

    @classmethod
    def get_detail_data(cls, projectid):
        url = cls.base_url + '/baseQuery/data/conclusionProjectInfo/' + projectid
        data = WR.get_response(url).json()['data']
        return data

    @classmethod
    def get_conclusion_report(cls,
                              ratify_number,
                              tmpdir='tmp',
                              pdf=True,
                              outfile=None):
        data = cls.get_conclusion_data(ratify_number, detail=False)
        if not data:
            cls.logger.warning(f'no conclusion result for: {ratify_number}')
            return

        images = list(cls.get_conclusion_report_images(data['projectid']))

        if not os.path.exists(tmpdir):
            os.makedirs(tmpdir)

        pngs = []
        for n, url in enumerate(images, 1):
            name = os.path.basename(url)
            png = f'{tmpdir}/{name}.png'
            pngs.append(png)
            cls.logger.debug(
                f'[{n}/{len(images)}] download png: {url} => {png}')

            resp = WR.get_response(url, stream=True)
            with open(png, 'wb') as out:
                for chunk in resp.iter_content(chunk_size=512):
                    out.write(chunk)
            cls.logger.debug(f'save png: {png}')

        if pdf:
            cls.logger.debug('converting *png to pdf')
            outfile = outfile or f'{ratify_number}.pdf'
            with open(outfile, 'wb') as out:
                out.write(img2pdf.convert(pngs))

            size = human_readable.file_size(os.stat(outfile).st_size)
            cls.logger.info(f'save pdf: {outfile} [{size}]')
        return True

    @classmethod
    def get_conclusion_report_images(cls, projectid):
        url = cls.base_url + '/baseQuery/data/completeProjectReport'
        index = 1
        while True:
            payload = {'id': projectid, 'index': index}
            res = WR.get_response(url, method='POST',
                                  data=payload).json()['data']
            if not res['hasnext']:
                break
            yield cls.base_url + res['url']
            index += 1
Пример #9
0
class Export(object):

    logger = SimpleLogger('Export')

    def __init__(self, data, outfile='out.xlsx', outtype=None, fields=None, fillna='.', **kwargs):
        self.outfile = outfile
        self.outtype = outtype or outfile.split('.')[-1]
        # self.data = list(self.filter_data(data, fields)) if fields else data
        self.data = list(self.reformat_data(data, fields=fields, fillna=fillna))

    def reformat_data(self, data, fields, fillna):

        if fields:
            field_list = fields.strip(',').split(',')
        else:
            field_list = data[0].keys()

        for context in data:
            out_ctx = {k: v or fillna for k, v in context.items() if k in field_list}
            yield out_ctx

    def filter_data(self, data, fields):
        fields = fields.strip(',').split(',')
        for context in data:
            out_ctx = {k: v for k, v in context.items() if k in fields}
            yield out_ctx

    def export(self):
        if self.outtype == 'xlsx':
            self.export_xlsx()
        elif self.outtype == 'json':
            self.export_json()
        elif self.outtype in ('jl', 'jsonlines'):
            self.export_json_lines()
        else:
            self.logger.error('outtype is invalid, please check!')
            exit(1)
        
        self.logger.info('save file: {}'.format(self.outfile))

    def write_title(self, sheet, titles, fg_color=colors.BLACK, bg_color=colors.WHITE, border=True, bold=True, width=18, size=12):
        for col, value in enumerate(titles, 1):

                w = width * 4 if value in ('abstract', 'abstract_cn') else width

                sheet.column_dimensions[get_column_letter(col)].width = w
                _ = sheet.cell(1, col, value=value)

                # set cell styles
                _.alignment = Alignment(horizontal='left',vertical='center',wrap_text=True)
                _.fill = PatternFill(start_color=bg_color, end_color=bg_color, fill_type="solid")
                _.font = Font(bold=bold, color=fg_color, size=size)

    def add_hyperlink(self, key, value):
        """
            method1:
                value=HYPERLINK("http://www.baidu.com", "baidu")
                sheet.cell(row, colum, value=value)

            method2:
                _ = sheet.cell(row, column, value='baidu')
                _.hyperlink = 'http://www.baidu.com'
        """
        if key == 'pmid':
            url = 'https://pubmed.ncbi.nlm.nih.gov/{value}/'
        elif key == 'pmc':
            url = 'https://www.ncbi.nlm.nih.gov/pmc/articles/{value}/'
        elif key == 'doi':
            url = 'https://doi.org/{value}'
        else:
            return None

        url = url.format(**locals())

        return url
    
    @property
    def all_fields(self):
        return '''
            pmid title abstract abstract_cn impact_factor journal med_abbr iso_abbr pubdate pubmed_pubdate
            pmc issn e_issn doi year pagination volume issue   
            pub_status authors keywords pub_types cited
        '''.split()

    def sort(self, item):
        if type(item) == tuple:
            k = item[0]
        else:
            k = item

        if k in self.all_fields:
            return self.all_fields.index(k)
        else:
            return 9999

    def export_xlsx(self, sheet_title='Result', freeze_panes='B1'):
        """
            PatternFill:
            - https://openpyxl.readthedocs.io/en/latest/api/openpyxl.styles.fills.html?highlight=PatternFill
        """
        book = openpyxl.Workbook()
        sheet = book.active

        # # freeze the first column
        # sheet.freeze_panes = 'B1'

        # # freeze the first row
        # sheet.freeze_panes = 'A2'

        # freeze the first column and the first row
        sheet.freeze_panes = 'B2'

        sheet.title = sheet_title

        titles = sorted(list(self.data[0].keys()), key=self.sort)
        self.write_title(sheet, titles)
        
        for row, context in enumerate(self.data, 2):

            color = '00b3ffb3' if row % 2 else '00b3ffff'
            for col, (key, value) in enumerate(sorted(list(context.items()), key=self.sort), 1):
                if type(value) == list:
                    try:
                        value = ', '.join(value)
                    except:
                        try:
                            value = json.dumps(value, ensure_ascii=False)
                        except:
                            value = str(value)
                elif type(value) == dict:
                    value = json.dumps(value, ensure_ascii=False)
                
                try:
                    _ = sheet.cell(row, col, value=value)
                except:
                    _ = sheet.cell(row, col, value=str(value))

                _.fill = PatternFill(start_color=color, end_color=color, fill_type="solid")

                if key in ('pmid', 'pmc', 'doi') and value not in ('.', None):
                    _.hyperlink = self.add_hyperlink(key, value)
                    _.font = Font(color=colors.BLUE, italic=True)

                wrap_text = None
                if key in ('abstract', 'abstract_cn'):
                    wrap_text = True
                
                _.alignment = Alignment(horizontal='left',vertical='center',wrap_text=wrap_text)

        book.save(self.outfile)

    def export_json(self, **kwargs):
        with safe_open(self.outfile, 'w') as out:
            json.dump(self.data, out, ensure_ascii=False, **kwargs)

    def export_json_lines(self):
        with safe_open(self.outfile, 'w') as out:
            for context in self.data:
                out.write(json.dumps(context, ensure_ascii=False) + '\n')
Пример #10
0
 def __init__(self, omim_url='https://mirror.omim.org'):
     self.omim_url = omim_url
     self.logger = SimpleLogger('OMIM')
Пример #11
0
 def __init__(self, logger=None):
     self.logger = logger or SimpleLogger('LetPub')
     self.subcategory_list = self.list_support_types()
     self.province_list = self.list_provinces()
     self.code_list = self.list_codes()
Пример #12
0
class ExcelParser(object):
    logger = SimpleLogger('ExcelParser')
    def __init__(self):
        pass

    def parse(self, filename, data_only=False, read_only=False, sheet_idx=None, choose_one=False, skip=None, limit=None, **kwargs):
        """
            data_only=True: get the value instead of formula when data_type is 'f'
            read_only=True: to deal with large file, some attributes might lost
        """
        wb = openpyxl.load_workbook(filename, data_only=data_only, read_only=read_only)

        sheets = wb.worksheets
        if len(wb.sheetnames) > 1:
            if sheet_idx is not None:
                sheets = [wb.worksheets[sheet_idx]]
            elif choose_one:
                sheets = self.choose_sheet(wb)

        return self.get_data(sheets, skip=skip, limit=limit)

    def choose_sheet(self, workbook):
        context = dict(enumerate(workbook.sheetnames))
        click.secho('{}'.format(json.dumps(context, indent=2, ensure_ascii=False)), err=True, fg='bright_green')

        while True:
            idxes = click.prompt('please choose one or more sheets, separate by comma', err=True)
            for idx in idxes.split(','):
                if int(idx) not in context:
                    self.logger.warning('bad choice, choose from: {}'.format(list(context.keys())))
                    continue
            sheets = [workbook.worksheets[int(idx)] for idx in idxes.split(',')]
            return sheets

    def get_data(self, worksheets, skip=None, limit=None, fillna=''):
        data = OrderedDict()
        for ws in worksheets:
            data[ws.title] = []
            for n, row in enumerate(ws.rows):
                if skip and n < skip:
                    continue
                if limit and len(data[ws.title]) > limit:
                    break

                line = [
                    cell.value.strftime('%Y-%m-%d')
                    if cell.data_type == 'd'
                    else fillna if cell.value is None else cell.value
                    for cell in row
                ]
                data[ws.title].append(line)

        return data

    def export(self, data, outfile=None, fmt='table', indent=None, sep='\t', header=True, index=True, color=None, pager=False):
        """"export data

        parameters
            data: data return by get_data method
            outfile: output file, default stdout
            fmt: 'table', 'html', 'tsv' or 'json'
            indent: for json fmt export
            sep: for tsv fmt export
        """
        out = open(outfile, 'w') if outfile else sys.stdout
        with out:
            for sheet, rows in data.items():
                click.secho('>>> {}'.format(sheet), err=True, fg='yellow')
                fd = Formatter(rows, header=header)
                if fmt == 'table':
                    res = fd.to_table(index=index).get_string()
                elif fmt == 'html':
                    res = fd.to_table(index=index).get_html_string()
                elif fmt == 'json':
                    res = fd.to_json(indent=indent)
                elif fmt == 'tsv':
                    res = fd.to_tsv(sep=sep)
                else:
                    exit('bad format, choose from table, html, json, tsv')

                if color:
                    res = click.style(res, fg=color)

                if pager:
                    click.echo_via_pager(res, color=False)
                else:
                    out.write(res + '\n')
Пример #13
0
class GoogleTrans(object):
    """
    >>> from simple_googletrans import GoogleTrans
    >>> t = GoogleTrans()
    >>> t.translate('hello world')
    >>>
    >>> t.show_languages()
    """
    logger = SimpleLogger('GoogleTrans')

    def __init__(self, service_url=None, proxies=None, timeout=None):
        self.translator = googletrans.Translator(service_urls=[service_url],
                                                 proxies=proxies,
                                                 timeout=timeout)
        self.nltk_checked = False

    def translate(self, text, dest='zh-cn', **kwargs):
        if os.path.isfile(text):
            text = open(text).read()

        texts = self.split_text(text)

        result = []
        for text in texts:
            text = self.translator.translate(text, dest=dest, **kwargs).text
            result.append(text)
        result = ''.join(result)

        return result

    def check_nltk(self):
        """
            - download from Interactive Python
            >>> import nltk
            >>> nltk.download('punkt')

            - download from command line
            $ python -m nltk.downloader punkt

            - more: http://www.nltk.org/data.html
        """
        try:
            import nltk
        except SyntaxError:
            self.logger.warning(
                'nltk is not available for Python2, use Python3 please.')
            exit(1)

        try:
            nltk.sent_tokenize('hello world')
            self.logger.info('nltk is ok!')
        except Exception:
            self.logger.warning('nltk_data not found! downloading start ...')
            try:
                nltk.download('punkt')
                self.nltk_checked = True
            except:
                self.logger.error(
                    'nltk_data download failed! you can also try: python -m nltk.downloader punkt'
                )
                exit(1)

    def split_text(self, text, max_len=5000):
        """
            googletrans limits 5000 characters

            split text with nltk.sent_tokenize

            >>> nltk.sent_tokenize('hello world!')
        """
        if len(text) <= max_len:
            return [text]

        if not self.nltk_checked:
            self.check_nltk()

        self.logger.info('split text with nltk')

        texts = []
        for sent in nltk.sent_tokenize(text):
            if (not texts) or (len(texts[-1]) + len(sent) > max_len):
                texts.append(sent)
            else:
                texts[-1] += ' ' + sent
        return texts

    def show_languages(self):
        data = googletrans.LANGCODES
        table = prettytable.PrettyTable(['Index', 'Abbr', 'Language'])
        for n, (lang,
                abbr) in enumerate(sorted(data.items(), key=lambda x: x[1]),
                                   1):
            table.add_row([n, abbr, lang])
        table.align['Abbr'] = 'l'
        table.align['Language'] = 'l'
        click.secho(str(table), fg='cyan')
Пример #14
0
class ExcelConcat(object):
    logger = SimpleLogger('ExcelConcat')

    def __init__(self):
        self.wb = openpyxl.Workbook()
        self.wb.remove(self.wb.active)  # remove default 'Sheet'

    def concat(self,
               infiles,
               keep_fmt=False,
               keep_size=True,
               merge_cells=True,
               keep_other=True):
        for infile in infiles:
            self.logger.debug('loading file: {}'.format(infile))
            in_book = openpyxl.load_workbook(infile)
            for sheetname in in_book.sheetnames:
                sheet = in_book[sheetname]
                self.logger.debug(
                    'copy sheet: {} [{} rows, {} columns]'.format(
                        sheetname, sheet.max_row, sheet.max_column))
                ws = self.wb.create_sheet(sheetname)
                for row in sheet.rows:
                    for cell in row:
                        ws[cell.coordinate] = cell.value
                        if keep_fmt and cell.has_style:  # might be slow for big file
                            self.copy_format(cell, ws[cell.coordinate])
                if keep_size:
                    self.copy_size(sheet, ws)
                if merge_cells:
                    self.merge_cells(sheet, ws)

                if keep_other:
                    self.copy_other(sheet, ws)

    def copy_format(self, origin_cell, target_cell):
        """
            copy style for each cell
        """
        fmt_list = ('alignment', 'font', 'fill', 'border', 'comment',
                    'hyperlink', 'data_type', 'number_format')
        for fmt in fmt_list:
            value = getattr(origin_cell, fmt)
            if not value:
                continue
            if isinstance(value, openpyxl.styles.proxy.StyleProxy):
                value = value.copy()
            setattr(target_cell, fmt, value)

    def copy_size(self, origin_sheet, target_sheet):
        """
            copy width for columns and height for rows
        """
        self.logger.debug('copy height and width for sheet: {}'.format(
            origin_sheet.title))
        for column in range(1, origin_sheet.max_column + 1):
            letter = openpyxl.utils.get_column_letter(column)
            width = origin_sheet.column_dimensions[letter].width
            target_sheet.column_dimensions[
                letter].width = origin_sheet.column_dimensions[letter].width

        for row in range(1, origin_sheet.max_row + 1):
            target_sheet.row_dimensions[
                row].height = origin_sheet.row_dimensions[row].height

    def merge_cells(self, origin_sheet, target_sheet):
        """
            copy merged cells
        """
        self.logger.debug('merge cells for sheet: {}'.format(
            origin_sheet.title))
        for ranges in origin_sheet.merged_cell_ranges:
            target_sheet.merge_cells(ranges.coord)

    def copy_image(self, origin_sheet, target_sheet):
        self.logger.debug('copy images for sheet: {}'.format(
            origin_sheet.title))
        for im in origin_sheet._images:
            target_sheet.add_image(im)

    def copy_other(self, origin_sheet, target_sheet):
        for other in ('image', 'table', 'chart', 'pivot'):
            items = getattr(origin_sheet, '_{}s'.format(other))

            if other == 'table':  # dict for table
                items = items.values()

            if items:
                self.logger.debug('copy {} for sheet: {}'.format(
                    other, origin_sheet.title))
                for item in items:
                    getattr(target_sheet, 'add_{}'.format(other))(item)

        if origin_sheet.data_validations.dataValidation:
            self.logger.debug('copy data_validations for sheet: {}'.format(
                origin_sheet.title))
            for data_validation in origin_sheet.data_validations.dataValidation:
                target_sheet.add_data_validation(data_validation)

    def save(self, outfile):
        self.wb.save(outfile)
        self.logger.info('save file: {}'.format(outfile))
Пример #15
0
def main(**kwargs):

    logger = SimpleLogger('STATS')
    logger.level = logger.level_maps[kwargs['log_level']]

    logger.info(f'input arguments: {kwargs}')

    dbfile = kwargs['dbfile']
    limit = kwargs['limit']
    outfile = kwargs['outfile']

    if kwargs['keys']:
        table = PrettyTable(['Key', 'Comment', 'Type'])
        for k, v in Project.metadata.tables['project'].columns.items():
            table.add_row([k, v.comment, v.type])
        for field in table._field_names:
            table.align[field] = 'l'
        print(click.style(str(table), fg='cyan'))
        exit(0)

    if not os.path.isfile(dbfile):
        logger.error(f'dbfile not exists! [{dbfile}]')
        baidu = version_info['baidu_data']
        logger.info(f'可通过百度网盘下载需要的数据:{baidu}\n'
                    f'下载完成后可通过-d参数指定数据库文件,也可以拷贝文件到:{DEFAULT_DB}')
        exit(1)

    uri = f'sqlite:///{dbfile}'
    with Manager(uri=uri, echo=False, logger=logger) as m:

        query = m.session.query(Project)

        if kwargs['search']:
            for key, value in kwargs['search']:
                if '%' in value:
                    query = query.filter(Project.__dict__[key].like(value))
                elif key in ('approval_year', ) and not value.isdigit():
                    if '-' in value:
                        min_value, max_value = value.split('-')
                        query = query.filter(
                            Project.__dict__[key] >= min_value)
                        query = query.filter(
                            Project.__dict__[key] <= max_value)
                    else:
                        logger.error('bad approval_year: {value}')
                        exit(1)
                else:
                    query = query.filter(Project.__dict__[key] == value)

        if limit:
            query = query.limit(limit)

        logger.debug(str(query))

        if kwargs['count']:
            logger.info(f'count: {query.count()}')
        elif not query.count():
            logger.warning('no result for your input')
        else:
            if outfile and kwargs['format'] == 'xlsx':
                wb = openpyxl.Workbook()
                ws = wb.active
                ws.title = 'NSFC-RESULT'
                title = [
                    k for k, v in query.first().__dict__.items()
                    if k != '_sa_instance_state'
                ]
                ws.append(title)
                for col, v in enumerate(title, 1):
                    _ = ws.cell(row=1, column=col, value=v)
                    _.font = Font(color='FFFFFF', bold=True)
                    _.fill = PatternFill(start_color='000000',
                                         end_color='000000',
                                         fill_type='solid')

                for n, row in enumerate(query):
                    context = [
                        v for k, v in row.__dict__.items()
                        if k != '_sa_instance_state'
                    ]
                    ws.append(context)

                ws.freeze_panes = 'A2'
                wb.save(outfile)
            else:
                out = open(outfile, 'w') if outfile else sys.stdout
                with out:
                    if kwargs['format'] == 'json':
                        data = [{
                            k: v
                            for k, v in row.__dict__.items()
                            if k != '_sa_instance_state'
                        } for row in query]
                        out.write(
                            json.dumps(data, ensure_ascii=False, indent=2) +
                            '\n')
                    else:
                        for n, row in enumerate(query):
                            context = {
                                k: v
                                for k, v in row.__dict__.items()
                                if k != '_sa_instance_state'
                            }
                            if n == 0 and kwargs['format'] == 'tsv':
                                title = '\t'.join(context.keys())
                                out.write(title + '\n')
                            if kwargs['format'] == 'tsv':
                                line = '\t'.join(map(str, context.values()))
                            else:
                                line = json.dumps(context, ensure_ascii=False)
                            out.write(line + '\n')
            if outfile:
                logger.info(f'save file: {outfile}')
Пример #16
0
              default='jl',
              show_choices=True,
              show_default=True)
@click.option('-K',
              '--keys',
              help='list the available keys for query',
              is_flag=True)
@click.option('-C',
              '--count',
              help='just output the out of searching',
              is_flag=True)
@click.option('-L', '--limit', help='the count of limit of output', type=int)
@click.option('-l',
              '--log-level',
              help='the level of logging',
              type=click.Choice(SimpleLogger().level_maps),
              default='info',
              show_choices=True,
              show_default=True)
def main(**kwargs):

    logger = SimpleLogger('STATS')
    logger.level = logger.level_maps[kwargs['log_level']]

    logger.info(f'input arguments: {kwargs}')

    dbfile = kwargs['dbfile']
    limit = kwargs['limit']
    outfile = kwargs['outfile']

    if kwargs['keys']:
Пример #17
0
 def __init__(self, url=None, timeout=10):
     self.timeout = timeout
     self.logger = SimpleLogger('SciHub')
     self.url = url or self.check_url(url)
Пример #18
0
class SciHub(object):
    def __init__(self, url=None, timeout=10):
        self.timeout = timeout
        self.logger = SimpleLogger('SciHub')
        self.url = url or self.check_url(url)

    def check_url(self, url):
        def _check(url):
            self.logger.info(f'checking url: {url} ...')
            try:
                resp = requests.get(url, timeout=self.timeout)
                elapsed = resp.elapsed.total_seconds()

                soup = bs4.BeautifulSoup(resp.text, 'html.parser')

                form = soup.select_one('form[method="POST"]')
                if form:
                    self.logger.info(f'good url: {url} [{elapsed}s]')
                    return elapsed

            except Exception as e:
                self.logger.warning(e)

            self.logger.warning(f'bad url: {url}')
            return None

        self.logger.info('checking url automaticlly ...')
        hosts, update_time = check_host()
        self.logger.debug(f'update time: {update_time}')

        for host in hosts:
            elapsed = _check(host)
            if elapsed:
                url = host
                break

        if not url:
            self.logger.error('no available url, please use -c to check')
            exit(1)

        self.logger.info(f'use url: {url}')
        return url

    def search(self, term, max_try=3):
        """
            term: URL, PMID, DOI or search string

            return: the url of pdf
        """
        soup = WR.get_soup(self.url)
        form = soup.select_one('form[method="POST"]')
        post_url = self.url if form.attrs['action'] == '/' else form.attrs[
            'action']

        payload = {'sci-hub-plugin-check': '', 'request': term}

        self.logger.debug(f'search pdf url for: {term}')

        while max_try:
            max_try -= 1

            soup = WR.get_soup(post_url,
                               method='POST',
                               data=payload,
                               timeout=self.timeout)

            pdf = soup.select_one('#pdf')

            if 'article not found' in soup.text:
                self.logger.warning(f'article not found [{term}]')
                return
            elif not pdf:
                # print(soup.select('title'))
                continue

            pdf_url = pdf.attrs['src']

            if pdf_url.startswith('//'):
                pdf_url = post_url.split(':')[0] + f':{pdf_url}'

            self.logger.info(f'pdf url: {pdf_url}')
            return pdf_url

        self.logger.error(
            f'your searching has no result, please check! [{term}]')

    def download(self,
                 url,
                 outdir='.',
                 filename=None,
                 chunk_size=512,
                 overwrite=None,
                 show_progress=True):
        """download pdf from url
        """
        filename = filename or os.path.basename(url).split('#')[0]
        if outdir != '.' and not os.path.exists(outdir):
            os.makedirs(outdir)

        outfile = os.path.join(outdir, filename)
        if os.path.isfile(outfile) and os.stat(outfile).st_size > 0:
            if not isinstance(overwrite, bool):
                overwrite = click.confirm(
                    'The file already exists, do you want to overwrite it?')

            if overwrite:
                self.logger.debug(f'overwriting the file: {outfile}')
            else:
                self.logger.debug(f'skip downloading file: {outfile}')
                return True

        resp = WR.get_response(url, stream=True)

        if resp.headers['Content-Type'] != 'application/pdf':
            resp = self.deal_captcha(url, outdir, filename, chunk_size)

        length = int(resp.headers.get('Content-Length'))

        # if os.path.isfile(outfile) and os.stat(outfile).st_size == length:

        self.logger.info(
            f'downloading pdf: {outfile} [{length/1024/1024:.2f} M]')

        bar = click.progressbar(length=length,
                                label='downloading',
                                show_percent=True,
                                show_pos=True,
                                show_eta=True)
        with open(outfile, 'wb') as out, bar:
            for chunk in resp.iter_content(chunk_size=chunk_size):
                out.write(chunk)
                if show_progress:
                    bar.update(chunk_size)

        self.logger.info(f'save file: {outfile}')
        return True

    def deal_captcha(self, url, outdir, filename, chunk_size):
        """deal with the captcha
        """
        soup = WR.get_soup(url)
        img = soup.select_one('#captcha').attrs['src']
        img_url = url.rsplit('/', 3)[0] + img
        # print(img_url)

        self.logger.warning(f'need to type in the captcha: {img_url}')
        if os.getenv('DISPLAY'):
            self.logger.info(
                f'failed to open the picture, you can open it in your browser: {img_url}'
            )
        else:
            content = WR.get_response(img_url, max_try=1).content
            im = Image.open(io.BytesIO(content))

            # im.show()  # this will block the program
            im.save('tmp.png')

            # ****************************
            # ***** non-blocked mode *****
            # ****************************
            pylab.ion()
            img = pylab.imread('tmp.png')

            pylab.imshow(img)
            pylab.show()

        while True:
            answer = click.prompt('please input the captcha')

            if answer == 'new':
                return self.download(url,
                                     outdir=outdir,
                                     filename=filename,
                                     chunk_size=chunk_size)

            payload = {
                'id': img_url.split('/')[-1].split('.')[0],
                'answer': answer
            }

            # payload = {'id': '6058249282282', 'answer': 'manila'}
            self.logger.debug(payload)

            resp = WR.get_response(url,
                                   method='POST',
                                   stream=True,
                                   data=payload)

            if resp.headers['Content-Type'] == 'application/pdf':
                pylab.close()
                return resp

            self.logger.warning('bad captcha, try again!')
Пример #19
0
class ExcelBuilder(object):
    logger = SimpleLogger('ExcelBuilder')

    def __init__(self):
        self.wb = openpyxl.Workbook(write_only=False)
        self.wb.remove(self.wb.active)  # remove default 'Sheet'

    def create_sheet(self, title=None):
        """
            require: len(title) <= 31
        """
        if title and len(title) > 31:
            self.logger.warning('title is to long, limit 31 characters')
            title = title[:31]
        self.logger.info('create sheet: {}'.format(title))
        self.ws = self.wb.create_sheet(title)

    def add_title(self, titles, **style):
        self.logger.debug('>>> add title')
        self.ws.append(titles)
        self.set_row_style(**style)

    def add_rows(self, rows, color_list=None, **style):
        self.logger.info('>>> add rows')

        self.logger.debug('color_list: {}, style: {}'.format(
            color_list, list(style.keys())))

        for n, row in enumerate(rows, 1):
            self.ws.append(row)
            if not (color_list or style):
                continue

            if color_list:
                color = color_list[n % len(color_list)]
                style['PatternFill'] = PatternFill(start_color=color,
                                                   end_color=color,
                                                   fill_type='solid')
            self.set_row_style(**style)

    def set_row_style(self, **style):
        if not style:
            return

        for cell in self.ws[self.ws.max_row]:
            if style.get('font'):
                cell.font = style['font']

            if style.get('alignment'):
                cell.alignment = style['alignment']

            if style.get('border'):
                cell.border = style['border']

            if style.get('PatternFill'):
                # print(cell.coordinate, cell.value, style['PatternFill'].start_color.value)
                cell.fill = style['PatternFill']
            elif style.get('GradientFill'):
                cell.fill = style['GradientFill']

    def set_dimensions_style(self, height=None, width=None, **style):
        self.logger.info('>>> set dimensions style')
        height = height or (style and style.get('height'))
        width = width or (style and style.get('width'))
        if height:
            for row in range(1, self.ws.max_row + 1):
                r = self.ws.row_dimensions[row]
                r.height = height

        if width:
            for column in range(1, self.ws.max_column + 1):
                c = self.ws.column_dimensions[get_column_letter(column)]
                c.width = width

    def freeze_panes(self,
                     first_column=None,
                     first_row=True,
                     first_column_and_row=None,
                     coordinate=None):
        if not coordinate:
            if first_column:
                coordinate = 'B1'
            elif first_row:
                coordinate = 'A2'
            elif first_column_and_row:
                coordinate = 'B2'
        if coordinate:
            self.logger.info('>>> freeze: {}'.format(coordinate))
            self.ws.freeze_panes = coordinate

    def add_comment(self, coordinate, *args, **kwargs):
        self.logger.info('>>> add comment')
        self.ws[coordinate].comment = Comment(*args, **kwargs)

    def add_hyperlink(self, coordinate, hyperlink):
        """
            # method1:
            >>> value = '=HYPERLINK("http://www.baidu.com", "baidu")'
            >>> sheet.cell(row, colum, value=value)

            # method2:
            >>> _ = sheet.cell(row, column, value='baidu')
            >>> _.hyperlink = 'http://www.baidu.com'

            # method3:
            >>> sheet['B3'].hyperlink = Hyperlink(ref='', target='http://www.baidu.com', tooltip='baidu')
        """
        self.logger.info('>>> add hyperlink')
        self.ws[coordinate].hyperlink = hyperlink

    def merge_cells(self, *args, **kwargs):
        """
            range_string=None
            start_row=None
            start_column=None
            end_row=None
            end_column=None
        """
        self.logger.info('>>> merge cells')
        self.ws.merge_cells(*args, **kwargs)

    def auto_filter(self):
        """
            automatical filter for first row
        """
        self.ws.auto_filter.ref = self.ws.dimensions

    def save(self, outfile):
        self.wb.save(outfile)
        self.logger.warning('save file: {}'.format(outfile))
Пример #20
0
class Formatter(object):
    logger = SimpleLogger('DataFormatter')

    def __init__(self, rows, header=True):
        self.rows = rows
        self.header = header and self.check_header()

    def check_header(self):
        counter = Counter(self.rows[0])
        dup_names = [k for k, v in counter.items() if v > 1]
        if dup_names:
            click.secho(
                'could not set header=True as duplicate field names: {}'.
                format(dup_names),
                err=True,
                fg='yellow')
            return False
        return True

    def to_table(self, align='l', index=False):
        """
            return a prettytable object

            >>> t = to_table()
            >>> str(t)
            >>> t.get_string()
            >>> t.get_html_string()
        """
        table = prettytable.PrettyTable()
        if self.header:
            field_names = self.rows[0]
            rows = self.rows[1:]
        else:
            field_names = list(
                map(openpyxl.utils.get_column_letter,
                    range(1,
                          len(self.rows[0]) + 1)))
            rows = self.rows

        if index:
            table.field_names = ['Index'] + field_names
        else:
            table.field_names = field_names

        for n, row in enumerate(rows, 1):
            if index:
                row = [n] + row
            table.add_row(row)

        for field in table.field_names:
            table.align[field] = align

        return table

    def to_json(self, indent=None, ensure_ascii=False):
        data = []
        if not self.header:
            data = self.rows
        else:
            fields = self.rows[0]
            rows = self.rows[1:]
            for row in rows:
                context = dict(zip(fields, row))
                data.append(context)
        return json.dumps(data, indent=indent, ensure_ascii=ensure_ascii)

    def to_tsv(self, sep='\t', quote=''):
        data = []
        for row in self.rows:
            line = sep.join('{0}{1}{0}'.format(quote, each) for each in row)
            data.append(line)
        return '\n'.join(data)