def make_document(cls, meta, cont): """根据所给的cont和meta生成html或markdown文件""" if config.get_setting('running/file_type') == cls.DEFAULT_TYPE: doc = cls.item2html(cont, meta) else: doc = cls.item2md(cont, meta) if config.get_setting('running/download_image'): cls.download_image(doc) cls.show_info(meta)
def parse_data(cls, data): meta = Meta(pattern=Meta.simple) title = data.find('h2', _class='zm-item-title') try: meta.title = title.string except AttributeError: print(data) raise AttributeError original_url = title.find('a').get_attrs('href') if data.get_attrs('data-type') == 'Answer': head = data.find('div', _class='answer-head') original_url += config.get_setting('API/host') else: head = data.find('div', _class='post-head') meta.original_url = original_url try: author = head.find('a', _class='author-link') meta.author = author.string meta.author_homepage = config.get_setting( 'API/host') + author.get_attrs('href') except AttributeError: try: author = head.find('span', _class='name') meta.author = author.string meta.author_homepage = config.get_setting('API/host') except AttributeError: print(head) raise meta.voteup = int( head.find('div', _class='zm-item-vote-info').get_attrs('data-votecount')) # <meta itemprop="post-id" content="107121832"> # <meta itemprop="answer-id" content="107121832"> # https://www.zhihu.com/node/AnswerVoteInfoV2?params={"answer_id":"203923119"} # https://www.zhihu.com/node/ColumnPostVoteInfoV2?params={"post_id":"103306156"} def stg(r): return { '"': '"', '<': '<', '>': '>' }.get(r.group(0), '') return meta, re.sub('(")|(<)|(>)', stg, data.find('textarea', _class='content').string)
def item2html(cls, cont, meta): mushroom = html.Mushroom( cont, meta, css_output=config.get_setting('running/css_output')) with open(format_file_name('html', meta.author, meta.title), 'w', encoding='utf8') as foo: mushroom.write_down(foo) if config.get_setting('running/css_output'): stylesheets = mushroom.output_css_code() for css in stylesheets: with open(format_file_name('css', css['file_name']), 'w', encoding='utf8') as foo: foo.write(css['code']) return mushroom
def make_document(cls, meta, cont): """根据所给的cont和meta生成html或markdown文件""" if cont is None or cont == '': return if config.get_setting('running/file_type') == cls.DEFAULT_TYPE: doc = cls.item2html(cont, meta) else: doc = cls.item2md(cont, meta) cls.show_info(meta) if config.get_setting('running/download_image'): cls.download_image(doc) print('-' * 53 + '\n') cls.index += 1
class Crawler(requests.Session, API): UA = config.get_setting('Crawler/user-agent') def __init__(self): super().__init__() self.headers.update(Crawler.UA) def get_network_data_package(self, item_name, item_id, **kwargs): resp = self.get(self.get_url(item_name, item_id, **kwargs), timeout=30) try: resp.raise_for_status() except HTTPError: raise VerityError(status_code=resp.status_code, url=resp.url) except MissingSchema: raise ValueError('url error: ', item_name, item_id, kwargs) if config.get_setting('running/cached'): self.cached_network_data(resp, item_name, item_id, **kwargs) return resp def download(self, url, **kwargs): return self.get(url, timeout=30, **kwargs) @classmethod def cached_network_data(cls, data, item_name, item_id, **kwargs): """缓存原始数据""" ofs = kwargs.get('offset', None) or kwargs.get( 'page', None) or timer.timestamp_str() file = os.path.join(config.cached_warehouse(), '%s-%s-%s.json' % (item_name, item_id, ofs)) with open(file, 'w', encoding='utf8') as foo: foo.write(data.text) return file
def __init__(self, column_id): super(ColumnManage, self).__init__(column_id) resp = self.get_network_data_package('column_meta', self.item_id) item_words = re.search(config.get_setting('ColumnManage/title_reg'), resp.text).group(1) self.item_words = codecs.decode(item_words, 'unicode_escape') config.warehouse('~column/%s' % format_path(self.item_words))
def __init__(self, question_id): super(QuestionManage, self).__init__(question_id) response = self.get_network_data_package('question_meta', self.item_id) self.title = re.search(config.get_setting('QuestionManage/title_reg'), response.text).group(1) config.warehouse('~question/%s' % format_path(self.title))
def _make_link_card(self, tag): """生成卡片链接标签""" url = tag.get_attrs('href') img = tag.get_attrs('image') if re.search('zhihu', url) and img is None: img = config.get_setting('Formatter/link_card_default_image') return self.link_card(url=url, title=tag.string, img=img)
def get_network_data_package(self, item_name, item_id, **kwargs): resp = self.get(self.get_url(item_name, item_id, **kwargs), timeout=30) try: resp.raise_for_status() except HTTPError: raise VerityError(status_code=resp.status_code, url=resp.url) except MissingSchema: raise ValueError('url error: ', item_name, item_id, kwargs) if config.get_setting('running/cached'): self.cached_network_data(resp, item_name, item_id, **kwargs) return resp
def format_file_name(suffix, *part_name): """返回正确的文件名""" names = format_path('-'.join(part_name)) if (suffix is not None) and (suffix != ''): file = os.path.join(config.wh(), '%s.%s' % (names, suffix)) else: file = os.path.join(config.wh(), names) if not config.get_setting('running/cover'): return file REPETITION = 1 while os.path.exists(file): file = os.path.join(config.wh(), '%s-%d.%s' % (names, REPETITION, suffix)) REPETITION += 1 return file
def formatter(self, meta, otp: Mushroom): """处理Tags,修改属性、生成视频标签等""" r = self.format(self.tag_list) if len(self.reference_list) != 0: r.append(Tag('span', attrs={'style': 'font-size:24px'}, string='参考资料')) n = self.reference_table(self.reference_list) r.append(n) otp.insert_article_title(self.article_tile(meta)) otp.insert_article_text(self.article_text(*r)) for stylesheet in self.style_meta: otp.stylesheets.append(config.get_setting('head/style/%s' % stylesheet)) otp.image_list = self.image_list return otp
class Crawler(API): UA = config.get_setting('Crawler/user-agent') def __init__(self): super().__init__() self.session = requests.Session() self.session.headers.update({'User-Agent': Crawler.UA}) try: mod = __import__('zhihu.spider.login', None, None, ['__all__']) ckf = getattr(mod, 'cookies_file') self.session.cookies = cookiejar.LWPCookieJar(filename=ckf) self.session.cookies.load() except (FileNotFoundError, ImportError, AttributeError) as e: pass def __del__(self): self.session.close() def get_network_data_package(self, item_name, item_id, **kwargs): resp = self.session.get(self.get_url(item_name, item_id, **kwargs), timeout=30) try: resp.raise_for_status() except HTTPError: raise VerityError(status_code=resp.status_code, url=resp.url) except MissingSchema: raise ValueError('url error: ', item_name, item_id, kwargs) if config.get_setting('running/cached'): self.cached_network_data(resp, item_name, item_id, **kwargs) return resp def download(self, url, **kwargs): return self.session.get(url, timeout=30, **kwargs) @classmethod def cached_network_data(cls, data, item_name, item_id, **kwargs): """缓存原始数据""" ofs = kwargs.get('offset', None) or kwargs.get( 'page', None) or timer.timestamp_str() file = os.path.join(config.cached_warehouse(), '%s-%s-%s.json' % (item_name, item_id, ofs)) with open(file, 'w', encoding='utf8') as foo: foo.write(data.text) return file
class API: """获得有关数据的链接类""" SORT_BY_DEF = config.get_setting('API/SORT_BY_DEF') SORT_BY_VOT = config.get_setting('API/SORT_BY_VOT') SORT_BY_DAT = config.get_setting('API/SORT_BY_DAT') PLATFORM = config.get_setting('API/PLATFORM') api = { 'question': config.get_setting('API/question'), 'question_meta': config.get_setting('API/question_meta'), 'answer': config.get_setting('API/answer'), 'article': config.get_setting('API/article'), 'column': config.get_setting('API/column'), 'column_meta': config.get_setting('API/column_meta'), 'answer_link': config.get_setting('API/answer_link'), 'article_link': config.get_setting('API/article_link'), 'author_homepage': config.get_setting('API/author_homepage'), 'user_answers': config.get_setting('API/user_answers'), 'user_articles': config.get_setting('API/user_articles'), 'user_meta': config.get_setting('API/user_meta'), 'collection': config.get_setting('API/collection'), 'collection_meta': config.get_setting('API/collection_meta'), } @classmethod def get_url(cls, item_name, item_id, **kwargs): """ :param item_name: question, answer, column, ... :param item_id: question_id, answer_id, ... :param kwargs: offset, limit, sort_by :return: str, url """ params = { 'item_id': item_id, 'offset': 0, 'limit': 20, 'sort_by': cls.SORT_BY_VOT } params.update(kwargs) return cls.api.get(item_name, '').format(**params) @classmethod def format_url(cls, item_name, **kwargs): return cls.get_url(item_name, None, **kwargs)
def get_network_data_package(self, item_name, item_id, **kwargs): resp = self.get(self.get_url(item_name, item_id, **kwargs), timeout=30) if config.get_setting('running/cached'): self.cached_network_data(resp, item_name, item_id, **kwargs) return resp
class ZhihuAccount: UA = config.get_setting('Crawler/user-agent') BASE_HEAD = {'Host': 'www.zhihu.com', 'User-Agent': UA} LOGIN_UP = 1 # 登录了 LOGIN_IN = 0 # 请求登录 def __init__(self): self.session = requests.Session() self.session.cookies = cookiejar.LWPCookieJar(filename=cookies_file) try: self.session.cookies.load(ignore_discard=True) except FileNotFoundError: pass def __del__(self): try: os.remove(os.path.abspath('QR.jpg')) except FileNotFoundError: pass def login_up(self): if self.login_status() == ZhihuAccount.LOGIN_UP: print('已登录!') else: print('开始登录...') if self.__login(): if self.login_status() == ZhihuAccount.LOGIN_UP: self.session.cookies.save() print('登录成功!') return print('登录失败!') def login_out(self): self.session.get('https://www.zhihu.com/logout', headers=ZhihuAccount.BASE_HEAD, allow_redirects=False) self.session.cookies.save() # try: # os.remove('cookies') # except FileNotFoundError: # pass print('已退出!') def login_status(self): resp = self.session.get('https://www.zhihu.com/signup', headers=ZhihuAccount.BASE_HEAD, allow_redirects=False) if resp.status_code == 302: return ZhihuAccount.LOGIN_UP else: return ZhihuAccount.LOGIN_IN def __login(self): try: self.session.get("https://www.zhihu.com/signup?next=%2F", headers=ZhihuAccount.BASE_HEAD) captcha_head = {"Referer": "https://www.zhihu.com/"} captcha_head.update(ZhihuAccount.BASE_HEAD) self.session.get( "https://www.zhihu.com/api/v3/oauth/captcha?lang=en", headers=captcha_head) resp = self.session.post("https://www.zhihu.com/udid", headers=ZhihuAccount.BASE_HEAD) token_head = { 'Origin': 'https://www.zhihu.com', 'Referer': 'https://www.zhihu.com/signup?next=%2F', 'x-udid': resp.content.decode('utf8') } token_head.update(ZhihuAccount.BASE_HEAD) resp = self.session.post( "https://www.zhihu.com/api/v3/account/api/login/qrcode", headers=token_head) token = resp.json().get('token') qr = self.session.get( f'https://www.zhihu.com/api/v3/account/api/login/qrcode/{token}/image', headers=token_head) self.__show_qr_code(qr.content) print('操作系统已使用关联程序显示二维码,请使用知乎APP扫描。\n' '小提示:知乎APP扫码特别慢,建议使用微信扫描,按屏幕提示继续操作也可登录。\n') time.sleep(10) start = time.time() while True: rjs = self.session.get( f'https://www.zhihu.com/api/v3/account/api/login/qrcode/{token}/scan_info', headers=captcha_head).json() if rjs.get('user_id', None) or rjs.get( 'status', None) == 6 or rjs.get('error'): break if time.time() - start >= 90: print('登录超时!(<90s)') break time.sleep(2) return True except RequestException as e: return False @staticmethod def __show_qr_code(image): """ 调用系统软件显示图片 """ image_file = os.path.abspath('QR.jpg') with open(image_file, 'wb') as foo: foo.write(image) if platform.system() == 'Darwin': os.subprocess.call(['open', image_file]) elif platform.system() == 'Linux': os.subprocess.call(['xdg-open', image_file]) else: os.startfile(image_file) def __enter__(self): self.login_up() return self def __exit__(self, exc_type, exc_val, exc_tb): self.login_out()
def template(cls, name): try: return config.get_setting('tag/%s' % name) except KeyError: raise KeyError('not find template named %s.' % name)