def show_detail(self, data): cnt = 1 for idx in data['content']: logger.info('{} - {}'.format(cnt, idx)) cnt += 1 for dict_data in data['content'][idx]: logger.info(' ' + dict_data['title'])
def login(self): # login into wenku postData = { 'username': self.config['wenku']['account'], 'password': self.config['wenku']['password'], 'usecookie': '315360000', 'action': 'login' } self.wenku_session.cookies = cookielib.LWPCookieJar( filename=os.path.join(loc, 'data/wenku/cookie.txt')) try: # use cookie login self.wenku_session.cookies.load() resp = self.wenku_session.get(self.base_url) resp.encoding = 'gbk' soup = bs(resp.text, 'html.parser') not_login = soup.find('caption') if not_login: # use account logging if not self.config['wenku']['account'] or not self.config[ 'wenku']['password']: logger.error('You need to login wenku then you can search') return -1 resp = self.wenku_session.post(self.login_url, data=postData) resp.encoding = 'gbk' self.wenku_session.cookies.save() logger.info('use account login wenku') else: logger.info('use cookie login wenku') return 1 except: logging.error('Fail to login wenku, plz try later') return -1
def downloader(self, data, process_count, save_path, nth=0): # template download function def download(): try: if can_normal_download: url = data['content_table_url'].replace( 'index', content['vid']) resp = requests.get(url) resp.encoding = 'gbk' soup = bs(resp.text, 'html.parser') content['text'] = str(soup.find('div', id='content')) else: url = self.download_url.format(data['aid'], content['vid']) resp = requests.get(url, allow_redirects=True) content['text'] = resp.text logger.info('Success get {}'.format(content['title'])) except: logger.error( 'Can\'t get {}, there are some errors happend'.format( content['title'])) # test if can download from origin page resp = requests.get(url=self.main_page.format(data['aid'])) resp.encoding = 'gbk' soup = bs(resp.text, 'html.parser') can_normal_download = True if '因版权问题' in soup.find('span', class_='hottext').text: can_normal_download = False # get content of every chapter logger.info('strating download') tmp_data = dict(data) tmp_data['content'] = {} p = mp.Pool(processes=process_count) if nth <= 0: for name in data['content']: for content in data['content'][name]: p.apply_async(download()) p.close() p.join() else: tmp_data['content'][nth_dict(data['content'], nth - 1)] = list( data['content'][nth_dict(data['content'], nth - 1)]) for content in tmp_data['content'][nth_dict( data['content'], nth - 1)]: p.apply_async(download()) p.close() p.join() data = dict(tmp_data) # convert data to epub txt2epub(data, save_path) logger.info('Success get all novels')
def searcher(self, key): logger.info('===== epub site =====') resp = requests.get(url=self.search_url, params={'q': key}) resp.encoding = 'utf-8' soup = bs(resp.text, 'html.parser') if '找不到與查詢字詞' in soup.find('h2', class_='pagetitle').text: logger.warn('epub site can\'t find the match result') else: results = soup.find_all('a', rel='bookmark', text=re.compile(key)) for result in results: logger.info('%13s : %s' % (result['href'].replace( self.base_url, '').replace('.html', ''), result.text))
def local_searcher(self, key): result = {} rating = 0 key = OpenCC('tw2s').convert(key) for idx in self.data.keys(): result[idx] = fuzz.partial_token_set_ratio(self.data[idx], key) result = collections.OrderedDict( sorted(result.items(), key=operator.itemgetter(1), reverse=True)) for idx in result: if result[idx] < 50: break logger.info("%4s : %s" % (idx, self.data[idx]['title']))
def renew(self): # get lastes number from data page = int(max(self.data, key=int)) + 1 while True: result = self.get_main_page(page) page += 1 if result: self.data.update(result) with open(os.path.join(loc, 'data/wenku/data.json'), 'w', encoding='utf8') as fp: json.dump(self.data, fp, ensure_ascii=False) else: logger.info('It\'s already renewed wenku\'s data to lastest') break
def download(): try: if can_normal_download: url = data['content_table_url'].replace( 'index', content['vid']) resp = requests.get(url) resp.encoding = 'gbk' soup = bs(resp.text, 'html.parser') content['text'] = str(soup.find('div', id='content')) else: url = self.download_url.format(data['aid'], content['vid']) resp = requests.get(url, allow_redirects=True) content['text'] = resp.text logger.info('Success get {}'.format(content['title'])) except: logger.error( 'Can\'t get {}, there are some errors happend'.format( content['title']))
def downloader(self, _id, save_path): # get main page resp = requests.get(url=self.base_url + _id + '.html') resp.encoding = 'utf-8' soup = bs(resp.text, 'html.parser') title = soup.find('a', rel='bookmark').text google_url = soup.find('a', text='google') mega_url = soup.find('a', text='mega') if google_url: google_url = google_url['href'] try: logger.info('Strating download use google drive') _id = google_url.replace( 'https://drive.google.com/file/d/', '').replace( '/view?usp=sharing', '').replace('https://drive.google.com/open?id=', '') download_file_from_google_drive( _id, os.path.join(save_path, title + '.epub')) logger.info('Download successful') except Exception as e: logger.warn( 'There are some error when download, please try later \n {}' .format(e)) elif mega_url: mega_url = mega_url['href'] try: logger.info('Strating download use mega drive') download_file_from_mega_drive(mega_url, save_path + '/', title + '.epub') logger.info('Download successful') except Exception as e: logger.warn( 'There are some error when download, please try later \n {}' .format(e)) else: logger.warn('There has no source can download')
def online_searcher(self, key): try: logger.info('======= wenku =======') self.login() key = OpenCC('tw2s').convert(key) resp = self.wenku_session.get(url=self.search_url.format( requests.utils.quote(key, encoding='gbk'))) resp.encoding = 'gbk' soup = bs(resp.text, 'html.parser') # get search result if soup.find('caption', text=re.compile('搜索结果')): # multi search result max_page = int(soup.find('a', class_='last').text) for i in range(2, max_page + 2): novels = soup.find_all('a', text=re.compile(key)) for novel in novels: logger.info('%4s : %s' % (re.findall( r'[/][0-9]+', novel['href'])[0].replace( '/', ''), novel.text)) if (i == max_page + 1): break time.sleep(5) url = self.search_url.format( requests.utils.quote( key, encoding='gbk')) + '&page=' + str(i) resp = self.wenku_session.get(url=url) resp.encoding = 'gbk' soup = bs(resp.text, 'html.parser') else: # singal search aid = re.findall(r'=[0-9]+', soup.find('a', text='加入书架')['href'])[0].replace( '=', '') title = self.get_main_page(aid)[str(aid)]['title'] logger.info('%4s : %s' % (aid, title)) except Exception as e: logger.error('Fail to search wenku')
def download_file_from_mega_drive(url, path, name): mega = Mega() m = mega.login() logger.info('Use anonymous login meag success') logger.info('Starting download file') m.download_url(url, path, name)
############## pre deal action ############## # wenku login option if not opt.is_anonymous: config = {} # open and load config with open(os.path.join(loc, 'data/config.json'), 'r') as fp: config = json.load(fp) # change config if opt.wenku_account: config['wenku']['account'] = opt.wenku_account if opt.wenku_password: config['wenku']['password'] = opt.wenku_password if opt.wenku_account or opt.wenku_password: logger.info('Success save wenku login detail') # save config with open(os.path.join(loc, 'data/config.json'), 'w') as fp: json.dump(config, fp) epubsite_parser = EPUBSITEParser() wenku_parser = WENKUParser(opt.wenku_account, opt.wenku_password) # clean wenku account if opt.clean_wenku_account: with open(os.path.join(loc, 'data/config.json'), 'w') as fp: json.dump({}, fp) logger.info('Already clean your login information')