Пример #1
0
 def show_detail(self, data):
     cnt = 1
     for idx in data['content']:
         logger.info('{} - {}'.format(cnt, idx))
         cnt += 1
         for dict_data in data['content'][idx]:
             logger.info('    ' + dict_data['title'])
Пример #2
0
 def login(self):
     # login into wenku
     postData = {
         'username': self.config['wenku']['account'],
         'password': self.config['wenku']['password'],
         'usecookie': '315360000',
         'action': 'login'
     }
     self.wenku_session.cookies = cookielib.LWPCookieJar(
         filename=os.path.join(loc, 'data/wenku/cookie.txt'))
     try:
         # use cookie login
         self.wenku_session.cookies.load()
         resp = self.wenku_session.get(self.base_url)
         resp.encoding = 'gbk'
         soup = bs(resp.text, 'html.parser')
         not_login = soup.find('caption')
         if not_login:
             # use account logging
             if not self.config['wenku']['account'] or not self.config[
                     'wenku']['password']:
                 logger.error('You need to login wenku then you can search')
                 return -1
             resp = self.wenku_session.post(self.login_url, data=postData)
             resp.encoding = 'gbk'
             self.wenku_session.cookies.save()
             logger.info('use account login wenku')
         else:
             logger.info('use cookie login wenku')
         return 1
     except:
         logging.error('Fail to login wenku, plz try later')
         return -1
Пример #3
0
    def downloader(self, data, process_count, save_path, nth=0):
        # template download function
        def download():
            try:
                if can_normal_download:
                    url = data['content_table_url'].replace(
                        'index', content['vid'])
                    resp = requests.get(url)
                    resp.encoding = 'gbk'
                    soup = bs(resp.text, 'html.parser')
                    content['text'] = str(soup.find('div', id='content'))
                else:
                    url = self.download_url.format(data['aid'], content['vid'])
                    resp = requests.get(url, allow_redirects=True)
                    content['text'] = resp.text
                logger.info('Success get {}'.format(content['title']))
            except:
                logger.error(
                    'Can\'t get {}, there are some errors happend'.format(
                        content['title']))

        # test if can download from origin page
        resp = requests.get(url=self.main_page.format(data['aid']))
        resp.encoding = 'gbk'
        soup = bs(resp.text, 'html.parser')
        can_normal_download = True

        if '因版权问题' in soup.find('span', class_='hottext').text:
            can_normal_download = False

        # get content of every chapter
        logger.info('strating download')
        tmp_data = dict(data)
        tmp_data['content'] = {}
        p = mp.Pool(processes=process_count)
        if nth <= 0:
            for name in data['content']:
                for content in data['content'][name]:
                    p.apply_async(download())
            p.close()
            p.join()
        else:
            tmp_data['content'][nth_dict(data['content'], nth - 1)] = list(
                data['content'][nth_dict(data['content'], nth - 1)])
            for content in tmp_data['content'][nth_dict(
                    data['content'], nth - 1)]:
                p.apply_async(download())
            p.close()
            p.join()
            data = dict(tmp_data)

        # convert data to epub
        txt2epub(data, save_path)
        logger.info('Success get all novels')
Пример #4
0
 def searcher(self, key):
     logger.info('===== epub site =====')
     resp = requests.get(url=self.search_url, params={'q': key})
     resp.encoding = 'utf-8'
     soup = bs(resp.text, 'html.parser')
     if '找不到與查詢字詞' in soup.find('h2', class_='pagetitle').text:
         logger.warn('epub site can\'t find the match result')
     else:
         results = soup.find_all('a', rel='bookmark', text=re.compile(key))
         for result in results:
             logger.info('%13s : %s' % (result['href'].replace(
                 self.base_url, '').replace('.html', ''), result.text))
Пример #5
0
 def local_searcher(self, key):
     result = {}
     rating = 0
     key = OpenCC('tw2s').convert(key)
     for idx in self.data.keys():
         result[idx] = fuzz.partial_token_set_ratio(self.data[idx], key)
     result = collections.OrderedDict(
         sorted(result.items(), key=operator.itemgetter(1), reverse=True))
     for idx in result:
         if result[idx] < 50:
             break
         logger.info("%4s : %s" % (idx, self.data[idx]['title']))
Пример #6
0
 def renew(self):
     # get lastes number from data
     page = int(max(self.data, key=int)) + 1
     while True:
         result = self.get_main_page(page)
         page += 1
         if result:
             self.data.update(result)
             with open(os.path.join(loc, 'data/wenku/data.json'),
                       'w',
                       encoding='utf8') as fp:
                 json.dump(self.data, fp, ensure_ascii=False)
         else:
             logger.info('It\'s already renewed wenku\'s data to lastest')
             break
Пример #7
0
 def download():
     try:
         if can_normal_download:
             url = data['content_table_url'].replace(
                 'index', content['vid'])
             resp = requests.get(url)
             resp.encoding = 'gbk'
             soup = bs(resp.text, 'html.parser')
             content['text'] = str(soup.find('div', id='content'))
         else:
             url = self.download_url.format(data['aid'], content['vid'])
             resp = requests.get(url, allow_redirects=True)
             content['text'] = resp.text
         logger.info('Success get {}'.format(content['title']))
     except:
         logger.error(
             'Can\'t get {}, there are some errors happend'.format(
                 content['title']))
Пример #8
0
 def downloader(self, _id, save_path):
     # get main page
     resp = requests.get(url=self.base_url + _id + '.html')
     resp.encoding = 'utf-8'
     soup = bs(resp.text, 'html.parser')
     title = soup.find('a', rel='bookmark').text
     google_url = soup.find('a', text='google')
     mega_url = soup.find('a', text='mega')
     if google_url:
         google_url = google_url['href']
         try:
             logger.info('Strating download use google drive')
             _id = google_url.replace(
                 'https://drive.google.com/file/d/', '').replace(
                     '/view?usp=sharing',
                     '').replace('https://drive.google.com/open?id=', '')
             download_file_from_google_drive(
                 _id, os.path.join(save_path, title + '.epub'))
             logger.info('Download successful')
         except Exception as e:
             logger.warn(
                 'There are some error when download, please try later \n {}'
                 .format(e))
     elif mega_url:
         mega_url = mega_url['href']
         try:
             logger.info('Strating download use mega drive')
             download_file_from_mega_drive(mega_url, save_path + '/',
                                           title + '.epub')
             logger.info('Download successful')
         except Exception as e:
             logger.warn(
                 'There are some error when download, please try later \n {}'
                 .format(e))
     else:
         logger.warn('There has no source can download')
Пример #9
0
    def online_searcher(self, key):
        try:
            logger.info('======= wenku =======')
            self.login()
            key = OpenCC('tw2s').convert(key)
            resp = self.wenku_session.get(url=self.search_url.format(
                requests.utils.quote(key, encoding='gbk')))
            resp.encoding = 'gbk'
            soup = bs(resp.text, 'html.parser')

            # get search result
            if soup.find('caption', text=re.compile('搜索结果')):
                # multi search result
                max_page = int(soup.find('a', class_='last').text)
                for i in range(2, max_page + 2):
                    novels = soup.find_all('a', text=re.compile(key))
                    for novel in novels:
                        logger.info('%4s : %s' % (re.findall(
                            r'[/][0-9]+', novel['href'])[0].replace(
                                '/', ''), novel.text))
                    if (i == max_page + 1):
                        break
                    time.sleep(5)
                    url = self.search_url.format(
                        requests.utils.quote(
                            key, encoding='gbk')) + '&page=' + str(i)
                    resp = self.wenku_session.get(url=url)
                    resp.encoding = 'gbk'
                    soup = bs(resp.text, 'html.parser')
            else:
                # singal search
                aid = re.findall(r'=[0-9]+',
                                 soup.find('a',
                                           text='加入书架')['href'])[0].replace(
                                               '=', '')
                title = self.get_main_page(aid)[str(aid)]['title']
                logger.info('%4s : %s' % (aid, title))
        except Exception as e:
            logger.error('Fail to search wenku')
Пример #10
0
def download_file_from_mega_drive(url, path, name):
    mega = Mega()
    m = mega.login()
    logger.info('Use anonymous login meag success')
    logger.info('Starting download file')
    m.download_url(url, path, name)
Пример #11
0
############## pre deal action ##############
# wenku login option
if not opt.is_anonymous:
    config = {}

    # open and load config
    with open(os.path.join(loc, 'data/config.json'), 'r') as fp:
        config = json.load(fp)

    # change config
    if opt.wenku_account:
        config['wenku']['account'] = opt.wenku_account
    if opt.wenku_password:
        config['wenku']['password'] = opt.wenku_password
    if opt.wenku_account or opt.wenku_password:
        logger.info('Success save wenku login detail')

    # save config
    with open(os.path.join(loc, 'data/config.json'), 'w') as fp:
        json.dump(config, fp)

epubsite_parser = EPUBSITEParser()
wenku_parser = WENKUParser(opt.wenku_account, opt.wenku_password)

# clean wenku account
if opt.clean_wenku_account:
    with open(os.path.join(loc, 'data/config.json'), 'w') as fp:
        json.dump({}, fp)
    logger.info('Already clean your login information')