Пример #1
0
def parser_list(task):
    response = task['response']
    new_tasks = []

    if not response:
        LOG.log_it("Not Response", 'WARN')
        raise RetryDownload

    try:
        data = response.json()
    except Exception as e:
        LOG.log_it(
            '解析JSON出错(如一直出现,而且浏览器能正常访问,可能是网站代码升级,请通知开发者。)\nERRINFO:{}'.format(
                str(e)), 'WARN')
        raise RetryDownload

    try:
        for each_result in data['result']:
            title = each_result['title']
            url = each_result['url']
            date_group = re.search('(.*?)T(.*?)\+',
                                   each_result['date_created'])
            date = date_group.group(1) + ' ' + date_group.group(2)

            meta = deepcopy(task['meta'])
            save = deepcopy(task['save'])
            save.update({'title': title, 'date': date})
            new_task = Task.make_task({
                'url': url,
                'method': 'GET',
                'parser': parser_content,
                'resulter': resulter_content,
                'priority': 1,
                'meta': meta,
                'save': save
            })
            new_tasks.append(new_task)
    except KeyError:
        LOG.log_it('JSON KEY出错(如一直出现,而且浏览器能正常访问,可能是网站代码升级,请通知开发者。)', 'WARN')
        raise RetryDownload

    # 获取下一页
    meta = deepcopy(task['meta'])
    save = deepcopy(task['save'])
    save['cursor'] += 20
    if save['cursor'] < save['end'] and not len(data['result']) < 20:
        new_task = Task.make_task({
            'url': API_URL.format(save['cursor']),
            'method': 'GET',
            'meta': meta,
            'parser': parser_list,
            'priority': 0,
            'save': save,
            'retry': 3,
        })
        new_tasks.append(new_task)

    return None, new_tasks
Пример #2
0
def parser_content(task):
    title = task['title']
    new_tasks = []

    response = task['response']
    if not response:
        raise RetryDownload

    bs = BeautifulSoup(response.text, 'lxml')

    content_tab = bs.select('.PostIndex-content')
    if content_tab:
        content = str(content_tab[0])
    else:
        LOG.log_it("不能找到文章的内容。(如一直出现,而且浏览器能正常访问知乎,可能是知乎代码升级,请通知开发者。)", 'WARN')
        raise RetryDownload

    author_name = bs.select('.PostIndex-authorName')[0].string if bs.select(
        '.PostIndex-authorName') else ''
    voteup_count = re.search(
        'likesCount&quot;:(\d+),', response.text).group(1) if re.search(
            'likesCount&quot;:(\d+),', response.text) else ''
    created_time = str(
        bs.select('.PostIndex-header .HoverTitle')[1]['data-hover-title']
    ) if len(bs.select('.PostIndex-header .HoverTitle')) == 2 else ''
    article_url = task['url']

    download_img_list, content = format_zhihu_content(content, task)

    item = [
        md5string(article_url), title, content, created_time, voteup_count,
        author_name,
        int(time.time() * 100000)
    ]

    if task['save']['kw'].get('img', True):
        img_header = deepcopy(DEFAULT_HEADERS)
        img_header.update({'Referer': response.url})
        for img_url in download_img_list:
            new_tasks.append(
                Task.make_task({
                    'url': img_url,
                    'method': 'GET',
                    'meta': {
                        'headers': img_header,
                        'verify': False
                    },
                    'parser': parser_downloader_img,
                    'resulter': resulter_downloader_img,
                    'save': task['save'],
                    'priority': 10,
                }))

    task.update({"parsed_data": item})
    return task, new_tasks
Пример #3
0
def parser_content(task):
    title = task['title']
    new_tasks = []

    response = task['response']
    if not response:
        raise RetryDownload

    bs = BeautifulSoup(response.text, 'lxml')

    content_tab = bs.select('.show-content')
    if content_tab:
        content = str(content_tab[0])
    else:
        LOG.log_it("不能找到文章的内容。(如一直出现,而且浏览器能正常访问网站,可能是网站代码升级,请通知开发者。)", 'WARN')
        raise RetryDownload

    author_name = bs.select('.post .author .name a')[0].string if bs.select(
        '.post .author .name a') else ''
    voteup_count = bs.select(
        '.post .author .meta .likes-count')[0].string if bs.select(
            '.post .author .meta .likes-count') else ''
    created_time = bs.select(
        '.post .author .meta .publish-time')[0].string if bs.select(
            '.post .author .meta .publish-time') else ''
    article_url = task['url']

    download_img_list, content = format_content(content, task)

    item = [
        md5string(article_url), title, content, created_time, voteup_count,
        author_name,
        int(time.time() * 100000)
    ]

    if task['save']['kw'].get('img', True):
        img_header = deepcopy(DEFAULT_HEADERS)
        img_header.update({'Referer': response.url})
        for img_url in download_img_list:
            new_tasks.append(
                Task.make_task({
                    'url': img_url,
                    'method': 'GET',
                    'meta': {
                        'headers': img_header,
                        'verify': False
                    },
                    'parser': parser_downloader_img,
                    'resulter': resulter_downloader_img,
                    'save': task['save'],
                    'priority': 10,
                }))

    task.update({"parsed_data": item})
    return task, new_tasks
Пример #4
0
def parser_content(task):
    title = task['title']
    new_tasks = []

    response = task['response']
    if not response:
        raise RetryDownload

    try:
        content = response.json()['body']
    except Exception as e:
        LOG.log_it(
            '解析JSON出错(如一直出现,而且浏览器能正常访问网站,可能是网站代码升级,请通知开发者。)ERRINFO:{}'.format(
                str(e)), 'WARN')
        raise RetryDownload

    bs = BeautifulSoup(content, 'lxml')
    content = str(bs.select('div.content')[0])

    author_name = bs.select('.author')[0].string if bs.select(
        '.author') else ''
    voteup_count = ''
    created_time = ''
    article_url = task['url']

    download_img_list, content = format_zhihu_content(content, task)

    item = [
        md5string(article_url), title, content, created_time, voteup_count,
        author_name,
        int(time.time() * 100000)
    ]

    if task['save']['kw'].get('img', True):
        img_header = deepcopy(DEFAULT_HEADERS)
        img_header.update({'Referer': response.url})
        for img_url in download_img_list:
            new_tasks.append(
                Task.make_task({
                    'url': img_url,
                    'method': 'GET',
                    'meta': {
                        'headers': img_header,
                        'verify': False
                    },
                    'parser': parser_downloader_img,
                    'resulter': resulter_downloader_img,
                    'save': task['save'],
                    'priority': 10,
                }))

    task.update({"parsed_data": item})
    return task, new_tasks
Пример #5
0
def parser_list(task):
    response = task['response']
    new_tasks = []
    to_next = True

    if not response:
        raise RetryDownload

    try:
        data = response.json()['stories']
    except Exception as e:
        LOG.log_it(
            '解析JSON出错(如一直出现,而且浏览器能正常访问网站,可能是网站代码升级,请通知开发者。)ERRINFO:{}'.format(
                str(e)), 'WARN')
        raise RetryDownload

    for item in data:
        # 如果在数据库里面已经存在的项目,就不继续爬了
        url = 'http://news-at.zhihu.com/api/4/story/' + str(item['id'])
        if md5string(url) in ARTICLE_ID_SET:
            to_next = False
            continue

        new_task = Task.make_task({
            'url': url,
            'method': 'GET',
            'meta': task['meta'],
            'parser': parser_content,
            'resulter': resulter_content,
            'priority': 5,
            'save': task['save'],
            'title': item['title'],
        })
        new_tasks.append(new_task)

    # 下一页
    if not IS_TODAY_URL and to_next:
        next_datetime = get_next_datetime_string(task['save']['cursor'],
                                                 '%Y%m%d', 1)

        # 始终会到相等的时候
        if compare_datetime_string(task['save']['end'], next_datetime,
                                   '%Y%m%d') and len(data) != 0:
            next_page_task = deepcopy(task)
            next_page_task.update({
                'url':
                re.sub('before/\d+', 'before/{}'.format(next_datetime),
                       next_page_task['url'])
            })
            next_page_task['save'].update({'cursor': next_datetime})
            new_tasks.append(next_page_task)

    return None, new_tasks
Пример #6
0
def parser_list(task):
    response = task['response']
    new_tasks = []

    if not response:
        raise RetryDownload

    try:
        data = response.json()
        data.reverse()
    except Exception as e:
        LOG.log_it(
            '解析JSON出错(如一直出现,而且浏览器能正常访问知乎,可能是知乎代码升级,请通知开发者。)ERRINFO:{}'.format(
                str(e)), 'WARN')
        raise RetryDownload

    if len(data) != 0:
        if task['save']['cursor'] < task['save']['end'] - 20:
            next_page_task = deepcopy(task)
            next_page_task.update({
                'url':
                re.sub('offset=\d+',
                       'offset={}'.format(task['save']['cursor'] + 20),
                       next_page_task['url'])
            })
            next_page_task['save'].update(
                {'cursor': next_page_task['save']['cursor'] + 20})
            new_tasks.append(next_page_task)
    else:
        LOG.log_it('不能读取专栏列表。(如一直出现,而且浏览器能正常访问知乎,可能是知乎代码升级,请通知开发者。)', 'WARN')
        raise RetryDownload

    for item in data:
        new_task = Task.make_task({
            'url':
            'https://zhuanlan.zhihu.com' + item['url'],
            'method':
            'GET',
            'meta':
            task['meta'],
            'parser':
            parser_content,
            'resulter':
            resulter_content,
            'priority':
            5,
            'save':
            task['save'],
            'title':
            item['title'],
        })
        new_tasks.append(new_task)
    return None, new_tasks
Пример #7
0
def parser_list(task):
    response = task['response']
    new_tasks = []
    to_next = True

    if not response:
        raise RetryDownload

    try:
        data = response.json()
        data.reverse()
    except Exception as e:
        LOG.log_it(
            '解析JSON出错(如一直出现,而且浏览器能正常访问知乎,可能是知乎代码升级,请通知开发者。)ERRINFO:{}'.format(
                str(e)), 'WARN')
        raise RetryDownload

    for item in data:
        # 如果在数据库里面已经存在的项目,就不继续爬了
        url = 'https://zhuanlan.zhihu.com' + item['url']
        if md5string(url) in ARTICLE_ID_SET:
            to_next = False
            continue

        new_task = Task.make_task({
            'url': url,
            'method': 'GET',
            'meta': task['meta'],
            'parser': parser_content,
            'resulter': resulter_content,
            'priority': 5,
            'save': task['save'],
            'title': item['title'],
        })
        new_tasks.append(new_task)

    # 下一页
    if to_next and len(data) != 0:
        if task['save']['cursor'] < task['save']['end'] - 20:
            next_page_task = deepcopy(task)
            next_page_task.update({
                'url':
                re.sub('offset=\d+',
                       'offset={}'.format(task['save']['cursor'] + 20),
                       next_page_task['url'])
            })
            next_page_task['save'].update(
                {'cursor': next_page_task['save']['cursor'] + 20})
            new_tasks.append(next_page_task)

    return None, new_tasks
Пример #8
0
def main(start, end, kw):
    iq = PriorityQueue()
    oq = PriorityQueue()
    result_q = Queue()
    crawler = Crawler(iq, oq, result_q)
    default_headers = deepcopy(DEFAULT_HEADERS)
    default_headers.update({'Referer': 'http://www.guokr.com/scientific/'})
    save_path = SCRIPT_CONFIG['SAVE_PATH']
    book_name = '果壳网'
    task = Task.make_task({
        'url': API_URL.format(start),
        'method': 'GET',
        'meta': {
            'headers': default_headers,
            'verify': False
        },
        'parser': parser_list,
        'priority': 0,
        'save': {
            'cursor': start,
            'start': start,
            'end': end,
            'kw': kw,
            'save_path': SCRIPT_CONFIG['SAVE_PATH'],
        },
        'retry': 3,
    })
    iq.put(task)
    # Init DB
    with ArticleDB(save_path, VERSION=0) as db:
        pass

    crawler.start()

    items = []

    with ArticleDB(save_path) as db:
        items.extend(db.select_article())
        db.insert_meta_data(['BOOK_NAME', book_name])
        db.increase_version()

    with HTML2Kindle(items, save_path, book_name,
                     MAIN_CONFIG.get('KINDLEGEN_PATH')) as html2kindle:
        html2kindle.make_metadata(window=kw.get('window', 50))
        html2kindle.make_book_multi(save_path)

    if kw.get('email'):
        with SendEmail2Kindle() as s:
            s.send_all_mobi(SCRIPT_CONFIG['SAVE_PATH'])
    os._exit(0)
Пример #9
0
    def test_normal(self):
        def parser_mock(task):
            self.assertEqual(task['response'].text, "Hello World!")
            return None, None

        task = Task.make_task({
            'url': "http://127.0.0.1:5000/hello_world",
            'method': 'GET',
            'parser': parser_mock,
            'priority': 0,
        })
        self.iq.put(task)

        self.crawler.start()
Пример #10
0
def parser_content(task):
    title = task['title']
    items = []
    new_tasks = []

    response = task['response']
    if not response:
        raise RetryDownload

    response.encoding = 'utf-8'
    bs = BeautifulSoup(response.text, 'lxml')

    content_tab = bs.select('.article-detail-bd > .detail')
    if content_tab:
        content = str(content_tab[0])
    else:
        LOG.log_it("不能找到文章的内容。(如一直出现,而且浏览器能正常访问,可能是代码升级,请通知开发者。)", 'WARN')
        raise RetryDownload

    author_name = '未知'
    voteup_count = task['voteup_count']
    created_time = task['created_time']
    article_url = task['url']
    article_id = md5string(article_url)

    download_img_list, content = format_content(content, task)

    items.append([article_id, title, content, created_time, voteup_count, author_name, int(time.time() * 100000)])

    if task['save']['kw'].get('img', True):
        img_header = deepcopy(DEFAULT_HEADERS)
        img_header.update({'Referer': response.url})
        for img_url in download_img_list:
            new_tasks.append(Task.make_task({
                'url': img_url,
                'method': 'GET',
                'meta': {'headers': img_header, 'verify': False},
                'parser': parser_downloader_img,
                'resulter': resulter_downloader_img,
                'save': task['save'],
                'priority': 10,
            }))

    task.update({'parsed_data': items})
    return task, new_tasks
Пример #11
0
def parser_content(task):
    response = task['response']
    if not response:
        LOG.log_it("Not Response", 'WARN')
        raise RetryDownload

    new_tasks = []
    items = []
    content = response.text
    # 去除空格
    content = content.replace('</p><p>', '').replace('<br/>', '')
    soup = BeautifulSoup(content, 'lxml')

    title = task['save']['title']
    article_url = task['url']
    created_time = soup.select('.content-th-info span')[0].string[3:]
    author = soup.select('.content-th-info a')[0].string

    download_img_list, content = format_content(soup, task)
    items.append([
        md5string(article_url), title, content, created_time, '', author,
        int(time.time() * 100000)
    ])

    if task['save']['kw'].get('img', True):
        img_header = deepcopy(DEFAULT_HEADERS)
        img_header.update({'Referer': response.url})
        for img_url in download_img_list:
            new_tasks.append(
                Task.make_task({
                    'url': img_url,
                    'method': 'GET',
                    'meta': {
                        'headers': img_header,
                        'verify': False
                    },
                    'parser': parser_downloader_img,
                    'resulter': resulter_downloader_img,
                    'priority': 2,
                    'save': task['save']
                }))
    task.update({'parsed_data': items})
    return task, new_tasks
Пример #12
0
    def mock_downloader_to_push_in_delayqueue(self):
        def parser_mock():
            return None, None

        tasks = [
            Task.make_task({
                'url':
                'http://www.baidu.com?a={}'.format(random.randint(1, 999)),
                'method':
                'GET',
                'retry_delay':
                random.random() * SEC,
                'parser':
                parser_mock,
            }) for _ in range(20)
        ]

        for task in tasks:
            # time.sleep(random.random())
            task['to_download_timestamp'] = time.time() + task['retry_delay']
            TaskManager.push_delay_queue(task)
Пример #13
0
    def test_retry_delay(self):
        delay = 1
        t = time.time() - delay

        def parser_mock(task):
            nonlocal t
            self.assertTrue(time.time() - t >= delay)
            t = time.time()
            raise RetryDownload
            return None, None

        task = Task.make_task({
            'url': "http://127.0.0.1:5000/retry_delay",
            'method': 'GET',
            'parser': parser_mock,
            'priority': 0,
            'retry_delay': delay,
            'retry': 3
        })

        self.iq.put(task)

        self.crawler.start()
Пример #14
0
def main(start, end, kw):
    iq = PriorityQueue()
    oq = PriorityQueue()
    result_q = Queue()
    crawler = Crawler(iq, oq, result_q, MAIN_CONFIG.get('PARSER_WORKER', 1),
                      MAIN_CONFIG.get('DOWNLOADER_WORKER', 1),
                      MAIN_CONFIG.get('RESULTER_WORKER', 1))

    default_headers = deepcopy(DEFAULT_HEADERS)
    default_headers.update({'Referer': 'http://www.guokr.com/scientific/'})
    save_path = SCRIPT_CONFIG['SAVE_PATH']
    book_name = '果壳网'
    task = Task.make_task({
        'url': API_URL.format(start),
        'method': 'GET',
        'meta': {
            'headers': default_headers,
            'verify': False
        },
        'parser': parser_list,
        'priority': 0,
        'save': {
            'cursor': start,
            'start': start,
            'end': end,
            'kw': kw,
            'save_path': SCRIPT_CONFIG['SAVE_PATH'],
        },
        'retry': 10,
        'retry_delay': 10
    })
    iq.put(task)
    # Init DB
    with ArticleDB(save_path, VERSION=0) as db:
        _ = db.select_all_article_id()
    if _:
        for each in _:
            ARTICLE_ID_SET.add(each[0])

    crawler.start()

    items = []
    with ArticleDB(save_path) as db:
        items.extend(db.select_article())
        db.insert_meta_data(['BOOK_NAME', book_name])
        db.increase_version()
        db.reset()

    if items:
        new = True
        with HTML2Kindle(items, save_path, book_name,
                         MAIN_CONFIG.get('KINDLEGEN_PATH')) as html2kindle:
            html2kindle.make_metadata(window=kw.get('window', 50))
            html2kindle.make_book_multi(save_path)
    else:
        LOG.log_it('无新项目', 'INFO')
        new = False

    if new and kw.get('email'):
        with SendEmail2Kindle() as s:
            s.send_all_mobi(SCRIPT_CONFIG['SAVE_PATH'])
Пример #15
0
def parser_content(task):
    response = task['response']
    if not response:
        LOG.log_it("Not Response", 'WARN')
        raise RetryDownload

    new_tasks = []
    download_img_list = []
    items = []
    soup = BeautifulSoup(response.text, 'lxml')

    content_select = soup.select('.document')
    # 移除每页后面无用的信息
    if content_select:
        for to_del in soup.select('.copyright'):
            to_del.decompose()

    content = str(content_select)
    # bs4会自动加html和body 标签
    content = re.sub('<html><body>(.*?)</body></html>',
                     lambda x: x.group(1),
                     content,
                     flags=re.S)
    download_img_list.extend(re.findall('src="(http.*?)"', content))
    # 更换为本地相对路径
    content = re.sub('src="(.*?)"', convert_link, content)

    # 去掉"[]"
    content = content[1:-1]

    title = task['save']['title']
    article_url = task['url']
    created_time = soup.select('.content-th-info span')[0].string[3:]
    author = soup.select('.content-th-info a')[0].string

    bs2 = BeautifulSoup(content, 'lxml')
    # 居中图片
    for tab in bs2.select('img'):
        tab.wrap(bs2.new_tag('div', style='text-align:center;'))
        tab['style'] = "display: inline-block;"

        # 删除gif
        if task['save']['kw']['gif'] is False:
            if 'gif' in tab['src']:
                tab.decompose()
                continue

    content = str(bs2)

    items.append([
        md5string(article_url), title, content, created_time, '', author,
        int(time.time() * 100000)
    ])

    if task['save']['kw'].get('img', True):
        img_header = deepcopy(DEFAULT_HEADERS)
        img_header.update({'Referer': response.url})
        for img_url in download_img_list:
            new_tasks.append(
                Task.make_task({
                    'url': img_url,
                    'method': 'GET',
                    'meta': {
                        'headers': img_header,
                        'verify': False
                    },
                    'parser': parser_downloader_img,
                    'resulter': resulter_downloader_img,
                    'priority': 2,
                    'save': task['save']
                }))
    task.update({'parsed_data': items})
    return task, new_tasks
Пример #16
0
def main(zhuanti_list, start, end, kw):
    """start默认1;end为结束页数,每页9个"""
    iq = PriorityQueue()
    oq = PriorityQueue()
    result_q = Queue()
    crawler = Crawler(iq, oq, result_q,
                      MAIN_CONFIG.get('PARSER_WORKER', 1),
                      MAIN_CONFIG.get('DOWNLOADER_WORKER', 1),
                      MAIN_CONFIG.get('RESULTER_WORKER', 1))

    start = int(start)
    end = int(end)

    for zhuanti in zhuanti_list:
        new_header = deepcopy(DEFAULT_HEADERS)
        new_header.update({'Referer': BASE_URL.format(zhuanti)})

        # 以专题的数字作为子文件名
        save_path = os.path.join(SCRIPT_CONFIG['SAVE_PATH'], str(zhuanti))

        if kw.get('order_by') == 'comment':
            order_by = ORDER_COMMENT
        elif kw.get('order_by') == 'add':
            order_by = ORDER_ADD
        elif kw.get('order_by') == 'top':
            order_by = ORDER_TOP
        else:
            # 默认add
            order_by = ORDER_ADD

        task = Task.make_task({
            'url': API_URL.format(zhuanti, order_by, start),
            'method': 'GET',
            'meta': {'headers': new_header, 'verify': False},
            'parser': parser_list,
            'priority': 0,
            'save': {'cursor': start,
                     'save_path': save_path,
                     'start': start,
                     'end': end,
                     'kw': kw,
                     'name': zhuanti,
                     'order_by': order_by},
            'retry': 10,
            'retry_delay': 10
        })

        iq.put(task)

        # Init DB
        with ArticleDB(save_path, VERSION=0) as db:
            _ = db.select_all_article_id()

        # 利用集合去重
        if _:
            for each in _:
                ARTICLE_ID_SET.add(each[0])

    # 开始爬虫
    crawler.start()

    # 开始制作电子书
    for zhuanti in zhuanti_list:
        items = []
        save_path = os.path.join(SCRIPT_CONFIG['SAVE_PATH'], str(zhuanti))
        with ArticleDB(save_path, VERSION=0) as db:
            # 读取所有文章
            items.extend(db.select_article())
            # 从数据库中获取专题名字
            book_name = db.select_meta('BOOK_NAME')
            # 更新数据库版本
            db.increase_version()
            # 数据库收尾工作
            db.reset()

        if items:
            with HTML2Kindle(items, save_path, book_name, MAIN_CONFIG.get('KINDLEGEN_PATH')) as html2kindle:
                html2kindle.make_metadata(window=kw.get('window', 50))
                html2kindle.make_book_multi(save_path)

            if kw.get('email'):
                with SendEmail2Kindle() as s:
                    s.send_all_mobi(save_path)
        else:
            LOG.log_it('无新项目', 'INFO')
Пример #17
0
def parser_collection(task):
    to_next = True
    response = task['response']
    if not response:
        raise RetryDownload

    text = response.text
    bs = BeautifulSoup(text, 'lxml')
    download_img_list = []
    new_tasks = []
    items = []

    # 获得当前页码
    now_page_num = int(re.search('page=(\d*)$',
                                 response.url).group(1)) if re.search(
                                     'page=(\d*)$', response.url) else 1

    if not bs.select('.zm-item'):
        LOG.log_it("无法获取收藏列表(如一直出现,而且浏览器能正常访问知乎,可能是知乎代码升级,请通知开发者。)", 'WARN')
        raise RetryDownload

    collection_name = bs.select(
        '.zm-item-title')[0].string.strip() + ' 第{}页'.format(now_page_num)
    LOG.log_it("获取收藏夹[{}]".format(collection_name), 'INFO')

    book_name = bs.select('#zh-fav-head-title')[0].string.strip() if bs.select(
        '#zh-fav-head-title') else task['save']['name']
    for i in bs.select('.zm-item'):
        article_url = i.select('.zm-item-fav a.toggle-expand')[0].attrs['href']
        article_id = md5string(article_url)

        # 如果在数据库里面已经存在的项目,就不继续爬了
        if article_id not in ARTICLE_ID_SET:
            author_name = i.select(
                '.answer-head a.author-link')[0].string if i.select(
                    '.answer-head a.author-link') else '匿名'
            title = i.select('.zm-item-title a')[0].string if i.select(
                '.zm-item-title a') else ''

            voteup_count = i.select(
                'a.zm-item-vote-count')[0].string if i.select(
                    'a.zm-item-vote-count') else ''
            created_time = i.select('p.visible-expanded a')[0].string.replace(
                '发布于 ', '') if i.select('p.visible-expanded a') else ''
            content = i.select('.content')[0].string if i.select(
                '.content') else ''

            _, content = format_zhihu_content(content, task)
            download_img_list.extend(_)

            items.append([
                article_id, title, content, created_time, voteup_count,
                author_name,
                int(time.time() * 100000)
            ])
        else:
            to_next = False

    # 获取下一页
    if to_next and now_page_num < task['save']['end']:
        next_page = bs.select('.zm-invite-pager span a')
        # 如果有下一页
        if next_page and next_page[-1].string == '下一页':
            next_page = re.sub('\?page=\d+', '',
                               task['url']) + next_page[-1]['href']
            new_tasks.append(
                Task.make_task({
                    'url': next_page,
                    'method': 'GET',
                    'priority': 0,
                    'save': task['save'],
                    'meta': task['meta'],
                    'parser': parser_collection,
                    'resulter': resulter_collection,
                }))

    # 获取图片
    if task['save']['kw'].get('img', True):
        img_header = deepcopy(DEFAULT_HEADERS)
        img_header.update({'Referer': response.url})
        for img_url in download_img_list:
            new_tasks.append(
                Task.make_task({
                    'url': img_url,
                    'method': 'GET',
                    'meta': {
                        'headers': img_header,
                        'verify': False
                    },
                    'parser': parser_downloader_img,
                    'resulter': resulter_downloader_img,
                    'priority': 5,
                    'save': task['save']
                }))

    if items:
        task.update({'parsed_data': items})
        task['save'].update({'book_name': book_name})
        return task, new_tasks
    else:
        return None, new_tasks
Пример #18
0
def main(zhuanlan_name_list, start, end, kw):
    iq = PriorityQueue()
    oq = PriorityQueue()
    result_q = Queue()
    crawler = Crawler(iq, oq, result_q)

    for zhuanlan_name in zhuanlan_name_list:
        new_header = deepcopy(DEFAULT_HEADERS)
        new_header.update(
            {'Referer': 'https://zhuanlan.zhihu.com/{}'.format(zhuanlan_name)})
        save_path = os.path.join(SCRIPT_CONFIG['SAVE_PATH'],
                                 str(zhuanlan_name))

        task = Task.make_task({
            'url':
            'https://zhuanlan.zhihu.com/api/columns/{}/posts?limit=20&offset={}'
            .format(zhuanlan_name, start),
            'method':
            'GET',
            'meta': {
                'headers': new_header,
                'verify': False
            },
            'parser':
            parser_list,
            'priority':
            0,
            'save': {
                'cursor': start,
                'save_path': save_path,
                'start': start,
                'end': end,
                'kw': kw,
                'name': zhuanlan_name
            },
            'retry':
            3,
        })

        iq.put(task)
        # Init DB
        with ArticleDB(save_path, VERSION=0) as db:
            pass

    crawler.start()
    for zhuanlan_name in zhuanlan_name_list:
        items = []
        save_path = os.path.join(SCRIPT_CONFIG['SAVE_PATH'],
                                 str(zhuanlan_name))
        with ArticleDB(save_path, VERSION=0) as db:
            db.insert_meta_data(['BOOK_NAME', zhuanlan_name])
            items.extend(db.select_article())
            db.increase_version()

        with HTML2Kindle(items, save_path, zhuanlan_name,
                         MAIN_CONFIG.get('KINDLEGEN_PATH')) as html2kindle:
            html2kindle.make_metadata(window=kw.get('window', 50))
            html2kindle.make_book_multi(save_path)

    if kw.get('email'):
        for zhuanlan_name in zhuanlan_name_list:
            with SendEmail2Kindle() as s:
                s.send_all_mobi(
                    os.path.join(SCRIPT_CONFIG['SAVE_PATH'],
                                 str(zhuanlan_name)))

    os._exit(0)
Пример #19
0
def parser_list(task):
    response = task['response']
    new_tasks = []
    to_next = True

    if not response:
        raise RetryDownload

    try:
        text = response.text
        bs = BeautifulSoup(text, 'lxml')
    except Exception as e:
        LOG.log_it('解析网页出错(如一直出现,而且浏览器能正常访问,可能是网站网站代码升级,请通知开发者。)ERRINFO:{}'
                   .format(str(e)), 'WARN')
        raise RetryDownload

    book_name = bs.title.string if bs.title else task['save']['name']

    # 插入文集名字
    with ArticleDB(task['save']['save_path']) as article_db:
        article_db.insert_meta_data(['BOOK_NAME', format_file_name('简书专题_' + book_name)], update=False)

    # 顺序反向
    items = bs.select('a.title')
    items.reverse()

    for item in items:
        # 如果已经在数据库中,则不下载
        url = 'https://www.jianshu.com' + item.attrs['href']
        if md5string(url) in ARTICLE_ID_SET:
            to_next = False
            continue

        try:
            title = item.string
        except:
            LOG.log_it('解析标题出错(如一直出现,而且浏览器能正常访问,可能是网站网站代码升级,请通知开发者。)', 'WARN')
            raise RetryDownload

        new_task = Task.make_task({
            'url': url,
            'method': 'GET',
            'meta': task['meta'],
            'parser': parser_content,
            'resulter': resulter_content,
            'priority': 5,
            'save': task['save'],
            'title': title,
        })
        new_tasks.append(new_task)

    # 下一页
    if to_next and len(items) != 0:
        if task['save']['cursor'] < task['save']['end']:
            next_page_task = deepcopy(task)
            next_page_task.update(
                {'url': API_URL.format(task['save']['name'], task['save']['order_by'], task['save']['cursor'] + 1)})
            next_page_task['save'].update({'cursor': next_page_task['save']['cursor'] + 1})
            new_tasks.append(next_page_task)

    return None, new_tasks
Пример #20
0
def main(collection_num_list, start, end, kw):
    iq = PriorityQueue()
    oq = PriorityQueue()
    result_q = Queue()
    crawler = Crawler(iq, oq, result_q, MAIN_CONFIG.get('PARSER_WORKER', 1),
                      MAIN_CONFIG.get('DOWNLOADER_WORKER', 1),
                      MAIN_CONFIG.get('RESULTER_WORKER', 1))
    new = True

    for collection_num in collection_num_list:
        save_path = os.path.join(SCRIPT_CONFIG['SAVE_PATH'],
                                 str(collection_num))

        task = Task.make_task({
            'url':
            'https://www.zhihu.com/collection/{}?page={}'.format(
                collection_num, start),
            'method':
            'GET',
            'meta': {
                'headers': DEFAULT_HEADERS,
                'verify': False
            },
            'parser':
            parser_collection,
            'resulter':
            resulter_collection,
            'priority':
            0,
            'retry':
            10,
            'save': {
                'start': start,
                'end': end,
                'kw': kw,
                'save_path': save_path,
                'name': collection_num,
            },
            'retry_delay':
            10
        })
        iq.put(task)
        # Init DB
        with ArticleDB(save_path, VERSION=0) as db:
            _ = db.select_all_article_id()
        if _:
            for each in _:
                ARTICLE_ID_SET.add(each[0])

    crawler.start()
    for collection_num in collection_num_list:
        items = []
        save_path = os.path.join(SCRIPT_CONFIG['SAVE_PATH'],
                                 str(collection_num))
        with ArticleDB(save_path) as db:
            items.extend(db.select_article())
            book_name = db.select_meta('BOOK_NAME')
            db.increase_version()
            db.reset()

        if items:
            new = True
            with HTML2Kindle(items, save_path, book_name,
                             MAIN_CONFIG.get('KINDLEGEN_PATH')) as html2kindle:
                html2kindle.make_metadata(window=kw.get('window', 50))
                html2kindle.make_book_multi(save_path)
        else:
            LOG.log_it('无新项目', 'INFO')
            new = False

    if new and kw.get('email'):
        for collection_num in collection_num_list:
            save_path = os.path.join(SCRIPT_CONFIG['SAVE_PATH'],
                                     str(collection_num))
            with SendEmail2Kindle() as s:
                s.send_all_mobi(save_path)
Пример #21
0
def parser_content(task):
    title = task['title']
    download_img_list = []
    new_tasks = []

    response = task['response']
    if not response:
        raise RetryDownload

    bs = BeautifulSoup(response.text, 'lxml')

    content_tab = bs.select('.PostIndex-content')
    if content_tab:
        content = str(content_tab[0])
    else:
        LOG.log_it("不能找到文章的内容。(如一直出现,而且浏览器能正常访问知乎,可能是知乎代码升级,请通知开发者。)", 'WARN')
        raise RetryDownload

    author_name = bs.select('.PostIndex-authorName')[0].string if bs.select(
        '.PostIndex-authorName') else ''

    voteup_count = re.search(
        'likesCount&quot;:(\d+),', response.text).group(1) if re.search(
            'likesCount&quot;:(\d+),', response.text) else ''

    created_time = str(
        bs.select('.PostIndex-header .HoverTitle')[1]['data-hover-title']
    ) if len(bs.select('.PostIndex-header .HoverTitle')) == 2 else ''
    article_url = task['url']

    bs = BeautifulSoup(content, 'lxml')
    for tab in bs.select('img[src^="data"]'):
        # 删除无用的img标签
        tab.decompose()

    # 居中图片
    for tab in bs.select('img'):
        if 'equation' not in tab['src']:
            tab.wrap(bs.new_tag('div', style='text-align:center;'))
            tab['style'] = "display: inline-block;"

        # 删除gif
        if task['save']['kw']['gif'] is False:
            if 'gif' in tab['src']:
                tab.decompose()
                continue

    content = str(bs)
    # bs4会自动加html和body 标签
    content = re.sub('<html><body>(.*?)</body></html>',
                     lambda x: x.group(1),
                     content,
                     flags=re.S)

    # 公式地址转换(傻逼知乎又换地址了)
    # content = content.replace('//www.zhihu.com', 'http://www.zhihu.com')

    download_img_list.extend(re.findall('src="(http.*?)"', content))

    # 更换为本地相对路径
    content = re.sub('src="(.*?)"', convert_link, content)

    # 超链接的转换
    content = re.sub('//link.zhihu.com/\?target=(.*?)"',
                     lambda x: unquote(x.group(1)), content)
    content = re.sub('<noscript>(.*?)</noscript>',
                     lambda x: x.group(1),
                     content,
                     flags=re.S)

    item = [
        md5string(article_url), title, content, created_time, voteup_count,
        author_name,
        int(time.time() * 100000)
    ]

    if task['save']['kw'].get('img', True):
        img_header = deepcopy(DEFAULT_HEADERS)
        img_header.update({'Referer': response.url})
        for img_url in download_img_list:
            new_tasks.append(
                Task.make_task({
                    'url': img_url,
                    'method': 'GET',
                    'meta': {
                        'headers': img_header,
                        'verify': False
                    },
                    'parser': parser_downloader_img,
                    'resulter': resulter_downloader_img,
                    'save': task['save'],
                    'priority': 10,
                }))

    task.update({"parsed_data": item})
    return task, new_tasks
Пример #22
0
def main(start, end, kw):
    iq = PriorityQueue()
    oq = PriorityQueue()
    result_q = Queue()
    crawler = Crawler(iq, oq, result_q, MAIN_CONFIG.get('PARSER_WORKER', 1),
                      MAIN_CONFIG.get('DOWNLOADER_WORKER', 1),
                      MAIN_CONFIG.get('RESULTER_WORKER', 1))

    new_header = deepcopy(DEFAULT_HEADERS)

    global IS_TODAY_URL
    if start is None:
        IS_TODAY_URL = True
        save_path = os.path.join(
            SCRIPT_CONFIG['SAVE_PATH'],
            'zhihu_daily_' + get_datetime_string('%Y%m%d'))
        book_name = '知乎日报_' + get_datetime_string('%Y%m%d')
    else:
        if end is None:
            end = datetime.datetime.fromtimestamp(
                time.time()).strftime('%Y%m%d')

        save_path = os.path.join(SCRIPT_CONFIG['SAVE_PATH'],
                                 'zhihu_daily_{}_{}'.format(start, end))
        book_name = '知乎日报_{}_{}'.format(start, end)
        IS_TODAY_URL = False

    url = TODAY_URL if IS_TODAY_URL else YESTERDAY_URL.format(start)

    task = Task.make_task({
        'url': url,
        'method': 'GET',
        'meta': {
            'headers': new_header,
            'verify': False
        },
        'parser': parser_list,
        'priority': 0,
        'save': {
            'cursor': start,
            'save_path': save_path,
            'start': start,
            'end': end,
            'kw': kw
        },
        'retry': 99,
        'retry_delay': 10
    })

    iq.put(task)

    # Init DB
    with ArticleDB(save_path, VERSION=0) as db:
        _ = db.select_all_article_id()
    if _:
        for each in _:
            ARTICLE_ID_SET.add(each[0])

    crawler.start()

    items = []
    with ArticleDB(save_path, VERSION=0) as db:
        db.insert_meta_data(['BOOK_NAME', book_name])
        items.extend(db.select_article())
        db.increase_version()
        db.reset()

    if items:
        new = True
        with HTML2Kindle(items, save_path, book_name,
                         MAIN_CONFIG.get('KINDLEGEN_PATH')) as html2kindle:
            html2kindle.make_metadata(window=kw.get('window', 50))
            html2kindle.make_book_multi(save_path)
    else:
        LOG.log_it('无新项目', 'INFO')
        new = False

    if new and kw.get('email'):
        with SendEmail2Kindle() as s:
            s.send_all_mobi(os.path.join(save_path))
Пример #23
0
def main(start, end, kw):
    # start:2017/12/11
    # end:2017/12/12
    iq = PriorityQueue()
    oq = PriorityQueue()
    result_q = Queue()
    crawler = Crawler(iq, oq, result_q)
    try:
        start_l = [int(_) for _ in start.split('-')]
        end_l = [int(_) for _ in end.split('-')]
        start_t = int(
            datetime.datetime(start_l[0], start_l[1],
                              start_l[2]).timestamp()) + 60 * 60 * 24
        end_t = int(
            datetime.datetime(end_l[0], end_l[1], end_l[2]).timestamp())
    except:
        LOG.log_it('日期格式错误', 'WARN')
        traceback.print_exc()
        return

    global API_URL
    if 'type' in kw:
        if kw['type'] == 'business':
            API_URL = API_BUSINESS
        elif kw['type'] == 'intelligent':
            API_URL = API_INTELLIGENT
        elif kw['type'] == 'design':
            API_URL = API_DESIGN
        elif kw['type'] == 'fashion':
            API_URL = API_FASHION
        elif kw['type'] == 'entertainment':
            API_URL = API_ENTERTAINMENT
        elif kw['type'] == 'city':
            API_URL = API_CITY
        elif kw['type'] == 'game':
            API_URL = API_GAME
        elif kw['type'] == 'long':
            API_URL = API_LONG
        elif kw['type'] == 'home':
            pass
    else:
        kw.update({'type': 'home'})

    new_header = deepcopy(SCRIPT_CONFIG.get('DEFAULT_HEADERS'))
    new_header.update({'Referer': 'https://www.qdaily.com/'})
    save_path = os.path.join(SCRIPT_CONFIG['SAVE_PATH'],
                             'qdaily_{}'.format(kw['type']))
    book_name = '好奇心日报_{}_{}_{}'.format(kw['type'], start, end)
    task = Task.make_task({
        'url': API_URL.format(start_t),
        'method': 'GET',
        'meta': {
            'headers': new_header,
            'verify': False
        },
        'parser': parser_list,
        'priority': 0,
        'save': {
            'cursor': start_t,
            'save_path': save_path,
            'start': start_t,
            'end': end_t,
            'kw': kw,
            'page': 1,
            'name': book_name,
        },
        'retry': 3,
    })
    iq.put(task)
    # Init DB
    with ArticleDB(save_path, VERSION=0) as db:
        pass

    crawler.start()

    items = []
    with ArticleDB(save_path) as db:
        items.extend(db.select_article())
        db.insert_meta_data(['BOOK_NAME', book_name])
        db.increase_version()

    with HTML2Kindle(items, save_path, book_name,
                     MAIN_CONFIG.get('KINDLEGEN_PATH')) as html2kindle:
        html2kindle.make_metadata(window=kw.get('window', 50))
        html2kindle.make_book_multi(save_path)

    if kw.get('email'):
        with SendEmail2Kindle() as s:
            s.send_all_mobi(save_path)
    os._exit(0)
Пример #24
0
def parser_content(task):
    title = task['title']
    items = []
    download_img_list = []
    new_tasks = []

    response = task['response']
    if not response:
        raise RetryDownload

    response.encoding = 'utf-8'
    bs = BeautifulSoup(response.text, 'lxml')

    content_tab = bs.select('.article-detail-bd > .detail')
    if content_tab:
        content = str(content_tab[0])
    else:
        LOG.log_it("不能找到文章的内容。(如一直出现,而且浏览器能正常访问,可能是代码升级,请通知开发者。)", 'WARN')
        raise RetryDownload

    author_name = '未知'
    voteup_count = task['voteup_count']
    created_time = task['created_time']
    article_url = task['url']

    bs = BeautifulSoup(content, 'lxml')

    # 居中图片
    for tab in bs.select('img'):
        if len(tab.attrs['class']) != 1:
            tab.decompose()
            continue

        # 删除gif
        if task['save']['kw']['gif'] is False:
            if 'gif' in tab['data-src']:
                tab.decompose()
                continue

        tab.wrap(bs.new_tag('div', style='text-align:center;'))
        tab['style'] = "display: inline-block;"

    content = str(bs)
    # bs4会自动加html和body 标签
    content = re.sub('<html><body>(.*?)</body></html>',
                     lambda x: x.group(1),
                     content,
                     flags=re.S)

    download_img_list.extend(re.findall('src="(http.*?)"', content))

    # 更换为本地相对路径
    content = re.sub('src="(.*?)"', convert_link, content)
    content = content.replace('data-src', 'src')

    items.append([
        md5string(article_url), title, content, created_time, voteup_count,
        author_name,
        int(time.time() * 100000)
    ])

    if task['save']['kw'].get('img', True):
        img_header = deepcopy(SCRIPT_CONFIG.get('DEFAULT_HEADERS'))
        img_header.update({'Referer': response.url})
        for img_url in download_img_list:
            new_tasks.append(
                Task.make_task({
                    'url': img_url,
                    'method': 'GET',
                    'meta': {
                        'headers': img_header,
                        'verify': False
                    },
                    'parser': parser_downloader_img,
                    'resulter': resulter_downloader_img,
                    'save': task['save'],
                    'priority': 10,
                }))

    task.update({'parsed_data': items})
    return task, new_tasks
Пример #25
0
def parser_list(task):
    response = task['response']
    new_tasks = []
    opf = []

    if not response:
        raise RetryDownload

    try:
        data = response.json()
    except Exception as e:
        LOG.log_it(
            '解析JSON出错(如一直出现,而且浏览器能正常访问,可能是代码升级,请通知开发者。)ERRINFO:{}'.format(
                str(e)), 'WARN')
        raise RetryDownload

    try:
        # Next page
        if len(data['data']) != 0:
            if data['data']['last_key'] > task['save']['end'] - 144209:
                next_page_task = deepcopy(task)
                next_page_task.update(
                    {'url': API_URL.format(data['data']['last_key'])})
                next_page_task['save'].update({
                    'cursor':
                    data['data']['last_key'],
                    'page':
                    task['save']['page'] + 1
                })
                new_tasks.append(next_page_task)
        else:
            LOG.log_it('不能读取专栏列表。(如一直出现,而且浏览器能正常访问,可能是代码升级,请通知开发者。)', 'WARN')
            raise RetryDownload

        for item in data['data']['feeds']:
            if item['datatype'] == 'article':
                item = item['post']
                # 文件名太长无法制作mobi
                title = item['title']
                if len(title) > 55:
                    _ = 55 - len(title) - 3
                    title = title[:_] + '...'
                opf.append({'href': format_file_name(title, '.html')})
                new_task = Task.make_task({
                    'url':
                    'https://www.qdaily.com/articles/{}.html'.format(
                        str(item['id'])),
                    'method':
                    'GET',
                    'meta':
                    task['meta'],
                    'parser':
                    parser_content,
                    'resulter':
                    resulter_content,
                    'priority':
                    5,
                    'save':
                    task['save'],
                    'title':
                    item['title'],
                    'created_time':
                    item['publish_time'],
                    'voteup_count':
                    item['praise_count']
                })
                new_tasks.append(new_task)
    except KeyError:
        LOG.log_it('JSON KEY出错(如一直出现,而且浏览器能正常访问,可能是网站代码升级,请通知开发者。)', 'WARN')
        raise RetryDownload
    return None, new_tasks
Пример #26
0
def main(zhuanlan_name_list, start, end, kw):
    iq = PriorityQueue()
    oq = PriorityQueue()
    result_q = Queue()
    crawler = Crawler(iq, oq, result_q, MAIN_CONFIG.get('PARSER_WORKER', 1),
                      MAIN_CONFIG.get('DOWNLOADER_WORKER', 1),
                      MAIN_CONFIG.get('RESULTER_WORKER', 1))
    new = True

    for zhuanlan_name in zhuanlan_name_list:
        new_header = deepcopy(DEFAULT_HEADERS)
        new_header.update(
            {'Referer': 'https://zhuanlan.zhihu.com/{}'.format(zhuanlan_name)})
        save_path = os.path.join(SCRIPT_CONFIG['SAVE_PATH'],
                                 str(zhuanlan_name))

        task = Task.make_task({
            'url':
            'https://zhuanlan.zhihu.com/api/columns/{}/posts?limit=20&offset={}'
            .format(zhuanlan_name, start),
            'method':
            'GET',
            'meta': {
                'headers': new_header,
                'verify': False
            },
            'parser':
            parser_list,
            'priority':
            0,
            'save': {
                'cursor': start,
                'save_path': save_path,
                'start': start,
                'end': end,
                'kw': kw,
                'name': zhuanlan_name
            },
            'retry':
            10,
            'retry_delay':
            10
        })

        iq.put(task)
        # Init DB
        with ArticleDB(save_path, VERSION=0) as db:
            _ = db.select_all_article_id()
        if _:
            for each in _:
                ARTICLE_ID_SET.add(each[0])

    crawler.start()
    for zhuanlan_name in zhuanlan_name_list:
        items = []
        book_name = '知乎专栏_{}'.format(zhuanlan_name)
        save_path = os.path.join(SCRIPT_CONFIG['SAVE_PATH'],
                                 str(zhuanlan_name))
        with ArticleDB(save_path, VERSION=0) as db:
            db.insert_meta_data(['BOOK_NAME', zhuanlan_name])
            items.extend(db.select_article())
            db.increase_version()
            db.reset()

        if items:
            new = True
            with HTML2Kindle(items, save_path, book_name,
                             MAIN_CONFIG.get('KINDLEGEN_PATH')) as html2kindle:
                html2kindle.make_metadata(window=kw.get('window', 50))
                html2kindle.make_book_multi(save_path)
        else:
            LOG.log_it('无新项目', 'INFO')
            new = False

    if new and kw.get('email'):
        for zhuanlan_name in zhuanlan_name_list:
            with SendEmail2Kindle() as s:
                s.send_all_mobi(
                    os.path.join(SCRIPT_CONFIG['SAVE_PATH'],
                                 str(zhuanlan_name)))