Exemplos de md5string em Python, exemplos de web2kindle.libs.utils.md5string em Python

Exemplo n.º 1

0

Exibir arquivo

def parser_content(task):
    title = task['title']
    new_tasks = []

    response = task['response']
    if not response:
        raise RetryDownload

    bs = BeautifulSoup(response.text, 'lxml')

    content_tab = bs.select('.PostIndex-content')
    if content_tab:
        content = str(content_tab[0])
    else:
        LOG.log_it("不能找到文章的内容。（如一直出现，而且浏览器能正常访问知乎，可能是知乎代码升级，请通知开发者。）", 'WARN')
        raise RetryDownload

    author_name = bs.select('.PostIndex-authorName')[0].string if bs.select(
        '.PostIndex-authorName') else ''
    voteup_count = re.search(
        'likesCount&quot;:(\d+),', response.text).group(1) if re.search(
            'likesCount&quot;:(\d+),', response.text) else ''
    created_time = str(
        bs.select('.PostIndex-header .HoverTitle')[1]['data-hover-title']
    ) if len(bs.select('.PostIndex-header .HoverTitle')) == 2 else ''
    article_url = task['url']

    download_img_list, content = format_zhihu_content(content, task)

    item = [
        md5string(article_url), title, content, created_time, voteup_count,
        author_name,
        int(time.time() * 100000)
    ]

    if task['save']['kw'].get('img', True):
        img_header = deepcopy(DEFAULT_HEADERS)
        img_header.update({'Referer': response.url})
        for img_url in download_img_list:
            new_tasks.append(
                Task.make_task({
                    'url': img_url,
                    'method': 'GET',
                    'meta': {
                        'headers': img_header,
                        'verify': False
                    },
                    'parser': parser_downloader_img,
                    'resulter': resulter_downloader_img,
                    'save': task['save'],
                    'priority': 10,
                }))

    task.update({"parsed_data": item})
    return task, new_tasks

Exemplo n.º 2

0

Exibir arquivo

Arquivo: jianshu_wenji.py Projeto: ssd4561/web2kindle

def parser_content(task):
    title = task['title']
    new_tasks = []

    response = task['response']
    if not response:
        raise RetryDownload

    bs = BeautifulSoup(response.text, 'lxml')

    content_tab = bs.select('.show-content')
    if content_tab:
        content = str(content_tab[0])
    else:
        LOG.log_it("不能找到文章的内容。（如一直出现，而且浏览器能正常访问网站，可能是网站代码升级，请通知开发者。）", 'WARN')
        raise RetryDownload

    author_name = bs.select('.post .author .name a')[0].string if bs.select(
        '.post .author .name a') else ''
    voteup_count = bs.select(
        '.post .author .meta .likes-count')[0].string if bs.select(
            '.post .author .meta .likes-count') else ''
    created_time = bs.select(
        '.post .author .meta .publish-time')[0].string if bs.select(
            '.post .author .meta .publish-time') else ''
    article_url = task['url']

    download_img_list, content = format_content(content, task)

    item = [
        md5string(article_url), title, content, created_time, voteup_count,
        author_name,
        int(time.time() * 100000)
    ]

    if task['save']['kw'].get('img', True):
        img_header = deepcopy(DEFAULT_HEADERS)
        img_header.update({'Referer': response.url})
        for img_url in download_img_list:
            new_tasks.append(
                Task.make_task({
                    'url': img_url,
                    'method': 'GET',
                    'meta': {
                        'headers': img_header,
                        'verify': False
                    },
                    'parser': parser_downloader_img,
                    'resulter': resulter_downloader_img,
                    'save': task['save'],
                    'priority': 10,
                }))

    task.update({"parsed_data": item})
    return task, new_tasks

Exemplo n.º 3

0

Exibir arquivo

Arquivo: zhihu_zhuanlan.py Projeto: zhangshaoze/web2kindle

def resulter_downloader_img(task):
    if 'www.zhihu.com/equation' not in task['url']:
        write(os.path.join(task['save']['save_path'], 'static'),
              urlparse(task['response'].url).path[1:],
              task['response'].content,
              mode='wb')
    else:
        write(os.path.join(task['save']['save_path'], 'static'),
              md5string(task['url']) + '.svg',
              task['response'].content,
              mode='wb')

Exemplo n.º 4

0

Exibir arquivo

Arquivo: zhihu_zhuanlan.py Projeto: zhangshaoze/web2kindle

def convert_link(x):
    if 'www.zhihu.com/equation' not in x.group(1):
        return 'src="./static/{}"'.format(urlparse(x.group(1)).path[1:])
    # svg等式的保存
    else:
        url = x.group(1)
        if url.startswith('//'):
            url = 'http:' + url
        else:
            url = 'http://' + url
        a = 'src="./static/{}.svg"'.format(md5string(url))
        return a

Exemplo n.º 5

0

Exibir arquivo

def parser_content(task):
    title = task['title']
    new_tasks = []

    response = task['response']
    if not response:
        raise RetryDownload

    try:
        content = response.json()['body']
    except Exception as e:
        LOG.log_it(
            '解析JSON出错（如一直出现，而且浏览器能正常访问网站，可能是网站代码升级，请通知开发者。）ERRINFO:{}'.format(
                str(e)), 'WARN')
        raise RetryDownload

    bs = BeautifulSoup(content, 'lxml')
    content = str(bs.select('div.content')[0])

    author_name = bs.select('.author')[0].string if bs.select(
        '.author') else ''
    voteup_count = ''
    created_time = ''
    article_url = task['url']

    download_img_list, content = format_zhihu_content(content, task)

    item = [
        md5string(article_url), title, content, created_time, voteup_count,
        author_name,
        int(time.time() * 100000)
    ]

    if task['save']['kw'].get('img', True):
        img_header = deepcopy(DEFAULT_HEADERS)
        img_header.update({'Referer': response.url})
        for img_url in download_img_list:
            new_tasks.append(
                Task.make_task({
                    'url': img_url,
                    'method': 'GET',
                    'meta': {
                        'headers': img_header,
                        'verify': False
                    },
                    'parser': parser_downloader_img,
                    'resulter': resulter_downloader_img,
                    'save': task['save'],
                    'priority': 10,
                }))

    task.update({"parsed_data": item})
    return task, new_tasks

Exemplo n.º 6

0

Exibir arquivo

def parser_list(task):
    response = task['response']
    new_tasks = []
    to_next = True

    if not response:
        raise RetryDownload

    try:
        data = response.json()['stories']
    except Exception as e:
        LOG.log_it(
            '解析JSON出错（如一直出现，而且浏览器能正常访问网站，可能是网站代码升级，请通知开发者。）ERRINFO:{}'.format(
                str(e)), 'WARN')
        raise RetryDownload

    for item in data:
        # 如果在数据库里面已经存在的项目，就不继续爬了
        url = 'http://news-at.zhihu.com/api/4/story/' + str(item['id'])
        if md5string(url) in ARTICLE_ID_SET:
            to_next = False
            continue

        new_task = Task.make_task({
            'url': url,
            'method': 'GET',
            'meta': task['meta'],
            'parser': parser_content,
            'resulter': resulter_content,
            'priority': 5,
            'save': task['save'],
            'title': item['title'],
        })
        new_tasks.append(new_task)

    # 下一页
    if not IS_TODAY_URL and to_next:
        next_datetime = get_next_datetime_string(task['save']['cursor'],
                                                 '%Y%m%d', 1)

        # 始终会到相等的时候
        if compare_datetime_string(task['save']['end'], next_datetime,
                                   '%Y%m%d') and len(data) != 0:
            next_page_task = deepcopy(task)
            next_page_task.update({
                'url':
                re.sub('before/\d+', 'before/{}'.format(next_datetime),
                       next_page_task['url'])
            })
            next_page_task['save'].update({'cursor': next_datetime})
            new_tasks.append(next_page_task)

    return None, new_tasks

Exemplo n.º 7

0

Exibir arquivo

def parser_list(task):
    response = task['response']
    new_tasks = []
    to_next = True

    if not response:
        raise RetryDownload

    try:
        data = response.json()
        data.reverse()
    except Exception as e:
        LOG.log_it(
            '解析JSON出错（如一直出现，而且浏览器能正常访问知乎，可能是知乎代码升级，请通知开发者。）ERRINFO:{}'.format(
                str(e)), 'WARN')
        raise RetryDownload

    for item in data:
        # 如果在数据库里面已经存在的项目，就不继续爬了
        url = 'https://zhuanlan.zhihu.com' + item['url']
        if md5string(url) in ARTICLE_ID_SET:
            to_next = False
            continue

        new_task = Task.make_task({
            'url': url,
            'method': 'GET',
            'meta': task['meta'],
            'parser': parser_content,
            'resulter': resulter_content,
            'priority': 5,
            'save': task['save'],
            'title': item['title'],
        })
        new_tasks.append(new_task)

    # 下一页
    if to_next and len(data) != 0:
        if task['save']['cursor'] < task['save']['end'] - 20:
            next_page_task = deepcopy(task)
            next_page_task.update({
                'url':
                re.sub('offset=\d+',
                       'offset={}'.format(task['save']['cursor'] + 20),
                       next_page_task['url'])
            })
            next_page_task['save'].update(
                {'cursor': next_page_task['save']['cursor'] + 20})
            new_tasks.append(next_page_task)

    return None, new_tasks

Exemplo n.º 8

0

Exibir arquivo

def parser_content(task):
    title = task['title']
    items = []
    new_tasks = []

    response = task['response']
    if not response:
        raise RetryDownload

    response.encoding = 'utf-8'
    bs = BeautifulSoup(response.text, 'lxml')

    content_tab = bs.select('.article-detail-bd > .detail')
    if content_tab:
        content = str(content_tab[0])
    else:
        LOG.log_it("不能找到文章的内容。（如一直出现，而且浏览器能正常访问，可能是代码升级，请通知开发者。）", 'WARN')
        raise RetryDownload

    author_name = '未知'
    voteup_count = task['voteup_count']
    created_time = task['created_time']
    article_url = task['url']
    article_id = md5string(article_url)

    download_img_list, content = format_content(content, task)

    items.append([article_id, title, content, created_time, voteup_count, author_name, int(time.time() * 100000)])

    if task['save']['kw'].get('img', True):
        img_header = deepcopy(DEFAULT_HEADERS)
        img_header.update({'Referer': response.url})
        for img_url in download_img_list:
            new_tasks.append(Task.make_task({
                'url': img_url,
                'method': 'GET',
                'meta': {'headers': img_header, 'verify': False},
                'parser': parser_downloader_img,
                'resulter': resulter_downloader_img,
                'save': task['save'],
                'priority': 10,
            }))

    task.update({'parsed_data': items})
    return task, new_tasks

Exemplo n.º 9

0

Exibir arquivo

Arquivo: guoke_scientific.py Projeto: ssd4561/web2kindle

def parser_content(task):
    response = task['response']
    if not response:
        LOG.log_it("Not Response", 'WARN')
        raise RetryDownload

    new_tasks = []
    items = []
    content = response.text
    # 去除空格
    content = content.replace('</p><p>', '').replace('<br/>', '')
    soup = BeautifulSoup(content, 'lxml')

    title = task['save']['title']
    article_url = task['url']
    created_time = soup.select('.content-th-info span')[0].string[3:]
    author = soup.select('.content-th-info a')[0].string

    download_img_list, content = format_content(soup, task)
    items.append([
        md5string(article_url), title, content, created_time, '', author,
        int(time.time() * 100000)
    ])

    if task['save']['kw'].get('img', True):
        img_header = deepcopy(DEFAULT_HEADERS)
        img_header.update({'Referer': response.url})
        for img_url in download_img_list:
            new_tasks.append(
                Task.make_task({
                    'url': img_url,
                    'method': 'GET',
                    'meta': {
                        'headers': img_header,
                        'verify': False
                    },
                    'parser': parser_downloader_img,
                    'resulter': resulter_downloader_img,
                    'priority': 2,
                    'save': task['save']
                }))
    task.update({'parsed_data': items})
    return task, new_tasks

Exemplo n.º 10

0

Exibir arquivo

Arquivo: crawler.py Projeto: ssd4561/web2kindle

    def make_task(params):
        if 'parser' not in params:
            # FIXME Can't raise Exception in there
            raise Exception("Need a parser")
        if 'method' not in params:
            raise Exception("Need a method")
        if 'url' not in params:
            raise Exception("Need a url")

        tid = md5string(params['url'] + str(params.get('data')) +
                        str(params.get('params')))
        params.setdefault('meta', {})
        params.setdefault('priority', 0)
        params.setdefault('retry', 3)
        params.setdefault('tid', tid)

        if not params['url'].startswith('http'):
            if params['url'].startswith('//'):
                params['url'] = 'http:' + params['url']
            else:
                params['url'] = 'http://' + params['url']
        return Task(**params)

Exemplo n.º 11

0

Exibir arquivo

Arquivo: guoke_scientific.py Projeto: ssd4561/web2kindle

def parser_list(task):
    response = task['response']
    new_tasks = []
    to_next = True

    if not response:
        LOG.log_it("Not Response", 'WARN')
        raise RetryDownload

    try:
        data = response.json()
    except Exception as e:
        LOG.log_it(
            '解析JSON出错（如一直出现，而且浏览器能正常访问，可能是网站代码升级，请通知开发者。）\nERRINFO:{}'.format(
                str(e)), 'WARN')
        raise RetryDownload

    try:
        for each_result in data['result']:
            url = each_result['url']
            article_id = md5string(url)
            if article_id not in ARTICLE_ID_SET:
                title = each_result['title']
                date_group = re.search('(.*?)T(.*?)\+',
                                       each_result['date_created'])
                date = date_group.group(1) + ' ' + date_group.group(2)

                meta = deepcopy(task['meta'])
                save = deepcopy(task['save'])
                save.update({'title': title, 'date': date})
                new_task = Task.make_task({
                    'url': url,
                    'method': 'GET',
                    'parser': parser_content,
                    'resulter': resulter_content,
                    'priority': 1,
                    'meta': meta,
                    'save': save
                })
                new_tasks.append(new_task)
            else:
                to_next = False
    except KeyError:
        LOG.log_it('JSON KEY出错（如一直出现，而且浏览器能正常访问，可能是网站代码升级，请通知开发者。）', 'WARN')
        raise RetryDownload

    # 获取下一页
    if to_next and task['save']['cursor'] < task['save']['end'] and not len(
            data['result']) < 20:
        meta = deepcopy(task['meta'])
        save = deepcopy(task['save'])
        save['cursor'] += 20
        new_task = Task.make_task({
            'url': API_URL.format(save['cursor']),
            'method': 'GET',
            'meta': meta,
            'parser': parser_list,
            'priority': 0,
            'save': save,
            'retry': 10,
            'retry_delay': 10
        })
        new_tasks.append(new_task)

    return None, new_tasks

Exemplo n.º 12

0

Exibir arquivo

def parser_list(task):
    response = task['response']
    new_tasks = []
    to_next = True

    if not response:
        raise RetryDownload

    try:
        text = response.text
        bs = BeautifulSoup(text, 'lxml')
    except Exception as e:
        LOG.log_it('解析网页出错（如一直出现，而且浏览器能正常访问，可能是网站网站代码升级，请通知开发者。）ERRINFO:{}'
                   .format(str(e)), 'WARN')
        raise RetryDownload

    book_name = bs.title.string if bs.title else task['save']['name']

    # 插入文集名字
    with ArticleDB(task['save']['save_path']) as article_db:
        article_db.insert_meta_data(['BOOK_NAME', format_file_name('简书专题_' + book_name)], update=False)

    # 顺序反向
    items = bs.select('a.title')
    items.reverse()

    for item in items:
        # 如果已经在数据库中，则不下载
        url = 'https://www.jianshu.com' + item.attrs['href']
        if md5string(url) in ARTICLE_ID_SET:
            to_next = False
            continue

        try:
            title = item.string
        except:
            LOG.log_it('解析标题出错（如一直出现，而且浏览器能正常访问，可能是网站网站代码升级，请通知开发者。）', 'WARN')
            raise RetryDownload

        new_task = Task.make_task({
            'url': url,
            'method': 'GET',
            'meta': task['meta'],
            'parser': parser_content,
            'resulter': resulter_content,
            'priority': 5,
            'save': task['save'],
            'title': title,
        })
        new_tasks.append(new_task)

    # 下一页
    if to_next and len(items) != 0:
        if task['save']['cursor'] < task['save']['end']:
            next_page_task = deepcopy(task)
            next_page_task.update(
                {'url': API_URL.format(task['save']['name'], task['save']['order_by'], task['save']['cursor'] + 1)})
            next_page_task['save'].update({'cursor': next_page_task['save']['cursor'] + 1})
            new_tasks.append(next_page_task)

    return None, new_tasks

Exemplo n.º 13

0

Exibir arquivo

Arquivo: zhihu_zhuanlan.py Projeto: zhangshaoze/web2kindle

def parser_content(task):
    title = task['title']
    download_img_list = []
    new_tasks = []

    response = task['response']
    if not response:
        raise RetryDownload

    bs = BeautifulSoup(response.text, 'lxml')

    content_tab = bs.select('.PostIndex-content')
    if content_tab:
        content = str(content_tab[0])
    else:
        LOG.log_it("不能找到文章的内容。（如一直出现，而且浏览器能正常访问知乎，可能是知乎代码升级，请通知开发者。）", 'WARN')
        raise RetryDownload

    author_name = bs.select('.PostIndex-authorName')[0].string if bs.select(
        '.PostIndex-authorName') else ''

    voteup_count = re.search(
        'likesCount&quot;:(\d+),', response.text).group(1) if re.search(
            'likesCount&quot;:(\d+),', response.text) else ''

    created_time = str(
        bs.select('.PostIndex-header .HoverTitle')[1]['data-hover-title']
    ) if len(bs.select('.PostIndex-header .HoverTitle')) == 2 else ''
    article_url = task['url']

    bs = BeautifulSoup(content, 'lxml')
    for tab in bs.select('img[src^="data"]'):
        # 删除无用的img标签
        tab.decompose()

    # 居中图片
    for tab in bs.select('img'):
        if 'equation' not in tab['src']:
            tab.wrap(bs.new_tag('div', style='text-align:center;'))
            tab['style'] = "display: inline-block;"

        # 删除gif
        if task['save']['kw']['gif'] is False:
            if 'gif' in tab['src']:
                tab.decompose()
                continue

    content = str(bs)
    # bs4会自动加html和body 标签
    content = re.sub('<html><body>(.*?)</body></html>',
                     lambda x: x.group(1),
                     content,
                     flags=re.S)

    # 公式地址转换（傻逼知乎又换地址了）
    # content = content.replace('//www.zhihu.com', 'http://www.zhihu.com')

    download_img_list.extend(re.findall('src="(http.*?)"', content))

    # 更换为本地相对路径
    content = re.sub('src="(.*?)"', convert_link, content)

    # 超链接的转换
    content = re.sub('//link.zhihu.com/\?target=(.*?)"',
                     lambda x: unquote(x.group(1)), content)
    content = re.sub('<noscript>(.*?)</noscript>',
                     lambda x: x.group(1),
                     content,
                     flags=re.S)

    item = [
        md5string(article_url), title, content, created_time, voteup_count,
        author_name,
        int(time.time() * 100000)
    ]

    if task['save']['kw'].get('img', True):
        img_header = deepcopy(DEFAULT_HEADERS)
        img_header.update({'Referer': response.url})
        for img_url in download_img_list:
            new_tasks.append(
                Task.make_task({
                    'url': img_url,
                    'method': 'GET',
                    'meta': {
                        'headers': img_header,
                        'verify': False
                    },
                    'parser': parser_downloader_img,
                    'resulter': resulter_downloader_img,
                    'save': task['save'],
                    'priority': 10,
                }))

    task.update({"parsed_data": item})
    return task, new_tasks

Exemplo n.º 14

0

Exibir arquivo

def parser_content(task):
    title = task['title']
    items = []
    download_img_list = []
    new_tasks = []

    response = task['response']
    if not response:
        raise RetryDownload

    response.encoding = 'utf-8'
    bs = BeautifulSoup(response.text, 'lxml')

    content_tab = bs.select('.article-detail-bd > .detail')
    if content_tab:
        content = str(content_tab[0])
    else:
        LOG.log_it("不能找到文章的内容。（如一直出现，而且浏览器能正常访问，可能是代码升级，请通知开发者。）", 'WARN')
        raise RetryDownload

    author_name = '未知'
    voteup_count = task['voteup_count']
    created_time = task['created_time']
    article_url = task['url']

    bs = BeautifulSoup(content, 'lxml')

    # 居中图片
    for tab in bs.select('img'):
        if len(tab.attrs['class']) != 1:
            tab.decompose()
            continue

        # 删除gif
        if task['save']['kw']['gif'] is False:
            if 'gif' in tab['data-src']:
                tab.decompose()
                continue

        tab.wrap(bs.new_tag('div', style='text-align:center;'))
        tab['style'] = "display: inline-block;"

    content = str(bs)
    # bs4会自动加html和body 标签
    content = re.sub('<html><body>(.*?)</body></html>',
                     lambda x: x.group(1),
                     content,
                     flags=re.S)

    download_img_list.extend(re.findall('src="(http.*?)"', content))

    # 更换为本地相对路径
    content = re.sub('src="(.*?)"', convert_link, content)
    content = content.replace('data-src', 'src')

    items.append([
        md5string(article_url), title, content, created_time, voteup_count,
        author_name,
        int(time.time() * 100000)
    ])

    if task['save']['kw'].get('img', True):
        img_header = deepcopy(SCRIPT_CONFIG.get('DEFAULT_HEADERS'))
        img_header.update({'Referer': response.url})
        for img_url in download_img_list:
            new_tasks.append(
                Task.make_task({
                    'url': img_url,
                    'method': 'GET',
                    'meta': {
                        'headers': img_header,
                        'verify': False
                    },
                    'parser': parser_downloader_img,
                    'resulter': resulter_downloader_img,
                    'save': task['save'],
                    'priority': 10,
                }))

    task.update({'parsed_data': items})
    return task, new_tasks

Exemplo n.º 15

0

Exibir arquivo

def parser_list(task):
    response = task['response']
    new_tasks = []
    opf = []
    to_next = True

    if not response:
        raise RetryDownload
    try:
        data = response.json()
    except Exception as e:
        LOG.log_it('解析JSON出错（如一直出现，而且浏览器能正常访问，可能是代码升级，请通知开发者。）ERRINFO:{}'
                   .format(str(e)), 'WARN')
        raise RetryDownload

    try:
        for item in data['data']['feeds']:
            if item['datatype'] == 'article':
                article_url = 'https://www.qdaily.com/articles/{}.html'.format(str(item['post']['id']))
                article_id = md5string(article_url)
                # 如果在数据库里面已经存在的项目，就不继续爬了
                if article_id not in ARTICLE_ID_SET:
                    item = item['post']
                    # 文件名太长无法制作mobi
                    title = item['title']
                    if len(title) > 55:
                        _ = 55 - len(title) - 3
                        title = title[:_] + '...'
                    opf.append({'href': format_file_name(title, '.html')})
                    new_task = Task.make_task({
                        'url': article_url,
                        'method': 'GET',
                        'meta': task['meta'],
                        'parser': parser_content,
                        'resulter': resulter_content,
                        'priority': 5,
                        'save': task['save'],
                        'title': item['title'],
                        'created_time': item['publish_time'],
                        'voteup_count': item['praise_count']
                    })
                    new_tasks.append(new_task)
                else:
                    to_next = False

        # Next page
        if to_next:
            if len(data['data']) != 0:
                if data['data']['last_key'] > task['save']['end']:
                    next_page_task = deepcopy(task)
                    next_page_task.update(
                        {'url': API_URL.format(data['data']['last_key'])})
                    next_page_task['save'].update(
                        {'cursor': data['data']['last_key'], 'page': task['save']['page'] + 1})
                    new_tasks.append(next_page_task)
            else:
                LOG.log_it('不能读取列表。（如一直出现，而且浏览器能正常访问，可能是代码升级，请通知开发者。）', 'WARN')
                raise RetryDownload

    except KeyError:
        LOG.log_it('JSON KEY出错（如一直出现，而且浏览器能正常访问，可能是网站代码升级，请通知开发者。）', 'WARN')
        raise RetryDownload
    return None, new_tasks

Exemplo n.º 16

0

Exibir arquivo

Arquivo: guoke_scientific.py Projeto: zhangshaoze/web2kindle

def parser_content(task):
    response = task['response']
    if not response:
        LOG.log_it("Not Response", 'WARN')
        raise RetryDownload

    new_tasks = []
    download_img_list = []
    items = []
    soup = BeautifulSoup(response.text, 'lxml')

    content_select = soup.select('.document')
    # 移除每页后面无用的信息
    if content_select:
        for to_del in soup.select('.copyright'):
            to_del.decompose()

    content = str(content_select)
    # bs4会自动加html和body 标签
    content = re.sub('<html><body>(.*?)</body></html>',
                     lambda x: x.group(1),
                     content,
                     flags=re.S)
    download_img_list.extend(re.findall('src="(http.*?)"', content))
    # 更换为本地相对路径
    content = re.sub('src="(.*?)"', convert_link, content)

    # 去掉"[]"
    content = content[1:-1]

    title = task['save']['title']
    article_url = task['url']
    created_time = soup.select('.content-th-info span')[0].string[3:]
    author = soup.select('.content-th-info a')[0].string

    bs2 = BeautifulSoup(content, 'lxml')
    # 居中图片
    for tab in bs2.select('img'):
        tab.wrap(bs2.new_tag('div', style='text-align:center;'))
        tab['style'] = "display: inline-block;"

        # 删除gif
        if task['save']['kw']['gif'] is False:
            if 'gif' in tab['src']:
                tab.decompose()
                continue

    content = str(bs2)

    items.append([
        md5string(article_url), title, content, created_time, '', author,
        int(time.time() * 100000)
    ])

    if task['save']['kw'].get('img', True):
        img_header = deepcopy(DEFAULT_HEADERS)
        img_header.update({'Referer': response.url})
        for img_url in download_img_list:
            new_tasks.append(
                Task.make_task({
                    'url': img_url,
                    'method': 'GET',
                    'meta': {
                        'headers': img_header,
                        'verify': False
                    },
                    'parser': parser_downloader_img,
                    'resulter': resulter_downloader_img,
                    'priority': 2,
                    'save': task['save']
                }))
    task.update({'parsed_data': items})
    return task, new_tasks