Пример #1
0
def crawl_article(dicts):
    """
    :param dicts: 
    """
    for article_dict in dicts:
        sess = requests.Session()
        headers = get_header()
        url = article_dict.get('url')
        print("开始爬取:%s" % url)
        res = sess.get(url, headers=headers)
        selector = etree.HTML(res.text)
        rich_media = selector.xpath(
            "//div[@class='rich_media_inner']/div[@id='page-content']/div[1]/div[2]")[0]
        author = selector.xpath("//div[@id='meta_content']/span[@class='rich_media_meta rich_media_meta_text']")[
            0].xpath(
            "string(.)")
        __biz = url2dict(url).get('__biz', '')
        # 正文文字
        content = rich_media.xpath("string(.)")
        # 图片集合
        picture_urls = selector.xpath("//img/@data-src")
        # 视频集合
        video_urls = selector.xpath("//iframe[@class='video_iframe']/@data-src")
        json_info = get_article_info(url)
        if json_info is not None:
            like_num = json_info.get('data', {}).get('zannums', 0)
            read_num = json_info.get('data', {}).get('readnums', 0)
        mongodb = MongoDB()

        article_item = {'title': article_dict.get('title', ""), 'author': author,
                        'summary': article_dict.get('summary', ""),
                        'cover': article_dict.get('cover', ""), 'content': content, 'like_num': like_num,
                        'read_num': read_num,
                        'comment': "", 'url': url, 'receive_time': article_dict.get('receive_time', ""),
                        'account': article_dict.get('account', ""), '__biz': __biz}
        mongodb.add("wechat_article", article_item)
        try:
            download_pictures(dict_info=article_item, picture_urls=picture_urls)
            _thread.start_new_thread(download_videos, (article_item, '', video_urls))
        except:
            print("下载多媒体内容失败")
        sleep(60)
    pass
Пример #2
0
# -*- coding: utf-8 -*-
"""
-------------------------------------------------
   File Name:     dbshot
   Description :
   Author :       Lychlov
   date:          2018/5/24
-------------------------------------------------
   Change Activity:
                   2018/5/24:
-------------------------------------------------
"""
from db.mongodb import MongoDB

temp_dict = {"title": "政变四周年,曼谷反军方大示威今日正式爆发!",
             "summary": 'jianjie',
             "cover": "http://sdfsdf",
             "receive_time": "2018-05-23 23:23:23",
             "account": '泰国网'}
temp_dict2 = {"title": "政变四周年,曼谷反军方大示威今日正式爆发!",
              "summary": 'jianjie',
              "account": '泰国网'}
mongodb = MongoDB()
add_res = mongodb.add('wechat_article', temp_dict)

res = mongodb.find('wechat_article', temp_dict2)
print(res)