def crawl_article(dicts): """ :param dicts: """ for article_dict in dicts: sess = requests.Session() headers = get_header() url = article_dict.get('url') print("开始爬取:%s" % url) res = sess.get(url, headers=headers) selector = etree.HTML(res.text) rich_media = selector.xpath( "//div[@class='rich_media_inner']/div[@id='page-content']/div[1]/div[2]")[0] author = selector.xpath("//div[@id='meta_content']/span[@class='rich_media_meta rich_media_meta_text']")[ 0].xpath( "string(.)") __biz = url2dict(url).get('__biz', '') # 正文文字 content = rich_media.xpath("string(.)") # 图片集合 picture_urls = selector.xpath("//img/@data-src") # 视频集合 video_urls = selector.xpath("//iframe[@class='video_iframe']/@data-src") json_info = get_article_info(url) if json_info is not None: like_num = json_info.get('data', {}).get('zannums', 0) read_num = json_info.get('data', {}).get('readnums', 0) mongodb = MongoDB() article_item = {'title': article_dict.get('title', ""), 'author': author, 'summary': article_dict.get('summary', ""), 'cover': article_dict.get('cover', ""), 'content': content, 'like_num': like_num, 'read_num': read_num, 'comment': "", 'url': url, 'receive_time': article_dict.get('receive_time', ""), 'account': article_dict.get('account', ""), '__biz': __biz} mongodb.add("wechat_article", article_item) try: download_pictures(dict_info=article_item, picture_urls=picture_urls) _thread.start_new_thread(download_videos, (article_item, '', video_urls)) except: print("下载多媒体内容失败") sleep(60) pass
# -*- coding: utf-8 -*- """ ------------------------------------------------- File Name: dbshot Description : Author : Lychlov date: 2018/5/24 ------------------------------------------------- Change Activity: 2018/5/24: ------------------------------------------------- """ from db.mongodb import MongoDB temp_dict = {"title": "政变四周年,曼谷反军方大示威今日正式爆发!", "summary": 'jianjie', "cover": "http://sdfsdf", "receive_time": "2018-05-23 23:23:23", "account": '泰国网'} temp_dict2 = {"title": "政变四周年,曼谷反军方大示威今日正式爆发!", "summary": 'jianjie', "account": '泰国网'} mongodb = MongoDB() add_res = mongodb.add('wechat_article', temp_dict) res = mongodb.find('wechat_article', temp_dict2) print(res)