print("抓取:", resp.url) if resp is not None: data = resp.json()['list'] if data is not None: for d in data: news_list.append(News( _id=d['id'], title=d['name'], image=d['picture'], overview=d['brief'], origin=d['writer'], url=detail_url + d['id'] + '.html', publish_time=d['rPtime'] ).to_dict()) return news_list if __name__ == '__main__': cur_page = 1 client = MongodbClient('jiemodui') while True: result_list = fetch_news(cur_page) client.insert_many(result_list) if int(round(time.time())) - int( time.mktime(time.strptime(result_list[-1]['publish_time'], "%Y-%m-%d %H:%M:%S"))) < 43200: cur_page += 1 continue else: break print("芥末堆爬取完毕!")
'pagesize': 20 }, headers=iheima_headers) print("爬取:", resp.url) if resp is not None: resp_json = resp.json() contents = resp_json['contents'] for content in contents: # 只抓取12个小时以内的新闻 if int(round(time.time())) - int( time.mktime( time.strptime(content['published'], "%Y-%m-%d %H:%M"))) > 86400: return news_list else: news_list.append( News(_id=content['contentid'], title=content['title'], url=iheima_url[:-1] + content['url'], image=content['thumb'], publish_time=content['published'], origin=content['author'], overview=content['description']).to_dict()) page += 1 if __name__ == '__main__': client = MongodbClient('iheima') client.insert_many(fetch_iheima_news()) print("爱黑马爬取完毕!")
from news import News, MongodbClient from tools import user_agents base_url = 'https://www.iyiou.com/breaking/' headers = { 'User-Agent': user_agents.random_user_agent() } def fetch_news(url): news_list = [] resp = r.get(url, headers=headers) print("抓取:", resp.url) if resp is not None: pq = PyQuery(resp.text) a_s = pq('.newsFlashListWrap > div > ul > li > a') for item in a_s.items(): news_list.append(News( _id=item.attr('href').split('/')[-1].replace('.html', ''), url=item.attr('href'), title=item('span.fl').text(), publish_time=item('span.fr').text() ).to_dict()) return news_list if __name__ == '__main__': client = MongodbClient('yiou') for i in range(1, 3): client.insert_many(fetch_news("{}p{}.html".format(base_url, i)))
news_list = [] resp = r.get(load_more_url, params={'page': page, 'num': 20}, headers=headers) print("抓取:", resp.url) if resp is not None: data = resp.json()['data']['list'] for d in data: news_list.append(News( _id=d['id'], title=d['title'], overview=d['desc'], publish_time=d['post_date'], image=d['image'], origin=d['author_info']['name'], url=article_url + str(d['id']) ).to_dict()) return news_list if __name__ == '__main__': client = MongodbClient('babite') cur_page = 1 while True: result_list = fetch_news(cur_page) client.insert_many(result_list) last_publish_time = result_list[-1]['publish_time'] if int(round(time.time())) - int(last_publish_time) < 43200: cur_page += 1 continue else: break
data_dict = json.loads(data_json) for data in data_dict['data']['list']: news_list.append(News( _id=data['id'], title=data['title'], overview=data['brief'], image=data['thumb'], publish_time=data['time'], url=data['url'], origin=data['columnName'] ).to_dict()) sort_field = data['sort_field'] return news_list, sort_field if __name__ == '__main__': client = MongodbClient('jueshengwang') result = fetch_index_news() client.insert_many(result[0]) count_time -= 1 min_id = result[1] while True: result = fetch_more_news(min_id) client.insert_many(result[0]) if int(round(time.time())) - int(result[0][-1]['publish_time']) < 432000: count_time -= 1 min_id = result[1] continue else: break
def fetch_diyicaijing_news(): news_list = [] resp = r.get(diyicaijing_url, params={'page': 2}, headers=diyicaijing_headers) bs = BeautifulSoup(resp.text, 'lxml') articles = bs.findAll('article', attrs={'class': 'article-item clearfix'}) for article in articles: detail_url = diyicaijing_url[:-1] + article.a['href'] if not detail_url.endswith('subscribe'): news_content = article.div.text.replace(' ', '').replace('\n', '') text_result = msg_extract_pattern.findall(news_content) if text_result is not None: for content in text_result: news_list.append( News( _id=detail_url.split('/')[-1], url=detail_url, image=url_extract_pattern.search( article.a['style']).group(1), origin=content[0], title=content[1], publish_time=content[2], ).to_dict()) return news_list if __name__ == '__main__': client = MongodbClient('diyicaijing') client.insert_many(fetch_diyicaijing_news())
params={ "cre": "tianyi", "mod": category, "_": int(round(time.time() * 1000)), "offset": 20 * i }, headers=headers) print('爬取:', resp.url) if resp is not None: resp_json = resp.json() data = resp_json['data'] for d in data: news_list.append( News(_id=d['uuid'], title=d['title'], overview=d['intro'], image=d['thumb'], publish_time=d['ctime'], origin=d['author'], url=d['url_https']).to_dict()) time.sleep(random.randint(0, 2)) return news_list if __name__ == '__main__': client = MongodbClient('xlkj') for i in category_key_dict.keys(): client.insert_many(fetch_news(i)) time.sleep(random.randint(0, 2)) print("新浪科技爬取完毕!")
resp = r.get(penpai_ajax_url, params=ajax_params, headers=penpai_headers) resp_content = resp.text print("爬取:", resp.url) results = news_pattern.findall(resp_content) for result in results: if '小时前' in result[5]: hours_before = hours_pattern.search(result[5]) if hours_before is not None: if int(hours_before.group(1)) > 12: return news_list else: news_list.append( News(_id=result[0].split('_')[-1], title=result[2], overview=result[3].replace('\n', '').replace( ' ', ''), url=penpai_url + result[0], image='http:' + result[1], publish_time=result[5], origin=result[4]).to_dict()) pageidx += 1 time.sleep(random.randint(0, 2)) if __name__ == '__main__': client = MongodbClient('penpai') data_list = fetch_penpai_news() client.insert_many(data_list)
'Host': str_handle.remove_url_scheme(index_url)[:-1], 'Referer': roll_url, } def fetch_news(): news_list = [] resp = r.get(roll_url, headers=headers) print("爬取:%s" % resp.url) if resp is not None: resp.encoding = 'utf8' pq = PyQuery(resp.text) lis = pq('div.newslist > ul >li') for li in lis.items(): if li.attr('class') != 'line': a = li('span.tit > a') news_list.append(News( _id=a.attr('href').split('/')[-1].replace('.shtml', ''), url=a.attr('href'), title=a.text(), origin=li('span.column').text() + '|' + li('span.source').text(), update_time=li('span.time').text() ).to_dict()) return news_list if __name__ == '__main__': client = MongodbClient('techweb') client.insert_many(fetch_news()) print("techweb爬取完毕!")
'Host': str_handle.remove_url_scheme(index_url)[:-1], 'Referer': index_url, } def fetch_news(): news_list = [] resp = r.get(category_url, headers=headers) resp.encoding = 'utf8' print('爬取:', resp.url) if resp is not None: pq = PyQuery(resp.text) divs = pq('div.lfn-bar') for div in divs.items(): a = div('div.lfn-title > a') form = div('div > div > div > span.form').text() url = 'https:' + a.attr('href') news_list.append( News(_id=url.split('/')[-1].replace('.html', ''), url=url, title=a.text(), overview=div('div.lfn-des').text(), publish_time=div('div > div > div > span.time').text(), origin=form if form != '' else None).to_dict()) return news_list if __name__ == '__main__': client = MongodbClient('chuangyebang') client.insert_many(fetch_news())
def fetch_web_news(): resp = r.get(data_url, headers=headers) resp.encoding = 'utf8' # 设置编码 json_result = json_extract_pattern.search(resp.text); if json_result is not None: json_news = json_result.group(1) sites = json.loads(json_news)['sites'] news_list = [] for site in sites: if site['c5'] == today_date: id_result = id_extract_pattern.search(site['c2'].split('/')[-1]) news_list.append( News( _id=id_result.group(1), title=site['c1'], url=site['c2'], image=site['c3'], origin=site['c4'], publish_time=site['c5'] + ' ' + site['c6'] ).to_dict() ) else: break return news_list if __name__ == '__main__': client = MongodbClient('zgtq') client.insert_many(fetch_web_news()) print("中国天气网数据爬取完毕!")
title=a.attr('title'), image=a('img').attr('src'), overview=c.parent( 'div .posts-default-content > div.posts-text').text(), origin=c.parent( 'div .posts-default-content > div.posts-default-info > ul > li.ico-cat' ).text(), publish_time=c.parent( 'div .posts-default-content > div.posts-default-info > ul > li.ico-time' ).text()).to_dict()) return news_list if __name__ == '__main__': cur_page = 1 client = MongodbClient('jingmeiti') while True: result_list = fetch_news(cur_page) client.insert_many(result_list) publish_time = result_list[-1]['publish_time'] if '小时前' in publish_time: hours_before = hours_pattern.search( result_list[-1]['publish_time']) print(hours_before) if hours_before is not None: if int(hours_before.group(1)) > 12: break else: cur_page += 1 continue else:
headers = { 'Host': str_handle.remove_url_scheme(index_url)[:-1], 'User-Agent': user_agents.random_user_agent(), } def fetch_news(): news_list = [] resp = r.get(index_url, headers=headers) print("抓取:", resp.url) if resp is not None: bs = BeautifulSoup(resp.text, 'lxml') data_list = bs.find_all("div", attrs={"bosszone": "TS_Mainnews"})[0] a_s = data_list.find_all("a") cur = 1 for a in a_s: news_list.append(News( _id=cur, title=a.text, origin="腾讯体育", url=a['href'] ).to_dict()) cur += 1 return news_list if __name__ == '__main__': client = MongodbClient('txty') client.insert_many(fetch_news()) print("腾讯体育爬取完毕!")
print("抓取:", resp.url) if resp is not None: resp.encoding = 'utf8' pq = PyQuery(resp.text) lis = pq('li') for li in lis.items(): news_list.append( News(_id=li('div.share_btn > a').attr('id'), title=li('div > div.title').text(), overview=li('div > div.content').text().replace( '\n', '').strip(), publish_time=li('div > div.pub_time').attr('data-time'), url="{}?id={}".format( flash_url, li('div.share_btn > a').attr('id'))).to_dict()) return news_list if __name__ == '__main__': client = MongodbClient('bianews') cur_page = 1 while True: result_list = fetch_news(cur_page) client.insert_many(result_list) last_publish_time = result_list[-1]['publish_time'] if int(round(time.time() * 1000)) - int(last_publish_time) < 43200000: cur_page += 1 continue else: break
print("抓取:", resp.url) if resp is not None: data = resp.json()['Data'] if data is not None: for d in data: news_list.append(News( _id=d['ID'], title=d['Title'], image=d['PicUrl'], overview=d['abstract'], origin=d['Key'], url=index_url + d['APage'], publish_time=d['updatetime'] ).to_dict()) return news_list if __name__ == '__main__': cur_page = 1 client = MongodbClient('lianshangwang') while True: result_list = fetch_news(cur_page) client.insert_many(result_list) if int(round(time.time())) - int( time.mktime(time.strptime(result_list[-1]['publish_time'], "%Y/%m/%d %H:%M:%S"))) < 43200: cur_page += 1 continue else: break print("联商网爬取完毕!")
'a': 'ajaxNews', 'cid': 4, 'page': page }, headers=headers) print('爬取:', resp.url) if resp is not None: resp.encoding = 'utf8' rst = json.loads(resp.text[1:-1])['rst'] pq = PyQuery(rst) news_item = pq('div.item-news') for item in news_item.items(): a_url = item('div > p > a').attr('href') item_main = title_extract_pattern.search( item('div.item-main').text()) if item_main is not None: news_list.append( News(_id=a_url.split('/')[-1].replace('.html', ''), url=a_url, title=item_main.group(1), overview=item_main.group(2), publish_time=item('div.item-date').text()).to_dict()) return news_list if __name__ == '__main__': client = MongodbClient('jiemian') client.insert_many(fetch_news(1)) client.insert_many(fetch_news(2)) print("界面新闻爬取完毕!")
bs = BeautifulSoup(resp.text, 'lxml') data_list = bs.find("ul", attrs={'class': 'gallery l-list-selected l-m'}) lis = data_list.findAll('li') for li in lis: l_cbox = li.find('div', attrs={'class': 'l-cbox'}) spans = l_cbox.find('div', attrs={ 'class': 'l-foot-par' }).findAll('span') news_id_result = xhs_news_id_pattern.search(li.a['href']) if news_id_result is not None: # 判断新闻的发布时间与当前时间的时间间隔,只保存12个小时以内的新闻 publish_time = spans[1].text.replace('\n', '').strip() if int(round(time.time())) - int( time.mktime( time.strptime(publish_time, "%Y-%m-%d %H:%M:%S"))) < 43200: news_list.append( News(_id=news_id_result.group(1), url=li.a['href'], title=li.a.img['alt'], image=xhs_gd_url + li.a.img['src'], origin=spans[0].text, publish_time=publish_time, overview=l_cbox.p.text).to_dict()) return news_list if __name__ == '__main__': client = MongodbClient('xinhuashe') client.insert_many(fetch_xh_focus()) client.insert_many(fetch_gd_news())
for li in pq('ul.main-wrap > li').items(): url = index_url + li('div.childR > p > a').attr('href') img = li('div > a > img') news_list.append(News( _id=url.split('/')[-1], url=url, title=img.attr('alt'), overview=li('div.childR > p > a').text(), image=index_url[:-1] + img.attr('src'), publish_time=li('div.childR > div.time').text() ).to_dict()) return news_list if __name__ == '__main__': client = MongodbClient('huanqiulvxun') cur_page = 1 while True: result_list = fetch_news(cur_page) client.insert_many(result_list) last_publish_time = result_list[-1]['publish_time'] if '分钟前' in last_publish_time: cur_page += 1 continue elif '小时前' in last_publish_time: hours_before = hours_pattern.search(last_publish_time) if hours_before is not None: if int(hours_before.group(1)) < 12: cur_page += 1 continue else:
print("爬取:", resp.url) if resp is not None: pq = PyQuery(resp.text) news = pq('div.ws-newsflash-list01') for n in news.items(): a = n('a') news_list.append( News(_id=a.attr('href').split('/')[-1], url=index_url + a.attr('href'), title=a('h5').text(), overview=n('div.ws-newsflash-content').text().replace( "【查看原文】", ''), publish_time=n('div > div > time').text()).to_dict()) return news_list if __name__ == '__main__': client = MongodbClient('diyidiandong') cur_page = 1 while True: result_list = fetch_news(cur_page) client.insert_many(result_list) last_publish_time = result_list[-1]['publish_time'] if int(round(time.time())) - int( time.mktime(time.strptime(last_publish_time, "%Y-%m-%d %H:%M"))) < 43200: cur_page += 1 continue else: break
resp = r.post(ajax_url, data=json.dumps(ajax_params), headers=headers) if resp is not None: res = resp.json() for i in res['res']: news_list.append( News(_id=i['id'], title=i['title'], overview=i['content'], publish_time=i['create_time'], origin=i['src_name'], url=news_detail_url + i['uid']).to_dict()) return news_list if __name__ == '__main__': client = MongodbClient('dongmaiwang') cur_page = 1 while True: print("爬取第%d页" % cur_page) result_list = fetch_news(cur_page) client.insert_many(result_list) if int(round(time.time())) - int( time.mktime( time.strptime(result_list[-1]['publish_time'], "%Y-%m-%d %H:%M:%S"))) < 43200: cur_page += 1 continue else: break print("动脉网爬取完毕!")
resp_json = resp.json() items = resp_json['data']['items'] for item in items: post = item['post'] motifs = post['motifs'] motifs_name = motifs[0]['name'] if motifs is not None else '' data_list.append( News(_id=str(item['id']), title=post['title'], url=news_detail_base_url + str(post['id']), image=post['cover'], publish_time=post['published_at'], overview=post['summary'], origin=post['user']['name'] + '|' + motifs_name).to_dict()) if int(round(time.time())) - int( time.mktime( time.strptime(items[-1]['post']['published_at'], "%Y-%m-%d %H:%M:%S"))) > 86400: return None else: return fetch_web_news_more(items[-1]['id']) if __name__ == '__main__': result_list, end_id = fetch_web_news() fetch_web_news_more(end_id) client = MongodbClient('36Kr') client.insert_many(result_list) client.insert_many(data_list)
def fetch_news(): news_list = [] resp = r.get(index_url, headers=headers) print("抓取:", resp.url) if resp is not None: resp.encoding = 'utf8' pq = PyQuery(resp.text) data_list = pq('ul#date-list-ul') for li in data_list('li').items(): img = li('a > img') print(li('p').text()) news_list.append( News( url=li('a').attr('href'), _id=li('a').attr('href').split('/')[-1].replace( '.html', ''), title=img.attr('alt'), image=img.attr('src'), overview=li('div#list-t p#list-abs').text(), publish_time=li( 'div#list-t > p#list-sm span:first').text(), origin=li('div#list-t > p#list-sm > span:last').text(), ).to_dict()) return news_list if __name__ == '__main__': client = MongodbClient('xinlvjie') client.insert_many(fetch_news()) print("新旅社爬取完毕!")
if resp is not None: d = resp.json()['d'] for i in d: news_list.append( News( _id=i['ArticleId'], title=i['Title'], overview=i['BriefContent'], url=index_url[:-1] + i['LinkUrl'], publish_time=i['IssueTime'], ).to_dict()) return news_list if __name__ == '__main__': client = MongodbClient('gasgoo') cur_page = 1 while True: print("爬取第%d页" % cur_page) result_list = fetch_news(cur_page) client.insert_many(result_list) last_publish_time = result_list[-1]['publish_time'] if int(round(time.time())) - int( time.mktime( time.strptime(last_publish_time, "%Y-%m-%d %H:%M:%S"))) < 43200: cur_page += 1 continue else: break print("盖世汽车资讯爬取完毕!")