Пример #1
0
    print("抓取:", resp.url)
    if resp is not None:
        data = resp.json()['list']
        if data is not None:
            for d in data:
                news_list.append(News(
                    _id=d['id'],
                    title=d['name'],
                    image=d['picture'],
                    overview=d['brief'],
                    origin=d['writer'],
                    url=detail_url + d['id'] + '.html',
                    publish_time=d['rPtime']
                ).to_dict())
    return news_list


if __name__ == '__main__':
    cur_page = 1
    client = MongodbClient('jiemodui')
    while True:
        result_list = fetch_news(cur_page)
        client.insert_many(result_list)
        if int(round(time.time())) - int(
                time.mktime(time.strptime(result_list[-1]['publish_time'], "%Y-%m-%d %H:%M:%S"))) < 43200:
            cur_page += 1
            continue
        else:
            break
    print("芥末堆爬取完毕!")
Пример #2
0
                         'pagesize': 20
                     },
                     headers=iheima_headers)
        print("爬取:", resp.url)
        if resp is not None:
            resp_json = resp.json()
            contents = resp_json['contents']
            for content in contents:
                # 只抓取12个小时以内的新闻
                if int(round(time.time())) - int(
                        time.mktime(
                            time.strptime(content['published'],
                                          "%Y-%m-%d %H:%M"))) > 86400:
                    return news_list
                else:
                    news_list.append(
                        News(_id=content['contentid'],
                             title=content['title'],
                             url=iheima_url[:-1] + content['url'],
                             image=content['thumb'],
                             publish_time=content['published'],
                             origin=content['author'],
                             overview=content['description']).to_dict())
            page += 1


if __name__ == '__main__':
    client = MongodbClient('iheima')
    client.insert_many(fetch_iheima_news())
    print("爱黑马爬取完毕!")
Пример #3
0
from news import News, MongodbClient
from tools import user_agents

base_url = 'https://www.iyiou.com/breaking/'
headers = {
    'User-Agent': user_agents.random_user_agent()
}


def fetch_news(url):
    news_list = []
    resp = r.get(url, headers=headers)
    print("抓取:", resp.url)
    if resp is not None:
        pq = PyQuery(resp.text)
        a_s = pq('.newsFlashListWrap > div > ul > li > a')
        for item in a_s.items():
            news_list.append(News(
                _id=item.attr('href').split('/')[-1].replace('.html', ''),
                url=item.attr('href'),
                title=item('span.fl').text(),
                publish_time=item('span.fr').text()
            ).to_dict())
        return news_list


if __name__ == '__main__':
    client = MongodbClient('yiou')
    for i in range(1, 3):
        client.insert_many(fetch_news("{}p{}.html".format(base_url, i)))
Пример #4
0
    news_list = []
    resp = r.get(load_more_url, params={'page': page, 'num': 20}, headers=headers)
    print("抓取:", resp.url)
    if resp is not None:
        data = resp.json()['data']['list']
        for d in data:
            news_list.append(News(
                _id=d['id'],
                title=d['title'],
                overview=d['desc'],
                publish_time=d['post_date'],
                image=d['image'],
                origin=d['author_info']['name'],
                url=article_url + str(d['id'])
            ).to_dict())
    return news_list


if __name__ == '__main__':
    client = MongodbClient('babite')
    cur_page = 1
    while True:
        result_list = fetch_news(cur_page)
        client.insert_many(result_list)
        last_publish_time = result_list[-1]['publish_time']
        if int(round(time.time())) - int(last_publish_time) < 43200:
            cur_page += 1
            continue
        else:
            break
Пример #5
0
            data_dict = json.loads(data_json)
            for data in data_dict['data']['list']:
                news_list.append(News(
                    _id=data['id'],
                    title=data['title'],
                    overview=data['brief'],
                    image=data['thumb'],
                    publish_time=data['time'],
                    url=data['url'],
                    origin=data['columnName']
                ).to_dict())
                sort_field = data['sort_field']
    return news_list, sort_field


if __name__ == '__main__':
    client = MongodbClient('jueshengwang')
    result = fetch_index_news()
    client.insert_many(result[0])
    count_time -= 1
    min_id = result[1]
    while True:
        result = fetch_more_news(min_id)
        client.insert_many(result[0])
        if int(round(time.time())) - int(result[0][-1]['publish_time']) < 432000:
            count_time -= 1
            min_id = result[1]
            continue
        else:
            break
Пример #6
0
def fetch_diyicaijing_news():
    news_list = []
    resp = r.get(diyicaijing_url,
                 params={'page': 2},
                 headers=diyicaijing_headers)
    bs = BeautifulSoup(resp.text, 'lxml')
    articles = bs.findAll('article', attrs={'class': 'article-item clearfix'})
    for article in articles:
        detail_url = diyicaijing_url[:-1] + article.a['href']
        if not detail_url.endswith('subscribe'):
            news_content = article.div.text.replace(' ', '').replace('\n', '')
            text_result = msg_extract_pattern.findall(news_content)
            if text_result is not None:
                for content in text_result:
                    news_list.append(
                        News(
                            _id=detail_url.split('/')[-1],
                            url=detail_url,
                            image=url_extract_pattern.search(
                                article.a['style']).group(1),
                            origin=content[0],
                            title=content[1],
                            publish_time=content[2],
                        ).to_dict())
    return news_list


if __name__ == '__main__':
    client = MongodbClient('diyicaijing')
    client.insert_many(fetch_diyicaijing_news())
Пример #7
0
                     params={
                         "cre": "tianyi",
                         "mod": category,
                         "_": int(round(time.time() * 1000)),
                         "offset": 20 * i
                     },
                     headers=headers)
        print('爬取:', resp.url)
        if resp is not None:
            resp_json = resp.json()
            data = resp_json['data']
            for d in data:
                news_list.append(
                    News(_id=d['uuid'],
                         title=d['title'],
                         overview=d['intro'],
                         image=d['thumb'],
                         publish_time=d['ctime'],
                         origin=d['author'],
                         url=d['url_https']).to_dict())
        time.sleep(random.randint(0, 2))
    return news_list


if __name__ == '__main__':
    client = MongodbClient('xlkj')
    for i in category_key_dict.keys():
        client.insert_many(fetch_news(i))
        time.sleep(random.randint(0, 2))
    print("新浪科技爬取完毕!")
Пример #8
0
        resp = r.get(penpai_ajax_url,
                     params=ajax_params,
                     headers=penpai_headers)
        resp_content = resp.text
        print("爬取:", resp.url)
        results = news_pattern.findall(resp_content)
        for result in results:
            if '小时前' in result[5]:
                hours_before = hours_pattern.search(result[5])
                if hours_before is not None:
                    if int(hours_before.group(1)) > 12:
                        return news_list
                    else:
                        news_list.append(
                            News(_id=result[0].split('_')[-1],
                                 title=result[2],
                                 overview=result[3].replace('\n', '').replace(
                                     ' ', ''),
                                 url=penpai_url + result[0],
                                 image='http:' + result[1],
                                 publish_time=result[5],
                                 origin=result[4]).to_dict())
        pageidx += 1
        time.sleep(random.randint(0, 2))


if __name__ == '__main__':
    client = MongodbClient('penpai')
    data_list = fetch_penpai_news()
    client.insert_many(data_list)
    'Host': str_handle.remove_url_scheme(index_url)[:-1],
    'Referer': roll_url,
}


def fetch_news():
    news_list = []
    resp = r.get(roll_url, headers=headers)
    print("爬取:%s" % resp.url)
    if resp is not None:
        resp.encoding = 'utf8'
        pq = PyQuery(resp.text)
        lis = pq('div.newslist > ul >li')
        for li in lis.items():
            if li.attr('class') != 'line':
                a = li('span.tit > a')
                news_list.append(News(
                    _id=a.attr('href').split('/')[-1].replace('.shtml', ''),
                    url=a.attr('href'),
                    title=a.text(),
                    origin=li('span.column').text() + '|' + li('span.source').text(),
                    update_time=li('span.time').text()
                ).to_dict())
    return news_list


if __name__ == '__main__':
    client = MongodbClient('techweb')
    client.insert_many(fetch_news())
    print("techweb爬取完毕!")
Пример #10
0
    'Host': str_handle.remove_url_scheme(index_url)[:-1],
    'Referer': index_url,
}


def fetch_news():
    news_list = []
    resp = r.get(category_url, headers=headers)
    resp.encoding = 'utf8'
    print('爬取:', resp.url)
    if resp is not None:
        pq = PyQuery(resp.text)
        divs = pq('div.lfn-bar')
        for div in divs.items():
            a = div('div.lfn-title > a')
            form = div('div > div > div > span.form').text()
            url = 'https:' + a.attr('href')
            news_list.append(
                News(_id=url.split('/')[-1].replace('.html', ''),
                     url=url,
                     title=a.text(),
                     overview=div('div.lfn-des').text(),
                     publish_time=div('div > div > div > span.time').text(),
                     origin=form if form != '' else None).to_dict())
    return news_list


if __name__ == '__main__':
    client = MongodbClient('chuangyebang')
    client.insert_many(fetch_news())
Пример #11
0
def fetch_web_news():
    resp = r.get(data_url, headers=headers)
    resp.encoding = 'utf8'  # 设置编码
    json_result = json_extract_pattern.search(resp.text);
    if json_result is not None:
        json_news = json_result.group(1)
        sites = json.loads(json_news)['sites']
        news_list = []
        for site in sites:
            if site['c5'] == today_date:
                id_result = id_extract_pattern.search(site['c2'].split('/')[-1])
                news_list.append(
                    News(
                        _id=id_result.group(1),
                        title=site['c1'],
                        url=site['c2'],
                        image=site['c3'],
                        origin=site['c4'],
                        publish_time=site['c5'] + ' ' + site['c6']
                    ).to_dict()
                )
            else:
                break
        return news_list


if __name__ == '__main__':
    client = MongodbClient('zgtq')
    client.insert_many(fetch_web_news())
    print("中国天气网数据爬取完毕!")
                    title=a.attr('title'),
                    image=a('img').attr('src'),
                    overview=c.parent(
                        'div .posts-default-content > div.posts-text').text(),
                    origin=c.parent(
                        'div .posts-default-content > div.posts-default-info > ul > li.ico-cat'
                    ).text(),
                    publish_time=c.parent(
                        'div .posts-default-content > div.posts-default-info > ul > li.ico-time'
                    ).text()).to_dict())
    return news_list


if __name__ == '__main__':
    cur_page = 1
    client = MongodbClient('jingmeiti')
    while True:
        result_list = fetch_news(cur_page)
        client.insert_many(result_list)
        publish_time = result_list[-1]['publish_time']
        if '小时前' in publish_time:
            hours_before = hours_pattern.search(
                result_list[-1]['publish_time'])
            print(hours_before)
            if hours_before is not None:
                if int(hours_before.group(1)) > 12:
                    break
                else:
                    cur_page += 1
                    continue
        else:
Пример #13
0
headers = {
    'Host': str_handle.remove_url_scheme(index_url)[:-1],
    'User-Agent': user_agents.random_user_agent(),
}


def fetch_news():
    news_list = []
    resp = r.get(index_url, headers=headers)
    print("抓取:", resp.url)
    if resp is not None:
        bs = BeautifulSoup(resp.text, 'lxml')
        data_list = bs.find_all("div", attrs={"bosszone": "TS_Mainnews"})[0]
        a_s = data_list.find_all("a")
        cur = 1
        for a in a_s:
            news_list.append(News(
                _id=cur,
                title=a.text,
                origin="腾讯体育",
                url=a['href']
            ).to_dict())
            cur += 1
    return news_list


if __name__ == '__main__':
    client = MongodbClient('txty')
    client.insert_many(fetch_news())
    print("腾讯体育爬取完毕!")
Пример #14
0
    print("抓取:", resp.url)
    if resp is not None:
        resp.encoding = 'utf8'
        pq = PyQuery(resp.text)
        lis = pq('li')
        for li in lis.items():
            news_list.append(
                News(_id=li('div.share_btn > a').attr('id'),
                     title=li('div > div.title').text(),
                     overview=li('div > div.content').text().replace(
                         '\n', '').strip(),
                     publish_time=li('div > div.pub_time').attr('data-time'),
                     url="{}?id={}".format(
                         flash_url,
                         li('div.share_btn > a').attr('id'))).to_dict())
    return news_list


if __name__ == '__main__':
    client = MongodbClient('bianews')
    cur_page = 1
    while True:
        result_list = fetch_news(cur_page)
        client.insert_many(result_list)
        last_publish_time = result_list[-1]['publish_time']
        if int(round(time.time() * 1000)) - int(last_publish_time) < 43200000:
            cur_page += 1
            continue
        else:
            break
Пример #15
0
    print("抓取:", resp.url)
    if resp is not None:
        data = resp.json()['Data']
        if data is not None:
            for d in data:
                news_list.append(News(
                    _id=d['ID'],
                    title=d['Title'],
                    image=d['PicUrl'],
                    overview=d['abstract'],
                    origin=d['Key'],
                    url=index_url + d['APage'],
                    publish_time=d['updatetime']
                ).to_dict())
    return news_list


if __name__ == '__main__':
    cur_page = 1
    client = MongodbClient('lianshangwang')
    while True:
        result_list = fetch_news(cur_page)
        client.insert_many(result_list)
        if int(round(time.time())) - int(
                time.mktime(time.strptime(result_list[-1]['publish_time'], "%Y/%m/%d %H:%M:%S"))) < 43200:
            cur_page += 1
            continue
        else:
            break
    print("联商网爬取完毕!")
Пример #16
0
                     'a': 'ajaxNews',
                     'cid': 4,
                     'page': page
                 },
                 headers=headers)
    print('爬取:', resp.url)
    if resp is not None:
        resp.encoding = 'utf8'
        rst = json.loads(resp.text[1:-1])['rst']
        pq = PyQuery(rst)
        news_item = pq('div.item-news')
        for item in news_item.items():
            a_url = item('div > p > a').attr('href')
            item_main = title_extract_pattern.search(
                item('div.item-main').text())
            if item_main is not None:
                news_list.append(
                    News(_id=a_url.split('/')[-1].replace('.html', ''),
                         url=a_url,
                         title=item_main.group(1),
                         overview=item_main.group(2),
                         publish_time=item('div.item-date').text()).to_dict())
    return news_list


if __name__ == '__main__':
    client = MongodbClient('jiemian')
    client.insert_many(fetch_news(1))
    client.insert_many(fetch_news(2))
    print("界面新闻爬取完毕!")
Пример #17
0
    bs = BeautifulSoup(resp.text, 'lxml')
    data_list = bs.find("ul", attrs={'class': 'gallery l-list-selected l-m'})
    lis = data_list.findAll('li')
    for li in lis:
        l_cbox = li.find('div', attrs={'class': 'l-cbox'})
        spans = l_cbox.find('div', attrs={
            'class': 'l-foot-par'
        }).findAll('span')
        news_id_result = xhs_news_id_pattern.search(li.a['href'])
        if news_id_result is not None:
            # 判断新闻的发布时间与当前时间的时间间隔,只保存12个小时以内的新闻
            publish_time = spans[1].text.replace('\n', '').strip()
            if int(round(time.time())) - int(
                    time.mktime(
                        time.strptime(publish_time,
                                      "%Y-%m-%d %H:%M:%S"))) < 43200:
                news_list.append(
                    News(_id=news_id_result.group(1),
                         url=li.a['href'],
                         title=li.a.img['alt'],
                         image=xhs_gd_url + li.a.img['src'],
                         origin=spans[0].text,
                         publish_time=publish_time,
                         overview=l_cbox.p.text).to_dict())
    return news_list


if __name__ == '__main__':
    client = MongodbClient('xinhuashe')
    client.insert_many(fetch_xh_focus())
    client.insert_many(fetch_gd_news())
Пример #18
0
        for li in pq('ul.main-wrap > li').items():
            url = index_url + li('div.childR > p > a').attr('href')
            img = li('div > a > img')
            news_list.append(News(
                _id=url.split('/')[-1],
                url=url,
                title=img.attr('alt'),
                overview=li('div.childR > p > a').text(),
                image=index_url[:-1] + img.attr('src'),
                publish_time=li('div.childR > div.time').text()
            ).to_dict())
    return news_list


if __name__ == '__main__':
    client = MongodbClient('huanqiulvxun')
    cur_page = 1
    while True:
        result_list = fetch_news(cur_page)
        client.insert_many(result_list)
        last_publish_time = result_list[-1]['publish_time']
        if '分钟前' in last_publish_time:
            cur_page += 1
            continue
        elif '小时前' in last_publish_time:
            hours_before = hours_pattern.search(last_publish_time)
            if hours_before is not None:
                if int(hours_before.group(1)) < 12:
                    cur_page += 1
                    continue
                else:
Пример #19
0
    print("爬取:", resp.url)
    if resp is not None:
        pq = PyQuery(resp.text)
        news = pq('div.ws-newsflash-list01')
        for n in news.items():
            a = n('a')
            news_list.append(
                News(_id=a.attr('href').split('/')[-1],
                     url=index_url + a.attr('href'),
                     title=a('h5').text(),
                     overview=n('div.ws-newsflash-content').text().replace(
                         "【查看原文】", ''),
                     publish_time=n('div > div > time').text()).to_dict())
    return news_list


if __name__ == '__main__':
    client = MongodbClient('diyidiandong')
    cur_page = 1
    while True:
        result_list = fetch_news(cur_page)
        client.insert_many(result_list)
        last_publish_time = result_list[-1]['publish_time']
        if int(round(time.time())) - int(
                time.mktime(time.strptime(last_publish_time,
                                          "%Y-%m-%d %H:%M"))) < 43200:
            cur_page += 1
            continue
        else:
            break
    resp = r.post(ajax_url, data=json.dumps(ajax_params), headers=headers)
    if resp is not None:
        res = resp.json()
        for i in res['res']:
            news_list.append(
                News(_id=i['id'],
                     title=i['title'],
                     overview=i['content'],
                     publish_time=i['create_time'],
                     origin=i['src_name'],
                     url=news_detail_url + i['uid']).to_dict())
    return news_list


if __name__ == '__main__':
    client = MongodbClient('dongmaiwang')
    cur_page = 1
    while True:
        print("爬取第%d页" % cur_page)
        result_list = fetch_news(cur_page)
        client.insert_many(result_list)
        if int(round(time.time())) - int(
                time.mktime(
                    time.strptime(result_list[-1]['publish_time'],
                                  "%Y-%m-%d %H:%M:%S"))) < 43200:
            cur_page += 1
            continue
        else:
            break
    print("动脉网爬取完毕!")
Пример #21
0
        resp_json = resp.json()
        items = resp_json['data']['items']
        for item in items:
            post = item['post']
            motifs = post['motifs']
            motifs_name = motifs[0]['name'] if motifs is not None else ''
            data_list.append(
                News(_id=str(item['id']),
                     title=post['title'],
                     url=news_detail_base_url + str(post['id']),
                     image=post['cover'],
                     publish_time=post['published_at'],
                     overview=post['summary'],
                     origin=post['user']['name'] + '|' +
                     motifs_name).to_dict())
        if int(round(time.time())) - int(
                time.mktime(
                    time.strptime(items[-1]['post']['published_at'],
                                  "%Y-%m-%d %H:%M:%S"))) > 86400:
            return None
        else:
            return fetch_web_news_more(items[-1]['id'])


if __name__ == '__main__':
    result_list, end_id = fetch_web_news()
    fetch_web_news_more(end_id)
    client = MongodbClient('36Kr')
    client.insert_many(result_list)
    client.insert_many(data_list)
Пример #22
0
def fetch_news():
    news_list = []
    resp = r.get(index_url, headers=headers)
    print("抓取:", resp.url)
    if resp is not None:
        resp.encoding = 'utf8'
        pq = PyQuery(resp.text)
        data_list = pq('ul#date-list-ul')
        for li in data_list('li').items():
            img = li('a > img')
            print(li('p').text())
            news_list.append(
                News(
                    url=li('a').attr('href'),
                    _id=li('a').attr('href').split('/')[-1].replace(
                        '.html', ''),
                    title=img.attr('alt'),
                    image=img.attr('src'),
                    overview=li('div#list-t p#list-abs').text(),
                    publish_time=li(
                        'div#list-t > p#list-sm span:first').text(),
                    origin=li('div#list-t > p#list-sm > span:last').text(),
                ).to_dict())
    return news_list


if __name__ == '__main__':
    client = MongodbClient('xinlvjie')
    client.insert_many(fetch_news())
    print("新旅社爬取完毕!")
Пример #23
0
    if resp is not None:
        d = resp.json()['d']
        for i in d:
            news_list.append(
                News(
                    _id=i['ArticleId'],
                    title=i['Title'],
                    overview=i['BriefContent'],
                    url=index_url[:-1] + i['LinkUrl'],
                    publish_time=i['IssueTime'],
                ).to_dict())
    return news_list


if __name__ == '__main__':
    client = MongodbClient('gasgoo')
    cur_page = 1
    while True:
        print("爬取第%d页" % cur_page)
        result_list = fetch_news(cur_page)
        client.insert_many(result_list)
        last_publish_time = result_list[-1]['publish_time']
        if int(round(time.time())) - int(
                time.mktime(
                    time.strptime(last_publish_time,
                                  "%Y-%m-%d %H:%M:%S"))) < 43200:
            cur_page += 1
            continue
        else:
            break
    print("盖世汽车资讯爬取完毕!")