Exemplo n.º 1
0
def writesth(path, hot_topics):
    file = open(path, "a", encoding="utf-8")
    file.write("抓取豆瓣内容")
    for topic in hot_topics:
        try:
            title = topic.find('a').get_text()
            href = topic.find('a').get('href')
            print(title, href)
            headers2 = {
                'User-Agent':
                'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
                'Referer': href
            }
            com_id = re.findall("\d+", href)[0]
            nums = ["20"]
            for num in nums:
                url = "https://m.douban.com/rexxar/api/v2/gallery/topic/" + str(
                    com_id
                ) + "/items?sort=hot&start=0&count=" + num + "&status_full_text=1&guest_only=0&ck=null"
                try:
                    html = download_page.download_html_waitting(
                        url, headers2, 1)
                    res = json.loads(html)
                    for item in res["items"]:
                        file.write(format_str(item["abstract"]) + '\n')
                        print(item["abstract"])
                except Exception as e:
                    print("Except——豆瓣:爬取热评失败", e)
        except Exception as e:
            print("Except——豆瓣:爬取24小时热门话题失败", e)
    file.close()
    return 'success'
Exemplo n.º 2
0
def newsCrawler(path):
    tags = ['china','society','world']
    items = []
    for tag in tags:
        url = r'http://news.cctv.com/' + tag + r'/data/index.json'
        try:
            result = download_page.download_html_waitting(url,headers,1)
            result = json.loads(result,strict=False)
            items = result['rollData']
        except Exception as e:
            print("Except-新闻列表",e)
        # 写入文件
        file = open(path, "a",encoding="utf-8")
        file.write("抓取新闻内容")
        if items != []:
            for item in items:
                title = item["title"]
                url = item['url']
                try:
                    soup = download_page.download_soup_waitting(url,headers,1)
                    content = soup.find('div', {'class': 'cnt_bd'})
                    # 剔除无关标签
                    [s.extract() for s in content(['div', 'script'])]
                    # print title, content.get_text().strip().replace('\n', '')
                    result = title + ":" + content.get_text().strip().replace('\n', '')
                    file.write(format_str(result.encode('utf-8','ignore').decode('utf-8','ignore'))+'\n')
                    print(result)
                except Exception as e:
                    print ("Except - 新闻:"+url,e)
        file.close()
    return 'success'
Exemplo n.º 3
0
def sportCrawler(path):
    # 写诶文件
    file = open(path, 'a', encoding="utf-8")
    file.write("抓取体育内容")
    items = []
    for url in urlcol:
        result = download_page.download_html_waitting(url, headers, 1)
        try:
            result = str(result, encoding="gbk").replace(
                "data_callback(", '{"data_callback":', 1)[:-1] + "}"
            result = json.loads(result, strict=False)
            items = result['data_callback']
        except Exception as e:
            print("Except-体育列表", e)
        if items != []:
            for item in items:
                title = item['title']
                docurl = item['docurl']
                file.write(format_str(title))
                print(title, docurl)
                from bs4 import BeautifulSoup
                res = requests.get(docurl, headers=headers)
                res.encoding = 'gb2312'
                soup = BeautifulSoup(res.text, "html.parser")
                # print(soup)
                try:
                    post = soup.find('div', id="endText")
                    if post is None:
                        print("格式不相符")
                    else:
                        text = post.get_text().strip()
                        result = text.replace('\n', '')
                        file.write(format_str(result) + '\n')
                        print(result)
                except:
                    print("Except -- ,跳往下一链接")
    file.close()
    return 'success'
Exemplo n.º 4
0
    def getdata(self):  # 获取数据
        req = self.s.get(url=self.url, verify=False)
        headers = {'referer': self.url}
        max_behot_time = '0'
        signature = '.1.hXgAApDNVcKHe5jmqy.9f4U'
        eas = 'A1E56B6786B47FE'
        ecp = '5B7674A7FF2E9E1'
        self.s.headers.update(headers)

        titles = []
        abstracts = []
        for i in range(0, 10):
            Honey = json.loads(self.get_js())
            eas = Honey['as']
            ecp = Honey['cp']
            signature = Honey['_signature']
            url = 'https://www.toutiao.com/api/pc/feed/?category={}&utm_source=toutiao&widen=1&max_behot_time={}&max_behot_time_tmp={}&tadrequire=true&as={}&cp={}&_signature={}'.format(
                self.channel, max_behot_time, max_behot_time, eas, ecp, signature)
            req = self.s.get(url=url, verify=False)
            time.sleep(random.random() * 2 + 2)
            # print(req.text)
            print(url)
            j = json.loads(req.text)

            items = j['data']
            # 写入文件
            file = open(self.path,'a',encoding="utf-8")
            file.write("抓取电影内容")
            for item in items:
                try :
                    title = item['title']
                    abstract = item['abstract']
                    file.write(format_str(title + ":" +abstract)+'\n')
                    print(title + " : " + abstract)
                    titles.append(title)  ##标题
                    try:
                        abstracts.append(abstract)  ###文章摘要
                    except Exception as e:
                        abstracts.append('')
                except Exception as e:
                    print("Except - 头条",e)
            file.close()
            time.sleep(2)
            print('------------' + str(j['next']['max_behot_time']))
Exemplo n.º 5
0
def esportsCrawler(path):
    for i in range(1, 10):
        url = 'http://www.dadianjing.cn/index.php?m=Index&a=xhrList&cid=1&page=' + str(
            i)
        try:
            result = download_page.download_html_waitting(url, headers, 1)
            result = json.loads(result, strict=False)
            items = result["data"]["list"]
            # 写入文件
            file = open(path, "a", encoding="utf-8")
            file.write("抓取电竞内容")
            for item in items:
                title = item['title']
                summary = item['summary']
                file.write(format_str((title + ":" + summary)) + '\n')
                print(title + "---" + summary)
            file.close()
        except Exception as e:
            print("Except - 电竞:" + url, e)
    return 'success'
Exemplo n.º 6
0
def hotmovieCrawler(path):
    start = 0
    # 不断请求,直到返回结果为
    file = open(path, "a", encoding="utf-8")
    file.write("抓取电影内容")
    while start <= 40:
        # 拼接需要请求的链接,包括标签和开始编号
        url = 'https://movie.douban.com/j/search_subjects?type=movie&tag=热门&sort=recommend&page_limit=20&page_start=' + str(
            start)
        result = download_page.download_html_waitting(url, headers, 1)
        if result != "none":
            print("-------------没有使用代理---------------")
            result = json.loads(result)
            movie_items = result['subjects']
            for movie_item in movie_items:
                movie_url = movie_item['url']
                # 提取电影简介
                # 捕捉异常,有的电影详情页中并没有简介
                try:
                    html = requests.get(movie_url).content
                    soup = BeautifulSoup(html, "html.parser")
                    description = soup.find_all(
                        "span",
                        attrs={"property": "v:summary"
                               })[0].get_text().strip().replace('\n', '')
                    file.write(
                        format_str(
                            description.encode('utf-8', 'ignore').decode(
                                'utf-8', 'ignore')) + '\n')
                    print(description)
                except Exception as e:
                    print("该电影没有简介", e)
                time.sleep(0.5)
        else:
            print("-------------使用代理---------------")
            # 获取代理IP
            ip_list = ipAgency.get_ip_list(headers)
            # print(ip_list)
            for ip in ip_list:
                hd, port = ip.split(':')
                try:
                    telnetlib.Telnet(hd, port=port, timeout=20)
                except:
                    print(str(ip) + '失败')
                else:
                    try:
                        proxies = get_proxy(ip)
                        requests.adapters.DEFAULT_RETRIES = 5  # 增加重连次数
                        s = requests.session()
                        s.keep_alive = False  # 关闭多余连接
                        s.proxies = proxies
                        s.headers = headers
                        html = requests.get(url).content
                        if html == "none":
                            continue
                        result = json.loads(html)
                        movie_items = result['subjects']
                        for movie_item in movie_items:
                            movie_url = movie_item['url']
                            # 提取电影简介
                            # 捕捉异常,有的电影详情页中并没有简介
                            try:
                                html = requests.get(movie_url).content
                                soup = BeautifulSoup(html, "html.parser")
                                description = soup.find_all(
                                    "span", attrs={
                                        "property": "v:summary"
                                    })[0].get_text().strip().replace('\n', '')
                                file.write(
                                    format_str(
                                        description.encode('utf-8', 'ignore').
                                        decode('utf-8', 'ignore')) + '\n')
                                print(description)
                            except Exception as e:
                                print("该电影没有简介", e)
                            time.sleep(0.5)
                    except Exception as e:
                        print("Except——电影:", e)

        start += 20
    file.close()
    return 'success'
Exemplo n.º 7
0
def sinaCrawler(path):
    # a、微博信息流、热搜
    hot_tag = []
    hot_tag.append("realtimehot")
    hot_tag.append("socialevent")
    # 写入文件
    file = open(path, "a", encoding="utf-8")
    file.write("抓取新浪微博内容")
    for tag in hot_tag:
        soup = download_page.download_soup_waitting(
            "https://s.weibo.com/top/summary?cate=" + tag, headers, 1)
        try:
            hot_list = soup.find_all('td', attrs={"class": "td-02"})
            for hot in hot_list:
                title = hot.find("a").get_text()
                href = "https://s.weibo.com" + hot.find("a").get('href')
                file.write(format_str(title))
                print(title + ":" + href)
                # 过滤无效链接
                if href != "https://s.weibo.comjavascript:void(0);":
                    detail_soup = download_page.download_soup_waitting(
                        href, headers, 1)
                    # print(detail_soup)
                    cards = detail_soup.find_all("div",
                                                 attrs={"class": "card-feed"})
                    for card in cards:
                        content = card.find("div", attrs={"class": "content"})
                        blogger = content.find("p", attrs={
                            "class": "txt"
                        }).get('nick-name')
                        blog = content.find("p", attrs={
                            "class": "txt"
                        }).get_text()
                        file.write(format_str(blogger + ":" + blog))
                        print(blogger + ":" + blog)
        except Exception as e:
            print("Except——新浪:爬取异常,已跳过", e)

    # b、微博信息流
    url_hotfeed = "https://api.weibo.cn/2/guest/cardlist?gsid=_2AkMu5Br-f8NhqwJRmPAcz2PmZYl_yQ3EieKYuOslJRM3HRl-3T9kqnwvtRWwLB-1C2SEmptvAP1Bfy0s7kgEgw..&uid=1008938494835&wm=3333_2001&i=8bb4ee5&b=1&from=1073193010&checktoken=807ca79ae3fa897b262e3b63c3882698&c=iphone&networktype=wifi&v_p=45&skin=default&s=ee9f63c1&v_f=1&did=eb4621d547f0e7cb9eef4a41403ee866&lang=zh_CN&sflag=1&ua=iPhone9,2__weibo__7.3.1__iphone__os10.3.1&aid=01AhjayctpFPjOzJEmy46JLMop9TgsXKgsxZQYIpcPoBa-nn8.&lon=116.2697240292689&count=20&fid=230584&containerid=230584&uicode=10000011&lat=40.04127809492162&offset=1&max_id=4151604225452173&page=1&moduleID=pagecard"
    url_starfeed = "https://api.weibo.cn/2/guest/cardlist?gsid=_2AkMu5WfMf8NhqwJRmPAcz2PmZYl_yQ3EieKYuZYXJRM3HRl-3T9kqnZftRVqWDRdwTGKDWtA7iBOAX-N3elOcA..&uid=1008938494835&wm=3333_2001&i=8bb4ee5&b=1&from=1073193010&checktoken=807ca79ae3fa897b262e3b63c3882698&c=iphone&networktype=wifi&v_p=45&skin=default&s=ee9f63c1&v_f=1&did=eb4621d547f0e7cb9eef4a41403ee866&lang=zh_CN&sflag=1&ua=iPhone9,2__weibo__7.3.1__iphone__os10.3.1&aid=01AhjayctpFPjOzJEmy46JLMop9TgsXKgsxZQYIpcPoBa-nn8.&lon=116.2697240292689&count=20&fid=230781&containerid=230781&uicode=10000011&lat=40.04127809492162&offset=1&max_id=4140648884038081&page=1&moduleID=pagecard"
    urlcol = []
    urlcol.append(url_hotfeed)
    urlcol.append(url_starfeed)
    for url in urlcol:
        print("正在获取微博信息流...")
        res = download_page.download_html_waitting(url, headers, 1)
        try:
            res = json.loads(res)
            for cards in res["cards"]:
                # print cards
                if cards["card_type"] == 9:
                    if "text" in cards["mblog"]:
                        # print cards["mblog"]["text"]
                        file.write(cards["mblog"]["text"])
                        print(cards["mblog"]["text"])
        except KeyError as e:
            print("Except——新浪: " + str(e))

    file.close()
    return 'success'