Пример #1
0
    def DL_getPageLink(self, url):
        stop_flag = False
        try:
            html = urlopen(url)
        except HTTPError as e:
            print("网络有故障!")
            return
        else:
            if html is None:
                print("请求链接服务端无法处理哦!")
            else:
                try:
                    # print("bsObj in DL_getPageLink")
                    bsObj = BeautifulSoup(html.read(), "html.parser")
                    o_li = bsObj.findAll(
                        "div", {"class": {"portlet"}})[1].findAll("li")
                    # print(o_li)
                    for item in o_li:
                        span_item = item.find("span")
                        # cmp_str_time = span_item.get_text()[1:-1]
                        cmp_str_time = span_item.get_text()
                        # print("cmp_str_time = %s" % cmp_str_time)
                        if (self.timecompare(cmp_str_time, 10)):
                            item_url = "http://www.dce.com.cn" + item.find(
                                "a").attrs["href"]
                            # print("item_url = %s" % item_url)
                            content = self.DL_getItemLink(item_url)
                            '''获取的内容进行对象保存'''
                            info_item = Info('DL')
                            info_item.setTitle(item.find("a").get_text())
                            info_item.setLink(item_url)
                            info_item.setPubtime(cmp_str_time)
                            info_item.setCatchTime(self.catchcount)
                            if content == None:
                                info_item.setContent("公告内容为空")
                            else:
                                info_item.setContent(content)

                            self.pipeline.saveInfo(info_item)

                        else:
                            stop_flag = True
                    if (stop_flag == True):
                        self.catchcount += 1
                        return
                    else:
                        page_item = bsObj.find("div", {
                            "class": {"pagination"}
                        }).findAll("a")[2]
                        next_page_url = "http://www.dce.com.cn" + page_item.attrs[
                            "tagname"]
                        print("next_page_url %s" % next_page_url)
                        self.DL_getPageLink(next_page_url)
                except AttributeError as e:
                    print("BS解析出错啦!")
        pass
Пример #2
0
    def SH_getPageLink(self, url):
        stop_flag = False
        try:
            html = urlopen(url)
        except HTTPError as e:
            print("网络有故障!")
            return
        else:
            if html is None:
                print("请求链接服务端无法处理哦!")
            else:
                try:
                    bsObj = BeautifulSoup(html.read(), "html.parser")
                    o_li = bsObj.find("div", {
                        "class": {"lawbox"}
                    }).findAll("li")
                    for item in o_li:
                        span_item = item.find("span")
                        cmp_str_time = span_item.get_text()[1:-1]
                        if (self.timecompare(cmp_str_time, 10)):
                            item_url = "http://www.shfe.com.cn" + item.find(
                                "a").attrs["href"]
                            content = self.SH_getItemLink(item_url)
                            '''获取的内容进行对象保存'''
                            info_item = Info('SH')
                            info_item.setTitle(item.find("a").get_text())
                            info_item.setLink(item_url)
                            info_item.setPubtime(cmp_str_time)
                            info_item.setCatchTime(self.catchcount)
                            if content == None:
                                info_item.setContent("公告内容为空")
                            else:
                                info_item.setContent(content)

                            self.pipeline.saveInfo(info_item)

                        else:
                            stop_flag = True
                    if (stop_flag == True):
                        self.catchcount += 1
                        return
                    else:
                        page_item = bsObj.find("div", {
                            "class": {"page-no"}
                        }).findAll("a")[2]
                        next_page_url = "http://www.shfe.com.cn/news/notice/" + page_item.attrs[
                            "href"]
                        # print("next_page_url %s" % next_page_url)
                        self.SH_getPageLink(next_page_url)
                except AttributeError as e:
                    print("BS解析出错啦!")
        pass