def save_article(self, categoryName, url, imgurl):
        logger.info("视频网页地址:" + url)
        data = HttpUtil.get_html(url)
        if data is None:
            return True

        title = self.obtainInfo.find_title(data)
        public_time = self.obtainInfo.find_time(data)

        flag = DateUtil.verify_time(public_time)
        if not flag:
            return False

        subject = self.obtainInfo.find_subject(data)
        context = self.obtainInfo.find_context(data)
        tags = self.obtainInfo.find_tags(data)
        if not context is None:
            # 对文章内容加密
            context = context.encode('utf-8')
            bs64 = base64.b64encode(context)
            p = paramater(categoryName, title, '', '', str(subject), str(bs64),
                          imgurl, tags, '投资界', url, public_time)
            over_dict = p.__dict__
            result = json.dumps(over_dict, ensure_ascii=False)
            js = json.loads(result)

            HttpUtil.post(js)

        return True
示例#2
0
    def save_article(self, categoryName, tag, url, imgurl):
        data = HttpUtil.get_html(url)
        if data is None:
            return True
        title = self.obtainInfo.find_title(data)
        authors = self.obtainInfo.find_author_info(data)
        context = self.obtainInfo.find_context(data)
        subject = self.obtainInfo.find_subject(data)
        tags = tag
        author = ''
        public_time = ''
        if len(authors) > 0:
            for v in authors:
                if v.find(':') > 0:
                    public_time = v
                else:
                    author = v
        if public_time.find('年') > 0:
            public_time = DateUtil.time_transfer(public_time)
        flag = DateUtil.verify_time(public_time)
        if not flag:
            return False
        if not context is None:
            # 对文章内容加密
            context = context.encode('utf-8')
            bs64 = base64.b64encode(context)
            p = paramater(categoryName, title, author, author, str(subject),
                          str(bs64), imgurl, tags, '36Kr网', url, public_time)
            over_dict = p.__dict__
            result = json.dumps(over_dict, ensure_ascii=False)
            js = json.loads(result)

            HttpUtil.post(js)

        return True
示例#3
0
    def save_article(self, categoryName, url, imgurl):
        logger.info("视频网页地址:" + url)
        data = HttpUtil.get_html(url)
        if data is None:
            return True
        soup_obj = self.obtainInfo.get_soup_obj(data)
        title = self.obtainInfo.get_title(soup_obj)
        public_time = self.obtainInfo.get_time(soup_obj)

        if public_time is None or public_time.find('前'):
            public_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())

        flag = DateUtil.verify_time(public_time)
        if not flag:
            return False

        subject = self.obtainInfo.get_desc(soup_obj)
        context = self.obtainInfo.get_content(soup_obj)
        tags = self.obtainInfo.get_tag(soup_obj)
        author = self.obtainInfo.get_author(soup_obj)
        if not context is None:
            # 对文章内容加密
            context = context.encode('utf-8')
            bs64 = base64.b64encode(context)
            p = paramater(categoryName, title, author, author,
                          str(subject), str(bs64),
                          imgurl, tags, self.default_value_source, url, public_time)
            over_dict = p.__dict__
            result = json.dumps(over_dict, ensure_ascii=False)
            js = json.loads(result)
            # 请求数据
            HttpUtil.post(js)

        return True
示例#4
0
    def save_video(self, categoryName, tags, url, img, time):
        logger.info("视频网页地址:" + url)
        data = HttpUtil.get_html(url)
        if data is None:
            return

        context = self.obtainInfo.find_video_context(data)
        if context is None:
            return

        context = '<iframe  width="680" height="480"  src="' + context + '" frameborder=0 allowfullscreen></iframe>'
        author = self.obtainInfo.find_video_author(data)
        title = self.obtainInfo.find_video_title(data)
        subject = title
        if not context is None:
            # 对文章内容加密
            context = context.encode('utf-8')
            bs64 = base64.b64encode(context)
            p = paramater(categoryName, title, author, author,
                          str(subject), str(bs64),
                          img, tags, '金斧子', url, time)
            over_dict = p.__dict__
            result = json.dumps(over_dict, ensure_ascii=False)
            js = json.loads(result)

            HttpUtil.post(js)
示例#5
0
    def run(self):
        logger.info("开始线程:", self.thread_id)
        act_url = self.url
        logger.info(act_url)
        html = HttpUtil.get_html(act_url)

        if html is None:
            return

        pages, b_id = self.obtain.find_pages1(html.decode("UTF-8"))

        for key, value in pages.items():
            self.save.save_article(self.categoryName, self.tag, key, value)

        while True:
            act_url = self.sub_url + '&b_id=' + str(b_id) + '&per_page=30'
            logger.info('分页URL:' + act_url)
            html = HttpUtil.get_html(act_url)

            if html is None:
                return

            pages, b_id = self.obtain.find_pages2(html.decode("UTF-8"))
            if len(pages) == 0:
                return
            for key, value in pages.items():
                self.save.save_article(self.categoryName, self.tag, key, value)
示例#6
0
    def run(self):
        logger.info("开始线程:", self.thread_id)
        i = 0
        flag = True
        while True:
            i = i + 1
            act_url = self.url + 'p' + str(i) + '.html'
            logger.info(act_url)
            html = HttpUtil.get_html(act_url)

            if html is None:
                continue

            pages = self.obtain.find_article_pages(html.decode("UTF-8"))
            if len(pages) == 0:
                return
            for key, desc in pages.items():
                flag = self.save.save_article(self.categoryName, self.tag, key,
                                              desc)
                if not flag:
                    break

            if not flag:
                break

        def __del__(self):
            logger.info(self.thread_id, "线程结束!)")
    def run(self):
        logger.info("开始线程:", self.thread_id)

        i = 0
        flag = True
        while True:
            i = i + 1
            act_url = self.url + str(i) + '-10.shtml'
            logger.info(act_url)
            html = HttpUtil.get_html(act_url)
            if html is None:
                continue
            if self.type == 1:
                pages = self.obtain.find_pages1(html.decode("UTF-8"))
            elif self.type == 2:
                pages = self.obtain.find_pages2(html.decode("UTF-8"))

            if len(pages) == 0:
                break
            num = 0

            for key, value in pages.items():
                flag = self.save.save_article(self.categoryName, key, value)
                if not flag:
                    break
                num = num + 1

            if not flag:
                break
示例#8
0
    def run(self):
        logger.info("开始线程:", self.thread_id)

        i = 0
        flag = True
        while True:
            i = i + 1
            act_url = self.url + "?page=" + str(i)
            logger.info(act_url)  # 这里是先拿到这个界面所有的链接
            html = HttpUtil.get_html(act_url)

            if html is None:
                return

            result = self.obtain.find_page_info_by_html_str(html.decode("UTF-8"))

            if len(result) == 0:
                break
            num = 0  # 这里去循环请求
            for key, value in result.items():
                flag = self.save.save_article(self.categoryName, key, value)
                if not flag:
                    break
                num = num + 1
                time.sleep(1)

            if not flag:
                break
    def run(self):
        logger.info("开始线程:", self.thread_id)

        i = 0
        flag = True
        while True:
            i = i + 1
            act_url = self.url + str(i)
            logger.info(act_url)
            html = HttpUtil.get_html(act_url)

            if html is None:
                return

            pages = self.obtain.find_pages(html.decode("UTF-8"))
            result = self.obtain.find_page_info(pages)

            if len(result) == 0:
                break
            num = 0
            for key, value in result.items():
                flag = self.save.save_article(self.categoryName, key, value)
                if not flag:
                    break
                num = num + 1

            if not flag:
                break
示例#10
0
    def run(self):
        logger.info("开始线程:", self.thread_id)
        i = 0
        flag = True
        while True:
            i = i + 1
            act_url = self.url + str(i)
            logger.info(act_url)
            html = HttpUtil.get_html(act_url)

            if html is None:
                continue

            pages, times = self.obtain.find_video_pages(html.decode("UTF-8"))
            if len(pages) == 0:
                return
            num = 0
            for url, img in pages.items():
                public_time = times[num]
                if public_time is None or public_time.find('前'):
                    public_time = time.strftime("%Y-%m-%d %H:%M:%S",
                                                time.localtime())
                flag = DateUtil.verify_time(public_time)
                if not flag:
                    break
                self.save.save_video(self.categoryName, self.tag, url, img,
                                     public_time)

            if not flag:
                break

        def __del__(self):
            logger.info(self.thread_id, "线程结束!)")
示例#11
0
from com.unif.util.HttpUtil import HttpUtil

# "http://192.168.30.153:8087/section/findSecById"
HttpUtil.post({'id': 0}, )