示例#1
0
文件: base.py 项目: judasnow/ex
    def start_flow(self):
        """ 执行一个完整的流程 """

        #logger.debug("%s start flow" % list_page_url)

        list_page_res = self.fetch_list_page()
        topic_ids = self.parse_list_page(list_page_res.body)

        # 遍历抓取每一个 topic 页面中的 img 元素

        for topic_id in topic_ids:
            time.sleep(2)
            try:

                try:
                    # 如果 topic 已经存在 就不再抓取
                    if Topic.get(id=topic_id):
                        continue
                except:
                    pass

                topic_page_res = self.fetch_topic_page(topic_id)
                topic = Topic.create(id=topic_id, origin_content=topic_page_res.body)

                img_urls = self.parse_topic_page(topic_page_res.body)

                imgs = []
                for img_url in img_urls:
                    time.sleep(2)

                    try:
                        if Image.get(origin_url=img_url):
                            continue

                    except:
                        pass

                    img = self.fetch_img(img_url)

                    if not os.path.exists("%s/imgs/%s" % (config["imgs_path"], self.imgs_dir_name)):
                        os.mkdir("%s/imgs/%s" % (config["imgs_path"], self.imgs_dir_name))

                    file_name = "%s/imgs/%s/%s.jpg" % (config["imgs_path"], self.imgs_dir_name, time.time())

                    Image.create(origin_url=img_url, file_name=file_name, topic=topic)

                    fp = open(file_name, "w+")
                    fp.write(img.body)
                    fp.close()

            except Exception as e:
                print e

        self.http_client.close()