示例#1
0
文件: base.py 项目: judasnow/ex
    def start_flow(self):
        """ 执行一个完整的流程 """

        #logger.debug("%s start flow" % list_page_url)

        list_page_res = self.fetch_list_page()
        topic_ids = self.parse_list_page(list_page_res.body)

        # 遍历抓取每一个 topic 页面中的 img 元素

        for topic_id in topic_ids:
            time.sleep(2)
            try:

                try:
                    # 如果 topic 已经存在 就不再抓取
                    if Topic.get(id=topic_id):
                        continue
                except:
                    pass

                topic_page_res = self.fetch_topic_page(topic_id)
                topic = Topic.create(id=topic_id, origin_content=topic_page_res.body)

                img_urls = self.parse_topic_page(topic_page_res.body)

                imgs = []
                for img_url in img_urls:
                    time.sleep(2)

                    try:
                        if Image.get(origin_url=img_url):
                            continue

                    except:
                        pass

                    img = self.fetch_img(img_url)

                    if not os.path.exists("%s/imgs/%s" % (config["imgs_path"], self.imgs_dir_name)):
                        os.mkdir("%s/imgs/%s" % (config["imgs_path"], self.imgs_dir_name))

                    file_name = "%s/imgs/%s/%s.jpg" % (config["imgs_path"], self.imgs_dir_name, time.time())

                    Image.create(origin_url=img_url, file_name=file_name, topic=topic)

                    fp = open(file_name, "w+")
                    fp.write(img.body)
                    fp.close()

            except Exception as e:
                print e

        self.http_client.close()
示例#2
0
文件: main.py 项目: judasnow/ex
def main():

    tornado.options.parse_command_line()

    if options.cmd == "start_web":
        from d.web.urls import urls

        logger.info("web start")

        application = tornado.web.Application(urls,
                                              debug=options.debug,
                                              template_path="d/web/tpl")
        application.listen(8888)
        tornado.ioloop.IOLoop.instance().start()

    elif options.cmd == "start_worker":
        from d.worker.worker import DBGroupWorker

        group_list = ["haixiuzu", "meituikong", "miniskirtlegs", "515085", "516876", "103485", "510760"]

        while 1:
            time.sleep(2)
            for group_name in group_list:
                #logger.info("%s: work start" % group_name)
                w = DBGroupWorker(group_name)
                w.start_flow()

    elif options.cmd == "init_db":
        from d.models.image import Image
        from d.models.topic import Topic

        Topic.create_table()
        Image.create_table()

        print "create_table ok"

    elif option.cmd == "migre_db":
        pass