def start_flow(self): """ 执行一个完整的流程 """ #logger.debug("%s start flow" % list_page_url) list_page_res = self.fetch_list_page() topic_ids = self.parse_list_page(list_page_res.body) # 遍历抓取每一个 topic 页面中的 img 元素 for topic_id in topic_ids: time.sleep(2) try: try: # 如果 topic 已经存在 就不再抓取 if Topic.get(id=topic_id): continue except: pass topic_page_res = self.fetch_topic_page(topic_id) topic = Topic.create(id=topic_id, origin_content=topic_page_res.body) img_urls = self.parse_topic_page(topic_page_res.body) imgs = [] for img_url in img_urls: time.sleep(2) try: if Image.get(origin_url=img_url): continue except: pass img = self.fetch_img(img_url) if not os.path.exists("%s/imgs/%s" % (config["imgs_path"], self.imgs_dir_name)): os.mkdir("%s/imgs/%s" % (config["imgs_path"], self.imgs_dir_name)) file_name = "%s/imgs/%s/%s.jpg" % (config["imgs_path"], self.imgs_dir_name, time.time()) Image.create(origin_url=img_url, file_name=file_name, topic=topic) fp = open(file_name, "w+") fp.write(img.body) fp.close() except Exception as e: print e self.http_client.close()
def main(): tornado.options.parse_command_line() if options.cmd == "start_web": from d.web.urls import urls logger.info("web start") application = tornado.web.Application(urls, debug=options.debug, template_path="d/web/tpl") application.listen(8888) tornado.ioloop.IOLoop.instance().start() elif options.cmd == "start_worker": from d.worker.worker import DBGroupWorker group_list = ["haixiuzu", "meituikong", "miniskirtlegs", "515085", "516876", "103485", "510760"] while 1: time.sleep(2) for group_name in group_list: #logger.info("%s: work start" % group_name) w = DBGroupWorker(group_name) w.start_flow() elif options.cmd == "init_db": from d.models.image import Image from d.models.topic import Topic Topic.create_table() Image.create_table() print "create_table ok" elif option.cmd == "migre_db": pass