def __init__(self, conf, debug=False): handlers = [ ('/', IndexHandler), ('/admin/drama/add', AdminDramaAddHandler), ('/admin/drama/list', AdminDramaListHandler), ('/admin/drama/search', AdminDramaSearchHandler), ('/admin/drama/parser', AdminDramaParserHandler), ('/api/drama/list', ApiDramaListHandler), ('/api/drama/search', ApiDramaSearchHandler), ('/drama/episode/play/(\S+)', DramaEpisodePlayHandler), ('/drama/episode/(\S+)', DramaEpisodeHandler), ('/weixin', WeixinHandler), ] settings = dict(template_path=os.path.join(os.path.dirname(__file__), "./web/template"), static_path=os.path.join(os.path.dirname(__file__), "./web/static"), debug=debug, autoescape=None) self.conf = conf engine = MysqlEngine(conf.get('db.uri')) BaseModel.setup_all_model(engine) self.dramaModel = DramaModel.instance() self.episodeModel = DramaEpisodeModel.instance() self.dramaService = DramaService() self.wechat = WechatBasic(token=conf.get("wechat.token"), appid=conf.get("wechat.appId"), appsecret=conf.get("wechat.appSecret")) self.hashid = Hashids(salt="woshifyz") self.parser = {'tudou': TudouParser()} super(Application, self).__init__(handlers, **settings)
class DramaEpisodeSource(BaseParser): def __init__(self): self.tudouParser = TudouParser() self.model = DramaEpisodeModel() self.urlModel = UrlContentModel() def fetch(self, d_id, url, last_ep=0): text_content = self.get_decoded_html(url) content = pq(text_content) trs = content(".abc dt") eps = self.model.get_by_drama_id(d_id) if eps: last_ep = eps[-1]['episode'] i = 1 for tr in trs: tr = pq(tr) ep_url = HOST + tr("a").attr("href") count = tr("a").text() ep_group = re.search(r"(\d+)", count) if ep_group is not None: ep = int(ep_group.group(1)) else: ep = i print ep_url, ep if ep <= last_ep: print "fetch before, skip %s" % ep continue res = self.fetch_ep_page(d_id, ep_url, ep) if res == False: break i += 1 def fetch_ep_page(self, d_id, url, ep): text_content = self.get_decoded_html(url) if text_content.find("http://www.tudou.com/programs/view") < 0: print "not tudou", d_id, url, ep return False content = pq(text_content) tudou_source = content("#ads iframe").attr("src") vid = re.search(r"code=(\S+?)&", tudou_source).group(1) source = "http://www.tudou.com/programs/view/" + vid url, hd_url = self.tudouParser.parse(vid) if url and hd_url: self.model.insert(d_id, ep, 0, source, url, hd_url) def fetch_by_ep_id(self, d_id, vid, cur, last_ep=0): for i in range(1, cur + 1): if i <= last_ep: continue url = "http://www.hanjucc.com/hanju/%s/%s.html" % (vid, i) self.fetch_ep_page(d_id, url, i)
class DramaEpisodeSource(BaseParser): def __init__(self): self.tudouParser = TudouParser() self.model = DramaEpisodeModel() self.urlModel = UrlContentModel() def fetch(self, d_id, url, last_ep=0): text_content = self.get_decoded_html(url); content = pq(text_content) trs = content(".abc dt") eps = self.model.get_by_drama_id(d_id) if eps: last_ep = eps[-1]['episode'] i = 1 for tr in trs: tr = pq(tr) ep_url = HOST + tr("a").attr("href") count = tr("a").text() ep_group = re.search(r"(\d+)", count) if ep_group is not None: ep = int(ep_group.group(1)) else: ep = i print ep_url, ep if ep <= last_ep: print "fetch before, skip %s" % ep continue res = self.fetch_ep_page(d_id, ep_url, ep) if res == False: break i += 1 def fetch_ep_page(self, d_id, url, ep): text_content = self.get_decoded_html(url) if text_content.find("http://www.tudou.com/programs/view") < 0: print "not tudou", d_id, url, ep return False content = pq(text_content) tudou_source = content("#ads iframe").attr("src") vid = re.search(r"code=(\S+?)&", tudou_source).group(1) source = "http://www.tudou.com/programs/view/" + vid url, hd_url = self.tudouParser.parse(vid) if url and hd_url: self.model.insert(d_id, ep, 0, source, url, hd_url) def fetch_by_ep_id(self, d_id, vid, cur, last_ep=0): for i in range(1, cur + 1): if i <= last_ep: continue url = "http://www.hanjucc.com/hanju/%s/%s.html" % (vid, i) self.fetch_ep_page(d_id, url, i)
def __init__(self): self.tudouParser = TudouParser() self.model = DramaEpisodeModel() self.urlModel = UrlContentModel()