def crawl(self): videos = [] mid = self.key url = DETAIL % mid detail = loadurl(url) description = detail.get('plots') description = ''.join(description.split()) if self.data.get('channel') == u'鐢靛奖': dict_ = detail['pinfos']['mpurls'] video = VideoItemModel({ "title": self.data.get('title'), "url": MOVIE_PLAY % mid, #缃戦〉鍦板潃 "image": self.data.get('image'), "description": description, "stream": [{ 'url': dict_['tv'].get('url'), 'size': dict_['tv'].get('bits'), 'format': 'mp4' }] }) videos.append(video) else: try: sort = detail['pinfos'].get('sort')[0] episodes = detail['pinfos']['content'][sort]['fsps'] except: episodes = detail['pinfos']['fsps'] for episode in episodes: plots = episode.get('plots') plots = ''.join(plots.split()) video = VideoItemModel({ "title": episode.get('taskname'), "url": PLAY_URL % (mid,episode.get('number')), #缃戦〉鍦板潃 "image": episode.get('picurl'), "description": plots, "stream": getstream(episode.get('mpurls')) }) videos.append(video) model = VideoSourceModel({ "source": self.data.get('source'), "source_id": mid, #婧愮珯ID "title": self.data["title"], "url": detail.get('shareurl'), #璇︽儏椤电殑鍦板潃 "image": self.data.get('image'), #鍥剧墖url "categories": self.data.get('category'), #鍒嗙被 "channel": self.data.get('channel'), #棰戦亾 "region": detail.get('country'), #鍦板尯 "videos": videos, #瑙嗛涓撹緫 "pubtime": parse_date(detail.get('rinfo').split(' ')[0]), #涓婃槧鏃堕棿 "actors": detail.get('lactor'), "directors": detail.get('director'), "description": description, }) #瀵煎嚭鏁版嵁 export(model) self.data['to_album_id'] = model['to_album_id']
def crawl(self): timestr = self.data.get('videoLength', '00:00') duration = gettime(timestr) videos = [] video = VideoItemModel({ "title": self.data.get('title'), "url": self.data.get('videoURLMid'), #网页地址 "image": self.data.get('imgURL'), "description": self.data.get('desc'), "stream": [{ "url": self.data.get('videoURLMid'), #视频文件播放地址 "size": self.data.get('videoSizeMid'), "format": "mp4", #视频格式(协议) "duration": duration }], "stream_low": [{ "url": self.data.get('videoURLLow'), "size": self.data.get('videoSizeLow'), "format": "mp4", "duration": duration }], "stream_high": [{ "url": self.data.get('videoURLHigh'), "size": self.data.get('videoSizeHigh'), "format": "mp4", "duration": duration }] }) videos.append(video) model = VideoSourceModel({ "source": self.data.get('source'), "source_id": self.data.get('id'), #源站ID "title": self.data.get("title"), "url": self.data.get('shareurl'), #详情页的地址 "image": self.data.get('imgURL'), #图片url "channel": CHANNEL, #频道 "videos": videos, #视频专辑 "pubtime": parse_date(self.data.get('videoPublishTime')), #上映时间 "description": self.data.get('desc'), }) #导出数据 export(model) self.data['to_album_id'] = model['to_album_id']
def crawl(self): album_url = "http://zyqvod.com/vod/index.asp?id=%s" % self.key hxs = load_html(album_url) urls = hxs.select("//div[@class='movievod']/li/input/@value").extract() videos = [] for url in urls: m = re.match("qvod://(.+)", url) if not m: continue words = m.group(1).split("|") size = int(words[0]) #md5 = words[1] title = words[2].split(".")[0] videos.append(VideoItemModel({ "title" : title, "url" : url, "stream" : [{"url" : url, "format" : "qvod", "size" : size}], })) kv = {} for s in hxs.select("//div[@class='videoDetail']/p"): texts = s.select(".//text()").extract() if len(texts) >= 2: kv[texts[0].strip()] = texts[1].strip() description = "\n".join(hxs.select("//div[@class='movievod']/p[2]/text()").extract()) try: image = hxs.select("//div[@class='videoPic']/img/@src").extract()[0] except: image = None model = VideoSourceModel({ "source" : self.data['source'], "source_id" : self.key, "title" : self.data["title"], "time" : self.data.get('time'), "url" : album_url, "image" : image, "completed" : self.data.get('completed'), "categories" : [self.data.get('category')], "channel" : self.data.get('category'), "region" : self.data.get('region'), "videos" : videos, "actors" : split(kv.get(u'影片主演:')), "directors" : split(kv.get(u'影片导演:')), "pubtime" : parse_date(kv.get(u'上映年份:')), "description" : description, "completed" : not kv.get(u'连载状态:'), }) export(model)
def crawl(self): album_url = "http://hakuzy.com/detail/?%s.html" % self.key hxs = load_html(album_url) urls = hxs.select("//td[@class='bt']/.//input[@id='copy_yah']/@value").extract() videos = [] for url in urls: m = re.match("qvod://(.+)", url) if not m: continue words = m.group(1).split("|") size = int(words[0]) #md5 = words[1] title = words[2].split(".")[0] videos.append(VideoItemModel({ "title" : title, "url" : url, "stream" : [{"url" : url, "format" : "qvod", "size" : size}], })) kv = {} for s in hxs.select("/html/body/table[4]/tbody/tr[1]/td[2]/table/tbody/tr"): texts = s.select(".//text()").extract() if len(texts) >= 2: kv[texts[0].strip()] = texts[1].strip() description = "\n".join(hxs.select("//div[@class='intro']/.//text()").extract()) try: image = hxs.select("//div[@class='img']/img/@src").extract()[0] except: image = None model = VideoSourceModel({ "source" : SOURCE, "source_id" : self.key, "title" : self.data["title"], "time" : self.data.get('time'), "url" : album_url, "image" : image, "categories" : [self.data.get('category')], "channel" : self.data.get('category'), "region" : self.data.get('region'), "videos" : videos, "pubtime" : parse_date(kv.get(u"上映日期:")), "actors" : split(kv.get(u"影片演员:")), "directors" : split(kv.get(u"影片导演:")), "completed" : kv.get(u"影片状态:", "").find(u"连载") == -1, "description" : description, }) export(model)
def import_douban(): from contentservice.utils.datetimeutil import parse_date from contentservice.settings import MONGO_CONN_STR db = MongoClient(MONGO_CONN_STR).douban pdb.set_trace() def clean_title(title): zhPattern = re.compile(u'[\u4e00-\u9fa5]+') if zhPattern.search(title): return title.split(" ")[0] else: return title for item in db.album.find(): pubtime = None if item['pub_time']: pubtime = parse_date(re.sub("\(.*\)", "", item['pub_time'][0])) model = VideoSourceModel({ "title": clean_title(item['title']), "categories": item['sub_category'], "image": item["img"], "related": item["related"], "score": item["score"], "actors": item["actors"], "region": item["area"][0] if item["area"] else None, "url": item["url"], "description": item["description"], "pubtime": pubtime, "channel": u"电影", "source": "douban", "source_id": re.findall("/(\d+)/", item['url'])[0], }) model.on_import() print model['title']
def import_mtime(): from datetime import datetime from contentservice.models.video import VideoSourceModel from contentservice.settings import MONGO_CONN_STR from contentservice.utils.datetimeutil import parse_date db = MongoClient(MONGO_CONN_STR).mtime pdb.set_trace() for item in db.album.find(): area = item["area"] if item.get("area") else None if isinstance(area, list): area = area[0] categories = item.get("type") if isinstance(categories, basestring): categories = [categories] description = item.get("description") if isinstance(description, list): description = "\n".join(description) tags = item.get("tags") if isinstance(tags, basestring): tags = [tags] channel = "" if item.get("category_id") == "1": channel = u"电影" elif item.get("category_id") == "0": channel = u"电视剧" try: model = VideoSourceModel({ "source" : "mtime", "source_id" : item["id"], "title" : item["title"], "description" : description, "tags" : tags, "time" : datetime.strptime(item["create_time"], "%Y-%m-%d %H:%M:%S"), "duration" : item.get("duration"), "region" : area, "directors" : item.get("directors"), "score" : item.get("score"), "actors" : item.get("actors"), "categories" : categories, "channel" : channel, "url" : "http://movie.mtime.com/%s/" % item["id"], "pubtime" : parse_date(item["release_time"]), }) model.on_import() except Exception, e: print e print model["title"]
def crawl(self): timestr = self.data.get('videoLength','00:00') duration = gettime(timestr) videos = [] video = VideoItemModel({ "title": self.data.get('title'), "url": self.data.get('videoURLMid'), #网页地址 "image": self.data.get('imgURL'), "description": self.data.get('desc'), "stream": [{ "url": self.data.get('videoURLMid'), #视频文件播放地址 "size": self.data.get('videoSizeMid'), "format": "mp4", #视频格式(协议) "duration": duration }], "stream_low":[{ "url": self.data.get('videoURLLow'), "size": self.data.get('videoSizeLow'), "format": "mp4", "duration": duration }], "stream_high":[{ "url": self.data.get('videoURLHigh'), "size": self.data.get('videoSizeHigh'), "format": "mp4", "duration": duration }] }) videos.append(video) model = VideoSourceModel({ "source": self.data.get('source'), "source_id": self.data.get('id'), #源站ID "title": self.data.get("title"), "url": self.data.get('shareurl'), #详情页的地址 "image": self.data.get('imgURL'), #图片url "channel": CHANNEL, #频道 "videos": videos, #视频专辑 "pubtime": parse_date(self.data.get('videoPublishTime')), #上映时间 "description": self.data.get('desc'), }) #导出数据 export(model) self.data['to_album_id'] = model['to_album_id']
def import_douban(): from contentservice.utils.datetimeutil import parse_date from contentservice.settings import MONGO_CONN_STR db = MongoClient(MONGO_CONN_STR).douban pdb.set_trace() def clean_title(title): zhPattern = re.compile(u'[\u4e00-\u9fa5]+') if zhPattern.search(title): return title.split(" ")[0] else: return title for item in db.album.find(): pubtime = None if item['pub_time']: pubtime = parse_date(re.sub("\(.*\)", "", item['pub_time'][0])) model = VideoSourceModel({ "title" : clean_title(item['title']), "categories" : item['sub_category'], "image" : item["img"], "related" : item["related"], "score" : item["score"], "actors" : item["actors"], "region" : item["area"][0] if item["area"] else None, "url" : item["url"], "description" : item["description"], "pubtime" : pubtime, "channel" : u"电影", "source" : "douban", "source_id" : re.findall("/(\d+)/", item['url'])[0], }) model.on_import() print model['title']
def crawl(self): album_url = "http://www.265zy.com/detail/?%s.html" % self.key hxs = load_html(album_url) urls = hxs.select( "//td[@class='bt']/.//input[@id='copy_yah']/@value").extract() videos = [] for url in urls: m = re.match("qvod://(.+)", url) if not m: continue words = m.group(1).split("|") size = int(words[0]) #md5 = words[1] title = words[2].split(".")[0] videos.append( VideoItemModel({ "title": title, "url": url, "stream": [{ "url": url, "format": "qvod", "size": size }], })) kv = {} for s in hxs.select("/html/body/table[2]/tr[1]/td[2]/table/tr"): texts = s.select(".//text()").extract() if len(texts) >= 2: kv[texts[0].strip()] = texts[1].strip() description = "\n".join( hxs.select("//div[@class='intro']/.//text()").extract()) try: image = urlparse.urljoin( "http://www.265zy.com/", hxs.select("//div[@class='img']/img/@src").extract()[0]) except: image = None model = VideoSourceModel({ "source": self.data['source'], "source_id": self.key, "title": self.data["title"], "image": image, "url": album_url, "time": self.data.get('time'), "categories": [self.data.get('category')], "channel": self.data.get('category'), "region": self.data.get('region'), "videos": videos, "actors": split(kv.get(u"影片演员:")), "pubtime": parse_date(kv.get(u"上映日期:")), "completed": kv.get(u"影片状态:", "").find(u"连载") == -1, "description": description, }) export(model)
def import_mtime(): from datetime import datetime from contentservice.models.video import VideoSourceModel from contentservice.settings import MONGO_CONN_STR from contentservice.utils.datetimeutil import parse_date db = MongoClient(MONGO_CONN_STR).mtime pdb.set_trace() for item in db.album.find(): area = item["area"] if item.get("area") else None if isinstance(area, list): area = area[0] categories = item.get("type") if isinstance(categories, basestring): categories = [categories] description = item.get("description") if isinstance(description, list): description = "\n".join(description) tags = item.get("tags") if isinstance(tags, basestring): tags = [tags] channel = "" if item.get("category_id") == "1": channel = u"电影" elif item.get("category_id") == "0": channel = u"电视剧" try: model = VideoSourceModel({ "source": "mtime", "source_id": item["id"], "title": item["title"], "description": description, "tags": tags, "time": datetime.strptime(item["create_time"], "%Y-%m-%d %H:%M:%S"), "duration": item.get("duration"), "region": area, "directors": item.get("directors"), "score": item.get("score"), "actors": item.get("actors"), "categories": categories, "channel": channel, "url": "http://movie.mtime.com/%s/" % item["id"], "pubtime": parse_date(item["release_time"]), }) model.on_import() except Exception, e: print e print model["title"]
def crawl(self): #key为专辑源站ID album_id = self.key album_url = "http://bdzy.cc/detail/?%s.html" % album_id hxs = load_html(album_url) urls = hxs.select("//td[@class='bt']/.//li/input/@value").extract() videos = [] for url in urls: m = re.match("bdhd://(.+)", url) if not m: continue words = m.group(1).split("|") size = int(words[0]) #md5 = words[1] title = words[2].split(".")[0] #视频剧集 video = VideoItemModel({ "title" : title, "url" : url, #网页地址 (这里没有,所以采用播放地址) "stream" : [ { "url" : url, #视频文件播放地址 "size" : size, "format" : "bdhd" #视频格式(协议) }], }) videos.append(video) kv = {} for s in hxs.select("/html/body/table[2]/tr[1]/td[2]/table/tr"): texts = s.select(".//text()").extract() if len(texts) >= 2: kv[texts[0].strip()] = texts[1].strip() description = "\n".join(hxs.select("//div[@class='intro']/.//text()").extract()) try: image = hxs.select("/html/body/table[2]/tr[1]/td[1]/img/@src").extract()[0] except: image = None #视频导出的数据模型 model = VideoSourceModel({ "source" : self.data['source'], #视频源 "source_id" : album_id, #源站ID "title" : self.data["title"], "url" : album_url, #网页地址 "image" : image, #图片url "time" : self.data.get('time'), #源站更新时间 "categories" : [self.data.get('category')], #分类 "channel" : self.data.get('category'), #频道 "region" : self.data.get('region'), #地区 "videos" : videos, #视频专辑数组 "pubtime" : parse_date(kv.get(u"上映日期:")), #上映时间 "actors" : split(kv.get(u"影片演员:")), "completed" : kv.get(u"影片状态:", "").find(u"连载") == -1, #是否完结 "description" : description, }) #导出数据 export(model)