def import_all_videos(): from contentservice.models.video import VideoSourceModel from contentservice.settings import MONGO_CONN_STR db = MongoClient(MONGO_CONN_STR).content_video pdb.set_trace() for item in db.video.source.find(sort=[("updated", 1)]): model = VideoSourceModel(item) model.on_import() print model["title"]
def import_all_videos(): from contentservice.models.video import VideoSourceModel from contentservice.settings import MONGO_CONN_STR db = MongoClient(MONGO_CONN_STR).content_video pdb.set_trace() for item in db.video.source.find(sort = [("updated", 1)]): model = VideoSourceModel(item) model.on_import() print model["title"]
def remerge_video(request): source_id = request.GET.get('sid', None) album_id = request.GET.get('aid', None) vs = VideoSourceModel().find_one(query={'_id':source_id}) va = VideoAlbumModel().find_one(query={'_id':album_id}) if vs and va: vs.on_import(to_album_id=album_id) return {'status':1} else: return {'status':-1}
def import_douban(): from contentservice.utils.datetimeutil import parse_date from contentservice.settings import MONGO_CONN_STR db = MongoClient(MONGO_CONN_STR).douban pdb.set_trace() def clean_title(title): zhPattern = re.compile(u'[\u4e00-\u9fa5]+') if zhPattern.search(title): return title.split(" ")[0] else: return title for item in db.album.find(): pubtime = None if item['pub_time']: pubtime = parse_date(re.sub("\(.*\)", "", item['pub_time'][0])) model = VideoSourceModel({ "title": clean_title(item['title']), "categories": item['sub_category'], "image": item["img"], "related": item["related"], "score": item["score"], "actors": item["actors"], "region": item["area"][0] if item["area"] else None, "url": item["url"], "description": item["description"], "pubtime": pubtime, "channel": u"电影", "source": "douban", "source_id": re.findall("/(\d+)/", item['url'])[0], }) model.on_import() print model['title']
def import_mtime(): from datetime import datetime from contentservice.models.video import VideoSourceModel from contentservice.settings import MONGO_CONN_STR from contentservice.utils.datetimeutil import parse_date db = MongoClient(MONGO_CONN_STR).mtime pdb.set_trace() for item in db.album.find(): area = item["area"] if item.get("area") else None if isinstance(area, list): area = area[0] categories = item.get("type") if isinstance(categories, basestring): categories = [categories] description = item.get("description") if isinstance(description, list): description = "\n".join(description) tags = item.get("tags") if isinstance(tags, basestring): tags = [tags] channel = "" if item.get("category_id") == "1": channel = u"电影" elif item.get("category_id") == "0": channel = u"电视剧" try: model = VideoSourceModel({ "source" : "mtime", "source_id" : item["id"], "title" : item["title"], "description" : description, "tags" : tags, "time" : datetime.strptime(item["create_time"], "%Y-%m-%d %H:%M:%S"), "duration" : item.get("duration"), "region" : area, "directors" : item.get("directors"), "score" : item.get("score"), "actors" : item.get("actors"), "categories" : categories, "channel" : channel, "url" : "http://movie.mtime.com/%s/" % item["id"], "pubtime" : parse_date(item["release_time"]), }) model.on_import() except Exception, e: print e print model["title"]
def crawl(self): album_id = self.key channel = self.data["channel"] detail = api_album(album_id) if album_id else None title = detail["tv_name"] directors = detail["director"].split(";") actors = detail["actor"].split(";") region = detail["area"] categories = detail["tv_cont_cats"].split(";") ver_image = detail["ver_high_pic"] hor_image = detail["hor_high_pic"] url = detail["s_url"] description = detail["tv_desc"] # 视频导出的数据模型 model = VideoSourceModel({ "source": self.data['source'], "source_id": album_id, "title": title, "url": url, "directors": directors, "actors": actors, "region": region, "categories": categories, "channel": channel, "description": description, "image": ver_image, "image2": hor_image, }) # 导出数据 export(model) self.data['to_album_id'] = model['to_album_id'] return
def crawl(self): videos = [] mid = self.key url = DETAIL % mid detail = loadurl(url) description = detail.get('plots') description = ''.join(description.split()) if self.data.get('channel') == u'鐢靛奖': dict_ = detail['pinfos']['mpurls'] video = VideoItemModel({ "title": self.data.get('title'), "url": MOVIE_PLAY % mid, #缃戦〉鍦板潃 "image": self.data.get('image'), "description": description, "stream": [{ 'url': dict_['tv'].get('url'), 'size': dict_['tv'].get('bits'), 'format': 'mp4' }] }) videos.append(video) else: try: sort = detail['pinfos'].get('sort')[0] episodes = detail['pinfos']['content'][sort]['fsps'] except: episodes = detail['pinfos']['fsps'] for episode in episodes: plots = episode.get('plots') plots = ''.join(plots.split()) video = VideoItemModel({ "title": episode.get('taskname'), "url": PLAY_URL % (mid,episode.get('number')), #缃戦〉鍦板潃 "image": episode.get('picurl'), "description": plots, "stream": getstream(episode.get('mpurls')) }) videos.append(video) model = VideoSourceModel({ "source": self.data.get('source'), "source_id": mid, #婧愮珯ID "title": self.data["title"], "url": detail.get('shareurl'), #璇︽儏椤电殑鍦板潃 "image": self.data.get('image'), #鍥剧墖url "categories": self.data.get('category'), #鍒嗙被 "channel": self.data.get('channel'), #棰戦亾 "region": detail.get('country'), #鍦板尯 "videos": videos, #瑙嗛涓撹緫 "pubtime": parse_date(detail.get('rinfo').split(' ')[0]), #涓婃槧鏃堕棿 "actors": detail.get('lactor'), "directors": detail.get('director'), "description": description, }) #瀵煎嚭鏁版嵁 export(model) self.data['to_album_id'] = model['to_album_id']
def crawl(self): timestr = self.data.get('videoLength', '00:00') duration = gettime(timestr) videos = [] video = VideoItemModel({ "title": self.data.get('title'), "url": self.data.get('videoURLMid'), #网页地址 "image": self.data.get('imgURL'), "description": self.data.get('desc'), "stream": [{ "url": self.data.get('videoURLMid'), #视频文件播放地址 "size": self.data.get('videoSizeMid'), "format": "mp4", #视频格式(协议) "duration": duration }], "stream_low": [{ "url": self.data.get('videoURLLow'), "size": self.data.get('videoSizeLow'), "format": "mp4", "duration": duration }], "stream_high": [{ "url": self.data.get('videoURLHigh'), "size": self.data.get('videoSizeHigh'), "format": "mp4", "duration": duration }] }) videos.append(video) model = VideoSourceModel({ "source": self.data.get('source'), "source_id": self.data.get('id'), #源站ID "title": self.data.get("title"), "url": self.data.get('shareurl'), #详情页的地址 "image": self.data.get('imgURL'), #图片url "channel": CHANNEL, #频道 "videos": videos, #视频专辑 "pubtime": parse_date(self.data.get('videoPublishTime')), #上映时间 "description": self.data.get('desc'), }) #导出数据 export(model) self.data['to_album_id'] = model['to_album_id']
def crawl(self): url = self.data['url'] title = self.data['title'] items = api_recommend(url, 10) related = [item['title'] for item in items] album = VideoSourceModel({ 'source': self.data['source'], 'title': title, 'related': related, }) export(album)
def crawl(self): album_id = self.key detail_data = api_detail(album_id) detail_data = detail_data.get('detail') channel = detail_data.get('cats') title = detail_data.get('title') title = "".join(title.split()) image = detail_data.get('img') url = detail_data.get('play_url') url_key = re.findall("http://www.tudou.com/albumplay/(.+)/.+\.html", url)[0] album_url = "http://www.tudou.com/albumcover/%s.html" % url_key if channel == u"动漫": actors = detail_data.get('seiyuu') else: actors = detail_data.get('performer') if channel == u"综艺": directors = detail_data.get('host') else: directors = detail_data.get('director') categories = detail_data.get('genre') region = detail_data.get('area')[0] description = detail_data.get('desc') description = "".join(description.split()) pubtime = detail_data.get('showdate') # 未知发布时间pubtime != 0 if pubtime: pubtime = datetime.strptime(str(pubtime), "%Y") # 未知发布时间pubtime == 0 if not pubtime: pubtime = datetime.utcfromtimestamp(0) videos = get_videos(album_id, url_key) model = VideoSourceModel({ "source": self.data.get('source'), "source_id": album_id, "title": title, "image": image, "url": album_url, "actors": actors, "directors": directors, "categories": categories, "channel": channel, "region": region, "description": description, "pubtime": pubtime, "videos": videos, }) export(model) self.data['to_album_id'] = model['to_album_id']
def crawl(self): album_url = "http://zyqvod.com/vod/index.asp?id=%s" % self.key hxs = load_html(album_url) urls = hxs.select("//div[@class='movievod']/li/input/@value").extract() videos = [] for url in urls: m = re.match("qvod://(.+)", url) if not m: continue words = m.group(1).split("|") size = int(words[0]) #md5 = words[1] title = words[2].split(".")[0] videos.append(VideoItemModel({ "title" : title, "url" : url, "stream" : [{"url" : url, "format" : "qvod", "size" : size}], })) kv = {} for s in hxs.select("//div[@class='videoDetail']/p"): texts = s.select(".//text()").extract() if len(texts) >= 2: kv[texts[0].strip()] = texts[1].strip() description = "\n".join(hxs.select("//div[@class='movievod']/p[2]/text()").extract()) try: image = hxs.select("//div[@class='videoPic']/img/@src").extract()[0] except: image = None model = VideoSourceModel({ "source" : self.data['source'], "source_id" : self.key, "title" : self.data["title"], "time" : self.data.get('time'), "url" : album_url, "image" : image, "completed" : self.data.get('completed'), "categories" : [self.data.get('category')], "channel" : self.data.get('category'), "region" : self.data.get('region'), "videos" : videos, "actors" : split(kv.get(u'影片主演:')), "directors" : split(kv.get(u'影片导演:')), "pubtime" : parse_date(kv.get(u'上映年份:')), "description" : description, "completed" : not kv.get(u'连载状态:'), }) export(model)
def import_douban(): from contentservice.utils.datetimeutil import parse_date from contentservice.settings import MONGO_CONN_STR db = MongoClient(MONGO_CONN_STR).douban pdb.set_trace() def clean_title(title): zhPattern = re.compile(u'[\u4e00-\u9fa5]+') if zhPattern.search(title): return title.split(" ")[0] else: return title for item in db.album.find(): pubtime = None if item['pub_time']: pubtime = parse_date(re.sub("\(.*\)", "", item['pub_time'][0])) model = VideoSourceModel({ "title" : clean_title(item['title']), "categories" : item['sub_category'], "image" : item["img"], "related" : item["related"], "score" : item["score"], "actors" : item["actors"], "region" : item["area"][0] if item["area"] else None, "url" : item["url"], "description" : item["description"], "pubtime" : pubtime, "channel" : u"电影", "source" : "douban", "source_id" : re.findall("/(\d+)/", item['url'])[0], }) model.on_import() print model['title']
def extract_model(self, item, detail=None): try: pubtime = datetime.strptime(str(detail.get("tv_year")), "%Y") if detail else None except: pubtime = None model = VideoSourceModel({ "source": self.data['source'], "source_id": item.get("sid"), "title": detail.get("tv_name") if detail else item.get("albumTitle"), "image": detail.get("ver_big_pic") if detail else item.get("ver_big_pic"), "image2": detail.get("hor_big_pic") if detail else item.get("hor_big_pic"), "description": detail.get("tv_desc") if detail else item.get("tv_desc"), "directors": item["director"].split(";") if item.get("director") else [], "actors": item["main_actor"].split(";") if item.get("main_actor") else [], "region": item.get("area"), "url": detail["s_url"] if detail else item["s_url"], "categories": item["tv_cont_cats"].split(";") if item.get("tv_cont_cats") else [], "time": datetime.strptime( item.get('tv_application_time', '1970-01-01')[:10], "%Y-%m-%d"), "price": detail.get("fee") if detail else item.get("fee"), "channel": item["cname"], "completed": item["vcount"] >= item["totalSet"], "visits": self.extract_visits(item.get("albumPC")), "score": item.get("tv_score"), "pubtime": pubtime, }) return model
def update_region(): conn = Connection() db = conn.content_video2 count = 1 source_videos = db.video.source.find() for source_video in source_videos: model = VideoSourceModel({ "videos": source_video['videos'], "image": source_video['image'], "related": source_video['related'], "duration": source_video['duration'], "title": source_video['title'], "comments": source_video['comments'], "source": source_video['source'], "score": source_video['score'], "actors": source_video['actors'], "price": source_video['price'], "channel": source_video['channel'], "description": source_video['description'], "tags": source_video['tags'], "deleted": source_video['deleted'], "completed": source_video['completed'], "visits": source_video['visits'], "favorites": source_video['favorites'], "authorities": source_video['authorities'], "categories": source_video['categories'], "created": source_video['created'], "url": source_video['url'], "region": source_video['region'], "directors": source_video['directors'], "pubtime": source_video['pubtime'], "time": source_video['time'], "source_id": source_video['source_id'] }) export(model) count += 1 print "count = %s" % count print "count = %s" % count print "map complete."
def process_album(self, item): sites = {} fangying_id = re.findall("f_(.+)\.html", item['link'])[0] for play in item['plays']: site = play['site'] if site not in SITES: continue if play["url"].find("fangying.com") != -1: stream = [] else: format = "thunder" if site == "thunder" else "" stream = [{"url": play["url"], "format": format}] video = VideoItemModel({ "title": play["title"], "url": play["url"], "stream": stream, }) if not sites.has_key(site): sites[site] = [] sites[site].append(dict(video)) model = None for site, videos in sites.iteritems(): model = VideoSourceModel({ "source": self.data['source'], "source_id": fangying_id, "videos": videos, "title": item['title'], "directors": item['directors'].split("/"), "actors": item['performers'].split("/"), "description": item['description'], 'categories': item['genres'].split("/"), 'region': item['countries'].split("/")[0], 'duration': parse_duration(item['duration']), 'image': item['avatar_middle'], 'score': float(item['douban_rating']) if item.get('douban_rating') else None, 'url': item['link'], 'price': 0.0, 'pubtime': parse_pubtime(item['release_time']), 'channel': CHANNELS.get(self.key) }) export(model) if model: Scheduler.schedule(RelationCrawler.type, key=fangying_id, data={ 'title': model['title'], 'url': model['url'] })
def import_mtime(): from datetime import datetime from contentservice.models.video import VideoSourceModel from contentservice.settings import MONGO_CONN_STR from contentservice.utils.datetimeutil import parse_date db = MongoClient(MONGO_CONN_STR).mtime pdb.set_trace() for item in db.album.find(): area = item["area"] if item.get("area") else None if isinstance(area, list): area = area[0] categories = item.get("type") if isinstance(categories, basestring): categories = [categories] description = item.get("description") if isinstance(description, list): description = "\n".join(description) tags = item.get("tags") if isinstance(tags, basestring): tags = [tags] channel = "" if item.get("category_id") == "1": channel = u"电影" elif item.get("category_id") == "0": channel = u"电视剧" try: model = VideoSourceModel({ "source": "mtime", "source_id": item["id"], "title": item["title"], "description": description, "tags": tags, "time": datetime.strptime(item["create_time"], "%Y-%m-%d %H:%M:%S"), "duration": item.get("duration"), "region": area, "directors": item.get("directors"), "score": item.get("score"), "actors": item.get("actors"), "categories": categories, "channel": channel, "url": "http://movie.mtime.com/%s/" % item["id"], "pubtime": parse_date(item["release_time"]), }) model.on_import() except Exception, e: print e print model["title"]
def crawl(self): album_id = self.key if self.data['channel'] in SHORT_VIDEO: url = "http://v.qq.com/page/%s/%s/%s/%s.html" % ( album_id[0], album_id[1], album_id[-1], album_id) pubtime = datetime.strptime( self.data["pubtime"], "%Y-%m-%d %H:%M:%S") video = VideoItemModel({ "title": self.data["title"], "url": url, "stream": [{ "url": "javascript:getUrl('tencent', '%s')" % url }], "image": self.data["image"], "channel": self.data["channel"], }) model = VideoSourceModel({ "source": self.data["source"], "source_id": album_id, "title": self.data["title"], "url": url, "image": self.data["image"], "channel": self.data["channel"], "pubtime": pubtime, "videos": [video] }) export(model) self.data['to_album_id'] = model['to_album_id'] else: album_url = "http://v.qq.com/detail/%s/%s.html" % ( album_id[0], album_id) album_data = api_album(album_id[0], album_id) if album_data['trailer'] == 1: play_url = "http://v.qq.com/prev/%s/%s" % ( album_id[0], album_id) else: play_url = "http://v.qq.com/cover/%s/%s" % ( album_id[0], album_id) description = album_data.get("columndesc") if not description: description = album_data.get("desc") description = "".join(description.split()) try: pubtime = datetime.strptime(self.data.get("pubtime"), "%Y") except: pubtime = datetime.utcfromtimestamp(0) videos = [] columnid = album_data.get('columnid') rely = album_data.get('rely') if columnid: # columnid != 0 for video_dict in rely: for year, months in video_dict.iteritems(): for month in months: videolist_id = "%s_%s" % (year, month) videos_data = api_video(columnid, videolist_id) for video in videos_data['items']: time = video.get('date') time = datetime.strptime(time, "%Y-%m-%d") url = "http://v.qq.com/cover/%s/%s.html" % ( video.get('coverid')[0], video.get('coverid')) video = VideoItemModel({ "title": video.get('sectitle'), "description": video.get('breif'), "url": url, "stream": [{ "url": "javascript:getUrl('tencent', '%s')" % url }], "image": video.get('snapurl'), "time": time }) videos.append(video) if not columnid: # columnid == 0, only one video for video in album_data['videos']: videos.append(clean_video(video, play_url)) # self.data is not None: export(data) if self.data: model = VideoSourceModel({ "source": self.data.get('source'), "source_id": album_id, "title": album_data['columnname'] if album_data['columnname'] else self.data["title"], "image": self.data.get("image"), "url": album_url, "actors": self.data.get("actors"), "directors": self.data.get("directors"), "categories": self.data.get("categories"), "channel": self.data.get("channel"), "region": self.data.get("region"), "description": description, "pubtime": pubtime, "videos": videos, }) # self.data is None: crawl web data first # (http://v.qq.com/cover/x/xxxxx.html), and export(data) else: hxs = load_html(play_url) channel = hxs.select( "//div[@class='mod_crumbs']/a[1]/text()").extract()[0] album_hxs = hxs.select( "//div[@class='mod_video_intro mod_video_intro_rich']") image = album_hxs.select("a/img/@src").extract()[0] title = album_hxs.select( "div[@class='video_title']/strong/a/text()").extract()[0] directors = [] for director_hxs in album_hxs.select("//div[@itemprop='director']/a"): director = director_hxs.select("span/text()").extract()[0] directors.append(director) actors = [] for actor_hxs in album_hxs.select("//div[@itemprop='actors']/a"): actor = actor_hxs.select("span/text()").extract()[0] actors.append(actor) region = album_hxs.select( "//div[@class='info_area']/span[@class='content']/a/text()").extract()[0] categories = [] for categorie_hxs in album_hxs.select("//div[@class='info_category']/span[@class='content']/a"): categorie = categorie_hxs.select("text()").extract()[0] categories.append(categorie) pubtime = album_hxs.select( "//div[@class='info_years']/span[@class='content']/a/text()").extract()[0] if re.match("^\d+$", pubtime): pubtime = datetime.strptime(pubtime, "%Y") else: pubtime = datetime.utcfromtimestamp(0) model = VideoSourceModel({ "source": self.data.get('source'), "source_id": album_id, "title": title, "image": image, "url": album_url, "actors": actors, "directors": directors, "categories": categories, "channel": channel, "region": region, "description": description, "pubtime": pubtime, "videos": videos, }) export(model) self.data['to_album_id'] = model['to_album_id']
def crawl_show(self, show_id): def parse_number(text): if not text: return 0 text = re.sub("[^0-9\.]", "", text) return float(text) data = api_detail(show_id) detail = data['detail'] try: pubtime = datetime.strptime( detail.get('showdate', '1970-01-01').replace("-00", "-01"), "%Y-%m-%d") except: pubtime = None if detail['cats'] in MAIN_CHANNELS.values(): model = VideoSourceModel({ 'source': self.data['source'], 'source_id': detail['showid'], 'url': "http://www.youku.com/show_page/id_z%s.html" % detail['showid'], 'title': detail['title'], 'duration': None, 'visits': parse_number(detail.get('total_vv')), 'comments': parse_number(detail.get('total_comment')), 'score': parse_number(detail.get('reputation')), 'favorites': parse_number(detail.get('total_fav')), 'image': detail['img'], 'region': detail['area'][0] if detail.get('area') else None, 'categories': detail.get('genre', []), 'description': detail.get('desc'), 'completed': detail.get('completed') == 1, 'actors': detail.get('performer', []), 'directors': detail.get('director', []), #'price' : 0.0, 'pubtime': pubtime, 'channel': detail.get('cats', '') }) else: model = VideoSourceModel({ 'source': self.data['source'], 'source_id': show_id, 'url': "http://v.youku.com/v_show/id_%s.html" % show_id, 'title': detail['title'], 'duration': None, 'visits': parse_number(detail.get('total_vv')), 'comments': parse_number(detail.get('total_comment')), 'score': parse_number(detail.get('reputation')), 'favorites': parse_number(detail.get('total_fav')), 'image': detail['img'], 'region': detail['area'][0] if detail.get('area') else None, 'categories': detail.get('genre', []), 'description': detail.get('desc'), 'completed': detail.get('completed') == 1, 'actors': detail.get('performer', []), 'directors': detail.get('director', []), #'price' : 0.0, 'pubtime': pubtime, 'channel': detail.get('cats', '') }) return model
def crawl(self): type = 4 album_id = self.key title = self.data['title'].encode('utf-8') channel = self.data.get('channel') if channel in LONG_VIDEO_CHANNELS.items(): album_data = api_album(type, album_id, title) album_data = album_data['data'] pubtime = album_data.get("public_time") pubtime = datetime.strptime(pubtime, "%Y%m%d") videos = [] for video in album_data['data']: video = clean_video(video) videos.append(video) model = VideoSourceModel({ "source": self.data.get('source'), "source_id": album_id, "title": title, "image": album_data.get("bpic"), "image2": album_data.get("mpic"), "url": album_data.get("web_url"), "actors": album_data.get("actors"), "directors": album_data.get("director"), "categories": album_data.get("tname"), "tags": self.data.get("tags"), "channel": channel, "region": album_data.get("zname")[0], "description": album_data.get("introduce"), "pubtime": pubtime, "videos": videos, }) else: video = VideoItemModel({ "title": title, "description": self.data.get("description"), "url": "http://www.56.com/u13/v_%s.html" % album_id, "stream": [{ "url": "http://vxml.56.com/html5/%s/?src=3g&res=qqvga" % album_id }], "stream_high": [{ "url": "http://vxml.56.com/html5/%s/?src=3g&res=qvga" % album_id }] }) model = VideoSourceModel({ "source": self.data.get('source'), "source_id": album_id, "title": title, "image": self.data.get("bpic"), "image2": self.data.get("mpic"), "tags": self.data.get("tags"), "url": self.data.get("web_url"), "channel": channel, "description": self.data.get("introduce"), "videos": [video], }) export(model) self.data['to_album_id'] = model['to_album_id']
def crawl(self): album_url = "http://www.265zy.com/detail/?%s.html" % self.key hxs = load_html(album_url) urls = hxs.select( "//td[@class='bt']/.//input[@id='copy_yah']/@value").extract() videos = [] for url in urls: m = re.match("qvod://(.+)", url) if not m: continue words = m.group(1).split("|") size = int(words[0]) #md5 = words[1] title = words[2].split(".")[0] videos.append( VideoItemModel({ "title": title, "url": url, "stream": [{ "url": url, "format": "qvod", "size": size }], })) kv = {} for s in hxs.select("/html/body/table[2]/tr[1]/td[2]/table/tr"): texts = s.select(".//text()").extract() if len(texts) >= 2: kv[texts[0].strip()] = texts[1].strip() description = "\n".join( hxs.select("//div[@class='intro']/.//text()").extract()) try: image = urlparse.urljoin( "http://www.265zy.com/", hxs.select("//div[@class='img']/img/@src").extract()[0]) except: image = None model = VideoSourceModel({ "source": self.data['source'], "source_id": self.key, "title": self.data["title"], "image": image, "url": album_url, "time": self.data.get('time'), "categories": [self.data.get('category')], "channel": self.data.get('category'), "region": self.data.get('region'), "videos": videos, "actors": split(kv.get(u"影片演员:")), "pubtime": parse_date(kv.get(u"上映日期:")), "completed": kv.get(u"影片状态:", "").find(u"连载") == -1, "description": description, }) export(model)
def extract_album(album, source): if hasattr(album, "_a"): album = album._a channel = _CHANNEL_DCT.get(int(album._cid), '') channel_py = _CHANNEL_PINYIN.get(channel) if channel_py: url = "http://m.iqiyi.com/%s/a/%s.html" % (channel_py, album._id) else: url = "" pubtime = album.year if len(pubtime) == 4: pubtime = datetime.strptime(pubtime, "%Y") elif len(pubtime) == 8: pubtime = datetime.strptime(pubtime, "%Y%m%d") else: pubtime = None visits = getattr(album, 'vv') m = re.match("^\d+$", getattr(album, 'vv')) if m: visits = int(visits) * 10000 else: visits = int(visits[:-1]) * 10000 item = VideoSourceModel({ 'source': source, 'source_id': album._id, 'url': url, 'title': album.clm if hasattr(album, 'clm') and album.clm else album._t, 'duration': int(album._dn), 'visits': visits, 'score': float(album._sc), 'image': album._img, 'tags': album.tag.split() if hasattr(album, "tag") else [], 'channel': channel, 'description': album.desc if hasattr(album, "desc") else "", 'price': float(album.t_pc), 'directors': album._da.split(",") if hasattr(album, "_da") else [], 'actors': album._ma.split(",") if hasattr(album, "_ma") else [], # last update time for iqiyi 'time': datetime.strptime(getattr(album, "fst_time", "1970-01-01"), "%Y-%m-%d"), 'price': 0.0, 'pubtime': pubtime, }) return item
def crawl(self): source_id = self.key album_data = api_album(source_id, pcode, version) album_data = album_data['body'] title = album_data.get("nameCn") pubtime = album_data.get("releaseDate") if re.match("^\d+$", pubtime): pubtime = datetime.strptime(pubtime, "%Y") elif re.match("^\d+-\d+-\d+$", pubtime): pubtime = datetime.strptime(pubtime, "%Y-%m-%d") else: pubtime = datetime.utcfromtimestamp(0) directors = album_data.get("directory").split(" ") actors = album_data.get("starring").split(" ") desc = album_data.get("description") desc = "".join(desc.split()) region = album_data.get("area") categories = album_data.get("subCategory").split(" ") tags = album_data.get("tag").split(" ") url = "http://so.letv.com/tv/%s.html" % source_id videos = [] b = 1 s = 60 o = -1 m = 0 series_data = api_series(source_id, b, s, o, m, pcode, version) for series in series_data['body']['videoInfo']: id = series['id'] mid = series['mid'] url = "http://www.letv.com/ptv/vplay/%s.html" % id vurl = "http://dynamic.app.m.letv.com/android/dynamic.php?mod=minfo&ctl=videofile&act=index&mmsid=%s&pcode=%s&version=%s" % ( mid, pcode, version) jsurl = "javascript:getUrl('letv', '%s')" % vurl video = VideoItemModel({ "title": series.get("nameCn"), "url": url, "stream": [{ "url": jsurl }], "image": series.get("picAll"), "duration": series.get("duration") }) videos.append(video) model = VideoSourceModel({ "source_id": source_id, "source": self.data.get('source'), "url": url, "channel": self.data['channel'], 'title': title, "image": self.data['image'], "pubtime": pubtime, "directors": directors, "actors": actors, "desc": desc, "region": region, "categories": categories, "tags": tags, "videos": videos }) export(model) self.data['to_album_id'] = model['to_album_id']
def crawl(self): #key为专辑源站ID album_id = self.key album_url = "http://bdzy.cc/detail/?%s.html" % album_id hxs = load_html(album_url) urls = hxs.select("//td[@class='bt']/.//li/input/@value").extract() videos = [] for url in urls: m = re.match("bdhd://(.+)", url) if not m: continue words = m.group(1).split("|") size = int(words[0]) #md5 = words[1] title = words[2].split(".")[0] #视频剧集 video = VideoItemModel({ "title" : title, "url" : url, #网页地址 (这里没有,所以采用播放地址) "stream" : [ { "url" : url, #视频文件播放地址 "size" : size, "format" : "bdhd" #视频格式(协议) }], }) videos.append(video) kv = {} for s in hxs.select("/html/body/table[2]/tr[1]/td[2]/table/tr"): texts = s.select(".//text()").extract() if len(texts) >= 2: kv[texts[0].strip()] = texts[1].strip() description = "\n".join(hxs.select("//div[@class='intro']/.//text()").extract()) try: image = hxs.select("/html/body/table[2]/tr[1]/td[1]/img/@src").extract()[0] except: image = None #视频导出的数据模型 model = VideoSourceModel({ "source" : self.data['source'], #视频源 "source_id" : album_id, #源站ID "title" : self.data["title"], "url" : album_url, #网页地址 "image" : image, #图片url "time" : self.data.get('time'), #源站更新时间 "categories" : [self.data.get('category')], #分类 "channel" : self.data.get('category'), #频道 "region" : self.data.get('region'), #地区 "videos" : videos, #视频专辑数组 "pubtime" : parse_date(kv.get(u"上映日期:")), #上映时间 "actors" : split(kv.get(u"影片演员:")), "completed" : kv.get(u"影片状态:", "").find(u"连载") == -1, #是否完结 "description" : description, }) #导出数据 export(model)