Пример #1
0
def import_all_videos():
    from contentservice.models.video import VideoSourceModel
    from contentservice.settings import MONGO_CONN_STR
    db = MongoClient(MONGO_CONN_STR).content_video

    pdb.set_trace()
    for item in db.video.source.find(sort=[("updated", 1)]):
        model = VideoSourceModel(item)
        model.on_import()
        print model["title"]
Пример #2
0
def import_all_videos():
    from contentservice.models.video import VideoSourceModel
    from contentservice.settings import MONGO_CONN_STR
    db = MongoClient(MONGO_CONN_STR).content_video

    pdb.set_trace()
    for item in db.video.source.find(sort = [("updated", 1)]):
        model = VideoSourceModel(item)
        model.on_import()
        print model["title"]
Пример #3
0
def remerge_video(request):
    source_id =  request.GET.get('sid', None)
    album_id =  request.GET.get('aid', None)
    vs = VideoSourceModel().find_one(query={'_id':source_id})
    va = VideoAlbumModel().find_one(query={'_id':album_id})
    if vs and va:
        vs.on_import(to_album_id=album_id)
        return {'status':1}
    else:
        return {'status':-1}
Пример #4
0
def import_douban():
    from contentservice.utils.datetimeutil import parse_date
    from contentservice.settings import MONGO_CONN_STR
    db = MongoClient(MONGO_CONN_STR).douban

    pdb.set_trace()

    def clean_title(title):
        zhPattern = re.compile(u'[\u4e00-\u9fa5]+')
        if zhPattern.search(title):
            return title.split(" ")[0]
        else:
            return title

    for item in db.album.find():
        pubtime = None
        if item['pub_time']:
            pubtime = parse_date(re.sub("\(.*\)", "", item['pub_time'][0]))

        model = VideoSourceModel({
            "title":
            clean_title(item['title']),
            "categories":
            item['sub_category'],
            "image":
            item["img"],
            "related":
            item["related"],
            "score":
            item["score"],
            "actors":
            item["actors"],
            "region":
            item["area"][0] if item["area"] else None,
            "url":
            item["url"],
            "description":
            item["description"],
            "pubtime":
            pubtime,
            "channel":
            u"电影",
            "source":
            "douban",
            "source_id":
            re.findall("/(\d+)/", item['url'])[0],
        })
        model.on_import()
        print model['title']
Пример #5
0
def import_mtime():
    from datetime import datetime
    from contentservice.models.video import VideoSourceModel
    from contentservice.settings import MONGO_CONN_STR
    from contentservice.utils.datetimeutil import parse_date
    db = MongoClient(MONGO_CONN_STR).mtime
    pdb.set_trace()

    for item in db.album.find():
        area = item["area"] if item.get("area") else None
        if isinstance(area, list):
            area = area[0]
        categories = item.get("type")
        if isinstance(categories, basestring):
            categories = [categories]
        description = item.get("description")
        if isinstance(description, list):
            description = "\n".join(description)
        tags = item.get("tags")
        if isinstance(tags, basestring):
            tags = [tags]
        channel = ""
        if item.get("category_id") == "1":
            channel = u"电影"
        elif item.get("category_id") == "0":
            channel = u"电视剧"
        try:
            model = VideoSourceModel({
                                    "source" : "mtime",
                                    "source_id" : item["id"],
                                    "title" : item["title"],
                                    "description" : description,
                                    "tags" : tags,
                                    "time" : datetime.strptime(item["create_time"], "%Y-%m-%d %H:%M:%S"),
                                    "duration" : item.get("duration"),
                                    "region" : area,
                                    "directors" : item.get("directors"),
                                    "score" : item.get("score"),
                                    "actors" : item.get("actors"),
                                    "categories" : categories,
                                    "channel" : channel,
                                    "url" : "http://movie.mtime.com/%s/" % item["id"],
                                    "pubtime" : parse_date(item["release_time"]),
                                      })
            model.on_import()
        except Exception, e:
            print e
        print model["title"]
Пример #6
0
    def crawl(self):
        album_id = self.key
        channel = self.data["channel"]

        detail = api_album(album_id) if album_id else None

        title = detail["tv_name"]
        directors = detail["director"].split(";")
        actors = detail["actor"].split(";")
        region = detail["area"]
        categories = detail["tv_cont_cats"].split(";")
        ver_image = detail["ver_high_pic"]
        hor_image = detail["hor_high_pic"]
        url = detail["s_url"]
        description = detail["tv_desc"]

        # 视频导出的数据模型
        model = VideoSourceModel({
            "source": self.data['source'],
            "source_id": album_id,
            "title": title,
            "url": url,
            "directors": directors,
            "actors": actors,
            "region": region,
            "categories": categories,
            "channel": channel,
            "description": description,
            "image": ver_image,
            "image2": hor_image,
        })
        # 导出数据
        export(model)
        self.data['to_album_id'] = model['to_album_id']
        return
Пример #7
0
    def crawl(self):
        videos = []
        mid = self.key
        url = DETAIL % mid
        detail = loadurl(url)
        description = detail.get('plots')
        description = ''.join(description.split())
        if self.data.get('channel') == u'鐢靛奖':
            dict_ = detail['pinfos']['mpurls']
            video = VideoItemModel({
                                    "title": self.data.get('title'),
                                    "url": MOVIE_PLAY % mid, #缃戦〉鍦板潃
                                    "image": self.data.get('image'),
                                    "description": description,
                                    "stream": [{
                                                 'url': dict_['tv'].get('url'),
                                                 'size': dict_['tv'].get('bits'),
                                                 'format': 'mp4'
                                                }]
                                    })   
            videos.append(video)
        else:
            try:
                sort = detail['pinfos'].get('sort')[0]    
                episodes = detail['pinfos']['content'][sort]['fsps']
            except:
                episodes = detail['pinfos']['fsps']

            for episode in episodes:
                plots = episode.get('plots')
                plots = ''.join(plots.split())                
                video = VideoItemModel({
                                     "title": episode.get('taskname'),
                                     "url": PLAY_URL % (mid,episode.get('number')), #缃戦〉鍦板潃
                                     "image": episode.get('picurl'),
                                     "description": plots,
                                     "stream": getstream(episode.get('mpurls'))
                                     })
                videos.append(video)           
        model = VideoSourceModel({
                                 "source": self.data.get('source'), 
                                 "source_id": mid, #婧愮珯ID
                                 "title": self.data["title"],
                                 "url": detail.get('shareurl'), #璇︽儏椤电殑鍦板潃
                                 "image": self.data.get('image'), #鍥剧墖url
                                 "categories": self.data.get('category'), #鍒嗙被
                                 "channel": self.data.get('channel'), #棰戦亾
                                 "region": detail.get('country'), #鍦板尯
                                 "videos": videos, #瑙嗛涓撹緫
                                 "pubtime": parse_date(detail.get('rinfo').split(' ')[0]), #涓婃槧鏃堕棿
                                 "actors": detail.get('lactor'),
                                 "directors": detail.get('director'),
                                 "description": description,
                                 })
        #瀵煎嚭鏁版嵁
        export(model)
        self.data['to_album_id'] = model['to_album_id']
Пример #8
0
 def crawl(self):
     timestr = self.data.get('videoLength', '00:00')
     duration = gettime(timestr)
     videos = []
     video = VideoItemModel({
         "title":
         self.data.get('title'),
         "url":
         self.data.get('videoURLMid'),  #网页地址
         "image":
         self.data.get('imgURL'),
         "description":
         self.data.get('desc'),
         "stream": [{
             "url": self.data.get('videoURLMid'),  #视频文件播放地址
             "size": self.data.get('videoSizeMid'),
             "format": "mp4",  #视频格式(协议)
             "duration": duration
         }],
         "stream_low": [{
             "url": self.data.get('videoURLLow'),
             "size": self.data.get('videoSizeLow'),
             "format": "mp4",
             "duration": duration
         }],
         "stream_high": [{
             "url": self.data.get('videoURLHigh'),
             "size": self.data.get('videoSizeHigh'),
             "format": "mp4",
             "duration": duration
         }]
     })
     videos.append(video)
     model = VideoSourceModel({
         "source":
         self.data.get('source'),
         "source_id":
         self.data.get('id'),  #源站ID
         "title":
         self.data.get("title"),
         "url":
         self.data.get('shareurl'),  #详情页的地址
         "image":
         self.data.get('imgURL'),  #图片url
         "channel":
         CHANNEL,  #频道
         "videos":
         videos,  #视频专辑
         "pubtime":
         parse_date(self.data.get('videoPublishTime')),  #上映时间
         "description":
         self.data.get('desc'),
     })
     #导出数据
     export(model)
     self.data['to_album_id'] = model['to_album_id']
Пример #9
0
 def crawl(self):
     url = self.data['url']
     title = self.data['title']
     items = api_recommend(url, 10)
     related = [item['title'] for item in items]
     album = VideoSourceModel({
         'source': self.data['source'],
         'title': title,
         'related': related,
     })
     export(album)
Пример #10
0
    def crawl(self):
        album_id = self.key
        detail_data = api_detail(album_id)
        detail_data = detail_data.get('detail')

        channel = detail_data.get('cats')
        title = detail_data.get('title')
        title = "".join(title.split())
        image = detail_data.get('img')
        url = detail_data.get('play_url')
        url_key = re.findall("http://www.tudou.com/albumplay/(.+)/.+\.html",
                             url)[0]
        album_url = "http://www.tudou.com/albumcover/%s.html" % url_key
        if channel == u"动漫":
            actors = detail_data.get('seiyuu')
        else:
            actors = detail_data.get('performer')
        if channel == u"综艺":
            directors = detail_data.get('host')
        else:
            directors = detail_data.get('director')
        categories = detail_data.get('genre')
        region = detail_data.get('area')[0]
        description = detail_data.get('desc')
        description = "".join(description.split())
        pubtime = detail_data.get('showdate')
        # 未知发布时间pubtime != 0
        if pubtime:
            pubtime = datetime.strptime(str(pubtime), "%Y")
        # 未知发布时间pubtime == 0
        if not pubtime:
            pubtime = datetime.utcfromtimestamp(0)

        videos = get_videos(album_id, url_key)

        model = VideoSourceModel({
            "source": self.data.get('source'),
            "source_id": album_id,
            "title": title,
            "image": image,
            "url": album_url,
            "actors": actors,
            "directors": directors,
            "categories": categories,
            "channel": channel,
            "region": region,
            "description": description,
            "pubtime": pubtime,
            "videos": videos,
        })

        export(model)
        self.data['to_album_id'] = model['to_album_id']
Пример #11
0
    def crawl(self):
        album_url = "http://zyqvod.com/vod/index.asp?id=%s" % self.key
        hxs = load_html(album_url)

        urls = hxs.select("//div[@class='movievod']/li/input/@value").extract()
        videos = []
        for url in urls:
            m = re.match("qvod://(.+)", url)
            if not m:
                continue
            words = m.group(1).split("|")
            size = int(words[0])
            #md5 = words[1]
            title = words[2].split(".")[0]

            videos.append(VideoItemModel({
                            "title" : title,
                            "url" : url,
                            "stream" : [{"url" : url, "format" : "qvod", "size" : size}],
                            }))

        kv = {}
        for s in hxs.select("//div[@class='videoDetail']/p"):
            texts = s.select(".//text()").extract()
            if len(texts) >= 2:
                kv[texts[0].strip()] = texts[1].strip()

        description = "\n".join(hxs.select("//div[@class='movievod']/p[2]/text()").extract())
        try:
            image = hxs.select("//div[@class='videoPic']/img/@src").extract()[0]
        except:
            image = None

        model = VideoSourceModel({
                                 "source" : self.data['source'],
                                 "source_id" : self.key,
                                 "title" : self.data["title"],
                                 "time" : self.data.get('time'),
                                 "url" : album_url,
                                 "image" : image,
                                 "completed" : self.data.get('completed'),
                                 "categories" : [self.data.get('category')],
                                 "channel" : self.data.get('category'),
                                 "region" : self.data.get('region'),
                                 "videos" : videos,
                                 "actors" : split(kv.get(u'影片主演:')),
                                 "directors" : split(kv.get(u'影片导演:')),
                                 "pubtime" : parse_date(kv.get(u'上映年份:')),
                                 "description" : description,
                                 "completed" : not kv.get(u'连载状态:'),
                                 })
        export(model)
Пример #12
0
def import_douban():
    from contentservice.utils.datetimeutil import parse_date
    from contentservice.settings import MONGO_CONN_STR
    db = MongoClient(MONGO_CONN_STR).douban

    pdb.set_trace()

    def clean_title(title):
        zhPattern = re.compile(u'[\u4e00-\u9fa5]+')
        if zhPattern.search(title):
            return title.split(" ")[0]
        else:
            return title

    for item in db.album.find():
        pubtime = None
        if item['pub_time']:
            pubtime = parse_date(re.sub("\(.*\)", "", item['pub_time'][0]))

        model = VideoSourceModel({
                         "title" : clean_title(item['title']),
                         "categories" : item['sub_category'],
                         "image" : item["img"],
                         "related" : item["related"],
                         "score" : item["score"],
                         "actors" : item["actors"],
                         "region" : item["area"][0] if item["area"] else None,
                         "url" : item["url"],
                         "description" : item["description"],
                         "pubtime" : pubtime,
                         "channel" : u"电影",
                         "source" : "douban",
                         "source_id" : re.findall("/(\d+)/", item['url'])[0],
                         })
        model.on_import()
        print model['title']
Пример #13
0
 def extract_model(self, item, detail=None):
     try:
         pubtime = datetime.strptime(str(detail.get("tv_year")),
                                     "%Y") if detail else None
     except:
         pubtime = None
     model = VideoSourceModel({
         "source":
         self.data['source'],
         "source_id":
         item.get("sid"),
         "title":
         detail.get("tv_name") if detail else item.get("albumTitle"),
         "image":
         detail.get("ver_big_pic") if detail else item.get("ver_big_pic"),
         "image2":
         detail.get("hor_big_pic") if detail else item.get("hor_big_pic"),
         "description":
         detail.get("tv_desc") if detail else item.get("tv_desc"),
         "directors":
         item["director"].split(";") if item.get("director") else [],
         "actors":
         item["main_actor"].split(";") if item.get("main_actor") else [],
         "region":
         item.get("area"),
         "url":
         detail["s_url"] if detail else item["s_url"],
         "categories":
         item["tv_cont_cats"].split(";")
         if item.get("tv_cont_cats") else [],
         "time":
         datetime.strptime(
             item.get('tv_application_time', '1970-01-01')[:10],
             "%Y-%m-%d"),
         "price":
         detail.get("fee") if detail else item.get("fee"),
         "channel":
         item["cname"],
         "completed":
         item["vcount"] >= item["totalSet"],
         "visits":
         self.extract_visits(item.get("albumPC")),
         "score":
         item.get("tv_score"),
         "pubtime":
         pubtime,
     })
     return model
Пример #14
0
def update_region():
    conn = Connection()
    db = conn.content_video2
    count = 1
    source_videos = db.video.source.find()
    for source_video in source_videos:
        model = VideoSourceModel({
            "videos": source_video['videos'],
            "image": source_video['image'],
            "related": source_video['related'],
            "duration": source_video['duration'],
            "title": source_video['title'],
            "comments": source_video['comments'],
            "source": source_video['source'],
            "score": source_video['score'],
            "actors": source_video['actors'],
            "price": source_video['price'],
            "channel": source_video['channel'],
            "description": source_video['description'],
            "tags": source_video['tags'],
            "deleted": source_video['deleted'],
            "completed": source_video['completed'],
            "visits": source_video['visits'],
            "favorites": source_video['favorites'],
            "authorities": source_video['authorities'],
            "categories": source_video['categories'],
            "created": source_video['created'],
            "url": source_video['url'],
            "region": source_video['region'],
            "directors": source_video['directors'],
            "pubtime": source_video['pubtime'],
            "time": source_video['time'],
            "source_id": source_video['source_id']
        })
        export(model)
        count += 1
        print "count = %s" % count
    print "count = %s" % count
    print "map complete."
Пример #15
0
    def process_album(self, item):
        sites = {}
        fangying_id = re.findall("f_(.+)\.html", item['link'])[0]

        for play in item['plays']:
            site = play['site']
            if site not in SITES:
                continue

            if play["url"].find("fangying.com") != -1:
                stream = []
            else:
                format = "thunder" if site == "thunder" else ""
                stream = [{"url": play["url"], "format": format}]

            video = VideoItemModel({
                "title": play["title"],
                "url": play["url"],
                "stream": stream,
            })

            if not sites.has_key(site):
                sites[site] = []
            sites[site].append(dict(video))

        model = None
        for site, videos in sites.iteritems():
            model = VideoSourceModel({
                "source":
                self.data['source'],
                "source_id":
                fangying_id,
                "videos":
                videos,
                "title":
                item['title'],
                "directors":
                item['directors'].split("/"),
                "actors":
                item['performers'].split("/"),
                "description":
                item['description'],
                'categories':
                item['genres'].split("/"),
                'region':
                item['countries'].split("/")[0],
                'duration':
                parse_duration(item['duration']),
                'image':
                item['avatar_middle'],
                'score':
                float(item['douban_rating'])
                if item.get('douban_rating') else None,
                'url':
                item['link'],
                'price':
                0.0,
                'pubtime':
                parse_pubtime(item['release_time']),
                'channel':
                CHANNELS.get(self.key)
            })
            export(model)

        if model:
            Scheduler.schedule(RelationCrawler.type,
                               key=fangying_id,
                               data={
                                   'title': model['title'],
                                   'url': model['url']
                               })
Пример #16
0
def import_mtime():
    from datetime import datetime
    from contentservice.models.video import VideoSourceModel
    from contentservice.settings import MONGO_CONN_STR
    from contentservice.utils.datetimeutil import parse_date
    db = MongoClient(MONGO_CONN_STR).mtime
    pdb.set_trace()

    for item in db.album.find():
        area = item["area"] if item.get("area") else None
        if isinstance(area, list):
            area = area[0]
        categories = item.get("type")
        if isinstance(categories, basestring):
            categories = [categories]
        description = item.get("description")
        if isinstance(description, list):
            description = "\n".join(description)
        tags = item.get("tags")
        if isinstance(tags, basestring):
            tags = [tags]
        channel = ""
        if item.get("category_id") == "1":
            channel = u"电影"
        elif item.get("category_id") == "0":
            channel = u"电视剧"
        try:
            model = VideoSourceModel({
                "source":
                "mtime",
                "source_id":
                item["id"],
                "title":
                item["title"],
                "description":
                description,
                "tags":
                tags,
                "time":
                datetime.strptime(item["create_time"], "%Y-%m-%d %H:%M:%S"),
                "duration":
                item.get("duration"),
                "region":
                area,
                "directors":
                item.get("directors"),
                "score":
                item.get("score"),
                "actors":
                item.get("actors"),
                "categories":
                categories,
                "channel":
                channel,
                "url":
                "http://movie.mtime.com/%s/" % item["id"],
                "pubtime":
                parse_date(item["release_time"]),
            })
            model.on_import()
        except Exception, e:
            print e
        print model["title"]
Пример #17
0
    def crawl(self):
        album_id = self.key
        if self.data['channel'] in SHORT_VIDEO:
            url = "http://v.qq.com/page/%s/%s/%s/%s.html" % (
                album_id[0], album_id[1], album_id[-1], album_id)
            pubtime = datetime.strptime(
                self.data["pubtime"], "%Y-%m-%d %H:%M:%S")
            video = VideoItemModel({
                "title": self.data["title"],
                "url": url,
                "stream": [{
                           "url": "javascript:getUrl('tencent', '%s')" % url
                           }],
                "image": self.data["image"],
                "channel": self.data["channel"],
            })
            model = VideoSourceModel({
                                     "source": self.data["source"],
                                     "source_id": album_id,
                                     "title": self.data["title"],
                                     "url": url,
                                     "image": self.data["image"],
                                     "channel": self.data["channel"],
                                     "pubtime": pubtime,
                                     "videos": [video]
                                     })
            export(model)
            self.data['to_album_id'] = model['to_album_id']
        else:
            album_url = "http://v.qq.com/detail/%s/%s.html" % (
                album_id[0], album_id)
            album_data = api_album(album_id[0], album_id)
            if album_data['trailer'] == 1:
                play_url = "http://v.qq.com/prev/%s/%s" % (
                    album_id[0], album_id)
            else:
                play_url = "http://v.qq.com/cover/%s/%s" % (
                    album_id[0], album_id)
            description = album_data.get("columndesc")
            if not description:
                description = album_data.get("desc")
            description = "".join(description.split())
            try:
                pubtime = datetime.strptime(self.data.get("pubtime"), "%Y")
            except:
                pubtime = datetime.utcfromtimestamp(0)

            videos = []
            columnid = album_data.get('columnid')
            rely = album_data.get('rely')
            if columnid:  # columnid != 0
                for video_dict in rely:
                    for year, months in video_dict.iteritems():
                        for month in months:
                            videolist_id = "%s_%s" % (year, month)
                            videos_data = api_video(columnid, videolist_id)
                            for video in videos_data['items']:
                                time = video.get('date')
                                time = datetime.strptime(time, "%Y-%m-%d")
                                url = "http://v.qq.com/cover/%s/%s.html" % (
                                    video.get('coverid')[0], video.get('coverid'))
                                video = VideoItemModel({
                                    "title": video.get('sectitle'),
                                    "description": video.get('breif'),
                                    "url": url,
                                    "stream": [{
                                               "url": "javascript:getUrl('tencent', '%s')" % url
                                               }],
                                    "image": video.get('snapurl'),
                                    "time": time
                                })
                                videos.append(video)
            if not columnid:  # columnid == 0, only one video
                for video in album_data['videos']:
                    videos.append(clean_video(video, play_url))

            # self.data is not None: export(data)
            if self.data:
                model = VideoSourceModel({
                    "source": self.data.get('source'),
                    "source_id": album_id,
                    "title": album_data['columnname'] if album_data['columnname'] else self.data["title"],
                    "image": self.data.get("image"),
                    "url": album_url,
                    "actors": self.data.get("actors"),
                    "directors": self.data.get("directors"),
                    "categories": self.data.get("categories"),
                    "channel": self.data.get("channel"),
                    "region": self.data.get("region"),
                    "description": description,
                    "pubtime": pubtime,
                    "videos": videos,
                })
            # self.data is None: crawl web data first
            # (http://v.qq.com/cover/x/xxxxx.html), and export(data)
            else:
                hxs = load_html(play_url)
                channel = hxs.select(
                    "//div[@class='mod_crumbs']/a[1]/text()").extract()[0]
                album_hxs = hxs.select(
                    "//div[@class='mod_video_intro mod_video_intro_rich']")
                image = album_hxs.select("a/img/@src").extract()[0]
                title = album_hxs.select(
                    "div[@class='video_title']/strong/a/text()").extract()[0]
                directors = []
                for director_hxs in album_hxs.select("//div[@itemprop='director']/a"):
                    director = director_hxs.select("span/text()").extract()[0]
                    directors.append(director)
                actors = []
                for actor_hxs in album_hxs.select("//div[@itemprop='actors']/a"):
                    actor = actor_hxs.select("span/text()").extract()[0]
                    actors.append(actor)
                region = album_hxs.select(
                    "//div[@class='info_area']/span[@class='content']/a/text()").extract()[0]
                categories = []
                for categorie_hxs in album_hxs.select("//div[@class='info_category']/span[@class='content']/a"):
                    categorie = categorie_hxs.select("text()").extract()[0]
                    categories.append(categorie)
                pubtime = album_hxs.select(
                    "//div[@class='info_years']/span[@class='content']/a/text()").extract()[0]
                if re.match("^\d+$", pubtime):
                    pubtime = datetime.strptime(pubtime, "%Y")
                else:
                    pubtime = datetime.utcfromtimestamp(0)

                model = VideoSourceModel({
                    "source": self.data.get('source'),
                    "source_id": album_id,
                    "title": title,
                    "image": image,
                    "url": album_url,
                    "actors": actors,
                    "directors": directors,
                    "categories": categories,
                    "channel": channel,
                    "region": region,
                    "description": description,
                    "pubtime": pubtime,
                    "videos": videos,
                })
            export(model)
            self.data['to_album_id'] = model['to_album_id']
Пример #18
0
    def crawl_show(self, show_id):
        def parse_number(text):
            if not text:
                return 0
            text = re.sub("[^0-9\.]", "", text)
            return float(text)

        data = api_detail(show_id)
        detail = data['detail']

        try:
            pubtime = datetime.strptime(
                detail.get('showdate', '1970-01-01').replace("-00", "-01"),
                "%Y-%m-%d")
        except:
            pubtime = None

        if detail['cats'] in MAIN_CHANNELS.values():
            model = VideoSourceModel({
                'source':
                self.data['source'],
                'source_id':
                detail['showid'],
                'url':
                "http://www.youku.com/show_page/id_z%s.html" %
                detail['showid'],
                'title':
                detail['title'],
                'duration':
                None,
                'visits':
                parse_number(detail.get('total_vv')),
                'comments':
                parse_number(detail.get('total_comment')),
                'score':
                parse_number(detail.get('reputation')),
                'favorites':
                parse_number(detail.get('total_fav')),
                'image':
                detail['img'],
                'region':
                detail['area'][0] if detail.get('area') else None,
                'categories':
                detail.get('genre', []),
                'description':
                detail.get('desc'),
                'completed':
                detail.get('completed') == 1,
                'actors':
                detail.get('performer', []),
                'directors':
                detail.get('director', []),
                #'price' : 0.0,
                'pubtime':
                pubtime,
                'channel':
                detail.get('cats', '')
            })
        else:
            model = VideoSourceModel({
                'source':
                self.data['source'],
                'source_id':
                show_id,
                'url':
                "http://v.youku.com/v_show/id_%s.html" % show_id,
                'title':
                detail['title'],
                'duration':
                None,
                'visits':
                parse_number(detail.get('total_vv')),
                'comments':
                parse_number(detail.get('total_comment')),
                'score':
                parse_number(detail.get('reputation')),
                'favorites':
                parse_number(detail.get('total_fav')),
                'image':
                detail['img'],
                'region':
                detail['area'][0] if detail.get('area') else None,
                'categories':
                detail.get('genre', []),
                'description':
                detail.get('desc'),
                'completed':
                detail.get('completed') == 1,
                'actors':
                detail.get('performer', []),
                'directors':
                detail.get('director', []),
                #'price' : 0.0,
                'pubtime':
                pubtime,
                'channel':
                detail.get('cats', '')
            })
        return model
Пример #19
0
    def crawl(self):
        type = 4
        album_id = self.key
        title = self.data['title'].encode('utf-8')
        channel = self.data.get('channel')

        if channel in LONG_VIDEO_CHANNELS.items():
            album_data = api_album(type, album_id, title)
            album_data = album_data['data']
            pubtime = album_data.get("public_time")
            pubtime = datetime.strptime(pubtime, "%Y%m%d")

            videos = []
            for video in album_data['data']:
                video = clean_video(video)
                videos.append(video)

            model = VideoSourceModel({
                "source": self.data.get('source'),
                "source_id": album_id,
                "title": title,
                "image": album_data.get("bpic"),
                "image2": album_data.get("mpic"),
                "url": album_data.get("web_url"),
                "actors": album_data.get("actors"),
                "directors": album_data.get("director"),
                "categories": album_data.get("tname"),
                "tags": self.data.get("tags"),
                "channel": channel,
                "region": album_data.get("zname")[0],
                "description": album_data.get("introduce"),
                "pubtime": pubtime,
                "videos": videos,
            })
        else:
            video = VideoItemModel({
                "title":
                title,
                "description":
                self.data.get("description"),
                "url":
                "http://www.56.com/u13/v_%s.html" % album_id,
                "stream": [{
                    "url":
                    "http://vxml.56.com/html5/%s/?src=3g&res=qqvga" % album_id
                }],
                "stream_high": [{
                    "url":
                    "http://vxml.56.com/html5/%s/?src=3g&res=qvga" % album_id
                }]
            })
            model = VideoSourceModel({
                "source": self.data.get('source'),
                "source_id": album_id,
                "title": title,
                "image": self.data.get("bpic"),
                "image2": self.data.get("mpic"),
                "tags": self.data.get("tags"),
                "url": self.data.get("web_url"),
                "channel": channel,
                "description": self.data.get("introduce"),
                "videos": [video],
            })
        export(model)
        self.data['to_album_id'] = model['to_album_id']
Пример #20
0
    def crawl(self):
        album_url = "http://www.265zy.com/detail/?%s.html" % self.key
        hxs = load_html(album_url)

        urls = hxs.select(
            "//td[@class='bt']/.//input[@id='copy_yah']/@value").extract()
        videos = []
        for url in urls:
            m = re.match("qvod://(.+)", url)
            if not m:
                continue
            words = m.group(1).split("|")
            size = int(words[0])
            #md5 = words[1]
            title = words[2].split(".")[0]

            videos.append(
                VideoItemModel({
                    "title":
                    title,
                    "url":
                    url,
                    "stream": [{
                        "url": url,
                        "format": "qvod",
                        "size": size
                    }],
                }))

        kv = {}
        for s in hxs.select("/html/body/table[2]/tr[1]/td[2]/table/tr"):
            texts = s.select(".//text()").extract()
            if len(texts) >= 2:
                kv[texts[0].strip()] = texts[1].strip()

        description = "\n".join(
            hxs.select("//div[@class='intro']/.//text()").extract())
        try:
            image = urlparse.urljoin(
                "http://www.265zy.com/",
                hxs.select("//div[@class='img']/img/@src").extract()[0])
        except:
            image = None

        model = VideoSourceModel({
            "source":
            self.data['source'],
            "source_id":
            self.key,
            "title":
            self.data["title"],
            "image":
            image,
            "url":
            album_url,
            "time":
            self.data.get('time'),
            "categories": [self.data.get('category')],
            "channel":
            self.data.get('category'),
            "region":
            self.data.get('region'),
            "videos":
            videos,
            "actors":
            split(kv.get(u"影片演员:")),
            "pubtime":
            parse_date(kv.get(u"上映日期:")),
            "completed":
            kv.get(u"影片状态:", "").find(u"连载") == -1,
            "description":
            description,
        })
        export(model)
Пример #21
0
def extract_album(album, source):
    if hasattr(album, "_a"):
        album = album._a

    channel = _CHANNEL_DCT.get(int(album._cid), '')

    channel_py = _CHANNEL_PINYIN.get(channel)
    if channel_py:
        url = "http://m.iqiyi.com/%s/a/%s.html" % (channel_py, album._id)
    else:
        url = ""

    pubtime = album.year
    if len(pubtime) == 4:
        pubtime = datetime.strptime(pubtime, "%Y")
    elif len(pubtime) == 8:
        pubtime = datetime.strptime(pubtime, "%Y%m%d")
    else:
        pubtime = None

    visits = getattr(album, 'vv')
    m = re.match("^\d+$", getattr(album, 'vv'))
    if m:
        visits = int(visits) * 10000
    else:
        visits = int(visits[:-1]) * 10000

    item = VideoSourceModel({
        'source':
        source,
        'source_id':
        album._id,
        'url':
        url,
        'title':
        album.clm if hasattr(album, 'clm') and album.clm else album._t,
        'duration':
        int(album._dn),
        'visits':
        visits,
        'score':
        float(album._sc),
        'image':
        album._img,
        'tags':
        album.tag.split() if hasattr(album, "tag") else [],
        'channel':
        channel,
        'description':
        album.desc if hasattr(album, "desc") else "",
        'price':
        float(album.t_pc),
        'directors':
        album._da.split(",") if hasattr(album, "_da") else [],
        'actors':
        album._ma.split(",") if hasattr(album, "_ma") else [],
        # last update time for iqiyi
        'time':
        datetime.strptime(getattr(album, "fst_time", "1970-01-01"),
                          "%Y-%m-%d"),
        'price':
        0.0,
        'pubtime':
        pubtime,
    })

    return item
Пример #22
0
    def crawl(self):
        source_id = self.key
        album_data = api_album(source_id, pcode, version)
        album_data = album_data['body']
        title = album_data.get("nameCn")
        pubtime = album_data.get("releaseDate")
        if re.match("^\d+$", pubtime):
            pubtime = datetime.strptime(pubtime, "%Y")
        elif re.match("^\d+-\d+-\d+$", pubtime):
            pubtime = datetime.strptime(pubtime, "%Y-%m-%d")
        else:
            pubtime = datetime.utcfromtimestamp(0)
        directors = album_data.get("directory").split(" ")
        actors = album_data.get("starring").split(" ")
        desc = album_data.get("description")
        desc = "".join(desc.split())
        region = album_data.get("area")
        categories = album_data.get("subCategory").split(" ")
        tags = album_data.get("tag").split(" ")
        url = "http://so.letv.com/tv/%s.html" % source_id

        videos = []
        b = 1
        s = 60
        o = -1
        m = 0
        series_data = api_series(source_id, b, s, o, m, pcode, version)
        for series in series_data['body']['videoInfo']:
            id = series['id']
            mid = series['mid']
            url = "http://www.letv.com/ptv/vplay/%s.html" % id
            vurl = "http://dynamic.app.m.letv.com/android/dynamic.php?mod=minfo&ctl=videofile&act=index&mmsid=%s&pcode=%s&version=%s" % (
                mid, pcode, version)
            jsurl = "javascript:getUrl('letv', '%s')" % vurl
            video = VideoItemModel({
                "title": series.get("nameCn"),
                "url": url,
                "stream": [{
                    "url": jsurl
                }],
                "image": series.get("picAll"),
                "duration": series.get("duration")
            })
            videos.append(video)

        model = VideoSourceModel({
            "source_id": source_id,
            "source": self.data.get('source'),
            "url": url,
            "channel": self.data['channel'],
            'title': title,
            "image": self.data['image'],
            "pubtime": pubtime,
            "directors": directors,
            "actors": actors,
            "desc": desc,
            "region": region,
            "categories": categories,
            "tags": tags,
            "videos": videos
        })
        export(model)
        self.data['to_album_id'] = model['to_album_id']
Пример #23
0
    def crawl(self):
        #key为专辑源站ID
        album_id = self.key

        album_url = "http://bdzy.cc/detail/?%s.html" % album_id
        hxs = load_html(album_url)

        urls = hxs.select("//td[@class='bt']/.//li/input/@value").extract()
        videos = []
        for url in urls:
            m = re.match("bdhd://(.+)", url)
            if not m:
                continue
            words = m.group(1).split("|")
            size = int(words[0])
            #md5 = words[1]
            title = words[2].split(".")[0]

            #视频剧集
            video = VideoItemModel({
                            "title" : title,
                            "url" : url, #网页地址 (这里没有,所以采用播放地址)
                            "stream" : [
                                        {
                                         "url" : url, #视频文件播放地址
                                         "size" : size,
                                         "format" : "bdhd" #视频格式(协议)
                                        }],
                            })

            videos.append(video)

        kv = {}
        for s in hxs.select("/html/body/table[2]/tr[1]/td[2]/table/tr"):
            texts = s.select(".//text()").extract()
            if len(texts) >= 2:
                kv[texts[0].strip()] = texts[1].strip()

        description = "\n".join(hxs.select("//div[@class='intro']/.//text()").extract())

        try:
            image = hxs.select("/html/body/table[2]/tr[1]/td[1]/img/@src").extract()[0]
        except:
            image = None

        #视频导出的数据模型
        model = VideoSourceModel({
                                 "source" : self.data['source'], #视频源
                                 "source_id" : album_id, #源站ID
                                 "title" : self.data["title"],
                                 "url" : album_url, #网页地址
                                 "image" : image, #图片url
                                 "time" : self.data.get('time'), #源站更新时间
                                 "categories" : [self.data.get('category')], #分类
                                 "channel" : self.data.get('category'), #频道
                                 "region" : self.data.get('region'), #地区
                                 "videos" : videos, #视频专辑数组
                                 "pubtime" : parse_date(kv.get(u"上映日期:")), #上映时间
                                 "actors" : split(kv.get(u"影片演员:")),
                                 "completed" : kv.get(u"影片状态:", "").find(u"连载") == -1, #是否完结
                                 "description" : description,
                                 })
        #导出数据
        export(model)