示例#1
0
    def parse_tv_show(self, r):
        """
	    解析播放主页
	    r:网页内容
	    return tvs detail list page url
	    """
        data = Contents()
        try:
            page = etree.HTML(r)
        except Exception as e:
            return None
        a = page.xpath('//div[@class="tvinfo"]/h2/a')
        if len(a) > 0:
            data.detail_url = "http:" + a[0].get('href')
            return data.__dict__
        else:
            try:
                data.detail_url = 'http:' + re.search(
                    u'tvinfo.* *\n*(//list\.youku\.com/show/id_\w*\d*\.html)',
                    r).group(1)
                return data.__dict__
            except Exception as e:
                return None
        category = re.search(u"catName: '娱乐'", r)
示例#2
0
    def merge_fields(self, info):
        L = Contents()
        L.title = info.get("name")
        L.summary = info.get("description")
        L.iqiyi_tvId = info.get("tvId")
        L.iqiyi_vid = info.get("vid")
        L.iqiyi_plays_num = info.get("playCount")
        L.iqiyi_albumId = info.get("albumId")
        L.iqiyi_play_url = info.get("url")
        if info.get("duration") and info.get("duration") != "":
            L.duration = info.get("duration")/60
        L.poster = []
        if info.get("albumImageUrl"):
            L.img_url = info.get("albumImageUrl")
            L.poster.append({"url": info.get("albumImageUrl")})
        if info.get("imageUrl"):
            L.poster.append({"url": info.get("imageUrl")})
        if info.get("videoImageUrl"):
            L.poster.append({"url": info.get("videoImageUrl")})
        if info.get("posterUrl"):
            L.poster.append({"url": info.get("posterUrl")})
        if info.get("tvImageUrl"):
            L.poster.append({"url": info.get("tvImageUrl")})
        if info.get("qualityImageUrl"):
            L.poster.append({"url": info.get("qualityImageUrl")})
        if info["issueTime"]:
            L.release_date = mictime_to_ymd(info["issueTime"])
        if info.get("crumbList"):
            level2 = True
            for x in info.get("crumbList"):
            	if int(x["level"])==2 and x["title"]!=u'VIP会员':
            		L.category = x["title"]
            		level2 = False
            if level2:
            	for x in info.get("crumbList"):
            		if int(x["level"])==3:
            			L.category = x["title"]
            			level2 = False
        _temp = []
        for x in info.get("categories"):
            if u"地区" in x.get("subName"):
                L.area = area_process(x.get("name"))
            elif u"类型" in x.get("subName") or u'风格' in x.get("subName") or u'分类' in x.get("subName") or u'小学' in x.get("subName") or u'高中' in x.get("subName") or u'短片' in x.get("subName"):
                _temp.append(x.get("name"))
            elif u"语种" in x.get("subName"):
                L.language = language_process(x.get("name"))
            elif x.get("subName") == u"年龄段":
                L.age = x.get("name")
        L.tags = ",".join(_temp)
        L.all_episode = info.get("videoCount")
        L.sub_title = info.get("subtitle")
        L.iqiyi_rating_num = info.get("commentCount")
        L.iqiyi_qitanId = info.get("qitanId")
        if info.get("cast") and info.get("cast").get("directors"):
            L.directors = []
            L.directors_list = []
            for x in info.get("cast").get("directors"):
                L.directors.append(x.get("name"))
                L.directors_list.append({"name": x.get("name"), "iqiyi_id": x.get("id"), "avatar": x.get(
                    "imageUrl"), "iqiyi_userId": x.get("userId"), "iqiyi_circleId": x.get("circleId")})
            L.directors = ",".join(L.directors)

        if info.get("cast") and info.get("cast").get("speakers"):
            L.speakers = []
            L.speakers_list = []
            for x in info.get("cast").get("speakers"):
                L.speakers.append(x.get("name"))
                L.speakers_list.append({"name": x.get("name"), "iqiyi_id": x.get("id"), "avatar": x.get(
                    "imageUrl"), "iqiyi_userId": x.get("userId"), "iqiyi_circleId": x.get("circleId")})
            L.speakers = ",".join(L.speakers)

        if info.get("cast") and info.get("cast").get("publishers"):
            L.publishers = []
            L.publishers_list = []
            for x in info.get("cast").get("publishers"):
                L.publishers.append(x.get("name"))
                L.publishers_list.append({"name": x.get("name"), "iqiyi_id": x.get("id"), "avatar": x.get(
                    "imageUrl"), "iqiyi_userId": x.get("userId"), "iqiyi_circleId": x.get("circleId")})
            L.publishers = ",".join(L.publishers)

        if info.get("cast") and info.get("cast").get("singers"):
            L.singers = []
            L.singers_list = []
            for x in info.get("cast").get("singers"):
                L.singers.append(x.get("name"))
                L.singers_list.append({"name": x.get("name"), "iqiyi_id": x.get("id"), "avatar": x.get(
                    "imageUrl"), "iqiyi_userId": x.get("userId"), "iqiyi_circleId": x.get("circleId")})
            L.singers = ",".join(L.singers)

        if info.get("cast") and info.get("cast").get("mainActors"):
            L.starring = []
            L.starring_list = []
            for x in info.get("cast").get("mainActors"):
                L.starring.append(x.get("name"))
                L.starring_list.append({"name": x.get("name"), "iqiyi_id": x.get("id"), "avatar": x.get(
                    "imageUrl"), "iqiyi_userId": x.get("userId"), "iqiyi_circleId": x.get("circleId")})
            L.starring = ",".join(L.starring)

        """编剧"""
        if info.get("cast") and info.get("cast").get("writers"):
            L.screenwriters = []
            L.screenwriter_list = []
            for x in info.get("cast").get("writers"):
                L.screenwriters.append(x.get("name"))
                L.screenwriter_list.append({"name": x.get("name"), "iqiyi_id": x.get("id"), "avatar": x.get(
                    "imageUrl"), "iqiyi_userId": x.get("userId"), "iqiyi_circleId": x.get("circleId")})
            L.screenwriters = ",".join(L.screenwriters)

        if info.get("cast") and info.get("cast").get("actors"):
            L.actors = []
            L.actors_list = []
            for x in info.get("cast").get("actors"):
                L.actors.append(x.get("name"))
                L.actors_list.append({"name": x.get("name"), "iqiyi_id": x.get("id"), "avatar": x.get(
                    "imageUrl"), "iqiyi_userId": x.get("userId"), "iqiyi_circleId": x.get("circleId")})
            L.actors = ",".join(L.actors)

        """嘉宾"""
        if info.get("cast") and info.get("cast").get("guests"):
            L.guests = []
            L.guests_list = []
            for x in info.get("cast").get("guests"):
                L.guests.append(x.get("name"))
                L.guests_list.append({"name": x.get("name"), "iqiyi_id": x.get("id"), "avatar": x.get(
                    "imageUrl"), "iqiyi_userId": x.get("userId"), "iqiyi_circleId": x.get("circleId")})
            L.guests = ",".join(L.guests)

        if info.get("cast") and info.get("cast").get("hosts"):
            L.hosts = []
            L.hosts_list = []
            for x in info.get("cast").get("hosts"):
                L.hosts.append(x.get("name"))
                L.hosts_list.append({"name": x.get("name"), "iqiyi_id": x.get("id"), "avatar": x.get(
                    "imageUrl"), "iqiyi_userId": x.get("userId"), "iqiyi_circleId": x.get("circleId")})
            L.hosts = ",".join(L.hosts)

        if L.release_date and L.year==None:
        	m = re.search(u'(\d{4})',L.release_date)
        	if m:
        		L.year = m.group(1)

        if not L.directors_list and not L.directors_list and L.hosts:
            L.directors = L.hosts
            L.directors_list = L.hosts_list

        L.focuses = info.get("focuses")
        L.iqiyi_rating = info.get("score")
        L.created_at = time.time()

        return L.__dict__
示例#3
0
    def vdetail_parser(self, r, url=None):
        """视频详情页面"""
        try:
            page = etree.HTML(r)
        except Exception as e:
            return False
        L = Contents()
        result_div = page.xpath(u'//div[@class="mod_search_topic"]')
        if result_div:
            result_div = page.xpath(u'//div[@class="mod_reuslt"]')

        title = result_div.xpath(u'//h1[@class="main_title"]/a')
        if len(title) > 0:
            L.title = title[0].text

        sub_title = result_div.xpath(u'//span[@class="sub_title"]')
        if len(sub_title) > 0:
            L.sub_title = sub_title[0].text
            if re.search(u'(\d{4})', L.sub_title):
                L.year = re.search(u'(\d{4})', L.sub_title).group(1)

        category = page.xpath(u'//a[@class="channelTag"]')
        if len(category) > 0:
            L.category = category[0].text

        area = result_div.xpath(u'//em[contains(text(),"地区")]/a')
        if len(area) > 0:
            L.area = area[0].text

        L.iqiyi_tvId = self.get_vid_by_url(url)

        language = result_div.xpath(u'//em[contains(text(),"语言")]/a')
        if len(language) > 0:
            L.language = language[0].text

        tags = result_div.xpath(u'//em[contains(text(),"类型")]/a')
        if len(tags) > 0:
            _temp = [x.text for x in tags]
            L.tags = ",".join(set(_temp))

        banben = result_div.xpath(u'//em[contains(text(),"版本")]/a')
        if len(banben) > 0:
            L.banben = banben[0].text

        summary = result_div.xpath(u'//em[contains(text(),"简介")]')
        if len(summary) > 0:
            L.summary = result_div.xpath(
                u'//em[contains(text(),"简介")]/following::text()[1]')[0]
            L.summary = parse_simple(L.summary)

        all_episode = result_div.xpath(u'//em[contains(text(),"集数")]/a')
        if len(all_episode) > 0:
            L.all_episode = self.parse_all_episode(all_episode[0].text)

        iqiyi_play_url = result_div.xpath(u'//a[contains(text(),"立即播放")]')
        if len(iqiyi_play_url) > 0:
            L.iqiyi_play_url = iqiyi_play_url[0].get("href")

        iqiyi_plays_num = result_div.xpath(u'//i[@id="widget-playcount"]')
        if len(iqiyi_plays_num) > 0:
            L.iqiyi_plays_num = iqiyi_plays_num[0].text

        peiyin = result_div.xpath(u'//em[contains(text(),"配音")]/a')
        if len(peiyin) > 0:
            L.peiyin_list = []
            _temp = []
            for x in peiyin:
                _temp.append(peiyin[0].text)
                L.peiyin_list.append({"name": peiyin[0].text, "iqiyi_url": x.get(
                    "href"), "iqiyi_id": self.get_starId_by_url(x.get("href"))})
            L.peiyin = ",".join(set(_temp))

        iqiyi_rating = page.xpath(u'//span[@class="score_font"]')
        if len(iqiyi_rating) > 0:
            L.iqiyi_rating = iqiyi_rating[0].get("snsscore")

        poster = page.xpath(u'//div[@contains(@_stat,"result_pic"]/img')
        if len(poster) > 0:
            L.poster = {"url": poster[0].get("src"), "width": poster[0].get(
                "width"), "height": poster[0].get("height"), "name": poster[0].get("alt")}
            L.img_url = L.poster['url']

        # 导演演员
        # actor_list = page.xpath(u'//ul[@class="actor_list cf"]/li/')
        # starring_list = []
        # starring = []
        # directors_list = []
        # directors = []
        return L.__dict__
示例#4
0
	def baike_parser(r,url=None):
		try:
			r = re.sub(u' ','',r)
			page = etree.HTML(r)
		except Exception as e:
			return False
		L = Contents()
		summary = page.xpath(u'//div[@class="lemmaWgt-lemmaSummary lemmaWgt-lemmaSummary-light"]')
		if len(summary) > 0:
			L.summary = summary[0].text

		title = page.xpath(u'//dt[contains(text(),"中文名")]')
		if len(title) > 0:
			L.title = parse_simple(title[0].getnext().text)

		foreign_title = page.xpath(u'//dt[contains(text(),"外文名")]')
		if len(foreign_title) > 0:
			L.foreign_title = parse_simple(foreign_title[0].getnext().text)

		production_company = page.xpath(u'//dt[contains(text(),"出品公司")]')
		if len(production_company) > 0:
			L.production_company = parse_simple(production_company[0].getnext().text)

		producer_country = page.xpath(u'//dt[contains(text(),"制片地区")]')
		if len(producer_country) > 0:
			L.producer_country = area_process(parse_simple(producer_country[0].getnext().text))

		directors_list = page.xpath(u'//dt[contains(text(),"导演")]')
		if len(directors_list) > 0:
			a_tag = directors_list[0].getnext().findall('a')
			if len(a_tag) > 0:
				L.directors_list = []
				directors = []
				for x in a_tag:
					L.directors_list.append({"name":parse_simple(x.text),"baike_id":x.get("data-lemmaid"),"baike_url":u'https://baike.baidu.com'+x.get("href")})
					directors.append(parse_simple(x.text))
				L.directors = ",".join(set(directors))
			else:
				L.directors = area_process(parse_simple(directors_list[0].getnext().text))

		screenwriter_list = page.xpath(u'//dt[contains(text(),"编剧")]')
		if len(screenwriter_list) > 0:
			a_tag = screenwriter_list[0].getnext().findall('a')
			if len(a_tag) > 0:
				L.screenwriter_list = []
				screenwriters = []
				for x in a_tag:
					L.screenwriter_list.append({"name":parse_simple(x.text),"baike_id":x.get("data-lemmaid"),"baike_url":u'https://baike.baidu.com'+x.get("href")})
					screenwriters.append(parse_simple(x.text))
				L.screenwriters = ",".join(set(screenwriters))
			else:
				L.screenwriters = area_process(parse_simple(screenwriter_list[0].getnext().text))

		starring_list = page.xpath(u'//dt[contains(text(),"主演")]')
		if len(starring_list) > 0:
			a_tag = starring_list[0].getnext().findall('a')
			if len(a_tag) > 0:
				L.starring_list = []
				starring = []
				for x in a_tag:
					L.starring_list.append({"name":parse_simple(x.text),"baike_id":x.get("data-lemmaid"),"baike_url":u'https://baike.baidu.com'+x.get("href")})
					starring.append(parse_simple(x.text))
				L.starring = ",".join(set(starring))
			else:
				L.starring = area_process(parse_simple(starring_list[0].getnext().text))

		alias = page.xpath(u'//dt[contains(text(),"其它译名")]')
		if len(alias) > 0:
			a_tag = alias[0].getnext().findall("a")
			if len(a_tag) > 0:
				L.alias = ",".join([parse_simple(x.text) for x in a_tag if parse_simple(x.text)])
			else:
				L.alias = parse_simple(alias[0].getnext().text)

		types = page.xpath(u'//dt[contains(text(),"类型")]')
		if len(types) > 0:
			L.type = area_process(parse_simple(types[0].getnext().text))

		duration = page.xpath(u'//dt[contains(text(),"片长")]')
		if len(duration) > 0:
			L.duration = area_process(parse_simple(duration[0].getnext().text))

		release_date = page.xpath(u'//dt[contains(text(),"上映时间")]')
		if len(release_date) > 0:
			L.release_date = area_process(parse_simple(release_date[0].getnext().text))

		release_date = page.xpath(u'//dt[contains(text(),"语言")]')
		if len(release_date) > 0:
			L.language = language_process(parse_simple(release_date[0].getnext().text))

		douban_rating = page.xpath(u'//span[contains(@class,"star-text")]')
		if len(douban_rating) > 0:
			L.douban_rating = douban_rating[0].text

		poster = page.xpath(u'//img[@alt="词条图片"]')
		if len(poster) > 0:
			L.poster = [{"url":poster[0].get("src"),"name":poster[0].get("alt")}]
			L.img_url = poster[0].get("src")

		actor_list = page.xpath(u'//ul[@class="actorList"]/li')
		if len(actor_list) > 0:
			starring = L.starring.split(',')
			L.actor_list = []
			starring_list = []
			for x in actor_list:
				_temp = {"avatar":x.find('img').get("src"),"name":x.xpath(u'//dl[@class="info"]/a')[0].text,"baike_id":x.xpath(u'//dl[@class="info"]/a')[0].get("data-lemmaid"),"baidu_url":"https://baike.baidu.com"+x.xpath(u'//dl[@class="info"]/a')[0].get("href")}
				if _temp['name'] in starring:
					starring_list.append(_temp)
				else:
					L.actor_list.append(_temp)
			if starring_list:
				L.starring_list = starring_list

		L.created_at = time.time()
		return L.__dict__
示例#5
0
    def vdetail_parser(self, r):
        data = Contents()
        try:
            page = etree.HTML(r)
        except Exception as e:
            return False
        year = re.search(u'<span class="year">\((\d{4})\)</span>', r)
        if year:
            data.year = year.group(1)
        title = re.search(u'<span property="v\:itemreviewed">(.*)</span>', r)
        if title:
            data.title = title.group(1)
        bianju = page.xpath(u'//span[contains(text(),"编剧")]')
        if len(bianju) > 0:
            bianju_a = bianju[0].getnext()
            if bianju_a is not None:
                bianju_a = bianju_a.findall('a')
                data.screenwriter_list = []
                screenwriters = ''
                for x in bianju_a:
                    screenwriters = screenwriters + parse_simple(x.text) + ","
                    _temp = {}
                    if re.search(u'/celebrity/(\d*)/', x.get("href")):
                        _temp["doubanid"] = re.search(u'/celebrity/(\d*)/',
                                                      x.get("href")).group(1)
                    else:
                        # doubanid = x.get("href")
                        pass
                    if x.get("href"):
                        _temp[
                            "douban_url"] = "https://movie.douban.com" + x.get(
                                "href")
                    _temp["name"] = parse_simple(x.text)
                    data.screenwriter_list.append(_temp)
                data.screenwriters = screenwriters.strip(',')

        directors_el = page.xpath(u'//span[contains(text(),"导演")]')
        if len(directors_el) > 0:
            directors_a = directors_el[0].getnext()
            if directors_a is not None:
                directors_a = directors_a.findall('a')
                data.directors_list = []
                directors = ""
                for x in directors_a:
                    directors = directors + parse_simple(x.text) + ","
                    _temp = {}
                    if re.search(u'/celebrity/(\d*)/', x.get("href")):
                        _temp["doubanid"] = re.search(u'/celebrity/(\d*)/',
                                                      x.get("href")).group(1)
                    else:
                        # doubanid = x.get("href")
                        pass
                    if x.get("href"):
                        _temp[
                            "douban_url"] = "https://movie.douban.com" + x.get(
                                "href")
                    _temp["name"] = parse_simple(x.text)
                    data.directors_list.append(_temp)
                data.directors = directors.strip(',')

        starring_el = page.xpath(u'//span[contains(text(),"主演")]')
        if len(starring_el) > 0:
            starring_a = starring_el[0].getnext()
            if starring_a is not None:
                starring_a = starring_a.findall('a')
                data.starring_list = []
                starring = ""
                for x in starring_a:
                    starring = starring + parse_simple(x.text) + ","
                    _temp = {}
                    if re.search(u'/celebrity/(\d*)/', x.get("href")):
                        _temp["doubanid"] = re.search(u'/celebrity/(\d*)/',
                                                      x.get("href")).group(1)
                    else:
                        # doubanid = x.get("href")
                        pass
                    if x.get("href"):
                        _temp[
                            "douban_url"] = "https://movie.douban.com" + x.get(
                                "href")
                    _temp["name"] = parse_simple(x.text)
                    data.starring_list.append(_temp)
                starring = starring.strip(',')
                data.starring = starring
        type_el = page.xpath(u'//span[@property="v:genre"]')  # 类型
        mvtype = []
        if len(type_el) > 0:
            for x in type_el:
                mvtype.append(parse_simple(x.text))

        tags = page.xpath(u'//div[@class="tags-body"]/a')
        _temp = []
        for x in tags:
            _temp.append(parse_simple(x.text))
        _temp = _temp + mvtype
        data.tags = ",".join(set(_temp))

        producer_country_el = page.xpath(
            u'//span[contains(text(),"制片国家/地区:")]')
        if len(producer_country_el) > 0:
            producer_country = page.xpath(
                u'//span[contains(text(),"制片国家/地区:")]/following::text()[1]')[0]
            data.producer_country = area_process(
                split_space(producer_country.replace('/', ',')))

        language_el = page.xpath(u'//span[contains(text(),"语言:")]')
        if len(language_el) > 0:
            language = page.xpath(
                u'//span[contains(text(),"语言:")]/following::text()[1]')[0]
            data.language = language_process(
                split_space(language.replace('/', ',')))

        all_episode = page.xpath(u'//span[contains(text(),"集数:")]')
        if len(all_episode) > 0:
            all_episode = page.xpath(
                u'//span[contains(text(),"集数:")]/following::text()[1]')[0]
            m = re.search(u'(\d{1,})', all_episode.replace(" ", ""))
            if m:
                data.all_episode = m.group(1)

        episode_time = page.xpath(u'//span[contains(text(),"单集片长:")]')
        if len(episode_time) > 0:
            episode = page.xpath(
                u'//span[contains(text(),"单集片长:")]/following::text()[1]')[0]
            m = re.search(u'(\d{1,})', episode.replace(" ", ""))
            if m:
                data.duration = m.group(1)

        season = page.xpath(
            u'//select[@id="season"]/option[@selected="selected"]')  #season季数
        if len(season) > 0:
            data.season = season[0].text

        release_date_el = page.xpath(
            u'//span[@property="v:initialReleaseDate"]')  #首播
        if len(release_date_el) > 0:
            release_date = ""
            for x in release_date_el:
                release_date = release_date + parse_simple(x.text) + "|"
            release_date = release_date.strip('|')
            m = re.search(u'(\d{4}-\d{2}-\d{2})',
                          release_date.replace(" ", ""))
            if m:
                data.release_date = m.group(1)
            else:
                data.release_date = release_date
        duration_el = page.xpath(u'//span[@property="v:runtime"]')
        if len(duration_el) > 0:
            m = re.search(u'(\d{1,})', duration_el[0].text.replace(" ", ''))
            if m:
                data.duration = m.group(1)  # 片长

        alias_al = page.xpath(u'//span[contains(text(),"又名:")]')
        if len(alias_al) > 0:
            alias = page.xpath(
                u'//span[contains(text(),"又名:")]/following::text()[1]')[0]
            data.alias = split_space(alias.replace('/', ','))

        IMDb_el = page.xpath(u'//span[contains(text(),"IMDb链接")]')
        if len(IMDb_el) > 0:
            data.IMDb = IMDb_el[0].getnext().get("href")

        rating = re.search(u'property="v\:average">(\d*\.\d*)</strong>', r)
        if rating:
            data.douban_rating = rating.group(1)

        rating_sum = page.xpath(u'//span[@property="v:votes"]')
        if len(rating_sum) > 0:
            data.douban_rating_sum = rating_sum[0].text

        summary_all = page.xpath(u'//span[@class="all hidden"]')
        summary = page.xpath(u'//span[@property="v:summary"]')
        if len(summary_all) > 0:
            data.summary = ''.join(
                page.xpath(u'//span[@class="all hidden"]/text()'))
            data.summary = parse_simple(data.summary)
        elif len(summary) > 0:
            data.summary = ''.join(
                page.xpath(u'//span[@property="v:summary"]/text()'))
            data.summary = parse_simple(data.summary)

        img_url = page.xpath(u'//img[@title="点击看更多海报"]')
        nbgnbg = page.xpath(u'//a[@title="点击看大图" and @class="nbgnbg"]')
        if len(img_url) > 0:
            data.img_url = page.xpath(u'//img[@title="点击看更多海报"]')[0].get("src")
        elif len(nbgnbg) > 0:
            data.img_url = nbgnbg[0].get("href")

        if data.all_episode > 1 and (u"动漫" in data.tags or u"动画" in data.tags):
            data.category = u"动漫"
        elif data.all_episode > 1 and (u"综艺" in data.tags
                                       or u'真人秀' in data.tags):
            data.category = u'综艺'
        elif data.all_episode > 1:
            data.category = u"电视剧"
        elif u"动漫" in data.tags or u"动画" in data.tags:
            data.category = u'动漫'
        elif u"短片" in data.tags:
            data.category = u'短片'
        else:
            data.category = u'电影'

        m = re.search(u"SUBJECT_ID: *'(\d*)'", r)
        if m:
            data.doubanid = m.group(1)

        print(
            "oooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo"
        )
        print(data.__dict__)
        print(
            "oooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo"
        )
        return data.__dict__
示例#6
0
    def parse_detail(self, r, url=None):
        try:
            page = etree.HTML(r)
        except Exception as e:
            return None
        data = Contents()
        sss = re.sub(u'\\n', '', r)
        v_show = page.xpath(
            u'//div[@class="p-post"]/div[@class="yk-pack p-list"]/div[@class="p-thumb"]/a'
        )
        if len(v_show) > 0:
            data.youku_play = "http:" + v_show[0].get("href")
        # 海报:
        thumb = page.xpath(
            u'//div[@class="p-post"]/div[@class="yk-pack p-list"]/div[@class="p-thumb"]/img'
        )
        if len(thumb) > 0:
            data.poster = [{
                "url": "http:" + thumb[0].get("src"),
                "title": thumb[0].get("alt"),
                "width": 200,
                "height": 300
            }]
            data.title = thumb[0].get("alt")
            data.img_url = "http:" + thumb[0].get("src")
        # category:
        category = page.xpath(
            '//div[@class="p-base"]/ul/li[@class="p-row p-title"]/a')
        if len(category) > 0:
            data.category = parse_simple(category[0].text)
        # 年份:可能没有
        year = page.xpath(
            '//div[@class="p-base"]/ul/li[@class="p-row p-title"]/span[@class="sub-title"]'
        )
        if len(year) > 0 and year[0].text:
            m = re.search(u'(\d{4})', year[0].text)
            if m:
                data.year = m.group(1)
        # 别名:可能没有
        alias = page.xpath('//div[@class="p-base"]/ul/li[@class="p-alias"]')
        if len(alias) > 0:
            data.alias = split_space(alias[0].get("title").replace('/', ','))
        # 上映:可能没有
        published_at = re.search(u'>上映:</label>(\w+-\d+-\d+)*</span>', sss)
        if published_at != None:
            data.release_date = published_at.group(1)
        # 优酷评分:可能没有
        youku_score = page.xpath(
            '//div[@class="p-base"]/ul/li[@class="p-score"]/span[@class="star-num"]'
        )
        if len(youku_score) > 0:
            data.youku_rating = parse_simple(youku_score[0].text)
        # 豆瓣评分:可能没有
        douban_score = re.search(u'<span class="db-bignum">(\d+\.\d*)</span>',
                                 sss)
        if douban_score != None:
            data.douban_rating = douban_score.group(1)
        # 豆瓣评价数量,可能没有
        douban_cm_num = re.search(u'<span class="db-cm-num">(\d*)评价</span>',
                                  sss)
        if douban_cm_num != None:
            data.douban_comment_sum = douban_cm_num.group(1)
        # 主演:可能没有
        actors = page.xpath(
            '//div[@class="p-base"]/ul/li[@class="p-performer"]')
        if len(actors) > 0:
            data.starring = split_space(actors[0].get('title').replace(
                "/", ","))
            data.starring_list = []
            for x in page.xpath(
                    '//div[@class="p-base"]/ul/li[@class="p-performer"]/a'):
                _temp = {}
                _temp["name"] = parse_simple(x.text)
                _temp["youkuid"] = re.search(
                    u"//list\.youku\.com/star/show/(.*)\.html",
                    etree.tostring(x)).group(1)
                if x.get("href"):
                    _temp["youku_url"] = "http:" + x.get("href")
                data.starring_list.append(_temp)

        # 集数
        renew = page.xpath(
            '//div[@class="p-base"]/ul/li[@class="p-row p-renew"]')
        if len(renew) > 0 and renew[0].text:
            m = re.search(u'(\d*)', renew[0].text)
            if m:
                data.all_episode = m.group(1)
        # 导演:循环出来
        directed = page.xpath(
            u'//div[@class="p-base"]/ul/li[contains(text(),"导演:")]/a')
        data['director_list'] = []
        if len(directed) > 0:
            data.directors = []
            for x in directed:
                data.directors.append(parse_simple(x.text))
                _temp = {}
                _temp["name"] = parse_simple(x.text)
                _temp["youkuid"] = re.search(
                    u"//list\.youku\.com/star/show/(.*)\.html",
                    etree.tostring(x)).group(1)
                if x.get("href"):
                    _temp["youku_url"] = x.get("href")
                data.directors_list.append(_temp)
                data.directors = ",".join(data.directors)
            data.directors = ",".join(data.directors)
        # 地区,可能没有
        area = re.search(
            u'>地区:<a href="//list\.youku\.com/category/show/([^\.html]+?)\.html" target="_blank">([^</a></li>]+?)</a>',
            sss)
        if area != None:
            data.producer_country = parse_simple(area.group(2))
        # 类型:循环出来
        types = page.xpath(
            u'//div[@class="p-base"]/ul/li[contains(text(),"类型")]/a')
        if len(types) > 0:
            data.tags = []
            for x in types:
                data.tags.append(parse_simple(x.text))
            data.tags = ",".join(data.tags)
        # 总播放数:可能为none
        plays_num = re.search(u'<li>总播放数:([^</li>]+?)</li>', sss)
        if plays_num != None:
            data.youku_plays_num = plays_num.group(1).replace(',', "")
        # 评论数量:可能为none
        youku_comments_num = re.search(u'<li>评论:([^</li>]+?)</li>', sss)
        if youku_comments_num:
            data.youku_comments_num = youku_comments_num.group(1)
        # 顶:可以空
        ding = re.search(u'<li>顶:([^</li>]+?)</li>', sss)
        if ding:
            data.ding = ding.group(1)
        # 简介:
        summary = page.xpath(u'.//span[contains(@class,"intro-more")]/text()')
        if summary:
            data.summary = parse_simple("".join(summary)).replace(u"简介:", "")
        # 适合年龄,可能为空
        age = re.search(u'>适用年龄:([^</li>]+?)</li>', sss)
        if age:
            data.age = age.group(1)
        peiyin = page.xpath(
            u'//div[@class="p-base"]/ul/li[contains(text(),"声优:")]/a')
        if len(peiyin) > 0:
            data.peiyin = []
            data.peiyin_list = []
            for x in peiyin:
                data.peiyin.append(parse_simple(x.text))
                _temp = {}
                _temp["name"] = parse_simple(x.text)
                _temp["youkuid"] = re.search(
                    u"//list\.youku\.com/star/show/(.*)\.html",
                    etree.tostring(x)).group(1)
                _temp['youku_url'] = "http:" + x.get("href")
                data['peiyin_list'].append(_temp)
            data.peiyin = ",".join(data.peiyin)
        # 综艺节目有
        presenters = page.xpath(
            u'//div[@class="p-base"]/ul/li[contains(text(),"主持人:")]/a')
        if len(presenters) > 0:
            data.presenters = []
            data.presenters_list = []
            for x in presenters:
                data.presenters.append(parse_simple(x.text))
                _temp["name"] = parse_simple(x.text)
                _temp["youkuid"] = re.search(
                    u"//list\.youku\.com/star/show/(.*)\.html",
                    etree.tostring(x)).group(1)
                _temp['youku_url'] = "http:" + x.get("href")
            data.presenters = ",".join(data.presenters)
        if data.title == None:
            return None
        data.created_at = time.time()

        return data.__dict__
示例#7
0
    def vdetail_parser(self, r):
        try:
            page = etree.HTML(r)
        except Exception as e:
            return False
        L = Contents()
        title = page.xpath(u'//a[@_stat="info:title"]')
        m = re.search(u'\{"id":"(\w*\d*)"', r)
        if m:
            L.qq_id = m.group(1)
        m = re.search(u'&vid=(\w*\d*)&', r)
        if m:
            L.qq_vid = m.group(1)
        if len(title) > 0:
            L.title = title[0].text

        category = page.xpath(u'//span[@class="type"]')
        if len(category) > 0:
            L.category = category[0].text

        area = page.xpath(u'.//span[contains(text(),"地 区:")]')
        if len(area) > 0:
            L.area = area_process(area[0].getnext().text)

        foreign_title = page.xpath(u'//span[@class="title_en"]')
        if len(foreign_title) > 0:
            L.foreign_title = foreign_title[0].text

        qq_play = page.xpath(u'.//a[@_stat="info:playbtn"]')
        if len(qq_play) > 0:
            L.qq_play = qq_play[0].get("href")

        language = page.xpath(u'//span[contains(text(),"语 言:")]')
        if len(language) > 0:
            L.language = language_process(language[0].getnext().text)

        year = page.xpath(u'.//span[contains(text(),"上映时间")]')
        if len(year) > 0 and year[0].getnext().text:
            m = re.search(u'(\d{4})', year[0].getnext().text)
            if m:
                L.year = m.group(1)

        all_episode = page.xpath(u'//span[contains(text(),"总集数:")]')
        if len(all_episode) > 0:
            L.all_episode = all_episode[0].getnext().text

        release_date = page.xpath(u'//span[contains(text(),"出品时间:")]')
        if len(release_date) > 0:
            L.release_date = release_date[0].getnext().text
            if L.release_date and L.year == None:
                try:
                    m = re.search(u'(\d{4})', L.release_date)
                    if m:
                        L.year = m.group(1)
                except Exception as e:
                    pass
        year = page.xpath(u'.//span[contains(text(),"首播时间")]')
        if len(year) > 0 and L.year == None:
            m = re.search(u'(\d{4})', year[0].getnext().text)
            if m:
                L.year = m.group(1)

        alias = page.xpath(u'//span[contains(text(),"别 名")]')
        if len(alias) > 0:
            L.alias = alias[0].getnext().text

        tags = page.xpath(u'//a[@class="tag"]')
        if len(tags) > 0:
            _temp = [x.text for x in tags]
            L.tags = ",".join(set(_temp))

        summary = page.xpath(
            u'//span[@class="desc_txt"]/span[@class="txt _desc_txt_lineHight"]'
        )
        if len(summary) > 0:
            L.summary = parse_simple(summary[0].text)

        qq_rating = page.xpath(u'//div[@class="score_v"]/span[@class="score"]')
        if len(qq_rating) > 0:
            L.qq_rating = qq_rating[0].text

        douban_rating = page.xpath(
            u'//a[@class="score_db"]/span[@class="score"]')
        if len(douban_rating) > 0:
            L.douban_rating = douban_rating[0].text

        poster = page.xpath(u'//img[@_stat="info:poster"]')
        if len(poster) > 0:
            L.poster = []
            if poster[0].get("src"):
                L.poster.append({
                    "url": self.parse_imgurl(poster[0].get("src")),
                    "name": poster[0].get("alt")
                })
                L.img_url = self.parse_imgurl(poster[0].get("src"))

        #导演演员
        actor_list = page.xpath(u'//ul[contains(@class,"actor_list")]/li')
        starring_list = []
        starring = []
        directors_list = []
        directors = []
        if len(actor_list) > 0:
            _temp = []
            for actor in actor_list:
                _dic = {}
                actor_avatar = actor.find(u'a')
                if actor_avatar is not None:
                    if actor_avatar.find('img') is not None:
                        _dic["avatar"] = self.parse_imgurl(
                            actor_avatar.find('img').get("src"))
                    _dic["qq_id"] = actor.get("data-id")
                    if actor.find("span") is not None:
                        _dic["name"] = actor.find("span").text
                    _dic["qq_home_page"] = actor_avatar.get("href")
                    actor_detail = actor.xpath(
                        u'.//div[@class="actor_detail"]')
                    if actor_detail:
                        # 职业
                        occupation = actor_detail[0].xpath(
                            u'.//span[contains(text(),"职业")]')
                        if occupation:
                            _dic['occupation'] = occupation[0].getnext().text

                        # 地区
                        area = actor_detail[0].xpath(
                            u'.//span[contains(text(),"地区")]')
                        if len(area) > 0:
                            _dic['area'] = area[0].getnext().text

                    # 简介
                    intro = actor.xpath(u'.//span[@itemprop="description"]')
                    if intro:
                        _dic["intro"] = intro[0].text
                    # 导演
                    if actor_avatar.xpath(u'.//span[@class="director"]'):
                        directors_list.append(_dic)
                        directors.append(_dic['name'])
                    else:
                        # 演员
                        starring_list.append(_dic)
                        starring.append(_dic['name'])
        if starring_list:
            L.starring = ','.join(starring)
            L.starring_list = starring_list
        if directors_list:
            L.directors = ','.join(directors)
            L.directors_list = directors_list

        if L.title == None:
            return False
        L.created_at = time.time()
        return L.__dict__