def parse_tv_show(self, r): """ 解析播放主页 r:网页内容 return tvs detail list page url """ data = Contents() try: page = etree.HTML(r) except Exception as e: return None a = page.xpath('//div[@class="tvinfo"]/h2/a') if len(a) > 0: data.detail_url = "http:" + a[0].get('href') return data.__dict__ else: try: data.detail_url = 'http:' + re.search( u'tvinfo.* *\n*(//list\.youku\.com/show/id_\w*\d*\.html)', r).group(1) return data.__dict__ except Exception as e: return None category = re.search(u"catName: '娱乐'", r)
def merge_fields(self, info): L = Contents() L.title = info.get("name") L.summary = info.get("description") L.iqiyi_tvId = info.get("tvId") L.iqiyi_vid = info.get("vid") L.iqiyi_plays_num = info.get("playCount") L.iqiyi_albumId = info.get("albumId") L.iqiyi_play_url = info.get("url") if info.get("duration") and info.get("duration") != "": L.duration = info.get("duration")/60 L.poster = [] if info.get("albumImageUrl"): L.img_url = info.get("albumImageUrl") L.poster.append({"url": info.get("albumImageUrl")}) if info.get("imageUrl"): L.poster.append({"url": info.get("imageUrl")}) if info.get("videoImageUrl"): L.poster.append({"url": info.get("videoImageUrl")}) if info.get("posterUrl"): L.poster.append({"url": info.get("posterUrl")}) if info.get("tvImageUrl"): L.poster.append({"url": info.get("tvImageUrl")}) if info.get("qualityImageUrl"): L.poster.append({"url": info.get("qualityImageUrl")}) if info["issueTime"]: L.release_date = mictime_to_ymd(info["issueTime"]) if info.get("crumbList"): level2 = True for x in info.get("crumbList"): if int(x["level"])==2 and x["title"]!=u'VIP会员': L.category = x["title"] level2 = False if level2: for x in info.get("crumbList"): if int(x["level"])==3: L.category = x["title"] level2 = False _temp = [] for x in info.get("categories"): if u"地区" in x.get("subName"): L.area = area_process(x.get("name")) elif u"类型" in x.get("subName") or u'风格' in x.get("subName") or u'分类' in x.get("subName") or u'小学' in x.get("subName") or u'高中' in x.get("subName") or u'短片' in x.get("subName"): _temp.append(x.get("name")) elif u"语种" in x.get("subName"): L.language = language_process(x.get("name")) elif x.get("subName") == u"年龄段": L.age = x.get("name") L.tags = ",".join(_temp) L.all_episode = info.get("videoCount") L.sub_title = info.get("subtitle") L.iqiyi_rating_num = info.get("commentCount") L.iqiyi_qitanId = info.get("qitanId") if info.get("cast") and info.get("cast").get("directors"): L.directors = [] L.directors_list = [] for x in info.get("cast").get("directors"): L.directors.append(x.get("name")) L.directors_list.append({"name": x.get("name"), "iqiyi_id": x.get("id"), "avatar": x.get( "imageUrl"), "iqiyi_userId": x.get("userId"), "iqiyi_circleId": x.get("circleId")}) L.directors = ",".join(L.directors) if info.get("cast") and info.get("cast").get("speakers"): L.speakers = [] L.speakers_list = [] for x in info.get("cast").get("speakers"): L.speakers.append(x.get("name")) L.speakers_list.append({"name": x.get("name"), "iqiyi_id": x.get("id"), "avatar": x.get( "imageUrl"), "iqiyi_userId": x.get("userId"), "iqiyi_circleId": x.get("circleId")}) L.speakers = ",".join(L.speakers) if info.get("cast") and info.get("cast").get("publishers"): L.publishers = [] L.publishers_list = [] for x in info.get("cast").get("publishers"): L.publishers.append(x.get("name")) L.publishers_list.append({"name": x.get("name"), "iqiyi_id": x.get("id"), "avatar": x.get( "imageUrl"), "iqiyi_userId": x.get("userId"), "iqiyi_circleId": x.get("circleId")}) L.publishers = ",".join(L.publishers) if info.get("cast") and info.get("cast").get("singers"): L.singers = [] L.singers_list = [] for x in info.get("cast").get("singers"): L.singers.append(x.get("name")) L.singers_list.append({"name": x.get("name"), "iqiyi_id": x.get("id"), "avatar": x.get( "imageUrl"), "iqiyi_userId": x.get("userId"), "iqiyi_circleId": x.get("circleId")}) L.singers = ",".join(L.singers) if info.get("cast") and info.get("cast").get("mainActors"): L.starring = [] L.starring_list = [] for x in info.get("cast").get("mainActors"): L.starring.append(x.get("name")) L.starring_list.append({"name": x.get("name"), "iqiyi_id": x.get("id"), "avatar": x.get( "imageUrl"), "iqiyi_userId": x.get("userId"), "iqiyi_circleId": x.get("circleId")}) L.starring = ",".join(L.starring) """编剧""" if info.get("cast") and info.get("cast").get("writers"): L.screenwriters = [] L.screenwriter_list = [] for x in info.get("cast").get("writers"): L.screenwriters.append(x.get("name")) L.screenwriter_list.append({"name": x.get("name"), "iqiyi_id": x.get("id"), "avatar": x.get( "imageUrl"), "iqiyi_userId": x.get("userId"), "iqiyi_circleId": x.get("circleId")}) L.screenwriters = ",".join(L.screenwriters) if info.get("cast") and info.get("cast").get("actors"): L.actors = [] L.actors_list = [] for x in info.get("cast").get("actors"): L.actors.append(x.get("name")) L.actors_list.append({"name": x.get("name"), "iqiyi_id": x.get("id"), "avatar": x.get( "imageUrl"), "iqiyi_userId": x.get("userId"), "iqiyi_circleId": x.get("circleId")}) L.actors = ",".join(L.actors) """嘉宾""" if info.get("cast") and info.get("cast").get("guests"): L.guests = [] L.guests_list = [] for x in info.get("cast").get("guests"): L.guests.append(x.get("name")) L.guests_list.append({"name": x.get("name"), "iqiyi_id": x.get("id"), "avatar": x.get( "imageUrl"), "iqiyi_userId": x.get("userId"), "iqiyi_circleId": x.get("circleId")}) L.guests = ",".join(L.guests) if info.get("cast") and info.get("cast").get("hosts"): L.hosts = [] L.hosts_list = [] for x in info.get("cast").get("hosts"): L.hosts.append(x.get("name")) L.hosts_list.append({"name": x.get("name"), "iqiyi_id": x.get("id"), "avatar": x.get( "imageUrl"), "iqiyi_userId": x.get("userId"), "iqiyi_circleId": x.get("circleId")}) L.hosts = ",".join(L.hosts) if L.release_date and L.year==None: m = re.search(u'(\d{4})',L.release_date) if m: L.year = m.group(1) if not L.directors_list and not L.directors_list and L.hosts: L.directors = L.hosts L.directors_list = L.hosts_list L.focuses = info.get("focuses") L.iqiyi_rating = info.get("score") L.created_at = time.time() return L.__dict__
def vdetail_parser(self, r, url=None): """视频详情页面""" try: page = etree.HTML(r) except Exception as e: return False L = Contents() result_div = page.xpath(u'//div[@class="mod_search_topic"]') if result_div: result_div = page.xpath(u'//div[@class="mod_reuslt"]') title = result_div.xpath(u'//h1[@class="main_title"]/a') if len(title) > 0: L.title = title[0].text sub_title = result_div.xpath(u'//span[@class="sub_title"]') if len(sub_title) > 0: L.sub_title = sub_title[0].text if re.search(u'(\d{4})', L.sub_title): L.year = re.search(u'(\d{4})', L.sub_title).group(1) category = page.xpath(u'//a[@class="channelTag"]') if len(category) > 0: L.category = category[0].text area = result_div.xpath(u'//em[contains(text(),"地区")]/a') if len(area) > 0: L.area = area[0].text L.iqiyi_tvId = self.get_vid_by_url(url) language = result_div.xpath(u'//em[contains(text(),"语言")]/a') if len(language) > 0: L.language = language[0].text tags = result_div.xpath(u'//em[contains(text(),"类型")]/a') if len(tags) > 0: _temp = [x.text for x in tags] L.tags = ",".join(set(_temp)) banben = result_div.xpath(u'//em[contains(text(),"版本")]/a') if len(banben) > 0: L.banben = banben[0].text summary = result_div.xpath(u'//em[contains(text(),"简介")]') if len(summary) > 0: L.summary = result_div.xpath( u'//em[contains(text(),"简介")]/following::text()[1]')[0] L.summary = parse_simple(L.summary) all_episode = result_div.xpath(u'//em[contains(text(),"集数")]/a') if len(all_episode) > 0: L.all_episode = self.parse_all_episode(all_episode[0].text) iqiyi_play_url = result_div.xpath(u'//a[contains(text(),"立即播放")]') if len(iqiyi_play_url) > 0: L.iqiyi_play_url = iqiyi_play_url[0].get("href") iqiyi_plays_num = result_div.xpath(u'//i[@id="widget-playcount"]') if len(iqiyi_plays_num) > 0: L.iqiyi_plays_num = iqiyi_plays_num[0].text peiyin = result_div.xpath(u'//em[contains(text(),"配音")]/a') if len(peiyin) > 0: L.peiyin_list = [] _temp = [] for x in peiyin: _temp.append(peiyin[0].text) L.peiyin_list.append({"name": peiyin[0].text, "iqiyi_url": x.get( "href"), "iqiyi_id": self.get_starId_by_url(x.get("href"))}) L.peiyin = ",".join(set(_temp)) iqiyi_rating = page.xpath(u'//span[@class="score_font"]') if len(iqiyi_rating) > 0: L.iqiyi_rating = iqiyi_rating[0].get("snsscore") poster = page.xpath(u'//div[@contains(@_stat,"result_pic"]/img') if len(poster) > 0: L.poster = {"url": poster[0].get("src"), "width": poster[0].get( "width"), "height": poster[0].get("height"), "name": poster[0].get("alt")} L.img_url = L.poster['url'] # 导演演员 # actor_list = page.xpath(u'//ul[@class="actor_list cf"]/li/') # starring_list = [] # starring = [] # directors_list = [] # directors = [] return L.__dict__
def baike_parser(r,url=None): try: r = re.sub(u' ','',r) page = etree.HTML(r) except Exception as e: return False L = Contents() summary = page.xpath(u'//div[@class="lemmaWgt-lemmaSummary lemmaWgt-lemmaSummary-light"]') if len(summary) > 0: L.summary = summary[0].text title = page.xpath(u'//dt[contains(text(),"中文名")]') if len(title) > 0: L.title = parse_simple(title[0].getnext().text) foreign_title = page.xpath(u'//dt[contains(text(),"外文名")]') if len(foreign_title) > 0: L.foreign_title = parse_simple(foreign_title[0].getnext().text) production_company = page.xpath(u'//dt[contains(text(),"出品公司")]') if len(production_company) > 0: L.production_company = parse_simple(production_company[0].getnext().text) producer_country = page.xpath(u'//dt[contains(text(),"制片地区")]') if len(producer_country) > 0: L.producer_country = area_process(parse_simple(producer_country[0].getnext().text)) directors_list = page.xpath(u'//dt[contains(text(),"导演")]') if len(directors_list) > 0: a_tag = directors_list[0].getnext().findall('a') if len(a_tag) > 0: L.directors_list = [] directors = [] for x in a_tag: L.directors_list.append({"name":parse_simple(x.text),"baike_id":x.get("data-lemmaid"),"baike_url":u'https://baike.baidu.com'+x.get("href")}) directors.append(parse_simple(x.text)) L.directors = ",".join(set(directors)) else: L.directors = area_process(parse_simple(directors_list[0].getnext().text)) screenwriter_list = page.xpath(u'//dt[contains(text(),"编剧")]') if len(screenwriter_list) > 0: a_tag = screenwriter_list[0].getnext().findall('a') if len(a_tag) > 0: L.screenwriter_list = [] screenwriters = [] for x in a_tag: L.screenwriter_list.append({"name":parse_simple(x.text),"baike_id":x.get("data-lemmaid"),"baike_url":u'https://baike.baidu.com'+x.get("href")}) screenwriters.append(parse_simple(x.text)) L.screenwriters = ",".join(set(screenwriters)) else: L.screenwriters = area_process(parse_simple(screenwriter_list[0].getnext().text)) starring_list = page.xpath(u'//dt[contains(text(),"主演")]') if len(starring_list) > 0: a_tag = starring_list[0].getnext().findall('a') if len(a_tag) > 0: L.starring_list = [] starring = [] for x in a_tag: L.starring_list.append({"name":parse_simple(x.text),"baike_id":x.get("data-lemmaid"),"baike_url":u'https://baike.baidu.com'+x.get("href")}) starring.append(parse_simple(x.text)) L.starring = ",".join(set(starring)) else: L.starring = area_process(parse_simple(starring_list[0].getnext().text)) alias = page.xpath(u'//dt[contains(text(),"其它译名")]') if len(alias) > 0: a_tag = alias[0].getnext().findall("a") if len(a_tag) > 0: L.alias = ",".join([parse_simple(x.text) for x in a_tag if parse_simple(x.text)]) else: L.alias = parse_simple(alias[0].getnext().text) types = page.xpath(u'//dt[contains(text(),"类型")]') if len(types) > 0: L.type = area_process(parse_simple(types[0].getnext().text)) duration = page.xpath(u'//dt[contains(text(),"片长")]') if len(duration) > 0: L.duration = area_process(parse_simple(duration[0].getnext().text)) release_date = page.xpath(u'//dt[contains(text(),"上映时间")]') if len(release_date) > 0: L.release_date = area_process(parse_simple(release_date[0].getnext().text)) release_date = page.xpath(u'//dt[contains(text(),"语言")]') if len(release_date) > 0: L.language = language_process(parse_simple(release_date[0].getnext().text)) douban_rating = page.xpath(u'//span[contains(@class,"star-text")]') if len(douban_rating) > 0: L.douban_rating = douban_rating[0].text poster = page.xpath(u'//img[@alt="词条图片"]') if len(poster) > 0: L.poster = [{"url":poster[0].get("src"),"name":poster[0].get("alt")}] L.img_url = poster[0].get("src") actor_list = page.xpath(u'//ul[@class="actorList"]/li') if len(actor_list) > 0: starring = L.starring.split(',') L.actor_list = [] starring_list = [] for x in actor_list: _temp = {"avatar":x.find('img').get("src"),"name":x.xpath(u'//dl[@class="info"]/a')[0].text,"baike_id":x.xpath(u'//dl[@class="info"]/a')[0].get("data-lemmaid"),"baidu_url":"https://baike.baidu.com"+x.xpath(u'//dl[@class="info"]/a')[0].get("href")} if _temp['name'] in starring: starring_list.append(_temp) else: L.actor_list.append(_temp) if starring_list: L.starring_list = starring_list L.created_at = time.time() return L.__dict__
def vdetail_parser(self, r): data = Contents() try: page = etree.HTML(r) except Exception as e: return False year = re.search(u'<span class="year">\((\d{4})\)</span>', r) if year: data.year = year.group(1) title = re.search(u'<span property="v\:itemreviewed">(.*)</span>', r) if title: data.title = title.group(1) bianju = page.xpath(u'//span[contains(text(),"编剧")]') if len(bianju) > 0: bianju_a = bianju[0].getnext() if bianju_a is not None: bianju_a = bianju_a.findall('a') data.screenwriter_list = [] screenwriters = '' for x in bianju_a: screenwriters = screenwriters + parse_simple(x.text) + "," _temp = {} if re.search(u'/celebrity/(\d*)/', x.get("href")): _temp["doubanid"] = re.search(u'/celebrity/(\d*)/', x.get("href")).group(1) else: # doubanid = x.get("href") pass if x.get("href"): _temp[ "douban_url"] = "https://movie.douban.com" + x.get( "href") _temp["name"] = parse_simple(x.text) data.screenwriter_list.append(_temp) data.screenwriters = screenwriters.strip(',') directors_el = page.xpath(u'//span[contains(text(),"导演")]') if len(directors_el) > 0: directors_a = directors_el[0].getnext() if directors_a is not None: directors_a = directors_a.findall('a') data.directors_list = [] directors = "" for x in directors_a: directors = directors + parse_simple(x.text) + "," _temp = {} if re.search(u'/celebrity/(\d*)/', x.get("href")): _temp["doubanid"] = re.search(u'/celebrity/(\d*)/', x.get("href")).group(1) else: # doubanid = x.get("href") pass if x.get("href"): _temp[ "douban_url"] = "https://movie.douban.com" + x.get( "href") _temp["name"] = parse_simple(x.text) data.directors_list.append(_temp) data.directors = directors.strip(',') starring_el = page.xpath(u'//span[contains(text(),"主演")]') if len(starring_el) > 0: starring_a = starring_el[0].getnext() if starring_a is not None: starring_a = starring_a.findall('a') data.starring_list = [] starring = "" for x in starring_a: starring = starring + parse_simple(x.text) + "," _temp = {} if re.search(u'/celebrity/(\d*)/', x.get("href")): _temp["doubanid"] = re.search(u'/celebrity/(\d*)/', x.get("href")).group(1) else: # doubanid = x.get("href") pass if x.get("href"): _temp[ "douban_url"] = "https://movie.douban.com" + x.get( "href") _temp["name"] = parse_simple(x.text) data.starring_list.append(_temp) starring = starring.strip(',') data.starring = starring type_el = page.xpath(u'//span[@property="v:genre"]') # 类型 mvtype = [] if len(type_el) > 0: for x in type_el: mvtype.append(parse_simple(x.text)) tags = page.xpath(u'//div[@class="tags-body"]/a') _temp = [] for x in tags: _temp.append(parse_simple(x.text)) _temp = _temp + mvtype data.tags = ",".join(set(_temp)) producer_country_el = page.xpath( u'//span[contains(text(),"制片国家/地区:")]') if len(producer_country_el) > 0: producer_country = page.xpath( u'//span[contains(text(),"制片国家/地区:")]/following::text()[1]')[0] data.producer_country = area_process( split_space(producer_country.replace('/', ','))) language_el = page.xpath(u'//span[contains(text(),"语言:")]') if len(language_el) > 0: language = page.xpath( u'//span[contains(text(),"语言:")]/following::text()[1]')[0] data.language = language_process( split_space(language.replace('/', ','))) all_episode = page.xpath(u'//span[contains(text(),"集数:")]') if len(all_episode) > 0: all_episode = page.xpath( u'//span[contains(text(),"集数:")]/following::text()[1]')[0] m = re.search(u'(\d{1,})', all_episode.replace(" ", "")) if m: data.all_episode = m.group(1) episode_time = page.xpath(u'//span[contains(text(),"单集片长:")]') if len(episode_time) > 0: episode = page.xpath( u'//span[contains(text(),"单集片长:")]/following::text()[1]')[0] m = re.search(u'(\d{1,})', episode.replace(" ", "")) if m: data.duration = m.group(1) season = page.xpath( u'//select[@id="season"]/option[@selected="selected"]') #season季数 if len(season) > 0: data.season = season[0].text release_date_el = page.xpath( u'//span[@property="v:initialReleaseDate"]') #首播 if len(release_date_el) > 0: release_date = "" for x in release_date_el: release_date = release_date + parse_simple(x.text) + "|" release_date = release_date.strip('|') m = re.search(u'(\d{4}-\d{2}-\d{2})', release_date.replace(" ", "")) if m: data.release_date = m.group(1) else: data.release_date = release_date duration_el = page.xpath(u'//span[@property="v:runtime"]') if len(duration_el) > 0: m = re.search(u'(\d{1,})', duration_el[0].text.replace(" ", '')) if m: data.duration = m.group(1) # 片长 alias_al = page.xpath(u'//span[contains(text(),"又名:")]') if len(alias_al) > 0: alias = page.xpath( u'//span[contains(text(),"又名:")]/following::text()[1]')[0] data.alias = split_space(alias.replace('/', ',')) IMDb_el = page.xpath(u'//span[contains(text(),"IMDb链接")]') if len(IMDb_el) > 0: data.IMDb = IMDb_el[0].getnext().get("href") rating = re.search(u'property="v\:average">(\d*\.\d*)</strong>', r) if rating: data.douban_rating = rating.group(1) rating_sum = page.xpath(u'//span[@property="v:votes"]') if len(rating_sum) > 0: data.douban_rating_sum = rating_sum[0].text summary_all = page.xpath(u'//span[@class="all hidden"]') summary = page.xpath(u'//span[@property="v:summary"]') if len(summary_all) > 0: data.summary = ''.join( page.xpath(u'//span[@class="all hidden"]/text()')) data.summary = parse_simple(data.summary) elif len(summary) > 0: data.summary = ''.join( page.xpath(u'//span[@property="v:summary"]/text()')) data.summary = parse_simple(data.summary) img_url = page.xpath(u'//img[@title="点击看更多海报"]') nbgnbg = page.xpath(u'//a[@title="点击看大图" and @class="nbgnbg"]') if len(img_url) > 0: data.img_url = page.xpath(u'//img[@title="点击看更多海报"]')[0].get("src") elif len(nbgnbg) > 0: data.img_url = nbgnbg[0].get("href") if data.all_episode > 1 and (u"动漫" in data.tags or u"动画" in data.tags): data.category = u"动漫" elif data.all_episode > 1 and (u"综艺" in data.tags or u'真人秀' in data.tags): data.category = u'综艺' elif data.all_episode > 1: data.category = u"电视剧" elif u"动漫" in data.tags or u"动画" in data.tags: data.category = u'动漫' elif u"短片" in data.tags: data.category = u'短片' else: data.category = u'电影' m = re.search(u"SUBJECT_ID: *'(\d*)'", r) if m: data.doubanid = m.group(1) print( "oooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo" ) print(data.__dict__) print( "oooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo" ) return data.__dict__
def parse_detail(self, r, url=None): try: page = etree.HTML(r) except Exception as e: return None data = Contents() sss = re.sub(u'\\n', '', r) v_show = page.xpath( u'//div[@class="p-post"]/div[@class="yk-pack p-list"]/div[@class="p-thumb"]/a' ) if len(v_show) > 0: data.youku_play = "http:" + v_show[0].get("href") # 海报: thumb = page.xpath( u'//div[@class="p-post"]/div[@class="yk-pack p-list"]/div[@class="p-thumb"]/img' ) if len(thumb) > 0: data.poster = [{ "url": "http:" + thumb[0].get("src"), "title": thumb[0].get("alt"), "width": 200, "height": 300 }] data.title = thumb[0].get("alt") data.img_url = "http:" + thumb[0].get("src") # category: category = page.xpath( '//div[@class="p-base"]/ul/li[@class="p-row p-title"]/a') if len(category) > 0: data.category = parse_simple(category[0].text) # 年份:可能没有 year = page.xpath( '//div[@class="p-base"]/ul/li[@class="p-row p-title"]/span[@class="sub-title"]' ) if len(year) > 0 and year[0].text: m = re.search(u'(\d{4})', year[0].text) if m: data.year = m.group(1) # 别名:可能没有 alias = page.xpath('//div[@class="p-base"]/ul/li[@class="p-alias"]') if len(alias) > 0: data.alias = split_space(alias[0].get("title").replace('/', ',')) # 上映:可能没有 published_at = re.search(u'>上映:</label>(\w+-\d+-\d+)*</span>', sss) if published_at != None: data.release_date = published_at.group(1) # 优酷评分:可能没有 youku_score = page.xpath( '//div[@class="p-base"]/ul/li[@class="p-score"]/span[@class="star-num"]' ) if len(youku_score) > 0: data.youku_rating = parse_simple(youku_score[0].text) # 豆瓣评分:可能没有 douban_score = re.search(u'<span class="db-bignum">(\d+\.\d*)</span>', sss) if douban_score != None: data.douban_rating = douban_score.group(1) # 豆瓣评价数量,可能没有 douban_cm_num = re.search(u'<span class="db-cm-num">(\d*)评价</span>', sss) if douban_cm_num != None: data.douban_comment_sum = douban_cm_num.group(1) # 主演:可能没有 actors = page.xpath( '//div[@class="p-base"]/ul/li[@class="p-performer"]') if len(actors) > 0: data.starring = split_space(actors[0].get('title').replace( "/", ",")) data.starring_list = [] for x in page.xpath( '//div[@class="p-base"]/ul/li[@class="p-performer"]/a'): _temp = {} _temp["name"] = parse_simple(x.text) _temp["youkuid"] = re.search( u"//list\.youku\.com/star/show/(.*)\.html", etree.tostring(x)).group(1) if x.get("href"): _temp["youku_url"] = "http:" + x.get("href") data.starring_list.append(_temp) # 集数 renew = page.xpath( '//div[@class="p-base"]/ul/li[@class="p-row p-renew"]') if len(renew) > 0 and renew[0].text: m = re.search(u'(\d*)', renew[0].text) if m: data.all_episode = m.group(1) # 导演:循环出来 directed = page.xpath( u'//div[@class="p-base"]/ul/li[contains(text(),"导演:")]/a') data['director_list'] = [] if len(directed) > 0: data.directors = [] for x in directed: data.directors.append(parse_simple(x.text)) _temp = {} _temp["name"] = parse_simple(x.text) _temp["youkuid"] = re.search( u"//list\.youku\.com/star/show/(.*)\.html", etree.tostring(x)).group(1) if x.get("href"): _temp["youku_url"] = x.get("href") data.directors_list.append(_temp) data.directors = ",".join(data.directors) data.directors = ",".join(data.directors) # 地区,可能没有 area = re.search( u'>地区:<a href="//list\.youku\.com/category/show/([^\.html]+?)\.html" target="_blank">([^</a></li>]+?)</a>', sss) if area != None: data.producer_country = parse_simple(area.group(2)) # 类型:循环出来 types = page.xpath( u'//div[@class="p-base"]/ul/li[contains(text(),"类型")]/a') if len(types) > 0: data.tags = [] for x in types: data.tags.append(parse_simple(x.text)) data.tags = ",".join(data.tags) # 总播放数:可能为none plays_num = re.search(u'<li>总播放数:([^</li>]+?)</li>', sss) if plays_num != None: data.youku_plays_num = plays_num.group(1).replace(',', "") # 评论数量:可能为none youku_comments_num = re.search(u'<li>评论:([^</li>]+?)</li>', sss) if youku_comments_num: data.youku_comments_num = youku_comments_num.group(1) # 顶:可以空 ding = re.search(u'<li>顶:([^</li>]+?)</li>', sss) if ding: data.ding = ding.group(1) # 简介: summary = page.xpath(u'.//span[contains(@class,"intro-more")]/text()') if summary: data.summary = parse_simple("".join(summary)).replace(u"简介:", "") # 适合年龄,可能为空 age = re.search(u'>适用年龄:([^</li>]+?)</li>', sss) if age: data.age = age.group(1) peiyin = page.xpath( u'//div[@class="p-base"]/ul/li[contains(text(),"声优:")]/a') if len(peiyin) > 0: data.peiyin = [] data.peiyin_list = [] for x in peiyin: data.peiyin.append(parse_simple(x.text)) _temp = {} _temp["name"] = parse_simple(x.text) _temp["youkuid"] = re.search( u"//list\.youku\.com/star/show/(.*)\.html", etree.tostring(x)).group(1) _temp['youku_url'] = "http:" + x.get("href") data['peiyin_list'].append(_temp) data.peiyin = ",".join(data.peiyin) # 综艺节目有 presenters = page.xpath( u'//div[@class="p-base"]/ul/li[contains(text(),"主持人:")]/a') if len(presenters) > 0: data.presenters = [] data.presenters_list = [] for x in presenters: data.presenters.append(parse_simple(x.text)) _temp["name"] = parse_simple(x.text) _temp["youkuid"] = re.search( u"//list\.youku\.com/star/show/(.*)\.html", etree.tostring(x)).group(1) _temp['youku_url'] = "http:" + x.get("href") data.presenters = ",".join(data.presenters) if data.title == None: return None data.created_at = time.time() return data.__dict__
def vdetail_parser(self, r): try: page = etree.HTML(r) except Exception as e: return False L = Contents() title = page.xpath(u'//a[@_stat="info:title"]') m = re.search(u'\{"id":"(\w*\d*)"', r) if m: L.qq_id = m.group(1) m = re.search(u'&vid=(\w*\d*)&', r) if m: L.qq_vid = m.group(1) if len(title) > 0: L.title = title[0].text category = page.xpath(u'//span[@class="type"]') if len(category) > 0: L.category = category[0].text area = page.xpath(u'.//span[contains(text(),"地 区:")]') if len(area) > 0: L.area = area_process(area[0].getnext().text) foreign_title = page.xpath(u'//span[@class="title_en"]') if len(foreign_title) > 0: L.foreign_title = foreign_title[0].text qq_play = page.xpath(u'.//a[@_stat="info:playbtn"]') if len(qq_play) > 0: L.qq_play = qq_play[0].get("href") language = page.xpath(u'//span[contains(text(),"语 言:")]') if len(language) > 0: L.language = language_process(language[0].getnext().text) year = page.xpath(u'.//span[contains(text(),"上映时间")]') if len(year) > 0 and year[0].getnext().text: m = re.search(u'(\d{4})', year[0].getnext().text) if m: L.year = m.group(1) all_episode = page.xpath(u'//span[contains(text(),"总集数:")]') if len(all_episode) > 0: L.all_episode = all_episode[0].getnext().text release_date = page.xpath(u'//span[contains(text(),"出品时间:")]') if len(release_date) > 0: L.release_date = release_date[0].getnext().text if L.release_date and L.year == None: try: m = re.search(u'(\d{4})', L.release_date) if m: L.year = m.group(1) except Exception as e: pass year = page.xpath(u'.//span[contains(text(),"首播时间")]') if len(year) > 0 and L.year == None: m = re.search(u'(\d{4})', year[0].getnext().text) if m: L.year = m.group(1) alias = page.xpath(u'//span[contains(text(),"别 名")]') if len(alias) > 0: L.alias = alias[0].getnext().text tags = page.xpath(u'//a[@class="tag"]') if len(tags) > 0: _temp = [x.text for x in tags] L.tags = ",".join(set(_temp)) summary = page.xpath( u'//span[@class="desc_txt"]/span[@class="txt _desc_txt_lineHight"]' ) if len(summary) > 0: L.summary = parse_simple(summary[0].text) qq_rating = page.xpath(u'//div[@class="score_v"]/span[@class="score"]') if len(qq_rating) > 0: L.qq_rating = qq_rating[0].text douban_rating = page.xpath( u'//a[@class="score_db"]/span[@class="score"]') if len(douban_rating) > 0: L.douban_rating = douban_rating[0].text poster = page.xpath(u'//img[@_stat="info:poster"]') if len(poster) > 0: L.poster = [] if poster[0].get("src"): L.poster.append({ "url": self.parse_imgurl(poster[0].get("src")), "name": poster[0].get("alt") }) L.img_url = self.parse_imgurl(poster[0].get("src")) #导演演员 actor_list = page.xpath(u'//ul[contains(@class,"actor_list")]/li') starring_list = [] starring = [] directors_list = [] directors = [] if len(actor_list) > 0: _temp = [] for actor in actor_list: _dic = {} actor_avatar = actor.find(u'a') if actor_avatar is not None: if actor_avatar.find('img') is not None: _dic["avatar"] = self.parse_imgurl( actor_avatar.find('img').get("src")) _dic["qq_id"] = actor.get("data-id") if actor.find("span") is not None: _dic["name"] = actor.find("span").text _dic["qq_home_page"] = actor_avatar.get("href") actor_detail = actor.xpath( u'.//div[@class="actor_detail"]') if actor_detail: # 职业 occupation = actor_detail[0].xpath( u'.//span[contains(text(),"职业")]') if occupation: _dic['occupation'] = occupation[0].getnext().text # 地区 area = actor_detail[0].xpath( u'.//span[contains(text(),"地区")]') if len(area) > 0: _dic['area'] = area[0].getnext().text # 简介 intro = actor.xpath(u'.//span[@itemprop="description"]') if intro: _dic["intro"] = intro[0].text # 导演 if actor_avatar.xpath(u'.//span[@class="director"]'): directors_list.append(_dic) directors.append(_dic['name']) else: # 演员 starring_list.append(_dic) starring.append(_dic['name']) if starring_list: L.starring = ','.join(starring) L.starring_list = starring_list if directors_list: L.directors = ','.join(directors) L.directors_list = directors_list if L.title == None: return False L.created_at = time.time() return L.__dict__