def parse_photos(self, r, id): data = [] page = etree.HTML(r) lis = page.xpath( u'//div[@class="article"]/ul[@class="poster-col3 clearfix"]/li') if len(lis) > 0: for x in lis: temp = {} temp['doubanvid'] = id temp['doubanid'] = x.get('data-id') name_el = x.find('div[@class="name"]') prop_el = x.find('div[@class="prop"]') cover = x.find('div[@class="cover"]/a') if name_el != None: temp['name'] = name_el.text.replace(u'\n', '') temp['name'] = parse_simple(temp['name']) if prop_el != None: temp['prop'] = prop_el.text.replace(u'\n', '') temp['prop'] = temp['prop'].strip(u' ') m = re.search(u'(\d{1,})x(\d{1,})', temp['prop'].replace(" ", "")) if m: temp['width'] = m.group(1) temp['height'] = m.group(2) if cover != None: temp['photos_page'] = cover.get("href") temp['url'] = cover.find('img').get("src") temp['url'] = re.sub(u'/photo/m/', '/photo/l/', temp['url']) data.append(temp) nextpage = page.xpath(u'//a[contains(text(),"后页")]') if len(nextpage) > 0: return {"data": data, "next": nextpage[0].get("href")} return {"data": data}
def parse_star_show(self, r, url): """ 解析明星主页 return:dic """ try: # r.decode('utf-8') page = etree.HTML(r) except Exception as e: return None info_el = page.xpath('//div[@id="starInfo"]/dl/dd[@class="info"]/span') sub_list = { u"别名": u'alias', u"性别": u'gender', u"地区": u'area', u"出生地": u'birthplace', u'生日': u'birthday', u'星座': u'constellation', u"血型": u'blood', u"职业": u'occupation' } info_list = [re.split(u':', parse_simple(x.text)) for x in info_el] for x in xrange(0, len(info_list)): info_list[x][0] = re.sub(info_list[x][0], sub_list[info_list[x][0]], info_list[x][0]) info_data = {x[0]: x[1] for x in info_list} avatar = page.xpath(u'.//div[@class="box-avatar"]/img') if len(avatar) > 0: info_data['avatar'] = avatar[0].get("src") info_data.name = page.xpath('//div[@id="starInfo"]')[0].get( 'data-name') info_data.youku_starid = page.xpath('//div[@id="starInfo"]')[0].get( 'data-starid') m = re.search(u'\/(uid_\w*\d*=*)\.html', url) info_data['youku_uid'] = m.group(1) if m != None else None info_data['created_at'] = int(time.time() * 1000) intro = page.xpath('//dd[@class="intro"]/span[@class="long noshow"]') info_data['youku_url'] = url if len(intro) > 0: info_data['intro'] = intro[0].text return info_data
def vdetail_parser(self, r, url=None): """视频详情页面""" try: page = etree.HTML(r) except Exception as e: return False L = Contents() result_div = page.xpath(u'//div[@class="mod_search_topic"]') if result_div: result_div = page.xpath(u'//div[@class="mod_reuslt"]') title = result_div.xpath(u'//h1[@class="main_title"]/a') if len(title) > 0: L.title = title[0].text sub_title = result_div.xpath(u'//span[@class="sub_title"]') if len(sub_title) > 0: L.sub_title = sub_title[0].text if re.search(u'(\d{4})', L.sub_title): L.year = re.search(u'(\d{4})', L.sub_title).group(1) category = page.xpath(u'//a[@class="channelTag"]') if len(category) > 0: L.category = category[0].text area = result_div.xpath(u'//em[contains(text(),"地区")]/a') if len(area) > 0: L.area = area[0].text L.iqiyi_tvId = self.get_vid_by_url(url) language = result_div.xpath(u'//em[contains(text(),"语言")]/a') if len(language) > 0: L.language = language[0].text tags = result_div.xpath(u'//em[contains(text(),"类型")]/a') if len(tags) > 0: _temp = [x.text for x in tags] L.tags = ",".join(set(_temp)) banben = result_div.xpath(u'//em[contains(text(),"版本")]/a') if len(banben) > 0: L.banben = banben[0].text summary = result_div.xpath(u'//em[contains(text(),"简介")]') if len(summary) > 0: L.summary = result_div.xpath( u'//em[contains(text(),"简介")]/following::text()[1]')[0] L.summary = parse_simple(L.summary) all_episode = result_div.xpath(u'//em[contains(text(),"集数")]/a') if len(all_episode) > 0: L.all_episode = self.parse_all_episode(all_episode[0].text) iqiyi_play_url = result_div.xpath(u'//a[contains(text(),"立即播放")]') if len(iqiyi_play_url) > 0: L.iqiyi_play_url = iqiyi_play_url[0].get("href") iqiyi_plays_num = result_div.xpath(u'//i[@id="widget-playcount"]') if len(iqiyi_plays_num) > 0: L.iqiyi_plays_num = iqiyi_plays_num[0].text peiyin = result_div.xpath(u'//em[contains(text(),"配音")]/a') if len(peiyin) > 0: L.peiyin_list = [] _temp = [] for x in peiyin: _temp.append(peiyin[0].text) L.peiyin_list.append({"name": peiyin[0].text, "iqiyi_url": x.get( "href"), "iqiyi_id": self.get_starId_by_url(x.get("href"))}) L.peiyin = ",".join(set(_temp)) iqiyi_rating = page.xpath(u'//span[@class="score_font"]') if len(iqiyi_rating) > 0: L.iqiyi_rating = iqiyi_rating[0].get("snsscore") poster = page.xpath(u'//div[@contains(@_stat,"result_pic"]/img') if len(poster) > 0: L.poster = {"url": poster[0].get("src"), "width": poster[0].get( "width"), "height": poster[0].get("height"), "name": poster[0].get("alt")} L.img_url = L.poster['url'] # 导演演员 # actor_list = page.xpath(u'//ul[@class="actor_list cf"]/li/') # starring_list = [] # starring = [] # directors_list = [] # directors = [] return L.__dict__
def start_detail(self, r, url=None): '''明星信息解析''' try: page = etree.HTML(r) except Exception as e: return False S = Star() S.iqiyi_url = url m = re.search(u'"circleId"\:(\d*)', r) # paopao id if m: S.iqiyi_circleId = m.group(1) avatar = page.xpath(u'.//div[@class="result_pic"]/img') if len(avatar) > 0: S.avatar = avatar[0].get("src") S.name = avatar[0].get("alt") result_detail = page.xpath(u'.//div[@class="result_detail"]') if len(result_detail) > 0: name = result_detail[0].xpath(u'.//h1[@itemprop="name"]') if len(name) > 0: S.name = parse_simple(name[0].text) occupation = page.xpath(u'//span[contains(text(),"职业")]') if len(occupation) > 0: S.occupation = page.xpath( u'//span[contains(text(),"职业")]/following::text()[1]')[0] S.occupation = self.parse_occupation(S.occupation) date_birth = page.xpath(u'//span[contains(text(),"生日")]') if len(date_birth) > 0: S.date_birth = page.xpath( u'//span[contains(text(),"生日")]/following::text()[1]')[0] S.date_birth = parse_simple(self.parse_date_birth(S.date_birth)) area = page.xpath(u'//span[contains(text(),"地区")]') if len(area) > 0: S.area = page.xpath( u'//span[contains(text(),"地区")]/following::text()[1]')[0] S.area = self.parse_area(S.area) body_weight = page.xpath(u'//span[contains(text(),"体重")]') if len(body_weight) > 0: S.body_weight = page.xpath( u'//span[contains(text(),"体重")]/following::text()[1]')[0] S.body_weight = self.parse_weight(S.body_weight) foreign_name = page.xpath(u'//dt[contains(text(),"外文名")]') if len(foreign_name) > 0: S.foreign_name = parse_simple(foreign_name[0].getnext().text) gender = page.xpath(u'//dt[contains(text(),"性别")]') if len(gender) > 0: S.gender = parse_simple(gender[0].getnext().text) body_height = page.xpath(u'//dt[contains(text(),"身高")]') if len(body_height) > 0: S.body_height = parse_simple(body_height[0].getnext().text) birthplace = page.xpath(u'//dt[contains(text(),"出生地")]') if len(birthplace) > 0: S.birthplace = parse_simple(birthplace[0].getnext().text) date_birth = page.xpath(u'//dt[contains(text(),"出生日期")]') if len(date_birth) > 0: S.date_birth = parse_simple(date_birth[0].getnext().text) graduated_school = page.xpath(u'//dt[contains(text(),"毕业院校")]') if len(graduated_school) > 0: S.graduated_school = parse_simple( graduated_school[0].getnext().text) famous_times = page.xpath(u'//dt[contains(text(),"成名年代")]') if len(famous_times) > 0: S.famous_times = parse_simple(famous_times[0].getnext().text) alias = page.xpath(u'//dt[contains(text(),"别名")]') if len(alias) > 0: S.alias = parse_simple(alias[0].getnext().text) blood = page.xpath(u'//dt[contains(text(),"血型")]') if len(blood) > 0: S.blood = parse_simple(blood[0].getnext().text) constellation = page.xpath(u'//dt[contains(text(),"星座")]') if len(constellation) > 0: S.constellation = parse_simple(constellation[0].getnext().text) current_residence = page.xpath(u'//dt[contains(text(),"现居地")]') if len(current_residence) > 0: S.current_residence = parse_simple( current_residence[0].getnext().text) agency = page.xpath(u'//dt[contains(text(),"经纪公司")]') if len(agency) > 0: S.agency = parse_simple(agency[0].getnext().text) hobbies = page.xpath(u'//dt[contains(text(),"爱好")]') if len(hobbies) > 0: S.hobbies = parse_simple(hobbies[0].getnext().text) intro = page.xpath(u'//p[@class="introduce-info"]') if len(intro): S.intro = "".join(page.xpath(u'//p[@class="introduce-info"]/text()')) S.intro = parse_simple(S.intro) if not S.name: return False S.created_at = time.time() return S.__dict__
def baike_parser(r,url=None): try: r = re.sub(u' ','',r) page = etree.HTML(r) except Exception as e: return False L = Contents() summary = page.xpath(u'//div[@class="lemmaWgt-lemmaSummary lemmaWgt-lemmaSummary-light"]') if len(summary) > 0: L.summary = summary[0].text title = page.xpath(u'//dt[contains(text(),"中文名")]') if len(title) > 0: L.title = parse_simple(title[0].getnext().text) foreign_title = page.xpath(u'//dt[contains(text(),"外文名")]') if len(foreign_title) > 0: L.foreign_title = parse_simple(foreign_title[0].getnext().text) production_company = page.xpath(u'//dt[contains(text(),"出品公司")]') if len(production_company) > 0: L.production_company = parse_simple(production_company[0].getnext().text) producer_country = page.xpath(u'//dt[contains(text(),"制片地区")]') if len(producer_country) > 0: L.producer_country = area_process(parse_simple(producer_country[0].getnext().text)) directors_list = page.xpath(u'//dt[contains(text(),"导演")]') if len(directors_list) > 0: a_tag = directors_list[0].getnext().findall('a') if len(a_tag) > 0: L.directors_list = [] directors = [] for x in a_tag: L.directors_list.append({"name":parse_simple(x.text),"baike_id":x.get("data-lemmaid"),"baike_url":u'https://baike.baidu.com'+x.get("href")}) directors.append(parse_simple(x.text)) L.directors = ",".join(set(directors)) else: L.directors = area_process(parse_simple(directors_list[0].getnext().text)) screenwriter_list = page.xpath(u'//dt[contains(text(),"编剧")]') if len(screenwriter_list) > 0: a_tag = screenwriter_list[0].getnext().findall('a') if len(a_tag) > 0: L.screenwriter_list = [] screenwriters = [] for x in a_tag: L.screenwriter_list.append({"name":parse_simple(x.text),"baike_id":x.get("data-lemmaid"),"baike_url":u'https://baike.baidu.com'+x.get("href")}) screenwriters.append(parse_simple(x.text)) L.screenwriters = ",".join(set(screenwriters)) else: L.screenwriters = area_process(parse_simple(screenwriter_list[0].getnext().text)) starring_list = page.xpath(u'//dt[contains(text(),"主演")]') if len(starring_list) > 0: a_tag = starring_list[0].getnext().findall('a') if len(a_tag) > 0: L.starring_list = [] starring = [] for x in a_tag: L.starring_list.append({"name":parse_simple(x.text),"baike_id":x.get("data-lemmaid"),"baike_url":u'https://baike.baidu.com'+x.get("href")}) starring.append(parse_simple(x.text)) L.starring = ",".join(set(starring)) else: L.starring = area_process(parse_simple(starring_list[0].getnext().text)) alias = page.xpath(u'//dt[contains(text(),"其它译名")]') if len(alias) > 0: a_tag = alias[0].getnext().findall("a") if len(a_tag) > 0: L.alias = ",".join([parse_simple(x.text) for x in a_tag if parse_simple(x.text)]) else: L.alias = parse_simple(alias[0].getnext().text) types = page.xpath(u'//dt[contains(text(),"类型")]') if len(types) > 0: L.type = area_process(parse_simple(types[0].getnext().text)) duration = page.xpath(u'//dt[contains(text(),"片长")]') if len(duration) > 0: L.duration = area_process(parse_simple(duration[0].getnext().text)) release_date = page.xpath(u'//dt[contains(text(),"上映时间")]') if len(release_date) > 0: L.release_date = area_process(parse_simple(release_date[0].getnext().text)) release_date = page.xpath(u'//dt[contains(text(),"语言")]') if len(release_date) > 0: L.language = language_process(parse_simple(release_date[0].getnext().text)) douban_rating = page.xpath(u'//span[contains(@class,"star-text")]') if len(douban_rating) > 0: L.douban_rating = douban_rating[0].text poster = page.xpath(u'//img[@alt="词条图片"]') if len(poster) > 0: L.poster = [{"url":poster[0].get("src"),"name":poster[0].get("alt")}] L.img_url = poster[0].get("src") actor_list = page.xpath(u'//ul[@class="actorList"]/li') if len(actor_list) > 0: starring = L.starring.split(',') L.actor_list = [] starring_list = [] for x in actor_list: _temp = {"avatar":x.find('img').get("src"),"name":x.xpath(u'//dl[@class="info"]/a')[0].text,"baike_id":x.xpath(u'//dl[@class="info"]/a')[0].get("data-lemmaid"),"baidu_url":"https://baike.baidu.com"+x.xpath(u'//dl[@class="info"]/a')[0].get("href")} if _temp['name'] in starring: starring_list.append(_temp) else: L.actor_list.append(_temp) if starring_list: L.starring_list = starring_list L.created_at = time.time() return L.__dict__
def vdetail_parser(self, r): data = Contents() try: page = etree.HTML(r) except Exception as e: return False year = re.search(u'<span class="year">\((\d{4})\)</span>', r) if year: data.year = year.group(1) title = re.search(u'<span property="v\:itemreviewed">(.*)</span>', r) if title: data.title = title.group(1) bianju = page.xpath(u'//span[contains(text(),"编剧")]') if len(bianju) > 0: bianju_a = bianju[0].getnext() if bianju_a is not None: bianju_a = bianju_a.findall('a') data.screenwriter_list = [] screenwriters = '' for x in bianju_a: screenwriters = screenwriters + parse_simple(x.text) + "," _temp = {} if re.search(u'/celebrity/(\d*)/', x.get("href")): _temp["doubanid"] = re.search(u'/celebrity/(\d*)/', x.get("href")).group(1) else: # doubanid = x.get("href") pass if x.get("href"): _temp[ "douban_url"] = "https://movie.douban.com" + x.get( "href") _temp["name"] = parse_simple(x.text) data.screenwriter_list.append(_temp) data.screenwriters = screenwriters.strip(',') directors_el = page.xpath(u'//span[contains(text(),"导演")]') if len(directors_el) > 0: directors_a = directors_el[0].getnext() if directors_a is not None: directors_a = directors_a.findall('a') data.directors_list = [] directors = "" for x in directors_a: directors = directors + parse_simple(x.text) + "," _temp = {} if re.search(u'/celebrity/(\d*)/', x.get("href")): _temp["doubanid"] = re.search(u'/celebrity/(\d*)/', x.get("href")).group(1) else: # doubanid = x.get("href") pass if x.get("href"): _temp[ "douban_url"] = "https://movie.douban.com" + x.get( "href") _temp["name"] = parse_simple(x.text) data.directors_list.append(_temp) data.directors = directors.strip(',') starring_el = page.xpath(u'//span[contains(text(),"主演")]') if len(starring_el) > 0: starring_a = starring_el[0].getnext() if starring_a is not None: starring_a = starring_a.findall('a') data.starring_list = [] starring = "" for x in starring_a: starring = starring + parse_simple(x.text) + "," _temp = {} if re.search(u'/celebrity/(\d*)/', x.get("href")): _temp["doubanid"] = re.search(u'/celebrity/(\d*)/', x.get("href")).group(1) else: # doubanid = x.get("href") pass if x.get("href"): _temp[ "douban_url"] = "https://movie.douban.com" + x.get( "href") _temp["name"] = parse_simple(x.text) data.starring_list.append(_temp) starring = starring.strip(',') data.starring = starring type_el = page.xpath(u'//span[@property="v:genre"]') # 类型 mvtype = [] if len(type_el) > 0: for x in type_el: mvtype.append(parse_simple(x.text)) tags = page.xpath(u'//div[@class="tags-body"]/a') _temp = [] for x in tags: _temp.append(parse_simple(x.text)) _temp = _temp + mvtype data.tags = ",".join(set(_temp)) producer_country_el = page.xpath( u'//span[contains(text(),"制片国家/地区:")]') if len(producer_country_el) > 0: producer_country = page.xpath( u'//span[contains(text(),"制片国家/地区:")]/following::text()[1]')[0] data.producer_country = area_process( split_space(producer_country.replace('/', ','))) language_el = page.xpath(u'//span[contains(text(),"语言:")]') if len(language_el) > 0: language = page.xpath( u'//span[contains(text(),"语言:")]/following::text()[1]')[0] data.language = language_process( split_space(language.replace('/', ','))) all_episode = page.xpath(u'//span[contains(text(),"集数:")]') if len(all_episode) > 0: all_episode = page.xpath( u'//span[contains(text(),"集数:")]/following::text()[1]')[0] m = re.search(u'(\d{1,})', all_episode.replace(" ", "")) if m: data.all_episode = m.group(1) episode_time = page.xpath(u'//span[contains(text(),"单集片长:")]') if len(episode_time) > 0: episode = page.xpath( u'//span[contains(text(),"单集片长:")]/following::text()[1]')[0] m = re.search(u'(\d{1,})', episode.replace(" ", "")) if m: data.duration = m.group(1) season = page.xpath( u'//select[@id="season"]/option[@selected="selected"]') #season季数 if len(season) > 0: data.season = season[0].text release_date_el = page.xpath( u'//span[@property="v:initialReleaseDate"]') #首播 if len(release_date_el) > 0: release_date = "" for x in release_date_el: release_date = release_date + parse_simple(x.text) + "|" release_date = release_date.strip('|') m = re.search(u'(\d{4}-\d{2}-\d{2})', release_date.replace(" ", "")) if m: data.release_date = m.group(1) else: data.release_date = release_date duration_el = page.xpath(u'//span[@property="v:runtime"]') if len(duration_el) > 0: m = re.search(u'(\d{1,})', duration_el[0].text.replace(" ", '')) if m: data.duration = m.group(1) # 片长 alias_al = page.xpath(u'//span[contains(text(),"又名:")]') if len(alias_al) > 0: alias = page.xpath( u'//span[contains(text(),"又名:")]/following::text()[1]')[0] data.alias = split_space(alias.replace('/', ',')) IMDb_el = page.xpath(u'//span[contains(text(),"IMDb链接")]') if len(IMDb_el) > 0: data.IMDb = IMDb_el[0].getnext().get("href") rating = re.search(u'property="v\:average">(\d*\.\d*)</strong>', r) if rating: data.douban_rating = rating.group(1) rating_sum = page.xpath(u'//span[@property="v:votes"]') if len(rating_sum) > 0: data.douban_rating_sum = rating_sum[0].text summary_all = page.xpath(u'//span[@class="all hidden"]') summary = page.xpath(u'//span[@property="v:summary"]') if len(summary_all) > 0: data.summary = ''.join( page.xpath(u'//span[@class="all hidden"]/text()')) data.summary = parse_simple(data.summary) elif len(summary) > 0: data.summary = ''.join( page.xpath(u'//span[@property="v:summary"]/text()')) data.summary = parse_simple(data.summary) img_url = page.xpath(u'//img[@title="点击看更多海报"]') nbgnbg = page.xpath(u'//a[@title="点击看大图" and @class="nbgnbg"]') if len(img_url) > 0: data.img_url = page.xpath(u'//img[@title="点击看更多海报"]')[0].get("src") elif len(nbgnbg) > 0: data.img_url = nbgnbg[0].get("href") if data.all_episode > 1 and (u"动漫" in data.tags or u"动画" in data.tags): data.category = u"动漫" elif data.all_episode > 1 and (u"综艺" in data.tags or u'真人秀' in data.tags): data.category = u'综艺' elif data.all_episode > 1: data.category = u"电视剧" elif u"动漫" in data.tags or u"动画" in data.tags: data.category = u'动漫' elif u"短片" in data.tags: data.category = u'短片' else: data.category = u'电影' m = re.search(u"SUBJECT_ID: *'(\d*)'", r) if m: data.doubanid = m.group(1) print( "oooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo" ) print(data.__dict__) print( "oooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo" ) return data.__dict__
def parse_star(self, r, url): try: page = etree.HTML(r) except Exception as e: return False data = Star() name = page.xpath(u'//div[@id="content"]/h1') if len(name) > 0: data.name = parse_simple(name[0].text) else: return False m = re.search(u'celebrity/(\d*)/', url) if m: data.doubanid = m.group(1) imgUrl = page.xpath(u'//div[@class="pic"]/a[@class="nbg"]') if len(imgUrl) > 0: data.avatar = imgUrl[0].get("href") gender = page.xpath( u'//span[contains(text(),"性别")]/following::text()[1]') if len(gender): gender = re.sub('\n', '', gender[0]) gender = gender.strip(':') data.gender = parse_simple(gender.strip(' ')) constellation = page.xpath( u'//span[contains(text(),"星座")]/following::text()[1]') if len(constellation) > 0: constellation = re.sub('\n', '', constellation[0]) constellation = constellation.strip(':') data.constellation = parse_simple(constellation.strip(' ')) date_birth = page.xpath( u'//span[contains(text(),"出生日期")]/following::text()[1]') if len(date_birth) > 0: date_birth = re.sub('\n', '', date_birth[0]) date_birth = date_birth.strip(':') date_birth = date_birth.strip(' ') data.date_birth = parse_simple(date_birth.strip(' ')) birthplace = page.xpath( u'//span[contains(text(),"出生地")]/following::text()[1]') if len(birthplace) > 0: birthplace = re.sub(u'\n', "", birthplace[0]) birthplace = birthplace.strip(':') data.birthplace = parse_simple(birthplace.strip(' ')) occupation = page.xpath( u'//span[contains(text(),"职业")]/following::text()[1]') if len(occupation) > 0: occupation = re.sub('\n', '', occupation[0]) occupation = occupation.strip(':') data.occupation = split_space(occupation.replace('/', ",")) foreign_names = page.xpath( u'//span[contains(text(),"更多外文名")]/following::text()[1]') if len(foreign_names) > 0: foreign_names = re.sub('\n', '', foreign_names[0]) foreign_names = foreign_names.strip(':') data.foreign_names = split_space(foreign_names.replace("/", ",")) zh_names = page.xpath( u'//span[contains(text(),"更多中文名")]/following::text()[1]') if len(zh_names) > 0: zh_names = re.sub('\n', '', zh_names[0]) zh_names = zh_names.strip('\n').strip(':') data.zh_names = split_space(zh_names.replace('/', ',')) family_member = page.xpath( u'//span[contains(text(),"家庭成员")]/following::text()[1]') if len(family_member) > 0: family_member = re.sub('\n', '', family_member[0]) family_member = family_member.strip(u':') data.family_member = split_space(family_member.replace('/', ',')) imdb = page.xpath(u'//span[contains(text(),"imdb编号")]') if len(imdb) > 0: if imdb[0].getnext() is not None: data.IMDb = parse_simple(imdb[0].getnext().text) intro = page.xpath(u'//span[@class="all hidden"]/text()') _intro = page.xpath(u'//div[@id="intro"]/div[@class="bd"]/text()') if len(intro): data.intro = parse_simple("".join(intro)) else: data.intro = parse_simple("".join(_intro)) if not data.name: return False return data.__dict__
def parse_detail(self, r, url=None): try: page = etree.HTML(r) except Exception as e: return None data = Contents() sss = re.sub(u'\\n', '', r) v_show = page.xpath( u'//div[@class="p-post"]/div[@class="yk-pack p-list"]/div[@class="p-thumb"]/a' ) if len(v_show) > 0: data.youku_play = "http:" + v_show[0].get("href") # 海报: thumb = page.xpath( u'//div[@class="p-post"]/div[@class="yk-pack p-list"]/div[@class="p-thumb"]/img' ) if len(thumb) > 0: data.poster = [{ "url": "http:" + thumb[0].get("src"), "title": thumb[0].get("alt"), "width": 200, "height": 300 }] data.title = thumb[0].get("alt") data.img_url = "http:" + thumb[0].get("src") # category: category = page.xpath( '//div[@class="p-base"]/ul/li[@class="p-row p-title"]/a') if len(category) > 0: data.category = parse_simple(category[0].text) # 年份:可能没有 year = page.xpath( '//div[@class="p-base"]/ul/li[@class="p-row p-title"]/span[@class="sub-title"]' ) if len(year) > 0 and year[0].text: m = re.search(u'(\d{4})', year[0].text) if m: data.year = m.group(1) # 别名:可能没有 alias = page.xpath('//div[@class="p-base"]/ul/li[@class="p-alias"]') if len(alias) > 0: data.alias = split_space(alias[0].get("title").replace('/', ',')) # 上映:可能没有 published_at = re.search(u'>上映:</label>(\w+-\d+-\d+)*</span>', sss) if published_at != None: data.release_date = published_at.group(1) # 优酷评分:可能没有 youku_score = page.xpath( '//div[@class="p-base"]/ul/li[@class="p-score"]/span[@class="star-num"]' ) if len(youku_score) > 0: data.youku_rating = parse_simple(youku_score[0].text) # 豆瓣评分:可能没有 douban_score = re.search(u'<span class="db-bignum">(\d+\.\d*)</span>', sss) if douban_score != None: data.douban_rating = douban_score.group(1) # 豆瓣评价数量,可能没有 douban_cm_num = re.search(u'<span class="db-cm-num">(\d*)评价</span>', sss) if douban_cm_num != None: data.douban_comment_sum = douban_cm_num.group(1) # 主演:可能没有 actors = page.xpath( '//div[@class="p-base"]/ul/li[@class="p-performer"]') if len(actors) > 0: data.starring = split_space(actors[0].get('title').replace( "/", ",")) data.starring_list = [] for x in page.xpath( '//div[@class="p-base"]/ul/li[@class="p-performer"]/a'): _temp = {} _temp["name"] = parse_simple(x.text) _temp["youkuid"] = re.search( u"//list\.youku\.com/star/show/(.*)\.html", etree.tostring(x)).group(1) if x.get("href"): _temp["youku_url"] = "http:" + x.get("href") data.starring_list.append(_temp) # 集数 renew = page.xpath( '//div[@class="p-base"]/ul/li[@class="p-row p-renew"]') if len(renew) > 0 and renew[0].text: m = re.search(u'(\d*)', renew[0].text) if m: data.all_episode = m.group(1) # 导演:循环出来 directed = page.xpath( u'//div[@class="p-base"]/ul/li[contains(text(),"导演:")]/a') data['director_list'] = [] if len(directed) > 0: data.directors = [] for x in directed: data.directors.append(parse_simple(x.text)) _temp = {} _temp["name"] = parse_simple(x.text) _temp["youkuid"] = re.search( u"//list\.youku\.com/star/show/(.*)\.html", etree.tostring(x)).group(1) if x.get("href"): _temp["youku_url"] = x.get("href") data.directors_list.append(_temp) data.directors = ",".join(data.directors) data.directors = ",".join(data.directors) # 地区,可能没有 area = re.search( u'>地区:<a href="//list\.youku\.com/category/show/([^\.html]+?)\.html" target="_blank">([^</a></li>]+?)</a>', sss) if area != None: data.producer_country = parse_simple(area.group(2)) # 类型:循环出来 types = page.xpath( u'//div[@class="p-base"]/ul/li[contains(text(),"类型")]/a') if len(types) > 0: data.tags = [] for x in types: data.tags.append(parse_simple(x.text)) data.tags = ",".join(data.tags) # 总播放数:可能为none plays_num = re.search(u'<li>总播放数:([^</li>]+?)</li>', sss) if plays_num != None: data.youku_plays_num = plays_num.group(1).replace(',', "") # 评论数量:可能为none youku_comments_num = re.search(u'<li>评论:([^</li>]+?)</li>', sss) if youku_comments_num: data.youku_comments_num = youku_comments_num.group(1) # 顶:可以空 ding = re.search(u'<li>顶:([^</li>]+?)</li>', sss) if ding: data.ding = ding.group(1) # 简介: summary = page.xpath(u'.//span[contains(@class,"intro-more")]/text()') if summary: data.summary = parse_simple("".join(summary)).replace(u"简介:", "") # 适合年龄,可能为空 age = re.search(u'>适用年龄:([^</li>]+?)</li>', sss) if age: data.age = age.group(1) peiyin = page.xpath( u'//div[@class="p-base"]/ul/li[contains(text(),"声优:")]/a') if len(peiyin) > 0: data.peiyin = [] data.peiyin_list = [] for x in peiyin: data.peiyin.append(parse_simple(x.text)) _temp = {} _temp["name"] = parse_simple(x.text) _temp["youkuid"] = re.search( u"//list\.youku\.com/star/show/(.*)\.html", etree.tostring(x)).group(1) _temp['youku_url'] = "http:" + x.get("href") data['peiyin_list'].append(_temp) data.peiyin = ",".join(data.peiyin) # 综艺节目有 presenters = page.xpath( u'//div[@class="p-base"]/ul/li[contains(text(),"主持人:")]/a') if len(presenters) > 0: data.presenters = [] data.presenters_list = [] for x in presenters: data.presenters.append(parse_simple(x.text)) _temp["name"] = parse_simple(x.text) _temp["youkuid"] = re.search( u"//list\.youku\.com/star/show/(.*)\.html", etree.tostring(x)).group(1) _temp['youku_url'] = "http:" + x.get("href") data.presenters = ",".join(data.presenters) if data.title == None: return None data.created_at = time.time() return data.__dict__
def vdetail_parser(self, r): try: page = etree.HTML(r) except Exception as e: return False L = Contents() title = page.xpath(u'//a[@_stat="info:title"]') m = re.search(u'\{"id":"(\w*\d*)"', r) if m: L.qq_id = m.group(1) m = re.search(u'&vid=(\w*\d*)&', r) if m: L.qq_vid = m.group(1) if len(title) > 0: L.title = title[0].text category = page.xpath(u'//span[@class="type"]') if len(category) > 0: L.category = category[0].text area = page.xpath(u'.//span[contains(text(),"地 区:")]') if len(area) > 0: L.area = area_process(area[0].getnext().text) foreign_title = page.xpath(u'//span[@class="title_en"]') if len(foreign_title) > 0: L.foreign_title = foreign_title[0].text qq_play = page.xpath(u'.//a[@_stat="info:playbtn"]') if len(qq_play) > 0: L.qq_play = qq_play[0].get("href") language = page.xpath(u'//span[contains(text(),"语 言:")]') if len(language) > 0: L.language = language_process(language[0].getnext().text) year = page.xpath(u'.//span[contains(text(),"上映时间")]') if len(year) > 0 and year[0].getnext().text: m = re.search(u'(\d{4})', year[0].getnext().text) if m: L.year = m.group(1) all_episode = page.xpath(u'//span[contains(text(),"总集数:")]') if len(all_episode) > 0: L.all_episode = all_episode[0].getnext().text release_date = page.xpath(u'//span[contains(text(),"出品时间:")]') if len(release_date) > 0: L.release_date = release_date[0].getnext().text if L.release_date and L.year == None: try: m = re.search(u'(\d{4})', L.release_date) if m: L.year = m.group(1) except Exception as e: pass year = page.xpath(u'.//span[contains(text(),"首播时间")]') if len(year) > 0 and L.year == None: m = re.search(u'(\d{4})', year[0].getnext().text) if m: L.year = m.group(1) alias = page.xpath(u'//span[contains(text(),"别 名")]') if len(alias) > 0: L.alias = alias[0].getnext().text tags = page.xpath(u'//a[@class="tag"]') if len(tags) > 0: _temp = [x.text for x in tags] L.tags = ",".join(set(_temp)) summary = page.xpath( u'//span[@class="desc_txt"]/span[@class="txt _desc_txt_lineHight"]' ) if len(summary) > 0: L.summary = parse_simple(summary[0].text) qq_rating = page.xpath(u'//div[@class="score_v"]/span[@class="score"]') if len(qq_rating) > 0: L.qq_rating = qq_rating[0].text douban_rating = page.xpath( u'//a[@class="score_db"]/span[@class="score"]') if len(douban_rating) > 0: L.douban_rating = douban_rating[0].text poster = page.xpath(u'//img[@_stat="info:poster"]') if len(poster) > 0: L.poster = [] if poster[0].get("src"): L.poster.append({ "url": self.parse_imgurl(poster[0].get("src")), "name": poster[0].get("alt") }) L.img_url = self.parse_imgurl(poster[0].get("src")) #导演演员 actor_list = page.xpath(u'//ul[contains(@class,"actor_list")]/li') starring_list = [] starring = [] directors_list = [] directors = [] if len(actor_list) > 0: _temp = [] for actor in actor_list: _dic = {} actor_avatar = actor.find(u'a') if actor_avatar is not None: if actor_avatar.find('img') is not None: _dic["avatar"] = self.parse_imgurl( actor_avatar.find('img').get("src")) _dic["qq_id"] = actor.get("data-id") if actor.find("span") is not None: _dic["name"] = actor.find("span").text _dic["qq_home_page"] = actor_avatar.get("href") actor_detail = actor.xpath( u'.//div[@class="actor_detail"]') if actor_detail: # 职业 occupation = actor_detail[0].xpath( u'.//span[contains(text(),"职业")]') if occupation: _dic['occupation'] = occupation[0].getnext().text # 地区 area = actor_detail[0].xpath( u'.//span[contains(text(),"地区")]') if len(area) > 0: _dic['area'] = area[0].getnext().text # 简介 intro = actor.xpath(u'.//span[@itemprop="description"]') if intro: _dic["intro"] = intro[0].text # 导演 if actor_avatar.xpath(u'.//span[@class="director"]'): directors_list.append(_dic) directors.append(_dic['name']) else: # 演员 starring_list.append(_dic) starring.append(_dic['name']) if starring_list: L.starring = ','.join(starring) L.starring_list = starring_list if directors_list: L.directors = ','.join(directors) L.directors_list = directors_list if L.title == None: return False L.created_at = time.time() return L.__dict__
def parse_star(self, r, url): data = Star() try: page = etree.HTML(r) except Exception as e: return False intro = page.xpath(u'.//div[@class="wiki_content"]/text()') if intro: data.intro = parse_simple("".join(intro)) name = page.xpath(u'.//span[contains(text(),"中文名")]') if len(name) > 0: data.name = parse_simple(name[0].getnext().text) date_birth = page.xpath(u'.//span[contains(text(),"出生日期")]') if len(date_birth) > 0: data.date_birth = parse_simple(date_birth[0].getnext().text) m = re.search(u'(\d{4}).{1}(\d{2}).{1}(\d{2})', data.date_birth) if m: data.date_birth = "-".join(m.groups()) foreign_names = page.xpath(u'.//span[contains(text(),"外文名")]') if len(foreign_names) > 0: data.foreign_names = parse_simple(foreign_names[0].getnext().text) occupation = page.xpath(u'.//span[contains(text(),"职 业")]') if len(occupation) > 0: data.occupation = parse_simple(occupation[0].getnext().text) alias = page.xpath(u'.//span[contains(text(),"别 名")]') if len(alias) > 0: data.alias = parse_simple(alias[0].getnext().text) hobbies = page.xpath(u'.//span[contains(text(),"爱 好")]') if len(hobbies) > 0: data.hobbies = parse_simple(hobbies[0].getnext().text) area = page.xpath(u'.//span[contains(text(),"地 区")]') if len(area) > 0: data.area = parse_simple(area[0].getnext().text) area = page.xpath(u'.//span[contains(text(),"地 区")]') if len(area) > 0: data.area = parse_simple(area[0].getnext().text) constellation = page.xpath(u'.//span[contains(text(),"星 座")]') if len(constellation) > 0: data.constellation = parse_simple(constellation[0].getnext().text) blood = page.xpath(u'.//span[contains(text(),"血 型")]') if len(blood) > 0: data.blood = parse_simple(blood[0].getnext().text) body_height = page.xpath(u'.//span[contains(text(),"身 高")]') if len(body_height) > 0: data.body_height = parse_simple(body_height[0].getnext().text) body_height = page.xpath(u'.//span[contains(text(),"身 高")]') if len(body_height) > 0: data.body_height = parse_simple(body_height[0].getnext().text) body_weight = page.xpath(u'.//span[contains(text(),"体 重")]') if len(body_weight) > 0: data.body_weight = parse_simple(body_weight[0].getnext().text) birthplace = page.xpath(u'.//span[contains(text(),"出生地")]') if len(birthplace) > 0: data.birthplace = parse_simple(birthplace[0].getnext().text) avatar = page.xpath(u'.//div[@class="star_pic"]/img') if len(avatar) > 0: data.avatar = "http:" + avatar[0].get("src") data.qq_home_page = url if not data.name: return False data.created_at = time.time() return data.__dict__