Пример #1
0
 def parse_photos(self, r, id):
     data = []
     page = etree.HTML(r)
     lis = page.xpath(
         u'//div[@class="article"]/ul[@class="poster-col3 clearfix"]/li')
     if len(lis) > 0:
         for x in lis:
             temp = {}
             temp['doubanvid'] = id
             temp['doubanid'] = x.get('data-id')
             name_el = x.find('div[@class="name"]')
             prop_el = x.find('div[@class="prop"]')
             cover = x.find('div[@class="cover"]/a')
             if name_el != None:
                 temp['name'] = name_el.text.replace(u'\n', '')
                 temp['name'] = parse_simple(temp['name'])
             if prop_el != None:
                 temp['prop'] = prop_el.text.replace(u'\n', '')
                 temp['prop'] = temp['prop'].strip(u' ')
                 m = re.search(u'(\d{1,})x(\d{1,})',
                               temp['prop'].replace(" ", ""))
                 if m:
                     temp['width'] = m.group(1)
                     temp['height'] = m.group(2)
             if cover != None:
                 temp['photos_page'] = cover.get("href")
                 temp['url'] = cover.find('img').get("src")
                 temp['url'] = re.sub(u'/photo/m/', '/photo/l/',
                                      temp['url'])
             data.append(temp)
     nextpage = page.xpath(u'//a[contains(text(),"后页")]')
     if len(nextpage) > 0:
         return {"data": data, "next": nextpage[0].get("href")}
     return {"data": data}
Пример #2
0
    def parse_star_show(self, r, url):
        """
        解析明星主页
        return:dic
        """
        try:
            # r.decode('utf-8')
            page = etree.HTML(r)
        except Exception as e:
            return None
        info_el = page.xpath('//div[@id="starInfo"]/dl/dd[@class="info"]/span')
        sub_list = {
            u"别名": u'alias',
            u"性别": u'gender',
            u"地区": u'area',
            u"出生地": u'birthplace',
            u'生日': u'birthday',
            u'星座': u'constellation',
            u"血型": u'blood',
            u"职业": u'occupation'
        }
        info_list = [re.split(u':', parse_simple(x.text)) for x in info_el]
        for x in xrange(0, len(info_list)):
            info_list[x][0] = re.sub(info_list[x][0],
                                     sub_list[info_list[x][0]],
                                     info_list[x][0])
        info_data = {x[0]: x[1] for x in info_list}
        avatar = page.xpath(u'.//div[@class="box-avatar"]/img')
        if len(avatar) > 0:
            info_data['avatar'] = avatar[0].get("src")
        info_data.name = page.xpath('//div[@id="starInfo"]')[0].get(
            'data-name')
        info_data.youku_starid = page.xpath('//div[@id="starInfo"]')[0].get(
            'data-starid')
        m = re.search(u'\/(uid_\w*\d*=*)\.html', url)
        info_data['youku_uid'] = m.group(1) if m != None else None
        info_data['created_at'] = int(time.time() * 1000)
        intro = page.xpath('//dd[@class="intro"]/span[@class="long noshow"]')
        info_data['youku_url'] = url
        if len(intro) > 0:
            info_data['intro'] = intro[0].text

        return info_data
Пример #3
0
    def vdetail_parser(self, r, url=None):
        """视频详情页面"""
        try:
            page = etree.HTML(r)
        except Exception as e:
            return False
        L = Contents()
        result_div = page.xpath(u'//div[@class="mod_search_topic"]')
        if result_div:
            result_div = page.xpath(u'//div[@class="mod_reuslt"]')

        title = result_div.xpath(u'//h1[@class="main_title"]/a')
        if len(title) > 0:
            L.title = title[0].text

        sub_title = result_div.xpath(u'//span[@class="sub_title"]')
        if len(sub_title) > 0:
            L.sub_title = sub_title[0].text
            if re.search(u'(\d{4})', L.sub_title):
                L.year = re.search(u'(\d{4})', L.sub_title).group(1)

        category = page.xpath(u'//a[@class="channelTag"]')
        if len(category) > 0:
            L.category = category[0].text

        area = result_div.xpath(u'//em[contains(text(),"地区")]/a')
        if len(area) > 0:
            L.area = area[0].text

        L.iqiyi_tvId = self.get_vid_by_url(url)

        language = result_div.xpath(u'//em[contains(text(),"语言")]/a')
        if len(language) > 0:
            L.language = language[0].text

        tags = result_div.xpath(u'//em[contains(text(),"类型")]/a')
        if len(tags) > 0:
            _temp = [x.text for x in tags]
            L.tags = ",".join(set(_temp))

        banben = result_div.xpath(u'//em[contains(text(),"版本")]/a')
        if len(banben) > 0:
            L.banben = banben[0].text

        summary = result_div.xpath(u'//em[contains(text(),"简介")]')
        if len(summary) > 0:
            L.summary = result_div.xpath(
                u'//em[contains(text(),"简介")]/following::text()[1]')[0]
            L.summary = parse_simple(L.summary)

        all_episode = result_div.xpath(u'//em[contains(text(),"集数")]/a')
        if len(all_episode) > 0:
            L.all_episode = self.parse_all_episode(all_episode[0].text)

        iqiyi_play_url = result_div.xpath(u'//a[contains(text(),"立即播放")]')
        if len(iqiyi_play_url) > 0:
            L.iqiyi_play_url = iqiyi_play_url[0].get("href")

        iqiyi_plays_num = result_div.xpath(u'//i[@id="widget-playcount"]')
        if len(iqiyi_plays_num) > 0:
            L.iqiyi_plays_num = iqiyi_plays_num[0].text

        peiyin = result_div.xpath(u'//em[contains(text(),"配音")]/a')
        if len(peiyin) > 0:
            L.peiyin_list = []
            _temp = []
            for x in peiyin:
                _temp.append(peiyin[0].text)
                L.peiyin_list.append({"name": peiyin[0].text, "iqiyi_url": x.get(
                    "href"), "iqiyi_id": self.get_starId_by_url(x.get("href"))})
            L.peiyin = ",".join(set(_temp))

        iqiyi_rating = page.xpath(u'//span[@class="score_font"]')
        if len(iqiyi_rating) > 0:
            L.iqiyi_rating = iqiyi_rating[0].get("snsscore")

        poster = page.xpath(u'//div[@contains(@_stat,"result_pic"]/img')
        if len(poster) > 0:
            L.poster = {"url": poster[0].get("src"), "width": poster[0].get(
                "width"), "height": poster[0].get("height"), "name": poster[0].get("alt")}
            L.img_url = L.poster['url']

        # 导演演员
        # actor_list = page.xpath(u'//ul[@class="actor_list cf"]/li/')
        # starring_list = []
        # starring = []
        # directors_list = []
        # directors = []
        return L.__dict__
Пример #4
0
    def start_detail(self, r, url=None):
        '''明星信息解析'''
        try:
            page = etree.HTML(r)
        except Exception as e:
            return False
        S = Star()
        S.iqiyi_url = url
        m = re.search(u'"circleId"\:(\d*)', r)  # paopao id
        if m:
            S.iqiyi_circleId = m.group(1)
        avatar = page.xpath(u'.//div[@class="result_pic"]/img')
        if len(avatar) > 0:
            S.avatar = avatar[0].get("src")
            S.name = avatar[0].get("alt")
        result_detail = page.xpath(u'.//div[@class="result_detail"]')
        if len(result_detail) > 0:
            name = result_detail[0].xpath(u'.//h1[@itemprop="name"]')
            if len(name) > 0:
                S.name = parse_simple(name[0].text)
        occupation = page.xpath(u'//span[contains(text(),"职业")]')
        if len(occupation) > 0:
            S.occupation = page.xpath(
                u'//span[contains(text(),"职业")]/following::text()[1]')[0]
            S.occupation = self.parse_occupation(S.occupation)

        date_birth = page.xpath(u'//span[contains(text(),"生日")]')
        if len(date_birth) > 0:
            S.date_birth = page.xpath(
                u'//span[contains(text(),"生日")]/following::text()[1]')[0]
            S.date_birth = parse_simple(self.parse_date_birth(S.date_birth))

        area = page.xpath(u'//span[contains(text(),"地区")]')
        if len(area) > 0:
            S.area = page.xpath(
                u'//span[contains(text(),"地区")]/following::text()[1]')[0]
            S.area = self.parse_area(S.area)

        body_weight = page.xpath(u'//span[contains(text(),"体重")]')
        if len(body_weight) > 0:
            S.body_weight = page.xpath(
                u'//span[contains(text(),"体重")]/following::text()[1]')[0]
            S.body_weight = self.parse_weight(S.body_weight)

        foreign_name = page.xpath(u'//dt[contains(text(),"外文名")]')
        if len(foreign_name) > 0:
            S.foreign_name = parse_simple(foreign_name[0].getnext().text)

        gender = page.xpath(u'//dt[contains(text(),"性别")]')
        if len(gender) > 0:
            S.gender = parse_simple(gender[0].getnext().text)

        body_height = page.xpath(u'//dt[contains(text(),"身高")]')
        if len(body_height) > 0:
            S.body_height = parse_simple(body_height[0].getnext().text)

        birthplace = page.xpath(u'//dt[contains(text(),"出生地")]')
        if len(birthplace) > 0:
            S.birthplace = parse_simple(birthplace[0].getnext().text)

        date_birth = page.xpath(u'//dt[contains(text(),"出生日期")]')
        if len(date_birth) > 0:
            S.date_birth = parse_simple(date_birth[0].getnext().text)

        graduated_school = page.xpath(u'//dt[contains(text(),"毕业院校")]')
        if len(graduated_school) > 0:
            S.graduated_school = parse_simple(
                graduated_school[0].getnext().text)

        famous_times = page.xpath(u'//dt[contains(text(),"成名年代")]')
        if len(famous_times) > 0:
            S.famous_times = parse_simple(famous_times[0].getnext().text)

        alias = page.xpath(u'//dt[contains(text(),"别名")]')
        if len(alias) > 0:
            S.alias = parse_simple(alias[0].getnext().text)

        blood = page.xpath(u'//dt[contains(text(),"血型")]')
        if len(blood) > 0:
            S.blood = parse_simple(blood[0].getnext().text)

        constellation = page.xpath(u'//dt[contains(text(),"星座")]')
        if len(constellation) > 0:
            S.constellation = parse_simple(constellation[0].getnext().text)

        current_residence = page.xpath(u'//dt[contains(text(),"现居地")]')
        if len(current_residence) > 0:
            S.current_residence = parse_simple(
                current_residence[0].getnext().text)

        agency = page.xpath(u'//dt[contains(text(),"经纪公司")]')
        if len(agency) > 0:
            S.agency = parse_simple(agency[0].getnext().text)

        hobbies = page.xpath(u'//dt[contains(text(),"爱好")]')
        if len(hobbies) > 0:
            S.hobbies = parse_simple(hobbies[0].getnext().text)

        intro = page.xpath(u'//p[@class="introduce-info"]')
        if len(intro):
            S.intro = "".join(page.xpath(u'//p[@class="introduce-info"]/text()'))
            S.intro = parse_simple(S.intro)

        if not S.name:
            return False
        S.created_at = time.time()
        return S.__dict__
Пример #5
0
	def baike_parser(r,url=None):
		try:
			r = re.sub(u' ','',r)
			page = etree.HTML(r)
		except Exception as e:
			return False
		L = Contents()
		summary = page.xpath(u'//div[@class="lemmaWgt-lemmaSummary lemmaWgt-lemmaSummary-light"]')
		if len(summary) > 0:
			L.summary = summary[0].text

		title = page.xpath(u'//dt[contains(text(),"中文名")]')
		if len(title) > 0:
			L.title = parse_simple(title[0].getnext().text)

		foreign_title = page.xpath(u'//dt[contains(text(),"外文名")]')
		if len(foreign_title) > 0:
			L.foreign_title = parse_simple(foreign_title[0].getnext().text)

		production_company = page.xpath(u'//dt[contains(text(),"出品公司")]')
		if len(production_company) > 0:
			L.production_company = parse_simple(production_company[0].getnext().text)

		producer_country = page.xpath(u'//dt[contains(text(),"制片地区")]')
		if len(producer_country) > 0:
			L.producer_country = area_process(parse_simple(producer_country[0].getnext().text))

		directors_list = page.xpath(u'//dt[contains(text(),"导演")]')
		if len(directors_list) > 0:
			a_tag = directors_list[0].getnext().findall('a')
			if len(a_tag) > 0:
				L.directors_list = []
				directors = []
				for x in a_tag:
					L.directors_list.append({"name":parse_simple(x.text),"baike_id":x.get("data-lemmaid"),"baike_url":u'https://baike.baidu.com'+x.get("href")})
					directors.append(parse_simple(x.text))
				L.directors = ",".join(set(directors))
			else:
				L.directors = area_process(parse_simple(directors_list[0].getnext().text))

		screenwriter_list = page.xpath(u'//dt[contains(text(),"编剧")]')
		if len(screenwriter_list) > 0:
			a_tag = screenwriter_list[0].getnext().findall('a')
			if len(a_tag) > 0:
				L.screenwriter_list = []
				screenwriters = []
				for x in a_tag:
					L.screenwriter_list.append({"name":parse_simple(x.text),"baike_id":x.get("data-lemmaid"),"baike_url":u'https://baike.baidu.com'+x.get("href")})
					screenwriters.append(parse_simple(x.text))
				L.screenwriters = ",".join(set(screenwriters))
			else:
				L.screenwriters = area_process(parse_simple(screenwriter_list[0].getnext().text))

		starring_list = page.xpath(u'//dt[contains(text(),"主演")]')
		if len(starring_list) > 0:
			a_tag = starring_list[0].getnext().findall('a')
			if len(a_tag) > 0:
				L.starring_list = []
				starring = []
				for x in a_tag:
					L.starring_list.append({"name":parse_simple(x.text),"baike_id":x.get("data-lemmaid"),"baike_url":u'https://baike.baidu.com'+x.get("href")})
					starring.append(parse_simple(x.text))
				L.starring = ",".join(set(starring))
			else:
				L.starring = area_process(parse_simple(starring_list[0].getnext().text))

		alias = page.xpath(u'//dt[contains(text(),"其它译名")]')
		if len(alias) > 0:
			a_tag = alias[0].getnext().findall("a")
			if len(a_tag) > 0:
				L.alias = ",".join([parse_simple(x.text) for x in a_tag if parse_simple(x.text)])
			else:
				L.alias = parse_simple(alias[0].getnext().text)

		types = page.xpath(u'//dt[contains(text(),"类型")]')
		if len(types) > 0:
			L.type = area_process(parse_simple(types[0].getnext().text))

		duration = page.xpath(u'//dt[contains(text(),"片长")]')
		if len(duration) > 0:
			L.duration = area_process(parse_simple(duration[0].getnext().text))

		release_date = page.xpath(u'//dt[contains(text(),"上映时间")]')
		if len(release_date) > 0:
			L.release_date = area_process(parse_simple(release_date[0].getnext().text))

		release_date = page.xpath(u'//dt[contains(text(),"语言")]')
		if len(release_date) > 0:
			L.language = language_process(parse_simple(release_date[0].getnext().text))

		douban_rating = page.xpath(u'//span[contains(@class,"star-text")]')
		if len(douban_rating) > 0:
			L.douban_rating = douban_rating[0].text

		poster = page.xpath(u'//img[@alt="词条图片"]')
		if len(poster) > 0:
			L.poster = [{"url":poster[0].get("src"),"name":poster[0].get("alt")}]
			L.img_url = poster[0].get("src")

		actor_list = page.xpath(u'//ul[@class="actorList"]/li')
		if len(actor_list) > 0:
			starring = L.starring.split(',')
			L.actor_list = []
			starring_list = []
			for x in actor_list:
				_temp = {"avatar":x.find('img').get("src"),"name":x.xpath(u'//dl[@class="info"]/a')[0].text,"baike_id":x.xpath(u'//dl[@class="info"]/a')[0].get("data-lemmaid"),"baidu_url":"https://baike.baidu.com"+x.xpath(u'//dl[@class="info"]/a')[0].get("href")}
				if _temp['name'] in starring:
					starring_list.append(_temp)
				else:
					L.actor_list.append(_temp)
			if starring_list:
				L.starring_list = starring_list

		L.created_at = time.time()
		return L.__dict__
Пример #6
0
    def vdetail_parser(self, r):
        data = Contents()
        try:
            page = etree.HTML(r)
        except Exception as e:
            return False
        year = re.search(u'<span class="year">\((\d{4})\)</span>', r)
        if year:
            data.year = year.group(1)
        title = re.search(u'<span property="v\:itemreviewed">(.*)</span>', r)
        if title:
            data.title = title.group(1)
        bianju = page.xpath(u'//span[contains(text(),"编剧")]')
        if len(bianju) > 0:
            bianju_a = bianju[0].getnext()
            if bianju_a is not None:
                bianju_a = bianju_a.findall('a')
                data.screenwriter_list = []
                screenwriters = ''
                for x in bianju_a:
                    screenwriters = screenwriters + parse_simple(x.text) + ","
                    _temp = {}
                    if re.search(u'/celebrity/(\d*)/', x.get("href")):
                        _temp["doubanid"] = re.search(u'/celebrity/(\d*)/',
                                                      x.get("href")).group(1)
                    else:
                        # doubanid = x.get("href")
                        pass
                    if x.get("href"):
                        _temp[
                            "douban_url"] = "https://movie.douban.com" + x.get(
                                "href")
                    _temp["name"] = parse_simple(x.text)
                    data.screenwriter_list.append(_temp)
                data.screenwriters = screenwriters.strip(',')

        directors_el = page.xpath(u'//span[contains(text(),"导演")]')
        if len(directors_el) > 0:
            directors_a = directors_el[0].getnext()
            if directors_a is not None:
                directors_a = directors_a.findall('a')
                data.directors_list = []
                directors = ""
                for x in directors_a:
                    directors = directors + parse_simple(x.text) + ","
                    _temp = {}
                    if re.search(u'/celebrity/(\d*)/', x.get("href")):
                        _temp["doubanid"] = re.search(u'/celebrity/(\d*)/',
                                                      x.get("href")).group(1)
                    else:
                        # doubanid = x.get("href")
                        pass
                    if x.get("href"):
                        _temp[
                            "douban_url"] = "https://movie.douban.com" + x.get(
                                "href")
                    _temp["name"] = parse_simple(x.text)
                    data.directors_list.append(_temp)
                data.directors = directors.strip(',')

        starring_el = page.xpath(u'//span[contains(text(),"主演")]')
        if len(starring_el) > 0:
            starring_a = starring_el[0].getnext()
            if starring_a is not None:
                starring_a = starring_a.findall('a')
                data.starring_list = []
                starring = ""
                for x in starring_a:
                    starring = starring + parse_simple(x.text) + ","
                    _temp = {}
                    if re.search(u'/celebrity/(\d*)/', x.get("href")):
                        _temp["doubanid"] = re.search(u'/celebrity/(\d*)/',
                                                      x.get("href")).group(1)
                    else:
                        # doubanid = x.get("href")
                        pass
                    if x.get("href"):
                        _temp[
                            "douban_url"] = "https://movie.douban.com" + x.get(
                                "href")
                    _temp["name"] = parse_simple(x.text)
                    data.starring_list.append(_temp)
                starring = starring.strip(',')
                data.starring = starring
        type_el = page.xpath(u'//span[@property="v:genre"]')  # 类型
        mvtype = []
        if len(type_el) > 0:
            for x in type_el:
                mvtype.append(parse_simple(x.text))

        tags = page.xpath(u'//div[@class="tags-body"]/a')
        _temp = []
        for x in tags:
            _temp.append(parse_simple(x.text))
        _temp = _temp + mvtype
        data.tags = ",".join(set(_temp))

        producer_country_el = page.xpath(
            u'//span[contains(text(),"制片国家/地区:")]')
        if len(producer_country_el) > 0:
            producer_country = page.xpath(
                u'//span[contains(text(),"制片国家/地区:")]/following::text()[1]')[0]
            data.producer_country = area_process(
                split_space(producer_country.replace('/', ',')))

        language_el = page.xpath(u'//span[contains(text(),"语言:")]')
        if len(language_el) > 0:
            language = page.xpath(
                u'//span[contains(text(),"语言:")]/following::text()[1]')[0]
            data.language = language_process(
                split_space(language.replace('/', ',')))

        all_episode = page.xpath(u'//span[contains(text(),"集数:")]')
        if len(all_episode) > 0:
            all_episode = page.xpath(
                u'//span[contains(text(),"集数:")]/following::text()[1]')[0]
            m = re.search(u'(\d{1,})', all_episode.replace(" ", ""))
            if m:
                data.all_episode = m.group(1)

        episode_time = page.xpath(u'//span[contains(text(),"单集片长:")]')
        if len(episode_time) > 0:
            episode = page.xpath(
                u'//span[contains(text(),"单集片长:")]/following::text()[1]')[0]
            m = re.search(u'(\d{1,})', episode.replace(" ", ""))
            if m:
                data.duration = m.group(1)

        season = page.xpath(
            u'//select[@id="season"]/option[@selected="selected"]')  #season季数
        if len(season) > 0:
            data.season = season[0].text

        release_date_el = page.xpath(
            u'//span[@property="v:initialReleaseDate"]')  #首播
        if len(release_date_el) > 0:
            release_date = ""
            for x in release_date_el:
                release_date = release_date + parse_simple(x.text) + "|"
            release_date = release_date.strip('|')
            m = re.search(u'(\d{4}-\d{2}-\d{2})',
                          release_date.replace(" ", ""))
            if m:
                data.release_date = m.group(1)
            else:
                data.release_date = release_date
        duration_el = page.xpath(u'//span[@property="v:runtime"]')
        if len(duration_el) > 0:
            m = re.search(u'(\d{1,})', duration_el[0].text.replace(" ", ''))
            if m:
                data.duration = m.group(1)  # 片长

        alias_al = page.xpath(u'//span[contains(text(),"又名:")]')
        if len(alias_al) > 0:
            alias = page.xpath(
                u'//span[contains(text(),"又名:")]/following::text()[1]')[0]
            data.alias = split_space(alias.replace('/', ','))

        IMDb_el = page.xpath(u'//span[contains(text(),"IMDb链接")]')
        if len(IMDb_el) > 0:
            data.IMDb = IMDb_el[0].getnext().get("href")

        rating = re.search(u'property="v\:average">(\d*\.\d*)</strong>', r)
        if rating:
            data.douban_rating = rating.group(1)

        rating_sum = page.xpath(u'//span[@property="v:votes"]')
        if len(rating_sum) > 0:
            data.douban_rating_sum = rating_sum[0].text

        summary_all = page.xpath(u'//span[@class="all hidden"]')
        summary = page.xpath(u'//span[@property="v:summary"]')
        if len(summary_all) > 0:
            data.summary = ''.join(
                page.xpath(u'//span[@class="all hidden"]/text()'))
            data.summary = parse_simple(data.summary)
        elif len(summary) > 0:
            data.summary = ''.join(
                page.xpath(u'//span[@property="v:summary"]/text()'))
            data.summary = parse_simple(data.summary)

        img_url = page.xpath(u'//img[@title="点击看更多海报"]')
        nbgnbg = page.xpath(u'//a[@title="点击看大图" and @class="nbgnbg"]')
        if len(img_url) > 0:
            data.img_url = page.xpath(u'//img[@title="点击看更多海报"]')[0].get("src")
        elif len(nbgnbg) > 0:
            data.img_url = nbgnbg[0].get("href")

        if data.all_episode > 1 and (u"动漫" in data.tags or u"动画" in data.tags):
            data.category = u"动漫"
        elif data.all_episode > 1 and (u"综艺" in data.tags
                                       or u'真人秀' in data.tags):
            data.category = u'综艺'
        elif data.all_episode > 1:
            data.category = u"电视剧"
        elif u"动漫" in data.tags or u"动画" in data.tags:
            data.category = u'动漫'
        elif u"短片" in data.tags:
            data.category = u'短片'
        else:
            data.category = u'电影'

        m = re.search(u"SUBJECT_ID: *'(\d*)'", r)
        if m:
            data.doubanid = m.group(1)

        print(
            "oooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo"
        )
        print(data.__dict__)
        print(
            "oooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo"
        )
        return data.__dict__
Пример #7
0
    def parse_star(self, r, url):
        try:
            page = etree.HTML(r)
        except Exception as e:
            return False
        data = Star()
        name = page.xpath(u'//div[@id="content"]/h1')
        if len(name) > 0:
            data.name = parse_simple(name[0].text)
        else:
            return False
        m = re.search(u'celebrity/(\d*)/', url)
        if m:
            data.doubanid = m.group(1)
        imgUrl = page.xpath(u'//div[@class="pic"]/a[@class="nbg"]')
        if len(imgUrl) > 0:
            data.avatar = imgUrl[0].get("href")
        gender = page.xpath(
            u'//span[contains(text(),"性别")]/following::text()[1]')
        if len(gender):
            gender = re.sub('\n', '', gender[0])
            gender = gender.strip(':')
            data.gender = parse_simple(gender.strip(' '))

        constellation = page.xpath(
            u'//span[contains(text(),"星座")]/following::text()[1]')
        if len(constellation) > 0:
            constellation = re.sub('\n', '', constellation[0])
            constellation = constellation.strip(':')
            data.constellation = parse_simple(constellation.strip(' '))

        date_birth = page.xpath(
            u'//span[contains(text(),"出生日期")]/following::text()[1]')
        if len(date_birth) > 0:
            date_birth = re.sub('\n', '', date_birth[0])
            date_birth = date_birth.strip(':')
            date_birth = date_birth.strip(' ')
            data.date_birth = parse_simple(date_birth.strip(' '))

        birthplace = page.xpath(
            u'//span[contains(text(),"出生地")]/following::text()[1]')
        if len(birthplace) > 0:
            birthplace = re.sub(u'\n', "", birthplace[0])
            birthplace = birthplace.strip(':')
            data.birthplace = parse_simple(birthplace.strip(' '))

        occupation = page.xpath(
            u'//span[contains(text(),"职业")]/following::text()[1]')
        if len(occupation) > 0:
            occupation = re.sub('\n', '', occupation[0])
            occupation = occupation.strip(':')
            data.occupation = split_space(occupation.replace('/', ","))

        foreign_names = page.xpath(
            u'//span[contains(text(),"更多外文名")]/following::text()[1]')
        if len(foreign_names) > 0:
            foreign_names = re.sub('\n', '', foreign_names[0])
            foreign_names = foreign_names.strip(':')
            data.foreign_names = split_space(foreign_names.replace("/", ","))

        zh_names = page.xpath(
            u'//span[contains(text(),"更多中文名")]/following::text()[1]')
        if len(zh_names) > 0:
            zh_names = re.sub('\n', '', zh_names[0])
            zh_names = zh_names.strip('\n').strip(':')
            data.zh_names = split_space(zh_names.replace('/', ','))

        family_member = page.xpath(
            u'//span[contains(text(),"家庭成员")]/following::text()[1]')
        if len(family_member) > 0:
            family_member = re.sub('\n', '', family_member[0])
            family_member = family_member.strip(u':')
            data.family_member = split_space(family_member.replace('/', ','))

        imdb = page.xpath(u'//span[contains(text(),"imdb编号")]')
        if len(imdb) > 0:
            if imdb[0].getnext() is not None:
                data.IMDb = parse_simple(imdb[0].getnext().text)

        intro = page.xpath(u'//span[@class="all hidden"]/text()')
        _intro = page.xpath(u'//div[@id="intro"]/div[@class="bd"]/text()')
        if len(intro):
            data.intro = parse_simple("".join(intro))
        else:
            data.intro = parse_simple("".join(_intro))
        if not data.name:
            return False
        return data.__dict__
Пример #8
0
    def parse_detail(self, r, url=None):
        try:
            page = etree.HTML(r)
        except Exception as e:
            return None
        data = Contents()
        sss = re.sub(u'\\n', '', r)
        v_show = page.xpath(
            u'//div[@class="p-post"]/div[@class="yk-pack p-list"]/div[@class="p-thumb"]/a'
        )
        if len(v_show) > 0:
            data.youku_play = "http:" + v_show[0].get("href")
        # 海报:
        thumb = page.xpath(
            u'//div[@class="p-post"]/div[@class="yk-pack p-list"]/div[@class="p-thumb"]/img'
        )
        if len(thumb) > 0:
            data.poster = [{
                "url": "http:" + thumb[0].get("src"),
                "title": thumb[0].get("alt"),
                "width": 200,
                "height": 300
            }]
            data.title = thumb[0].get("alt")
            data.img_url = "http:" + thumb[0].get("src")
        # category:
        category = page.xpath(
            '//div[@class="p-base"]/ul/li[@class="p-row p-title"]/a')
        if len(category) > 0:
            data.category = parse_simple(category[0].text)
        # 年份:可能没有
        year = page.xpath(
            '//div[@class="p-base"]/ul/li[@class="p-row p-title"]/span[@class="sub-title"]'
        )
        if len(year) > 0 and year[0].text:
            m = re.search(u'(\d{4})', year[0].text)
            if m:
                data.year = m.group(1)
        # 别名:可能没有
        alias = page.xpath('//div[@class="p-base"]/ul/li[@class="p-alias"]')
        if len(alias) > 0:
            data.alias = split_space(alias[0].get("title").replace('/', ','))
        # 上映:可能没有
        published_at = re.search(u'>上映:</label>(\w+-\d+-\d+)*</span>', sss)
        if published_at != None:
            data.release_date = published_at.group(1)
        # 优酷评分:可能没有
        youku_score = page.xpath(
            '//div[@class="p-base"]/ul/li[@class="p-score"]/span[@class="star-num"]'
        )
        if len(youku_score) > 0:
            data.youku_rating = parse_simple(youku_score[0].text)
        # 豆瓣评分:可能没有
        douban_score = re.search(u'<span class="db-bignum">(\d+\.\d*)</span>',
                                 sss)
        if douban_score != None:
            data.douban_rating = douban_score.group(1)
        # 豆瓣评价数量,可能没有
        douban_cm_num = re.search(u'<span class="db-cm-num">(\d*)评价</span>',
                                  sss)
        if douban_cm_num != None:
            data.douban_comment_sum = douban_cm_num.group(1)
        # 主演:可能没有
        actors = page.xpath(
            '//div[@class="p-base"]/ul/li[@class="p-performer"]')
        if len(actors) > 0:
            data.starring = split_space(actors[0].get('title').replace(
                "/", ","))
            data.starring_list = []
            for x in page.xpath(
                    '//div[@class="p-base"]/ul/li[@class="p-performer"]/a'):
                _temp = {}
                _temp["name"] = parse_simple(x.text)
                _temp["youkuid"] = re.search(
                    u"//list\.youku\.com/star/show/(.*)\.html",
                    etree.tostring(x)).group(1)
                if x.get("href"):
                    _temp["youku_url"] = "http:" + x.get("href")
                data.starring_list.append(_temp)

        # 集数
        renew = page.xpath(
            '//div[@class="p-base"]/ul/li[@class="p-row p-renew"]')
        if len(renew) > 0 and renew[0].text:
            m = re.search(u'(\d*)', renew[0].text)
            if m:
                data.all_episode = m.group(1)
        # 导演:循环出来
        directed = page.xpath(
            u'//div[@class="p-base"]/ul/li[contains(text(),"导演:")]/a')
        data['director_list'] = []
        if len(directed) > 0:
            data.directors = []
            for x in directed:
                data.directors.append(parse_simple(x.text))
                _temp = {}
                _temp["name"] = parse_simple(x.text)
                _temp["youkuid"] = re.search(
                    u"//list\.youku\.com/star/show/(.*)\.html",
                    etree.tostring(x)).group(1)
                if x.get("href"):
                    _temp["youku_url"] = x.get("href")
                data.directors_list.append(_temp)
                data.directors = ",".join(data.directors)
            data.directors = ",".join(data.directors)
        # 地区,可能没有
        area = re.search(
            u'>地区:<a href="//list\.youku\.com/category/show/([^\.html]+?)\.html" target="_blank">([^</a></li>]+?)</a>',
            sss)
        if area != None:
            data.producer_country = parse_simple(area.group(2))
        # 类型:循环出来
        types = page.xpath(
            u'//div[@class="p-base"]/ul/li[contains(text(),"类型")]/a')
        if len(types) > 0:
            data.tags = []
            for x in types:
                data.tags.append(parse_simple(x.text))
            data.tags = ",".join(data.tags)
        # 总播放数:可能为none
        plays_num = re.search(u'<li>总播放数:([^</li>]+?)</li>', sss)
        if plays_num != None:
            data.youku_plays_num = plays_num.group(1).replace(',', "")
        # 评论数量:可能为none
        youku_comments_num = re.search(u'<li>评论:([^</li>]+?)</li>', sss)
        if youku_comments_num:
            data.youku_comments_num = youku_comments_num.group(1)
        # 顶:可以空
        ding = re.search(u'<li>顶:([^</li>]+?)</li>', sss)
        if ding:
            data.ding = ding.group(1)
        # 简介:
        summary = page.xpath(u'.//span[contains(@class,"intro-more")]/text()')
        if summary:
            data.summary = parse_simple("".join(summary)).replace(u"简介:", "")
        # 适合年龄,可能为空
        age = re.search(u'>适用年龄:([^</li>]+?)</li>', sss)
        if age:
            data.age = age.group(1)
        peiyin = page.xpath(
            u'//div[@class="p-base"]/ul/li[contains(text(),"声优:")]/a')
        if len(peiyin) > 0:
            data.peiyin = []
            data.peiyin_list = []
            for x in peiyin:
                data.peiyin.append(parse_simple(x.text))
                _temp = {}
                _temp["name"] = parse_simple(x.text)
                _temp["youkuid"] = re.search(
                    u"//list\.youku\.com/star/show/(.*)\.html",
                    etree.tostring(x)).group(1)
                _temp['youku_url'] = "http:" + x.get("href")
                data['peiyin_list'].append(_temp)
            data.peiyin = ",".join(data.peiyin)
        # 综艺节目有
        presenters = page.xpath(
            u'//div[@class="p-base"]/ul/li[contains(text(),"主持人:")]/a')
        if len(presenters) > 0:
            data.presenters = []
            data.presenters_list = []
            for x in presenters:
                data.presenters.append(parse_simple(x.text))
                _temp["name"] = parse_simple(x.text)
                _temp["youkuid"] = re.search(
                    u"//list\.youku\.com/star/show/(.*)\.html",
                    etree.tostring(x)).group(1)
                _temp['youku_url'] = "http:" + x.get("href")
            data.presenters = ",".join(data.presenters)
        if data.title == None:
            return None
        data.created_at = time.time()

        return data.__dict__
Пример #9
0
    def vdetail_parser(self, r):
        try:
            page = etree.HTML(r)
        except Exception as e:
            return False
        L = Contents()
        title = page.xpath(u'//a[@_stat="info:title"]')
        m = re.search(u'\{"id":"(\w*\d*)"', r)
        if m:
            L.qq_id = m.group(1)
        m = re.search(u'&vid=(\w*\d*)&', r)
        if m:
            L.qq_vid = m.group(1)
        if len(title) > 0:
            L.title = title[0].text

        category = page.xpath(u'//span[@class="type"]')
        if len(category) > 0:
            L.category = category[0].text

        area = page.xpath(u'.//span[contains(text(),"地 区:")]')
        if len(area) > 0:
            L.area = area_process(area[0].getnext().text)

        foreign_title = page.xpath(u'//span[@class="title_en"]')
        if len(foreign_title) > 0:
            L.foreign_title = foreign_title[0].text

        qq_play = page.xpath(u'.//a[@_stat="info:playbtn"]')
        if len(qq_play) > 0:
            L.qq_play = qq_play[0].get("href")

        language = page.xpath(u'//span[contains(text(),"语 言:")]')
        if len(language) > 0:
            L.language = language_process(language[0].getnext().text)

        year = page.xpath(u'.//span[contains(text(),"上映时间")]')
        if len(year) > 0 and year[0].getnext().text:
            m = re.search(u'(\d{4})', year[0].getnext().text)
            if m:
                L.year = m.group(1)

        all_episode = page.xpath(u'//span[contains(text(),"总集数:")]')
        if len(all_episode) > 0:
            L.all_episode = all_episode[0].getnext().text

        release_date = page.xpath(u'//span[contains(text(),"出品时间:")]')
        if len(release_date) > 0:
            L.release_date = release_date[0].getnext().text
            if L.release_date and L.year == None:
                try:
                    m = re.search(u'(\d{4})', L.release_date)
                    if m:
                        L.year = m.group(1)
                except Exception as e:
                    pass
        year = page.xpath(u'.//span[contains(text(),"首播时间")]')
        if len(year) > 0 and L.year == None:
            m = re.search(u'(\d{4})', year[0].getnext().text)
            if m:
                L.year = m.group(1)

        alias = page.xpath(u'//span[contains(text(),"别 名")]')
        if len(alias) > 0:
            L.alias = alias[0].getnext().text

        tags = page.xpath(u'//a[@class="tag"]')
        if len(tags) > 0:
            _temp = [x.text for x in tags]
            L.tags = ",".join(set(_temp))

        summary = page.xpath(
            u'//span[@class="desc_txt"]/span[@class="txt _desc_txt_lineHight"]'
        )
        if len(summary) > 0:
            L.summary = parse_simple(summary[0].text)

        qq_rating = page.xpath(u'//div[@class="score_v"]/span[@class="score"]')
        if len(qq_rating) > 0:
            L.qq_rating = qq_rating[0].text

        douban_rating = page.xpath(
            u'//a[@class="score_db"]/span[@class="score"]')
        if len(douban_rating) > 0:
            L.douban_rating = douban_rating[0].text

        poster = page.xpath(u'//img[@_stat="info:poster"]')
        if len(poster) > 0:
            L.poster = []
            if poster[0].get("src"):
                L.poster.append({
                    "url": self.parse_imgurl(poster[0].get("src")),
                    "name": poster[0].get("alt")
                })
                L.img_url = self.parse_imgurl(poster[0].get("src"))

        #导演演员
        actor_list = page.xpath(u'//ul[contains(@class,"actor_list")]/li')
        starring_list = []
        starring = []
        directors_list = []
        directors = []
        if len(actor_list) > 0:
            _temp = []
            for actor in actor_list:
                _dic = {}
                actor_avatar = actor.find(u'a')
                if actor_avatar is not None:
                    if actor_avatar.find('img') is not None:
                        _dic["avatar"] = self.parse_imgurl(
                            actor_avatar.find('img').get("src"))
                    _dic["qq_id"] = actor.get("data-id")
                    if actor.find("span") is not None:
                        _dic["name"] = actor.find("span").text
                    _dic["qq_home_page"] = actor_avatar.get("href")
                    actor_detail = actor.xpath(
                        u'.//div[@class="actor_detail"]')
                    if actor_detail:
                        # 职业
                        occupation = actor_detail[0].xpath(
                            u'.//span[contains(text(),"职业")]')
                        if occupation:
                            _dic['occupation'] = occupation[0].getnext().text

                        # 地区
                        area = actor_detail[0].xpath(
                            u'.//span[contains(text(),"地区")]')
                        if len(area) > 0:
                            _dic['area'] = area[0].getnext().text

                    # 简介
                    intro = actor.xpath(u'.//span[@itemprop="description"]')
                    if intro:
                        _dic["intro"] = intro[0].text
                    # 导演
                    if actor_avatar.xpath(u'.//span[@class="director"]'):
                        directors_list.append(_dic)
                        directors.append(_dic['name'])
                    else:
                        # 演员
                        starring_list.append(_dic)
                        starring.append(_dic['name'])
        if starring_list:
            L.starring = ','.join(starring)
            L.starring_list = starring_list
        if directors_list:
            L.directors = ','.join(directors)
            L.directors_list = directors_list

        if L.title == None:
            return False
        L.created_at = time.time()
        return L.__dict__
Пример #10
0
    def parse_star(self, r, url):
        data = Star()
        try:
            page = etree.HTML(r)
        except Exception as e:
            return False
        intro = page.xpath(u'.//div[@class="wiki_content"]/text()')
        if intro:
            data.intro = parse_simple("".join(intro))
        name = page.xpath(u'.//span[contains(text(),"中文名")]')
        if len(name) > 0:
            data.name = parse_simple(name[0].getnext().text)

        date_birth = page.xpath(u'.//span[contains(text(),"出生日期")]')
        if len(date_birth) > 0:
            data.date_birth = parse_simple(date_birth[0].getnext().text)
            m = re.search(u'(\d{4}).{1}(\d{2}).{1}(\d{2})', data.date_birth)
            if m:
                data.date_birth = "-".join(m.groups())

        foreign_names = page.xpath(u'.//span[contains(text(),"外文名")]')
        if len(foreign_names) > 0:
            data.foreign_names = parse_simple(foreign_names[0].getnext().text)

        occupation = page.xpath(u'.//span[contains(text(),"职  业")]')
        if len(occupation) > 0:
            data.occupation = parse_simple(occupation[0].getnext().text)

        alias = page.xpath(u'.//span[contains(text(),"别 名")]')
        if len(alias) > 0:
            data.alias = parse_simple(alias[0].getnext().text)

        hobbies = page.xpath(u'.//span[contains(text(),"爱  好")]')
        if len(hobbies) > 0:
            data.hobbies = parse_simple(hobbies[0].getnext().text)

        area = page.xpath(u'.//span[contains(text(),"地 区")]')
        if len(area) > 0:
            data.area = parse_simple(area[0].getnext().text)

        area = page.xpath(u'.//span[contains(text(),"地 区")]')
        if len(area) > 0:
            data.area = parse_simple(area[0].getnext().text)

        constellation = page.xpath(u'.//span[contains(text(),"星 座")]')
        if len(constellation) > 0:
            data.constellation = parse_simple(constellation[0].getnext().text)

        blood = page.xpath(u'.//span[contains(text(),"血 型")]')
        if len(blood) > 0:
            data.blood = parse_simple(blood[0].getnext().text)

        body_height = page.xpath(u'.//span[contains(text(),"身 高")]')
        if len(body_height) > 0:
            data.body_height = parse_simple(body_height[0].getnext().text)

        body_height = page.xpath(u'.//span[contains(text(),"身 高")]')
        if len(body_height) > 0:
            data.body_height = parse_simple(body_height[0].getnext().text)

        body_weight = page.xpath(u'.//span[contains(text(),"体 重")]')
        if len(body_weight) > 0:
            data.body_weight = parse_simple(body_weight[0].getnext().text)

        birthplace = page.xpath(u'.//span[contains(text(),"出生地")]')
        if len(birthplace) > 0:
            data.birthplace = parse_simple(birthplace[0].getnext().text)

        avatar = page.xpath(u'.//div[@class="star_pic"]/img')
        if len(avatar) > 0:
            data.avatar = "http:" + avatar[0].get("src")

        data.qq_home_page = url
        if not data.name:
            return False
        data.created_at = time.time()
        return data.__dict__