Пример #1
0
def sim_content(data):
    sim_regx = {}
    if data.get("directors") and data.get("directors") != "":
        sim_regx["directors"] = re.compile(
            u"(" + parse_regx_char(data["directors"]) + ")", re.IGNORECASE)
    # if data.get("starring") and data.get("starring")!="":
    #     sim_regx['starring'] = re.compile(u"("+data["starring"]+")",re.IGNORECASE)
    if len(sim_regx) == 0:
        """如果主演导演都没有的话,就不去搜索了"""
        return None
    if data.get("area"):
        sim_regx['area'] = re.compile(
            u"(" + "|".join(area_process(data.get("area"))) + ")",
            re.IGNORECASE)
    regx_name = search_preprocess(data.get("title"))
    regx_name = parse_regx_char(regx_name)
    regx_name = u'.*' + regx_name.replace(u'-', '.*') + ".*"
    regx_name = re.compile(regx_name, re.IGNORECASE)
    sim_regx['title'] = regx_name  #匹配标题以之开头的
    contents = db.contents.find(sim_regx)
    print('---%s------%s' % (regx_name, data.get("title")))
    if contents.count() > 0:
        poster = db.posters.find({"content_id": str(contents[0]['_id'])})
        if poster.count() > 0:
            p = []
            for x in poster:
                print(x['_id'], data['_id'])
                print(
                    "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
                )
                del x['_id']
                x['content_id'] = data['_id']
                p.append(x)
            return p
    else:
        return None
Пример #2
0
    def merge_fields(self, info):
        L = Contents()
        L.title = info.get("name")
        L.summary = info.get("description")
        L.iqiyi_tvId = info.get("tvId")
        L.iqiyi_vid = info.get("vid")
        L.iqiyi_plays_num = info.get("playCount")
        L.iqiyi_albumId = info.get("albumId")
        L.iqiyi_play_url = info.get("url")
        if info.get("duration") and info.get("duration") != "":
            L.duration = info.get("duration")/60
        L.poster = []
        if info.get("albumImageUrl"):
            L.img_url = info.get("albumImageUrl")
            L.poster.append({"url": info.get("albumImageUrl")})
        if info.get("imageUrl"):
            L.poster.append({"url": info.get("imageUrl")})
        if info.get("videoImageUrl"):
            L.poster.append({"url": info.get("videoImageUrl")})
        if info.get("posterUrl"):
            L.poster.append({"url": info.get("posterUrl")})
        if info.get("tvImageUrl"):
            L.poster.append({"url": info.get("tvImageUrl")})
        if info.get("qualityImageUrl"):
            L.poster.append({"url": info.get("qualityImageUrl")})
        if info["issueTime"]:
            L.release_date = mictime_to_ymd(info["issueTime"])
        if info.get("crumbList"):
            level2 = True
            for x in info.get("crumbList"):
            	if int(x["level"])==2 and x["title"]!=u'VIP会员':
            		L.category = x["title"]
            		level2 = False
            if level2:
            	for x in info.get("crumbList"):
            		if int(x["level"])==3:
            			L.category = x["title"]
            			level2 = False
        _temp = []
        for x in info.get("categories"):
            if u"地区" in x.get("subName"):
                L.area = area_process(x.get("name"))
            elif u"类型" in x.get("subName") or u'风格' in x.get("subName") or u'分类' in x.get("subName") or u'小学' in x.get("subName") or u'高中' in x.get("subName") or u'短片' in x.get("subName"):
                _temp.append(x.get("name"))
            elif u"语种" in x.get("subName"):
                L.language = language_process(x.get("name"))
            elif x.get("subName") == u"年龄段":
                L.age = x.get("name")
        L.tags = ",".join(_temp)
        L.all_episode = info.get("videoCount")
        L.sub_title = info.get("subtitle")
        L.iqiyi_rating_num = info.get("commentCount")
        L.iqiyi_qitanId = info.get("qitanId")
        if info.get("cast") and info.get("cast").get("directors"):
            L.directors = []
            L.directors_list = []
            for x in info.get("cast").get("directors"):
                L.directors.append(x.get("name"))
                L.directors_list.append({"name": x.get("name"), "iqiyi_id": x.get("id"), "avatar": x.get(
                    "imageUrl"), "iqiyi_userId": x.get("userId"), "iqiyi_circleId": x.get("circleId")})
            L.directors = ",".join(L.directors)

        if info.get("cast") and info.get("cast").get("speakers"):
            L.speakers = []
            L.speakers_list = []
            for x in info.get("cast").get("speakers"):
                L.speakers.append(x.get("name"))
                L.speakers_list.append({"name": x.get("name"), "iqiyi_id": x.get("id"), "avatar": x.get(
                    "imageUrl"), "iqiyi_userId": x.get("userId"), "iqiyi_circleId": x.get("circleId")})
            L.speakers = ",".join(L.speakers)

        if info.get("cast") and info.get("cast").get("publishers"):
            L.publishers = []
            L.publishers_list = []
            for x in info.get("cast").get("publishers"):
                L.publishers.append(x.get("name"))
                L.publishers_list.append({"name": x.get("name"), "iqiyi_id": x.get("id"), "avatar": x.get(
                    "imageUrl"), "iqiyi_userId": x.get("userId"), "iqiyi_circleId": x.get("circleId")})
            L.publishers = ",".join(L.publishers)

        if info.get("cast") and info.get("cast").get("singers"):
            L.singers = []
            L.singers_list = []
            for x in info.get("cast").get("singers"):
                L.singers.append(x.get("name"))
                L.singers_list.append({"name": x.get("name"), "iqiyi_id": x.get("id"), "avatar": x.get(
                    "imageUrl"), "iqiyi_userId": x.get("userId"), "iqiyi_circleId": x.get("circleId")})
            L.singers = ",".join(L.singers)

        if info.get("cast") and info.get("cast").get("mainActors"):
            L.starring = []
            L.starring_list = []
            for x in info.get("cast").get("mainActors"):
                L.starring.append(x.get("name"))
                L.starring_list.append({"name": x.get("name"), "iqiyi_id": x.get("id"), "avatar": x.get(
                    "imageUrl"), "iqiyi_userId": x.get("userId"), "iqiyi_circleId": x.get("circleId")})
            L.starring = ",".join(L.starring)

        """编剧"""
        if info.get("cast") and info.get("cast").get("writers"):
            L.screenwriters = []
            L.screenwriter_list = []
            for x in info.get("cast").get("writers"):
                L.screenwriters.append(x.get("name"))
                L.screenwriter_list.append({"name": x.get("name"), "iqiyi_id": x.get("id"), "avatar": x.get(
                    "imageUrl"), "iqiyi_userId": x.get("userId"), "iqiyi_circleId": x.get("circleId")})
            L.screenwriters = ",".join(L.screenwriters)

        if info.get("cast") and info.get("cast").get("actors"):
            L.actors = []
            L.actors_list = []
            for x in info.get("cast").get("actors"):
                L.actors.append(x.get("name"))
                L.actors_list.append({"name": x.get("name"), "iqiyi_id": x.get("id"), "avatar": x.get(
                    "imageUrl"), "iqiyi_userId": x.get("userId"), "iqiyi_circleId": x.get("circleId")})
            L.actors = ",".join(L.actors)

        """嘉宾"""
        if info.get("cast") and info.get("cast").get("guests"):
            L.guests = []
            L.guests_list = []
            for x in info.get("cast").get("guests"):
                L.guests.append(x.get("name"))
                L.guests_list.append({"name": x.get("name"), "iqiyi_id": x.get("id"), "avatar": x.get(
                    "imageUrl"), "iqiyi_userId": x.get("userId"), "iqiyi_circleId": x.get("circleId")})
            L.guests = ",".join(L.guests)

        if info.get("cast") and info.get("cast").get("hosts"):
            L.hosts = []
            L.hosts_list = []
            for x in info.get("cast").get("hosts"):
                L.hosts.append(x.get("name"))
                L.hosts_list.append({"name": x.get("name"), "iqiyi_id": x.get("id"), "avatar": x.get(
                    "imageUrl"), "iqiyi_userId": x.get("userId"), "iqiyi_circleId": x.get("circleId")})
            L.hosts = ",".join(L.hosts)

        if L.release_date and L.year==None:
        	m = re.search(u'(\d{4})',L.release_date)
        	if m:
        		L.year = m.group(1)

        if not L.directors_list and not L.directors_list and L.hosts:
            L.directors = L.hosts
            L.directors_list = L.hosts_list

        L.focuses = info.get("focuses")
        L.iqiyi_rating = info.get("score")
        L.created_at = time.time()

        return L.__dict__
Пример #3
0
	def baike_parser(r,url=None):
		try:
			r = re.sub(u' ','',r)
			page = etree.HTML(r)
		except Exception as e:
			return False
		L = Contents()
		summary = page.xpath(u'//div[@class="lemmaWgt-lemmaSummary lemmaWgt-lemmaSummary-light"]')
		if len(summary) > 0:
			L.summary = summary[0].text

		title = page.xpath(u'//dt[contains(text(),"中文名")]')
		if len(title) > 0:
			L.title = parse_simple(title[0].getnext().text)

		foreign_title = page.xpath(u'//dt[contains(text(),"外文名")]')
		if len(foreign_title) > 0:
			L.foreign_title = parse_simple(foreign_title[0].getnext().text)

		production_company = page.xpath(u'//dt[contains(text(),"出品公司")]')
		if len(production_company) > 0:
			L.production_company = parse_simple(production_company[0].getnext().text)

		producer_country = page.xpath(u'//dt[contains(text(),"制片地区")]')
		if len(producer_country) > 0:
			L.producer_country = area_process(parse_simple(producer_country[0].getnext().text))

		directors_list = page.xpath(u'//dt[contains(text(),"导演")]')
		if len(directors_list) > 0:
			a_tag = directors_list[0].getnext().findall('a')
			if len(a_tag) > 0:
				L.directors_list = []
				directors = []
				for x in a_tag:
					L.directors_list.append({"name":parse_simple(x.text),"baike_id":x.get("data-lemmaid"),"baike_url":u'https://baike.baidu.com'+x.get("href")})
					directors.append(parse_simple(x.text))
				L.directors = ",".join(set(directors))
			else:
				L.directors = area_process(parse_simple(directors_list[0].getnext().text))

		screenwriter_list = page.xpath(u'//dt[contains(text(),"编剧")]')
		if len(screenwriter_list) > 0:
			a_tag = screenwriter_list[0].getnext().findall('a')
			if len(a_tag) > 0:
				L.screenwriter_list = []
				screenwriters = []
				for x in a_tag:
					L.screenwriter_list.append({"name":parse_simple(x.text),"baike_id":x.get("data-lemmaid"),"baike_url":u'https://baike.baidu.com'+x.get("href")})
					screenwriters.append(parse_simple(x.text))
				L.screenwriters = ",".join(set(screenwriters))
			else:
				L.screenwriters = area_process(parse_simple(screenwriter_list[0].getnext().text))

		starring_list = page.xpath(u'//dt[contains(text(),"主演")]')
		if len(starring_list) > 0:
			a_tag = starring_list[0].getnext().findall('a')
			if len(a_tag) > 0:
				L.starring_list = []
				starring = []
				for x in a_tag:
					L.starring_list.append({"name":parse_simple(x.text),"baike_id":x.get("data-lemmaid"),"baike_url":u'https://baike.baidu.com'+x.get("href")})
					starring.append(parse_simple(x.text))
				L.starring = ",".join(set(starring))
			else:
				L.starring = area_process(parse_simple(starring_list[0].getnext().text))

		alias = page.xpath(u'//dt[contains(text(),"其它译名")]')
		if len(alias) > 0:
			a_tag = alias[0].getnext().findall("a")
			if len(a_tag) > 0:
				L.alias = ",".join([parse_simple(x.text) for x in a_tag if parse_simple(x.text)])
			else:
				L.alias = parse_simple(alias[0].getnext().text)

		types = page.xpath(u'//dt[contains(text(),"类型")]')
		if len(types) > 0:
			L.type = area_process(parse_simple(types[0].getnext().text))

		duration = page.xpath(u'//dt[contains(text(),"片长")]')
		if len(duration) > 0:
			L.duration = area_process(parse_simple(duration[0].getnext().text))

		release_date = page.xpath(u'//dt[contains(text(),"上映时间")]')
		if len(release_date) > 0:
			L.release_date = area_process(parse_simple(release_date[0].getnext().text))

		release_date = page.xpath(u'//dt[contains(text(),"语言")]')
		if len(release_date) > 0:
			L.language = language_process(parse_simple(release_date[0].getnext().text))

		douban_rating = page.xpath(u'//span[contains(@class,"star-text")]')
		if len(douban_rating) > 0:
			L.douban_rating = douban_rating[0].text

		poster = page.xpath(u'//img[@alt="词条图片"]')
		if len(poster) > 0:
			L.poster = [{"url":poster[0].get("src"),"name":poster[0].get("alt")}]
			L.img_url = poster[0].get("src")

		actor_list = page.xpath(u'//ul[@class="actorList"]/li')
		if len(actor_list) > 0:
			starring = L.starring.split(',')
			L.actor_list = []
			starring_list = []
			for x in actor_list:
				_temp = {"avatar":x.find('img').get("src"),"name":x.xpath(u'//dl[@class="info"]/a')[0].text,"baike_id":x.xpath(u'//dl[@class="info"]/a')[0].get("data-lemmaid"),"baidu_url":"https://baike.baidu.com"+x.xpath(u'//dl[@class="info"]/a')[0].get("href")}
				if _temp['name'] in starring:
					starring_list.append(_temp)
				else:
					L.actor_list.append(_temp)
			if starring_list:
				L.starring_list = starring_list

		L.created_at = time.time()
		return L.__dict__
Пример #4
0
    def process(self, task=None):
        '''处理job task'''
        if task.get("name") == None or task.get("name") == "":
            pass

        #测试,先屏蔽了
        r_mongo = mongo_conn.contents.find({
            'relationship': {
                '$elemMatch': {
                    'mediaId': task.get("code"),
                    "platform": "gd"
                }
            }
        })
        if r_mongo.count() > 0:
            return self.after_mongo_succ(r_mongo[0]['_id'], task)

        ct = check_title(task['name'])
        if ct:
            return self.after_search_failed(task, category=ct)

        if task.get("actor") and task.get("director"):
            #信息全的走mongodb匹配,mongo没有的接着走百度搜索引擎
            print("task:--name:%s------director:%s---actor:%s---area:%s-" %
                  (task.get("name"), task.get("director"), task.get("actor"),
                   task.get("region")))
            regx = {}
            if task.get("director") and task.get("director") != "":
                regx["directors"] = re.compile(
                    u"(" +
                    "|".join(process_actor(task.get("director")).split(',')) +
                    ")", re.IGNORECASE)
            if task.get("actor") and task.get("actor") != "":
                regx['actors'] = re.compile(
                    u"(" +
                    "|".join(process_actor(task.get("actor")).split(',')) +
                    ")", re.IGNORECASE)
            # if task.get("year"):
            #     regx['year'] = re.compile(u"("+ "|".join(process_actor(task.get("year")).split(','))+")",re.IGNORECASE)
            if task.get("region"):
                regx['area'] = re.compile(
                    u"(" + "|".join(area_process(task.get("region"))) + ")",
                    re.IGNORECASE)
            regx_name = title_preprocess_seed(task.get("name"))
            regx_name = parse_regx_char(regx_name)
            regx_name = u'.*' + regx_name.replace(u'-', '.*') + ".*"
            print('---%s------%s' % (regx_name, task.get("name")))
            regx['title'] = re.compile(regx_name, re.IGNORECASE)
            contents = mongo_conn.contents.find(regx)
            '''在元数据仓库找不到的去百度搜索引擎'''
            if contents.count() == 0:
                return self.baidu(task)
            else:
                ls = []
                lss = {}
                for mon in contents:
                    sim = lst.ratio(title_preprocess(task.get("name")),
                                    title_preprocess(mon['title']))
                    print(
                        "-sim:%s--title:%s--name:%s------director:%s---actor:%s---region:%s-"
                        % (sim, mon['title'], task.get("name"),
                           task.get("director"), task.get("actor"),
                           task.get("region")))
                    if sim < 0.80:
                        continue
                    ls.append(sim)
                    lss[sim] = mon.get("_id")
                if len(ls) == 0:
                    return self.baidu(task)
                ls.sort()
                try:
                    _id = lss[ls[-1]]
                except Exception as e:
                    _id = lss[ls[0]]
                print(
                    "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^%s"
                    % regx_name)
                return self.after_mongo_succ(_id, task)
        else:
            return self.baidu(task)
Пример #5
0
    def vdetail_parser(self, r):
        data = Contents()
        try:
            page = etree.HTML(r)
        except Exception as e:
            return False
        year = re.search(u'<span class="year">\((\d{4})\)</span>', r)
        if year:
            data.year = year.group(1)
        title = re.search(u'<span property="v\:itemreviewed">(.*)</span>', r)
        if title:
            data.title = title.group(1)
        bianju = page.xpath(u'//span[contains(text(),"编剧")]')
        if len(bianju) > 0:
            bianju_a = bianju[0].getnext()
            if bianju_a is not None:
                bianju_a = bianju_a.findall('a')
                data.screenwriter_list = []
                screenwriters = ''
                for x in bianju_a:
                    screenwriters = screenwriters + parse_simple(x.text) + ","
                    _temp = {}
                    if re.search(u'/celebrity/(\d*)/', x.get("href")):
                        _temp["doubanid"] = re.search(u'/celebrity/(\d*)/',
                                                      x.get("href")).group(1)
                    else:
                        # doubanid = x.get("href")
                        pass
                    if x.get("href"):
                        _temp[
                            "douban_url"] = "https://movie.douban.com" + x.get(
                                "href")
                    _temp["name"] = parse_simple(x.text)
                    data.screenwriter_list.append(_temp)
                data.screenwriters = screenwriters.strip(',')

        directors_el = page.xpath(u'//span[contains(text(),"导演")]')
        if len(directors_el) > 0:
            directors_a = directors_el[0].getnext()
            if directors_a is not None:
                directors_a = directors_a.findall('a')
                data.directors_list = []
                directors = ""
                for x in directors_a:
                    directors = directors + parse_simple(x.text) + ","
                    _temp = {}
                    if re.search(u'/celebrity/(\d*)/', x.get("href")):
                        _temp["doubanid"] = re.search(u'/celebrity/(\d*)/',
                                                      x.get("href")).group(1)
                    else:
                        # doubanid = x.get("href")
                        pass
                    if x.get("href"):
                        _temp[
                            "douban_url"] = "https://movie.douban.com" + x.get(
                                "href")
                    _temp["name"] = parse_simple(x.text)
                    data.directors_list.append(_temp)
                data.directors = directors.strip(',')

        starring_el = page.xpath(u'//span[contains(text(),"主演")]')
        if len(starring_el) > 0:
            starring_a = starring_el[0].getnext()
            if starring_a is not None:
                starring_a = starring_a.findall('a')
                data.starring_list = []
                starring = ""
                for x in starring_a:
                    starring = starring + parse_simple(x.text) + ","
                    _temp = {}
                    if re.search(u'/celebrity/(\d*)/', x.get("href")):
                        _temp["doubanid"] = re.search(u'/celebrity/(\d*)/',
                                                      x.get("href")).group(1)
                    else:
                        # doubanid = x.get("href")
                        pass
                    if x.get("href"):
                        _temp[
                            "douban_url"] = "https://movie.douban.com" + x.get(
                                "href")
                    _temp["name"] = parse_simple(x.text)
                    data.starring_list.append(_temp)
                starring = starring.strip(',')
                data.starring = starring
        type_el = page.xpath(u'//span[@property="v:genre"]')  # 类型
        mvtype = []
        if len(type_el) > 0:
            for x in type_el:
                mvtype.append(parse_simple(x.text))

        tags = page.xpath(u'//div[@class="tags-body"]/a')
        _temp = []
        for x in tags:
            _temp.append(parse_simple(x.text))
        _temp = _temp + mvtype
        data.tags = ",".join(set(_temp))

        producer_country_el = page.xpath(
            u'//span[contains(text(),"制片国家/地区:")]')
        if len(producer_country_el) > 0:
            producer_country = page.xpath(
                u'//span[contains(text(),"制片国家/地区:")]/following::text()[1]')[0]
            data.producer_country = area_process(
                split_space(producer_country.replace('/', ',')))

        language_el = page.xpath(u'//span[contains(text(),"语言:")]')
        if len(language_el) > 0:
            language = page.xpath(
                u'//span[contains(text(),"语言:")]/following::text()[1]')[0]
            data.language = language_process(
                split_space(language.replace('/', ',')))

        all_episode = page.xpath(u'//span[contains(text(),"集数:")]')
        if len(all_episode) > 0:
            all_episode = page.xpath(
                u'//span[contains(text(),"集数:")]/following::text()[1]')[0]
            m = re.search(u'(\d{1,})', all_episode.replace(" ", ""))
            if m:
                data.all_episode = m.group(1)

        episode_time = page.xpath(u'//span[contains(text(),"单集片长:")]')
        if len(episode_time) > 0:
            episode = page.xpath(
                u'//span[contains(text(),"单集片长:")]/following::text()[1]')[0]
            m = re.search(u'(\d{1,})', episode.replace(" ", ""))
            if m:
                data.duration = m.group(1)

        season = page.xpath(
            u'//select[@id="season"]/option[@selected="selected"]')  #season季数
        if len(season) > 0:
            data.season = season[0].text

        release_date_el = page.xpath(
            u'//span[@property="v:initialReleaseDate"]')  #首播
        if len(release_date_el) > 0:
            release_date = ""
            for x in release_date_el:
                release_date = release_date + parse_simple(x.text) + "|"
            release_date = release_date.strip('|')
            m = re.search(u'(\d{4}-\d{2}-\d{2})',
                          release_date.replace(" ", ""))
            if m:
                data.release_date = m.group(1)
            else:
                data.release_date = release_date
        duration_el = page.xpath(u'//span[@property="v:runtime"]')
        if len(duration_el) > 0:
            m = re.search(u'(\d{1,})', duration_el[0].text.replace(" ", ''))
            if m:
                data.duration = m.group(1)  # 片长

        alias_al = page.xpath(u'//span[contains(text(),"又名:")]')
        if len(alias_al) > 0:
            alias = page.xpath(
                u'//span[contains(text(),"又名:")]/following::text()[1]')[0]
            data.alias = split_space(alias.replace('/', ','))

        IMDb_el = page.xpath(u'//span[contains(text(),"IMDb链接")]')
        if len(IMDb_el) > 0:
            data.IMDb = IMDb_el[0].getnext().get("href")

        rating = re.search(u'property="v\:average">(\d*\.\d*)</strong>', r)
        if rating:
            data.douban_rating = rating.group(1)

        rating_sum = page.xpath(u'//span[@property="v:votes"]')
        if len(rating_sum) > 0:
            data.douban_rating_sum = rating_sum[0].text

        summary_all = page.xpath(u'//span[@class="all hidden"]')
        summary = page.xpath(u'//span[@property="v:summary"]')
        if len(summary_all) > 0:
            data.summary = ''.join(
                page.xpath(u'//span[@class="all hidden"]/text()'))
            data.summary = parse_simple(data.summary)
        elif len(summary) > 0:
            data.summary = ''.join(
                page.xpath(u'//span[@property="v:summary"]/text()'))
            data.summary = parse_simple(data.summary)

        img_url = page.xpath(u'//img[@title="点击看更多海报"]')
        nbgnbg = page.xpath(u'//a[@title="点击看大图" and @class="nbgnbg"]')
        if len(img_url) > 0:
            data.img_url = page.xpath(u'//img[@title="点击看更多海报"]')[0].get("src")
        elif len(nbgnbg) > 0:
            data.img_url = nbgnbg[0].get("href")

        if data.all_episode > 1 and (u"动漫" in data.tags or u"动画" in data.tags):
            data.category = u"动漫"
        elif data.all_episode > 1 and (u"综艺" in data.tags
                                       or u'真人秀' in data.tags):
            data.category = u'综艺'
        elif data.all_episode > 1:
            data.category = u"电视剧"
        elif u"动漫" in data.tags or u"动画" in data.tags:
            data.category = u'动漫'
        elif u"短片" in data.tags:
            data.category = u'短片'
        else:
            data.category = u'电影'

        m = re.search(u"SUBJECT_ID: *'(\d*)'", r)
        if m:
            data.doubanid = m.group(1)

        print(
            "oooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo"
        )
        print(data.__dict__)
        print(
            "oooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo"
        )
        return data.__dict__
Пример #6
0
    def process(self, task=None):
        '''处理job task'''
        if task.get("contentName") == None or task.get("contentName") == "":
            pass

        # 测试,先屏蔽了
        r_mongo = mongo_conn.contents.find({
            'relationship': {
                '$elemMatch': {
                    'mediaId': task.get("mediaId"),
                    "platform": task.get("platform")
                }
            }
        })
        if r_mongo.count() > 0:
            return self.after_mongo_succ(r_mongo[0]["_id"], task)
        """过滤新闻等短视频"""
        ct = check_title(task['contentName'])
        if ct:
            return self.after_search_failed(task, category=ct)

        if task.get("actor") and task.get("director"):
            # 信息全的走mongodb匹配,mongo没有的接着走百度搜索引擎
            print(
                "task:--contentName:%s---contentType:%s---director:%s---actor:%s---area:%s-"
                % (task.get("contentName"), task.get("contentType"),
                   task.get("director"), task.get("actor"), task.get("area")))
            regx = {}
            if task.get("director") and task.get("director") != "":
                regx["directors"] = re.compile(
                    u"(" +
                    "|".join(process_actor(task.get("director")).split(',')) +
                    ")", re.IGNORECASE)  # 匹配至少有一个directors相交的
            if task.get("actor") and task.get("actor") != "":
                regx['actors'] = re.compile(
                    u"(" +
                    "|".join(process_actor(task.get("actor")).split(',')) +
                    ")", re.IGNORECASE)  # 匹配至少有一个starring相交的
            if task.get("year"):
                regx['year'] = re.compile(
                    u"(" +
                    "|".join(process_actor(task.get("year")).split(',')) + ")",
                    re.IGNORECASE)  # 匹配至少有一个year
            if task.get("area"):
                regx['area'] = re.compile(
                    u"(" + "|".join(area_process(task.get("area"))) + ")",
                    re.IGNORECASE)  # 匹配至少有一个year
            regx_name = title_preprocess_seed(task.get("contentName"))
            regx_name = parse_regx_char(regx_name)
            regx_name = u'.*' + regx_name.replace(u'-', '.*') + ".*"
            regx_name = re.compile(regx_name, re.IGNORECASE)
            regx['title'] = regx_name  # 匹配标题以之开头的
            contents = mongo_conn.contents.find(regx)
            '''在元数据仓库找不到的去百度搜索引擎'''
            if contents.count() == 0:
                return self.baidu(task)
            else:
                # mongo_rs = mongo_conn.contents.find({"title": regx_name})
                ls = []
                lss = {}
                # 100:普通点播 110:连续剧父集 110:连续剧子集
                for mon in contents:
                    sim = lst.ratio(title_preprocess(task.get("contentName")),
                                    title_preprocess(mon['title']))
                    print(
                        "-sim:%s--title:%s--contentName:%s---contentType:%s---director:%s---actor:%s---area:%s-"
                        % (sim, mon['title'], task.get("contentName"),
                           task.get("contentType"), task.get("director"),
                           task.get("actor"), task.get("area")))
                    if sim < 0.80:
                        continue
                    ls.append(sim)
                    lss[sim] = mon.get("_id")
                if len(ls) == 0:
                    return self.baidu(task)
                ls.sort()
                try:
                    _id = lss[ls[-1]]
                except Exception as e:
                    _id = lss[ls[0]]
                print(
                    "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^%s"
                    % regx_name)
                return self.after_mongo_succ(_id, task)
        else:
            return self.baidu(task)
Пример #7
0
    def vdetail_parser(self, r):
        try:
            page = etree.HTML(r)
        except Exception as e:
            return False
        L = Contents()
        title = page.xpath(u'//a[@_stat="info:title"]')
        m = re.search(u'\{"id":"(\w*\d*)"', r)
        if m:
            L.qq_id = m.group(1)
        m = re.search(u'&vid=(\w*\d*)&', r)
        if m:
            L.qq_vid = m.group(1)
        if len(title) > 0:
            L.title = title[0].text

        category = page.xpath(u'//span[@class="type"]')
        if len(category) > 0:
            L.category = category[0].text

        area = page.xpath(u'.//span[contains(text(),"地 区:")]')
        if len(area) > 0:
            L.area = area_process(area[0].getnext().text)

        foreign_title = page.xpath(u'//span[@class="title_en"]')
        if len(foreign_title) > 0:
            L.foreign_title = foreign_title[0].text

        qq_play = page.xpath(u'.//a[@_stat="info:playbtn"]')
        if len(qq_play) > 0:
            L.qq_play = qq_play[0].get("href")

        language = page.xpath(u'//span[contains(text(),"语 言:")]')
        if len(language) > 0:
            L.language = language_process(language[0].getnext().text)

        year = page.xpath(u'.//span[contains(text(),"上映时间")]')
        if len(year) > 0 and year[0].getnext().text:
            m = re.search(u'(\d{4})', year[0].getnext().text)
            if m:
                L.year = m.group(1)

        all_episode = page.xpath(u'//span[contains(text(),"总集数:")]')
        if len(all_episode) > 0:
            L.all_episode = all_episode[0].getnext().text

        release_date = page.xpath(u'//span[contains(text(),"出品时间:")]')
        if len(release_date) > 0:
            L.release_date = release_date[0].getnext().text
            if L.release_date and L.year == None:
                try:
                    m = re.search(u'(\d{4})', L.release_date)
                    if m:
                        L.year = m.group(1)
                except Exception as e:
                    pass
        year = page.xpath(u'.//span[contains(text(),"首播时间")]')
        if len(year) > 0 and L.year == None:
            m = re.search(u'(\d{4})', year[0].getnext().text)
            if m:
                L.year = m.group(1)

        alias = page.xpath(u'//span[contains(text(),"别 名")]')
        if len(alias) > 0:
            L.alias = alias[0].getnext().text

        tags = page.xpath(u'//a[@class="tag"]')
        if len(tags) > 0:
            _temp = [x.text for x in tags]
            L.tags = ",".join(set(_temp))

        summary = page.xpath(
            u'//span[@class="desc_txt"]/span[@class="txt _desc_txt_lineHight"]'
        )
        if len(summary) > 0:
            L.summary = parse_simple(summary[0].text)

        qq_rating = page.xpath(u'//div[@class="score_v"]/span[@class="score"]')
        if len(qq_rating) > 0:
            L.qq_rating = qq_rating[0].text

        douban_rating = page.xpath(
            u'//a[@class="score_db"]/span[@class="score"]')
        if len(douban_rating) > 0:
            L.douban_rating = douban_rating[0].text

        poster = page.xpath(u'//img[@_stat="info:poster"]')
        if len(poster) > 0:
            L.poster = []
            if poster[0].get("src"):
                L.poster.append({
                    "url": self.parse_imgurl(poster[0].get("src")),
                    "name": poster[0].get("alt")
                })
                L.img_url = self.parse_imgurl(poster[0].get("src"))

        #导演演员
        actor_list = page.xpath(u'//ul[contains(@class,"actor_list")]/li')
        starring_list = []
        starring = []
        directors_list = []
        directors = []
        if len(actor_list) > 0:
            _temp = []
            for actor in actor_list:
                _dic = {}
                actor_avatar = actor.find(u'a')
                if actor_avatar is not None:
                    if actor_avatar.find('img') is not None:
                        _dic["avatar"] = self.parse_imgurl(
                            actor_avatar.find('img').get("src"))
                    _dic["qq_id"] = actor.get("data-id")
                    if actor.find("span") is not None:
                        _dic["name"] = actor.find("span").text
                    _dic["qq_home_page"] = actor_avatar.get("href")
                    actor_detail = actor.xpath(
                        u'.//div[@class="actor_detail"]')
                    if actor_detail:
                        # 职业
                        occupation = actor_detail[0].xpath(
                            u'.//span[contains(text(),"职业")]')
                        if occupation:
                            _dic['occupation'] = occupation[0].getnext().text

                        # 地区
                        area = actor_detail[0].xpath(
                            u'.//span[contains(text(),"地区")]')
                        if len(area) > 0:
                            _dic['area'] = area[0].getnext().text

                    # 简介
                    intro = actor.xpath(u'.//span[@itemprop="description"]')
                    if intro:
                        _dic["intro"] = intro[0].text
                    # 导演
                    if actor_avatar.xpath(u'.//span[@class="director"]'):
                        directors_list.append(_dic)
                        directors.append(_dic['name'])
                    else:
                        # 演员
                        starring_list.append(_dic)
                        starring.append(_dic['name'])
        if starring_list:
            L.starring = ','.join(starring)
            L.starring_list = starring_list
        if directors_list:
            L.directors = ','.join(directors)
            L.directors_list = directors_list

        if L.title == None:
            return False
        L.created_at = time.time()
        return L.__dict__
Пример #8
0
def sync(arges):
    print("haha")
    offset = arges["offset"]
    size = arges['size']
    mongo_contents = mongo_conn.contents
    mongo_posters = mongo_conn.posters
    succ = 0
    total = 0
    this_limit = 3000
    this_offset = offset
    step = (size - offset) / this_limit
    print('this_offset:', this_offset, size, size / this_limit)
    for x in xrange(1, step + 1):
        print("offset:", offset + x * this_limit, this_limit)
        db_session = scoped_session(DBSession)
        print(offset + (x - 1) * this_limit, x * this_limit + offset)
        contents = dict()
        contents = db_session.query(Vod).filter(
            Vod.d_downfrom != "1").order_by(asc(Vod.id)).all()
        for item in contents:
            if item.d_name == None or item.d_name == "" or item.d_pic == None or item.d_pic == "":
                item.d_downfrom = '1'
                db_session.add(item)
                db_session.commit()
                continue
            total += 1
            """没有地区或者导演主演都没有就pass"""
            if not item.d_area or (not item.d_directed
                                   and not item.d_starring):
                item.d_downfrom = '1'
                db_session.add(item)
                db_session.commit()
                continue
            regx_name = title_preprocess_seed(item.d_name)
            regx_name = parse_regx_char(regx_name)
            regx_name = u'.*' + regx_name.replace(u'-', '.*') + ".*"
            print("-----%s-----%s----%s--%s----%s---" %
                  (item.d_name, regx_name, item.d_area, item.d_directed,
                   item.d_starring))
            regx = {}
            # regx["title"] = re.compile(regx_name, re.IGNORECASE)
            regx["title"] = regx_name
            if item.d_directed and item.d_directed != "":
                # regx["directors"] = re.compile(u"("+ "|".join(process_actor(item.d_directed).split(','))+")",re.IGNORECASE)  #匹配至少有一个directors相交的
                regx["directors"] = u"(" + "|".join(
                    process_actor(item.d_directed).split(
                        ',')) + ")"  #匹配至少有一个directors相交的
            if item.d_starring and item.d_starring != "":
                regx['actors'] = re.compile(
                    u"(" +
                    "|".join(process_actor(item.d_starring).split(',')) + ")",
                    re.IGNORECASE)  #匹配至少有一个starring相交的
                regx['actors'] = u"(" + "|".join(
                    process_actor(
                        item.d_starring).split(',')) + ")"  #匹配至少有一个starring相交的
            if item.d_area:
                # regx['area'] = re.compile(u"("+ "|".join(area_process(item.d_area))+")",re.IGNORECASE)
                regx['area'] = u"(" + "|".join(area_process(item.d_area)) + ")"
            print(json.dumps(regx))
            mongo_rs = mongo_contents.find(regx)
            ls = []
            lss = {}
            # 100:普通点播 110:连续剧父集 110:连续剧子集
            print("mongo_rs", mongo_rs.count())
            for mon in mongo_rs:
                sim = lst.ratio(title_preprocess(item.d_name),
                                title_preprocess(mon['title']))
                print("-sim:----%s----%s---%s----regx_name:%s----" %
                      (sim, item.d_name, mon['title'], regx_name))
                if sim < 0.80:
                    continue
                ls.append(sim)
                lss[sim] = mon.get("_id")
                if sim == 1.0:
                    break
            if len(ls) == 0:
                item.d_downfrom = '1'
                db_session.add(item)
                db_session.commit()
                continue
            ls.sort()
            try:
                _id = lss[ls[-1]]
            except Exception as e:
                _id = lss[ls[0]]
            _id = mongo_posters.insert(
                {
                    "url": item.d_pic,
                    "content_id": str(_id)
                }, check_keys=False)
            print(_id)
            # print("---%s--------%s----"%(item.d_name,c[0]['title']))
            print("-------done--------")
            # return True
            item.d_downfrom = '1'
            db_session.add(item)
            db_session.commit()
        db_session.close()
    #db_session.commit()
    db_session.close()
    print(size, succ)