Пример #1
0
    def firstSpider(self, seed):
        name = ""
        poster = ""
        star = ""
        director = ""
        ctype = ""
        programLanguage = ""
        intro = ""
        mainId = ""
        area = ""
        shootYear = ""

        doc = spiderTool.getHtmlBody(seed)
        tvId = re.search(r'data-player-tvid="([^"]+?)"', doc)
        videoId = re.search(r'data-player-videoid="([^"]+?)"', doc)
        if tvId and videoId:
            newUrl = 'http://cache.video.qiyi.com/vi/%s/%s/' % (tvId.group(1), videoId.group(1))
            doc = spiderTool.getHtmlBody(newUrl)
        else:
            return

        try:
            json_data = json.loads(doc)
            name = json_data["shortTitle"]
            poster = json_data["apic"]
            star = json_data["ma"].replace("|", ",")
            director = json_data["d"].replace("|", ",")
            ctype = json_data["tg"].replace(" ", ",")
            programLanguage = json_data["ar"]
            intro = json_data["info"]
        except:
            return



        self.program["name"] = spiderTool.changeName(name)
        self.program['poster'] = spiderTool.listStringToJson('url',poster)
        self.program['star'] = spiderTool.listStringToJson('name',star)
        self.program['director'] = spiderTool.listStringToJson('name',director)
        self.program['ctype'] = spiderTool.listStringToJson('name',ctype)
        self.program['programLanguage'] = programLanguage
        self.program['intro'] = intro
        if re.match(r'http://www\.iqiyi\.com/v_(\w+)\.html', seed):
            mainId = re.match(r'http://www\.iqiyi\.com/v_(\w+)\.html', seed).group(1)
        elif re.match(r'http://www\.iqiyi\.com/dianying/(\d{8})/', seed):
            mainId = re.match(r'http://www\.iqiyi\.com/dianying/(\d{8})/', seed).group(1)
        self.program["mainId"] = mainId
        self.program_sub = copy.deepcopy(PROGRAM_SUB)
        self.program_sub['setNumber'] = 1
        self.program_sub['setName'] = name
        self.program_sub['webUrl'] = seed
        self.program['programSub'].append(self.program_sub)
Пример #2
0
    def firstSpider(self, seed):
        name = ""
        poster = ""
        star = ""
        director = ""
        ctype = ""
        programLanguage = ""
        intro = ""
        mainId = ""
        area = ""

        doc = spiderTool.getHtmlBody(seed)
        soup = BeautifulSoup(doc, from_encoding="utf8")

        poster_p = soup.find("img", attrs={'width': '195'})
        poster_p_1 = soup.find(
            'div', attrs={'class': 'album-picCon album-picCon-onePic'})
        poster_p_2 = soup.find('ul', attrs={'class': 'album-imgs'})
        if poster_p is not None:
            poster = poster_p.get("src")
            name = poster_p.get("alt")
        elif poster_p_1 is not None:
            img_tag = poster_p_1.find("img")
            if img_tag is not None:
                poster = img_tag.get("src")
                name = img_tag.get("alt")
        elif poster_p_2 is not None:
            poster = poster_p_2.find('img').get('src')

        if name == "":
            name_p = soup.find('div',
                               attrs={'class': 'album-playArea clearfix'})
            if name_p is not None:
                name = name_p.find('h1').get_text()

        detail = soup.find("div", attrs={"class": "result_detail-minH"})
        detail_1 = soup.find("div", attrs={"class": "msg-hd-lt fl"})
        if detail is not None:
            for div_p in detail.find_all(
                    "div", attrs={"class": "topic_item clearfix"}):
                for each in div_p.find_all("div"):
                    a_list = []
                    for a_tag in each.find_all('a'):
                        a_list.append(a_tag.get_text())
                    a_str = ",".join(a_list)
                    if re.search("主持人:".decode("utf8"), each.get_text()):
                        star = a_str
                    if re.search("类型:".decode("utf8"), each.get_text()):
                        ctype = a_str
                    if re.search("语言:".decode("utf8"), each.get_text()):
                        programLanguage = a_str
                    if re.search("地区:".decode("utf8"), each.get_text()):
                        area = a_str
        elif detail_1:
            for each in detail_1.find_all("p"):
                a_list = []
                for a_tag in each.find_all('a'):
                    a_list.append(a_tag.get_text())
                a_str = ",".join(a_list)
                if re.search("主持人:".decode("utf8"), each.get_text()):
                    star = a_str
                if re.search("类型:".decode("utf8"), each.get_text()):
                    ctype = a_str
                if re.search("语言:".decode("utf8"), each.get_text()):
                    programLanguage = a_str
                if re.search("地区:".decode("utf8"), each.get_text()):
                    area = a_str

        content_p = soup.find('span',
                              attrs={
                                  "class": "showMoreText",
                                  "data-moreorless": "moreinfo",
                                  "style": "display: none;"
                              })
        content_p_1 = soup.find("span", attrs={"class": "bigPic-b-jtxt"})
        if content_p is not None:
            if content_p.find("span"):
                content_p = content_p.find("span")
            if re.search("简介:".decode("utf8"), content_p.get_text()):
                intro = content_p.get_text().split(
                    "简介:".decode("utf8"))[1].strip()
        elif content_p_1 is not None:
            intro = content_p_1.get_text()

        self.program["name"] = spiderTool.changeName(name)
        self.program['poster'] = spiderTool.listStringToJson('url', poster)
        self.program['star'] = spiderTool.listStringToJson('name', star)
        self.program['director'] = spiderTool.listStringToJson(
            'name', director)
        self.program['ctype'] = spiderTool.listStringToJson('name', ctype)
        self.program['programLanguage'] = programLanguage
        self.program['area'] = spiderTool.listStringToJson('name', area)
        self.program['intro'] = intro

        seed_P = re.search(
            r'sourceId:\s*(?P<sourceId>\d+),\s+cid:\s*(?P<cid>\d+)', doc)
        if seed_P:
            sourceId = seed_P.group('sourceId')
            cid = seed_P.group('cid')
            self.program['mainId'] = sourceId
            seed_sub = 'http://cache.video.iqiyi.com/jp/sdvlst/%s/%s/' % (
                cid, sourceId)
            self.secondSpider(seed_sub)
Пример #3
0
    def firstSpider(self, seed):
        name = ""
        pcUrl = ""
        poster = ""
        star = ""
        director = ""
        ctype = ""
        programLanguage = ""
        intro = ""
        point = ""
        mainId = ""
        area = ""

        if re.search(r'http://v.pptv.com/show/\w+\.html', seed):
            seed_post = seed.replace("/show/", "/page/")
        else:
            seed_post = seed

        doc = spiderTool.getHtmlBody(seed_post)

        soup = BeautifulSoup(doc, from_encoding="utf8")
        poster_P = soup.find("a", attrs={"class": "cover-a"})
        poster_P1 = soup.find("div", attrs={"class": "bd cf"})
        if poster_P is not None:
            img_tag = poster_P.find("img")
            if img_tag is not None:
                img = img_tag.get('data-src2')
                if img is not None:
                    poster = img
        elif poster_P1 is not None:
            img_tag = poster_P1.find("img")
            if img_tag is not None:
                img = img_tag.get('src')
                if img is not None:
                    poster = img

        if re.search(r'http://v.pptv.com/page/\w+\.html', seed):
            seed = seed.replace("/page/", "/show/")
        doc = spiderTool.getHtmlBody(seed)

        pid_P = re.search(r'"pid":\s*(?P<pid>\d+),'.decode('utf8'), doc)
        if pid_P:
            pid = pid_P.group('pid')
            seed_sub = 'http://v.pptv.com/show/videoList?&cb=videoList&pid=' + pid
            mainId = pid
        else:
            return

        id_P = re.search(r'"id":\s*(?P<id>\d+),'.decode('utf8'), doc)
        if id_P:
            id = id_P.group('id')
            seed = 'http://svcdn.pptv.com/show/v1/meta.json?cid=%s&sid=%s&psize=50' % (
                id, pid)
        else:
            return

        doc = spiderTool.getHtmlBody(seed)
        json_data = json.loads(doc)
        if json_data.has_key("data"):
            data = json_data["data"]
        else:
            return

        if data.has_key("set"):
            name_P = data["set"]
            if type(name_P) is types.ListType:
                if len(name_P) == 2:
                    name = name_P[0]
                    pcUrl = name_P[1]

        def getListName(dict_P):
            dict_list = []
            if type(dict_P) is types.ListType:
                for each in dict_P:
                    if type(each) is not types.DictionaryType:
                        continue
                    name = ""
                    if each.has_key("name"):
                        name = each["name"]
                    elif each.has_key("text"):
                        name = each["text"]
                    if re.search(r'未知'.decode('utf8'), name) or name == '':
                        continue
                    dict_list.append(name)
            return ",".join(dict_list)

        if data.has_key("directors"):
            star = getListName(data["directors"])

        if data.has_key("ct"):
            ctype = getListName(data["ct"])

        if data.has_key("area"):
            area = getListName(data["area"])

        if data.has_key("score"):
            point = data["score"]
        try:
            point = float(point)
        except:
            point = 0.0

        if data.has_key("summary"):
            intro = data["summary"]

        self.program["name"] = spiderTool.changeName(name)
        self.program['pcUrl'] = pcUrl
        self.program['poster'] = spiderTool.listStringToJson('url', poster)
        self.program['star'] = spiderTool.listStringToJson('name', star)
        self.program['director'] = spiderTool.listStringToJson(
            'name', director)
        self.program['ctype'] = spiderTool.listStringToJson('name', ctype)
        self.program['programLanguage'] = programLanguage
        self.program['area'] = spiderTool.listStringToJson('name', area)
        self.program['intro'] = intro
        self.program['point'] = point
        self.program['mainId'] = mainId

        self.secondSpider(seed_sub)
Пример #4
0
    def firstSpider(self, seed):
        name = ""
        poster = ""
        point = 0.0
        shootYear = ""
        star = ""
        director = ""
        ctype = ""
        programLanguage = ""
        intro = ""
        mainId = ""
        area = ""

        pid = ""
        if re.search(r'http://www\.le\.com/jilu/(?P<pid>\d+)\.html', seed):
            pid = re.search(r'http://www\.le\.com/jilu/(?P<pid>\d+)\.html',
                            seed).group('pid')
        elif re.search(r'http://www\.letv\.com/jilu/(?P<pid>\d+)\.html', seed):
            pid = re.search(r'http://www\.letv\.com/jilu/(?P<pid>\d+)\.html',
                            seed).group('pid')
        if pid != '' or pid != '0':
            seed = 'http://static.app.m.letv.com/android/mod/mob/ctl/album/act/detail/id/%s/pcode/010110014/version/5.2.3.mindex.html' % (
                pid)
        else:
            return
        mainId = pid
        doc = spiderTool.getHtmlBody(seed)
        try:
            json_data = json.loads(doc)
        except:
            json_data = {}
        if json_data.has_key("body"):
            if type(json_data["body"]) is types.DictionaryType:
                json_data = json_data["body"]
            else:
                return

        if json_data.has_key("nameCn"):
            name = json_data["nameCn"]

        if json_data.has_key("nameCn"):
            name = json_data["nameCn"]
        if json_data.has_key("picCollections"):
            poster_dict = json_data["picCollections"]
            if type(poster_dict) is types.DictionaryType:
                if poster_dict.has_key('400*300'):
                    poster = poster_dict['400*300']
                if poster == "":
                    for each in poster_dict:
                        if poster_dict[each] != "":
                            poster = poster_dict[each]
        if self.images.has_key(pid):
            poster = self.images[pid]

        if json_data.has_key("score"):
            try:
                point = float(json_data["score"])
            except:
                point = 0.0

        if json_data.has_key("releaseDate"):
            shootYear_P = json_data["releaseDate"]
            if re.search(r'(\d{4})-\d{2}-\d{2}', shootYear_P):
                shootYear = re.search(r'(\d{4})-\d{2}-\d{2}',
                                      shootYear_P).group(1)
            elif re.search(r'^\d{4}$', shootYear_P):
                shootYear = shootYear_P

        if json_data.has_key("directory"):
            director_P = json_data["directory"]
            if type(director_P) is types.UnicodeType:
                director = director_P.strip().replace(" ", ",")

        if json_data.has_key("starring"):
            star_P = json_data["starring"]
            if type(star_P) is types.UnicodeType:
                star = star_P.strip().replace(" ", ",")

        if json_data.has_key("area"):
            area_P = json_data["area"]
            if type(area_P) is types.UnicodeType:
                area = area_P.strip().replace(" ", ",")

        if json_data.has_key("subCategory"):
            ctype_P = json_data["subCategory"]
            if type(ctype_P) is types.UnicodeType:
                ctype = ctype_P.strip().replace(" ", ",")

        if json_data.has_key('description'):
            intro = json_data['description']

        if json_data.has_key('language'):
            programLanguage = json_data['language']

        self.program["name"] = spiderTool.changeName(name)
        self.program['poster'] = spiderTool.listStringToJson('url', poster)
        self.program['point'] = point
        self.program['shootYear'] = shootYear
        self.program['star'] = spiderTool.listStringToJson('name', star)
        self.program['director'] = spiderTool.listStringToJson(
            'name', director)
        self.program['ctype'] = spiderTool.listStringToJson('name', ctype)
        self.program['programLanguage'] = programLanguage
        self.program['area'] = spiderTool.listStringToJson('name', area)
        self.program['intro'] = intro
        self.program['mainId'] = mainId

        seed_sub = "http://static.app.m.letv.com/android/mod/mob/ctl/videolist/act/detail/id/%s/vid/25520328/b/1/s/60/o/-1/m/0/pcode/010110014/version/5.2.3.mindex.html" % (
            pid)
        self.secondSpider(seed_sub)
Пример #5
0
    def firstSpider(self, seed):
        name = ""
        poster = ""
        star = ""
        director = ""
        ctype = ""
        programLanguage = ""
        intro = ""
        mainId = ""
        area = ""

        doc = spiderTool.getHtmlBody(seed)
        soup = BeautifulSoup(doc, from_encoding="utf8")

        poster_p = soup.find("ul", attrs={'class': 'focus_img_list'})
        poster_p_1 = soup.find("div", attrs={"class": "result_pic pr"})
        if poster_p is not None:
            poster_p_sub = poster_p.find('li').get('style')
            if re.search(r'\((.*?)\)'.decode('utf8'), poster_p_sub):
                poster = re.search(r'\((.*?)\)'.decode('utf8'), poster_p_sub).group(1)
        elif poster_p_1 is not None:
            img_tag = poster_p_1.find("img")
            if img_tag is not None:
                poster = img_tag.get("src")
                name = img_tag.get("alt")

        name_p = soup.find('a', attrs={'class': 'white'})
        if name_p is not None:
            name = name_p.get_text()

        detail = soup.find("div", attrs={"class": "result_detail-minH"})
        detail_1 = soup.find("div", attrs={"class": "msg-hd-lt fl"})
        if detail is not None:
            for div_p in detail.find_all("div", attrs={"class": "topic_item clearfix"}):
                for each in div_p.find_all("div"):
                    a_list = []
                    for a_tag in each.find_all('a'):
                        a_list.append(a_tag.get_text())
                    a_str = ",".join(a_list)
                    if re.search("主演:".decode("utf8"),each.get_text()):
                        star = a_str
                    if re.search("导演:".decode("utf8"),each.get_text()):
                        director = a_str
                    if re.search("类型:".decode("utf8"),each.get_text()):
                        ctype = a_str
                    if re.search("语言:".decode("utf8"),each.get_text()):
                        programLanguage = a_str
                    if re.search("地区:".decode("utf8"),each.get_text()):
                        area = a_str
        elif detail_1 is not None:
            for p_tag in detail_1.find_all("p"):
                a_list = []
                for a_tag in p_tag.find_all('a'):
                    a_list.append(a_tag.get_text())
                a_str = ','.join(a_list)
                if re.search("导演:".decode("utf8"),p_tag.get_text()):
                    director = a_str
                if re.search("类型:".decode("utf8"),p_tag.get_text()):
                    ctype = a_str
                if re.search("语言:".decode("utf8"),p_tag.get_text()):
                    programLanguage = a_str
                if re.search("地区:".decode("utf8"),p_tag.get_text()):
                    area = a_str
                if re.search("主演:".decode("utf8"),p_tag.get_text()):
                    star = a_str

        content_p = soup.find('span',  attrs={"class": "showMoreText", "data-moreorless":"moreinfo", "style":"display: none;"})
        content_p_1 = soup.find('div',  attrs={"data-moreorless":"moreinfo"})
        if content_p is not None:
            if content_p.find("span"):
                content_p = content_p.find("span")
            if re.search("简介:".decode("utf8"),content_p.get_text()):
                intro = content_p.get_text().split("简介:".decode("utf8"))[1].strip()
        elif content_p_1 is not None:
            if re.search("简介:".decode("utf8"),content_p_1.get_text()):
                intro = content_p_1.get_text().split("简介:".decode("utf8"))[1].strip()


        self.program["name"] = spiderTool.changeName(name)
        self.program['poster'] = spiderTool.listStringToJson('url',poster)
        self.program['star'] = spiderTool.listStringToJson('name',star)
        self.program['director'] = spiderTool.listStringToJson('name',director)
        self.program['ctype'] = spiderTool.listStringToJson('name',ctype)
        self.program['programLanguage'] = programLanguage
        self.program['area'] = spiderTool.listStringToJson('name', area)
        self.program['intro'] = intro

        seed_P = re.search(r'albumId:\s*(?P<albumId>\d+)[^\\]*?cid:\s*(?P<cid>\d+)'.decode('utf8'), doc)
        if seed_P:
            albumId = seed_P.group('albumId')
            cid = seed_P.group('cid')
            self.program['mainId'] = cid + "_" +albumId
            seed_sub = 'http://cache.video.iqiyi.com/jp/avlist/%s/' %(albumId)
            allNum = 0
            doc = spiderTool.getHtmlBody(seed_sub)
            try:
                json_data = json.loads(doc.split('=')[1])
                data = json_data["data"]["vlist"]
                allNum = json_data["data"]["allNum"]
            except:
                data = []
            for i in range(1,(int(allNum)/50 + 2)):
                seed_sub = 'http://cache.video.iqiyi.com/jp/avlist/%s/%s/50/' %(albumId, str(i))
                self.secondSpider(seed_sub)
Пример #6
0
    def firstSpider(self, seed):
        name = ""
        poster = ""
        point = 0.0
        shootYear = ""
        star = ""
        director = ""
        ctype = ""
        programLanguage = ""
        intro = ""
        mainId = ""
        area = ""

        pid = ""
        pid_P = re.search(
            r'http://www.mgtv.com/\w/\d+/(?P<mainId>\d+)/\w/(?P<pid>\d+)\.html',
            seed)
        if pid_P:
            pid = pid_P.group('pid')
            mainId = pid_P.group('mainId')

        seedJson = ""
        if pid != '':
            seedJson = 'http://m.api.hunantv.com/video/getbyid?videoId=%s' % (
                pid)
        else:
            return
        if mainId == "":
            mainId = pid

        doc = spiderTool.getHtmlBody(seedJson)
        if re.search(r'<html>', doc):
            doc = spiderTool.getHtmlBody(seedJson)
        json_data = json.loads(doc.strip())

        if json_data.has_key("data"):
            if type(json_data["data"]) is types.DictionaryType:
                json_data = json_data["data"]
                if json_data.has_key("detail"):
                    json_data = json_data["detail"]
                else:
                    return
            else:
                return

        if json_data.has_key("collectionName"):
            name = json_data["collectionName"]

        if json_data.has_key("image"):
            poster = json_data["image"]

        if json_data.has_key("year"):
            shootYear = json_data["year"]

        if json_data.has_key("director"):
            director_P = json_data["director"]
            if type(director_P) is types.UnicodeType:
                director = director_P.strip().replace(" / ", ",")

        if json_data.has_key("player"):
            star_P = json_data["player"]
            if type(star_P) is types.UnicodeType:
                star = star_P.strip().replace(" / ", ",")

        if json_data.has_key("area"):
            area_P = json_data["area"]
            if type(area_P) is types.UnicodeType:
                area = area_P.strip().replace(" ", ",")

        if json_data.has_key('desc'):
            intro = json_data['desc']

        self.program["name"] = spiderTool.changeName(name)
        self.program['poster'] = spiderTool.listStringToJson('url', poster)
        self.program['point'] = point
        self.program['shootYear'] = shootYear
        self.program['star'] = spiderTool.listStringToJson('name', star)
        self.program['director'] = spiderTool.listStringToJson(
            'name', director)
        self.program['ctype'] = spiderTool.listStringToJson('name', ctype)
        self.program['programLanguage'] = programLanguage
        self.program['area'] = spiderTool.listStringToJson('name', area)
        self.program['intro'] = intro
        self.program['mainId'] = mainId

        if json_data.has_key('typeId'):
            typeId = json_data['typeId']
        if json_data.has_key('collectionId'):
            collectionId = json_data['collectionId']

        pageNum = 20
        if json_data.has_key("totalvideocount"):
            pageNum = json_data["totalvideocount"]
        seed_sub = "http://m.api.hunantv.com/video/getListV2?videoId=%s&pageId=0&pageNum=%s" % (
            pid, pageNum)
        try:
            self.secondSpider(seed_sub, seed, pid)
        except:
            print "spider error: " + seed
Пример #7
0
    def firstSpider(self, seed):
        point = 0.0
        poster = ""
        name = ""
        shootYear = ""
        alias = ""
        area = ""
        star = ""
        director = ""
        ctype = ""
        playTimes = 0
        intro = ""
        mainId = ""

        doc = spiderTool.getHtmlBody(seed)
        soup = BeautifulSoup(doc, from_encoding="utf8")
        seed_Re = re.search(r'http://www\.youku\.com/show_page/id', seed)
        if not seed_Re:
            seed_P = soup.find('h1', attrs={'class': 'title'})
            if seed_P is not None:
                seed_aTag = seed_P.find('a')
                if seed_aTag is not None:
                    seed = seed_aTag.get('href')
                    doc = spiderTool.getHtmlBody(seed)
                    soup = BeautifulSoup(doc, from_encoding="utf8")

        point_P = soup.find('span', attrs={'class': 'rating'})
        if point_P is not None:
            if point_P.find('em', attrs={'class': 'num'}) is not None:
                point = point_P.find('em', attrs={'class': 'num'}).get_text()

        poster_p = soup.find("li", attrs={'class': 'thumb'})
        if poster_p is not None:
            poster = poster_p.find('img').get("src")

        name_P = soup.find('h1', attrs={'class': 'title'})
        if name_P is not None:
            name_span = name_P.find('span', attrs={'class': 'name'})
            if name_span is not None:
                name = name_span.get_text()
            if re.search(r'\d{4}', name):
                shootYear = re.search(r'\d+', name).group()

        area_P = soup.find('span', attrs={'class': 'area'})
        if area_P is not None:
            area_list = []
            for each in area_P.find_all("a"):
                area_list.append(each.get_text())
            area = ",".join(area_list)

        ctype_all = soup.find_all("span", attrs={"class": "type"})
        for ctype_P in ctype_all:
            if re.search(r'类型:'.decode('utf8'), ctype_P.get_text()):
                ctype_list = []
                for each in ctype_P.find_all('a'):
                    ctype_list.append(each.get_text())
                ctype = ",".join(ctype_list)

        star_P = soup.find("span", attrs={"class": "actor"})
        if star_P is not None:
            star_list = []
            for each in star_P.find_all('a'):
                star_list.append(each.get_text())
            star = ",".join(star_list)

        director_P = soup.find("span", attrs={"class": "director"})
        if director_P is not None:
            director_list = []
            for each in director_P.find_all('a'):
                director_list.append(each.get_text())
            director = ",".join(director_list)

        playTimes_P = soup.find('span', attrs={'class': 'play'})
        if playTimes_P is not None:
            playTimesStr = playTimes_P.get_text()
            playTimes_list = re.findall(r'(\d+)', playTimesStr)
            playTimes = long(''.join(playTimes_list))

        content_p = soup.find('span',
                              attrs={
                                  "class": "short",
                                  'id': 'show_info_short'
                              })
        if content_p is not None:
            intro = content_p.get_text().strip()

        if re.match(r'http://www\.youku\.com/show_page/(id_(\w+))\.html',
                    seed):
            mainId = re.match(
                r'http://www\.youku\.com/show_page/id_(\w+)\.html',
                seed).group(1)

        self.program["name"] = spiderTool.changeName(name)
        self.program["alias"] = spiderTool.changeName(alias)
        self.program["point"] = float(point)
        self.program['poster'] = spiderTool.listStringToJson('url', poster)
        self.program['star'] = spiderTool.listStringToJson('name', star)
        self.program['director'] = spiderTool.listStringToJson(
            'name', director)
        self.program['ctype'] = spiderTool.listStringToJson('name', ctype)
        self.program['shootYear'] = shootYear
        self.program['area'] = spiderTool.listStringToJson('name', area)
        self.program['playTimes'] = long(playTimes)
        self.program['intro'] = intro
        self.program['mainId'] = mainId
        urlList = re.findall(
            r'<li\s+data="(reload_\d+)"\s*(class="current"){0,1}\s*>', doc)
        if urlList:
            for url in urlList:
                seed_sub = 'http://www.youku.com/show_episode/id_%s.html?dt=json&divid=%s' % (
                    mainId, url[0])
                self.secondSpider(seed_sub)
        else:
            seed_sub = 'http://www.youku.com/show_episode_id_%s.html?dt=json' % mainId
            self.secondSpider(seed_sub)
Пример #8
0
    def firstSpider(self, seed):
        poster = ""
        star = ""
        director = ""
        ctype = ""
        shootYear = ""
        intro = ""
        mainId = ""
        area = ""
        name = ""
        programLanguage = ""
        doc = spiderTool.getHtmlBody(seed)
        soup = BeautifulSoup(doc, from_encoding="utf8")
        seed = ""
        seed_p = soup.find('a', attrs={'class': 'album_title'})
        if seed_p is not None:
            if seed_p.get("href") is not None:
                seed = seed_p.get("href")
        if seed == "":
            return
        doc = spiderTool.getHtmlBody(seed)
        soup = BeautifulSoup(doc, from_encoding="utf8")

        poster_p = soup.find("img", attrs={'itemprop': 'image'})
        if poster_p is not None:
            if poster_p.get("src") is not None:
                poster = poster_p.get("src")
                if not re.search(r'http', poster):
                    poster = "http:%s" % poster
            if poster_p.get("alt") is not None:
                name = poster_p.get("alt")

        video_type = soup.find("div", attrs={"class": "video_type cf"})
        if video_type is not None:
            divs = video_type.find_all('div', attrs={"class": "type_item"})
            for div in divs:
                text = div.get_text()
                if re.search(r'地 区:'.decode("utf8"), text):
                    area_p = div.find('span', attrs={"class": "type_txt"})
                    if area_p is not None:
                        area = area_p.get_text()
                if re.search(r'语 言:'.decode("utf8"), text):
                    programLanguage_p = div.find('span',
                                                 attrs={"class": "type_txt"})
                    if programLanguage_p is not None:
                        programLanguage = programLanguage_p.get_text()
                if re.search(r'上映时间:'.decode("utf8"), text):
                    shootYear_p = div.find('span', attrs={"class": "type_txt"})
                    if shootYear_p is not None:
                        shootYear = shootYear_p.get_text().split("-")[0]

        ctype_P = soup.find("div", attrs={"class": "tag_list"})
        if ctype_P is not None:
            ctype_list = []
            for each in ctype_P.find_all('a'):
                ctype_list.append(each.get_text())
            ctype = ",".join(ctype_list)

        person_p = soup.find("ul", attrs={"class": "actor_list cf"})
        if person_p is not None:
            lis = person_p.find_all('li')
            director_list = []
            star_list = []
            for li in lis:
                person = ""
                star_p = li.find("span", attrs={'class': 'name'})
                if star_p is not None:
                    person = star_p.get_text()
                if person != "" and li.find("span",
                                            attrs={"class": "director"}):
                    director_list.append(person)
                elif person != "":
                    star_list.append(person)
            star = ",".join(star_list)
            director = ",".join(director_list)

        content_p = soup.find('span',
                              attrs={"class": "txt _desc_txt_lineHight"})
        if content_p is not None:
            intro = content_p.get_text().strip()

        if re.match(r'http://film\.qq\.com/(page|cover)/.*/(.*?).html', seed):
            mainId = re.match(
                r'http://film\.qq\.com/(page|cover)/.*/(.*?).html',
                seed).group(2)
        if re.match(r'http://v\.qq\.com/detail/[\w\d]/(.*?).html', seed):
            mainId = re.match(r'http://v\.qq\.com/detail/[\w\d]/(.*?).html',
                              seed).group(1)

        self.program["name"] = spiderTool.changeName(name)
        self.program['poster'] = spiderTool.listStringToJson('url', poster)
        self.program['star'] = spiderTool.listStringToJson('name', star)
        self.program['director'] = spiderTool.listStringToJson(
            'name', director)
        self.program['ctype'] = spiderTool.listStringToJson('name', ctype)
        self.program['shootYear'] = shootYear
        self.program['area'] = spiderTool.listStringToJson('name', area)
        self.program['programLanguage'] = programLanguage
        self.program['intro'] = intro
        self.program['mainId'] = mainId
        if mainId != "":
            seed_sub = 'http://s.video.qq.com/loadplaylist?vkey=898_qqvideo_clpl_%s_qq&vtype=' % mainId
            self.secondSpider(seed_sub)