def firstSpider(self, seed): poster = "" star = "" director = "" ctype = "" shootYear = "" intro = "" mainId = "" area = "" name = "" programLanguage = "" point = -1.0 doubanPoint = -1.0 playTimes = 0 pcUrl = "" duration = "" alias = "" doc = spiderTool.getHtmlBody(seed) soup = BeautifulSoup(doc, from_encoding="utf8") pageInfo = soup.find('div', attrs={'id': 'pageInfo'}) if pageInfo is not None: # name = pageInfo.get('data-title') poster = pageInfo.get('data-pic') # intro = pageInfo.get('data-desc') introPage = soup.find('div', attrs={'class': 'introduction'}) if introPage is not None: # get intro intro_p = introPage.find('div', attrs={'class': 'desc gheight'}) if intro_p is not None: intro = intro_p.find('div').get_text() # get platytimes playTimes_p = soup.find('section', attrs={'class': 'clearfix wp area crumb'}) if playTimes_p is not None: # playTimes_p = playTimes_p.find('span', attrs={'class': 'sp2'}).get_text() playTimes = 0 if re.match(r'http://www\.acfun\.tv/v/(.*)', seed): mainId = re.match(r'http://www\.acfun\.tv/v/(.*)', seed).group(1) self.program["alias"] = spiderTool.changeName(alias) self.program["point"] = float(point) self.program['poster'] = spiderTool.listStringToJson('url', poster) self.program['star'] = spiderTool.listStringToJson('name', star) self.program['director'] = spiderTool.listStringToJson( 'name', director) self.program['shootYear'] = shootYear self.program['area'] = spiderTool.listStringToJson('name', area) self.program['playTimes'] = playTimes self.program['intro'] = intro self.program['mainId'] = mainId videoList = [] soup = BeautifulSoup(doc, from_encoding="utf8") data = soup.find_all('script') for li in data: if re.match(r'.*pageInfo.*', ''.join(li)): pageInfo = li.get_text() pageInfo_p = re.findall(r'pageInfo\s+=\s+(.*)', pageInfo) if pageInfo_p: try: pageInfo_json = json.loads(pageInfo_p[0]) except: break # get ctype ctype_list = [] for tag in pageInfo_json['tagList']: ctype_list.append(tag['name']) ctype = ','.join(ctype_list) # get name name = pageInfo_json['title'] videoList = pageInfo_json['videoList'] self.program["name"] = spiderTool.changeName(name) self.program['ctype'] = spiderTool.listStringToJson('name', ctype) self.secondSpider(videoList)
def firstSpider(self, seed): point = 0.0 poster = "" pcUrl = "" name = "" shootYear = "" alias = "" area = "" star = "" director = "" ctype = "" playTimes = 0 intro = "" playLength = "" mainId = "" youkushootYear = "" doc = spiderTool.getHtmlBody(seed) soup = BeautifulSoup(doc) seed_v = re.search(r'http://v\.youku\.com/v_show/id_', seed) seed_Re = re.search(r'http://www\.youku\.com/show_page/id', seed) if seed_v: seed_P = soup.find('a', attrs={'class': 'desc-link'}) if seed_P is not None: seed = seed_P.get('href') seed = "http:%s" % seed doc = spiderTool.getHtmlBody(seed) soup = BeautifulSoup(doc) elif not seed_Re: seed_P = soup.find('h1', attrs={'class': 'title'}) if seed_P is not None: seed_aTag = seed_P.find('a') if seed_aTag is not None: seed = seed_aTag.get('href') seed = "http:%s" % seed doc = spiderTool.getHtmlBody(seed) soup = BeautifulSoup(doc) poster_p = soup.find("div", attrs={'class': 'p-thumb'}) if poster_p is not None: poster = poster_p.find('img').get("src") p_base_content = soup.find('div', attrs={'class': 'p-base'}) if p_base_content is not None: for li in p_base_content.find_all('li'): li_p = str(li) if li.find('span', attrs={'class': 'star-num'}) is not None: point = li.find('span', attrs={ 'class': 'star-num' }).get_text() elif li.get('class') == ['p-row', 'p-title']: name_p = re.findall(r'/a>:(.*)<span'.decode('utf8'), li_p.decode('utf8')) if name_p: name = name_p[0] if re.search(r'<'.decode('utf8'), name): name_p = re.findall(r'(.*)<span'.decode('utf8'), name) if name_p: name = name_p[0] elif li.get('class') == ['p-alias']: alias_p = li.get('title') elif li.find('span', attrs={'class': 'pub'}) is not None: shootYear_P = li.find('span', attrs={'class': 'pub'}) if re.search(r'优酷'.decode('utf8'), li_p.decode('utf8')): youkushootYear_text = re.findall( r'/label>(.*)</span', str(shootYear_P)) if youkushootYear_text: youkushootYear_text = youkushootYear_text[0] youkushootYear = ''.join( youkushootYear_text.split('-')[0]) else: shootYear_text = re.findall(r'/label>(.*)</span', str(shootYear_P)) if shootYear_text: shootYear_text = shootYear_text[0] shootYear = ''.join(shootYear_text.split('-')[0]) elif re.search(r'<li>地区'.decode('utf8'), li_p.decode('utf8')): area_p = li.get_text() area_p = re.findall(r'地区:(.*)'.decode('utf8'), area_p) if area_p: area = area_p[0] area = area.replace('/', ',') elif re.search(r'<li>类型'.decode('utf8'), li_p.decode('utf8')): ctype_p = li.get_text() ctype_p = re.findall(r'类型:(.*)'.decode('utf8'), ctype_p) if ctype_p: ctype = ctype_p[0] ctype = ctype.replace('/', ',') elif re.search(r'<li>导演'.decode('utf8'), li_p.decode('utf8')): director_p = li.get_text() director_p = re.findall(r'导演:(.*)'.decode('utf8'), director_p) if director_p: director = director_p[0] elif li.get('class') == ['p-performer']: star_list = [] for each in li.find_all('a'): star_list.append(each.get_text()) star = ','.join(star_list) elif re.search(r'<li>总播放数'.decode('utf8'), li_p.decode('utf8')): playTimesStr = li.get_text() playTimesStr = re.findall(r'总播放数:(.*)'.decode('utf8'), playTimesStr) if playTimesStr: playTimesStr = playTimesStr[0] playTimes_list = re.findall(r'(\d+)', playTimesStr) playTimes = long(''.join(playTimes_list)) elif li.get('class') == ['p-row', 'p-intro']: intro = li.find('span').get_text().strip() else: continue if shootYear == "": shootYear = youkushootYear if re.match(r'http://list\.youku\.com/show/id_(.+)\.html', seed): mainId = re.match(r'http://list\.youku\.com/show/id_(.+)\.html', seed).group(1) if re.match(r'http://v\.youku\.com/v_show/id_(.+)\.html', seed): mainId = re.match(r'http://v\.youku\.com/v_show/id_(.+)\.html', seed).group(1) self.program["name"] = spiderTool.changeName(name) self.program["alias"] = spiderTool.changeName(alias) self.program["point"] = float(point) self.program['poster'] = spiderTool.listStringToJson('url', poster) self.program['star'] = spiderTool.listStringToJson('name', star) self.program['director'] = spiderTool.listStringToJson( 'name', director) self.program['ctype'] = spiderTool.listStringToJson('name', ctype) self.program['shootYear'] = shootYear self.program['area'] = spiderTool.listStringToJson('name', area) self.program['playTimes'] = long(playTimes) self.program['intro'] = intro self.program['mainId'] = mainId self.secondSpider()
def secondSpider(self, seed_sub): doc = spiderTool.getHtmlBody(seed_sub) try: data = json.loads(doc) except: #print("load json error22222!") return if data.get('html') is None: #print("get html error22222") return sub_soup = BeautifulSoup(data['html']) for li in sub_soup.find_all('li'): self.program_sub = copy.deepcopy(PROGRAM_SUB) setNumber = "" setName = "" webUrl = "" poster = "" playLength = "" setIntro = "" plot = [] webUrl_p = li.find('a') if webUrl_p is not None: webUrl = "http:%s" % webUrl_p.get('href') setName_p = li.find('a') if setName_p is not None: setName = setName_p.get('title') if setName is None: setName = setName_p.get_text() setNumber_p = li.find('dt') if setNumber_p is not None: setNumber = setNumber_p.get_text() if self.program['shootYear'] != "" and re.search( r'^\d{2}-\d{2}期$'.decode('utf8'), setNumber): setNumber = setNumber.replace("-", "") setNumber = self.program['shootYear'] + re.findall( r'(\d+)', setNumber)[0] elif re.search(r'\d{2}-\d{2}期'.decode('utf8'), setNumber): setNumber = setNumber.replace("-", "") setNumber = re.findall(r'(\d+)', setNumber)[0] elif re.search(r'\d+'.decode('utf8'), setNumber): setNumber = re.search(r'\d+'.decode('utf8'), setNumber).group(1) elif re.search(r'第(\d+)期'.decode('utf8'), setNumber): setNumber = re.search(r'第(\d+)期'.decode('utf8'), setNumber).group(1) elif re.search(r'第(\d+)期'.decode('utf8'), setName): setNumber = re.search(r'第(\d+)期'.decode('utf8'), setName).group(1) elif re.search(r'第(\d+)集'.decode('utf8'), setNumber): setNumber = re.search(r'第(\d+)集'.decode('utf8'), setNumber).group(1) elif re.search(r'第(\d+)集'.decode('utf8'), setName): setNumber = re.search(r'第(\d+)集'.decode('utf8'), setName).group(1) else: setNumber = "" if setNumber == "": li_text = li.get_text() num_list = re.findall(r'(\d+)', li_text) if num_list: setNumber = num_list[0] if setNumber == "" and setName != "" and setName is not None and setNumber is not None: if re.search(r'\d+'.decode('utf8'), setName): setNumber = re.search(r'\d+'.decode('utf8'), setName).group(0) if setNumber == "" or setName == "" or setNumber is None or setName is None or webUrl is None or webUrl == "" or \ re.search(r'预告'.decode('utf8'), setName): continue self.program_sub['setNumber'] = setNumber self.program_sub['setName'] = setName self.program_sub['webUrl'] = webUrl self.program_sub['poster'] = poster self.program_sub['playLength'] = playLength self.program_sub['setIntro'] = setIntro self.program_sub['plot'] = plot self.program['programSub'].append(self.program_sub)
def firstSpider(self, seed): point = 0.0 poster = "" name = "" shootYear = "" alias = "" area = "" star = "" director = "" ctype = "" playTimes = 0 intro = "" mainId = "" youkushootYear = "" doc = spiderTool.getHtmlBody(seed) soup = BeautifulSoup(doc) seed_v = re.search(r'http://v\.youku\.com/v_show/id_', seed) seed_Re = re.search(r'http://www\.youku\.com/show_page/id', seed) if seed_v: seed_P = soup.find('a', attrs={'class': 'desc-link'}) if seed_P is not None: seed = seed_P.get('href') seed = "http:%s" % seed doc = spiderTool.getHtmlBody(seed) soup = BeautifulSoup(doc) elif not seed_Re: seed_P = soup.find('h1', attrs={'class': 'title'}) if seed_P is not None: seed_aTag = seed_P.find('a') if seed_aTag is not None: seed = seed_aTag.get('href') seed = "http:%s" % seed doc = spiderTool.getHtmlBody(seed) soup = BeautifulSoup(doc) poster_p = soup.find("div", attrs={'class': 'p-thumb'}) if poster_p is not None: poster = poster_p.find('img').get("src") p_base_content = soup.find('div', attrs={'class': 'p-base'}) if p_base_content is not None: for li in p_base_content.find_all('li'): li_p = str(li) if li.find('span', attrs={'class': 'star-num'}) is not None: point = li.find('span', attrs={ 'class': 'star-num' }).get_text() elif li.get('class') == ['p-row', 'p-title']: name_p = re.findall(r'/a>:(.*)<span'.decode('utf8'), li_p.decode('utf8')) if name_p: name = name_p[0] if re.search(r'<'.decode('utf8'), name): name_p = re.findall(r'(.*)<span'.decode('utf8'), name) if name_p: name = name_p[0] elif li.get('class') == ['p-alias']: alias_p = li.get('title') elif li.find('span', attrs={'class': 'pub'}) is not None: shootYear_P = li.find('span', attrs={'class': 'pub'}) if re.search(r'优酷'.decode('utf8'), li_p.decode('utf8')): youkushootYear_text = re.findall( r'/label>(.*)</span', str(shootYear_P)) if youkushootYear_text: youkushootYear_text = youkushootYear_text[0] youkushootYear = ''.join( youkushootYear_text.split('-')[0]) else: shootYear_text = re.findall(r'/label>(.*)</span', str(shootYear_P)) if shootYear_text: shootYear_text = shootYear_text[0] shootYear = ''.join(shootYear_text.split('-')[0]) elif re.search(r'<li>地区'.decode('utf8'), li_p.decode('utf8')): area_p = li.get_text() area_p = re.findall(r'地区:(.*)'.decode('utf8'), area_p) if area_p: area = area_p[0] area = area.replace('/', ',') elif re.search(r'<li>类型'.decode('utf8'), li_p.decode('utf8')): ctype_p = li.get_text() ctype_p = re.findall(r'类型:(.*)'.decode('utf8'), ctype_p) if ctype_p: ctype = ctype_p[0] ctype = ctype.replace('/', ',') elif re.search(r'<li>导演'.decode('utf8'), li_p.decode('utf8')): director_p = li.get_text() director_p = re.findall(r'导演:(.*)'.decode('utf8'), director_p) if director_p: director = director_p[0] elif li.get('class') == ['p-performer']: star_list = [] for each in li.find_all('a'): star_list.append(each.get_text()) star = ','.join(star_list) elif re.search(r'<li>总播放数'.decode('utf8'), li_p.decode('utf8')): playTimesStr = li.get_text() playTimesStr = re.findall(r'总播放数:(.*)'.decode('utf8'), playTimesStr) if playTimesStr: playTimesStr = playTimesStr[0] playTimes_list = re.findall(r'(\d+)', playTimesStr) playTimes = long(''.join(playTimes_list)) elif li.get('class') == ['p-row', 'p-intro']: intro = li.find('span').get_text().strip() else: continue if shootYear == "": shootYear = youkushootYear if re.match(r'http://list\.youku\.com/show/id_(.+)\.html', seed): mainId = re.match(r'http://list\.youku\.com/show/id_(.+)\.html', seed).group(1) if re.match(r'http://v\.youku\.com/v_show/id_(.+)\.html', seed): mainId = re.match(r'http://v\.youku\.com/v_show/id_(.+)\.html', seed).group(1) self.program["name"] = spiderTool.changeName(name) self.program["alias"] = spiderTool.changeName(alias) self.program["point"] = float(point) self.program['poster'] = spiderTool.listStringToJson('url', poster) self.program['star'] = spiderTool.listStringToJson('name', star) self.program['director'] = spiderTool.listStringToJson( 'name', director) self.program['ctype'] = spiderTool.listStringToJson('name', ctype) self.program['shootYear'] = shootYear self.program['area'] = spiderTool.listStringToJson('name', area) self.program['playTimes'] = long(playTimes) self.program['intro'] = intro self.program['mainId'] = mainId showid = "" showid_url = "" p_list_p = soup.find_all('script', attrs={'type': 'text/javascript'}) if p_list_p is not None: for each in p_list_p: if re.search(r'PageConfig', str(each)): showid_p = re.findall(r'showid:"(.*)", videoId', str(each)) if showid_p: showid = showid_p[0] showid_url = "http://list.youku.com/show/module?id=%s&tab=showInfo" % showid if showid_url != "": sub_doc = spiderTool.getHtmlBody(showid_url) try: data = json.loads(sub_doc) except: #print("load json error1111!") return if data.get('html') is None: #print("get html error1111") return sub_soup = BeautifulSoup(data['html']) reload_list_p = re.findall(r'id="reload_(\d+)"', data['html']) reload_list = list(set(reload_list_p)) if reload_list: def numeric_compare(x, y): x = int(x) y = int(y) if x > y: return 1 elif x == y: return 0 else: # x<y return -1 reload_list.sort(numeric_compare) # print(reload_list) for reload in reload_list: sub_seed = "http://list.youku.com/show/episode?id=%s&stage=reload_%s" % ( mainId, reload) self.secondSpider(sub_seed)
def firstSpider(self, seed): name = "" poster = "" star = "" director = "" ctype = "" programLanguage = "" intro = "" mainId = "" area = "" doc = spiderTool.getHtmlBody(seed) soup = BeautifulSoup(doc, from_encoding="utf8") poster_p = soup.find("ul", attrs={'class': 'focus_img_list'}) poster_p_1 = soup.find("div", attrs={"class": "result_pic pr"}) poster_p_2 = soup.find('div', attrs={'class': 'album-picCon album-picCon-onePic'}) if poster_p is not None: poster_p_sub = poster_p.find('li').get('style') if re.search(r'\((.*?)\)'.decode('utf8'), poster_p_sub): poster = re.search(r'\((.*?)\)'.decode('utf8'), poster_p_sub).group(1) elif poster_p_1 is not None: img_tag = poster_p_1.find("img") if img_tag is not None: poster = img_tag.get("src") name = img_tag.get("alt") elif poster_p_2 is not None: poster = poster_p_2.find('img').get('src') name_p = soup.find('a', attrs={'class': 'white'}) if name_p is not None: name = name_p.get_text() detail = soup.find("div", attrs={"class": "result_detail-minH"}) detail_1 = soup.find("div", attrs={"class": "msg-hd-lt fl"}) if detail is not None: for div_p in detail.find_all("div", attrs={"class": "topic_item clearfix"}): for each in div_p.find_all("div"): a_list = [] for a_tag in each.find_all('a'): a_list.append(a_tag.get_text()) a_str = ",".join(a_list) if re.search("主演:".decode("utf8"),each.get_text()): star = a_str if re.search("导演:".decode("utf8"),each.get_text()): director = a_str if re.search("类型:".decode("utf8"),each.get_text()): ctype = a_str if re.search("语言:".decode("utf8"),each.get_text()): programLanguage = a_str if re.search("地区:".decode("utf8"),each.get_text()): area = a_str elif detail_1 is not None: for p_tag in detail_1.find_all("p"): a_list = [] for a_tag in p_tag.find_all('a'): a_list.append(a_tag.get_text()) a_str = ','.join(a_list) if re.search("导演:".decode("utf8"),p_tag.get_text()): director = a_str if re.search("类型:".decode("utf8"),p_tag.get_text()): ctype = a_str if re.search("语言:".decode("utf8"),p_tag.get_text()): programLanguage = a_str if re.search("地区:".decode("utf8"),p_tag.get_text()): area = a_str if re.search("主演:".decode("utf8"),p_tag.get_text()): star = a_str content_p = soup.find('span', attrs={"class": "showMoreText", "data-moreorless":"moreinfo", "style":"display: none;"}) content_p_1 = soup.find('div', attrs={"data-moreorless":"moreinfo"}) if content_p is not None: if content_p.find("span"): content_p = content_p.find("span") if re.search("简介:".decode("utf8"),content_p.get_text()): intro = content_p.get_text().split("简介:".decode("utf8"))[1].strip() elif content_p_1 is not None: if re.search("简介:".decode("utf8"),content_p_1.get_text()): intro = content_p_1.get_text().split("简介:".decode("utf8"))[1].strip() self.program["name"] = spiderTool.changeName(name) self.program['poster'] = spiderTool.listStringToJson('url',poster) self.program['star'] = spiderTool.listStringToJson('name',star) self.program['director'] = spiderTool.listStringToJson('name',director) self.program['ctype'] = spiderTool.listStringToJson('name',ctype) self.program['programLanguage'] = programLanguage self.program['area'] = spiderTool.listStringToJson('name', area) self.program['intro'] = intro seed_P = re.search(r'albumId:\s*(?P<albumId>\d+)[^\\]*?cid:\s*(?P<cid>\d+)'.decode('utf8'), doc) if seed_P: albumId = seed_P.group('albumId') cid = seed_P.group('cid') self.program['mainId'] = cid + "_" +albumId seed_sub = 'http://cache.video.iqiyi.com/jp/avlist/%s/' %(albumId) allNum = 0 doc = spiderTool.getHtmlBody(seed_sub) try: json_data = json.loads(doc.split('=')[1]) data = json_data["data"]["vlist"] allNum = json_data["data"]["allNum"] except: data = [] for i in range(1,(int(allNum)/50 + 2)): seed_sub = 'http://cache.video.iqiyi.com/jp/avlist/%s/%s/50/' %(albumId, str(i)) self.secondSpider(seed_sub)
def firstSpider(self, seed): poster = "" star = "" director = "" ctype = "" shootYear = "" intro = "" mainId = "" area = "" name = "" programLanguage = "" point = -1.0 doubanPoint = -1.0 poster = "" playTimes = 0 pcUrl = "" duration = "" alias = "" doc = spiderTool.getHtmlBody(seed) soup = BeautifulSoup(doc, from_encoding="utf8") #get poster poster_p = soup.find('img', attrs={'class': 'cover_image'}) if poster_p is not None: poster = poster_p.get('src') if not re.match(r'http://', poster): poster = "http:%s" % poster #get name page_body = soup.find('div', attrs={'class': 'b-page-body'}) if page_body is not None: name_p = page_body.find('div', attrs={'class': 'v-title'}) if name_p is not None: name = name_p.get_text() v_info = soup.find('div', attrs={'class': 'v_info'}) if v_info is not None: #get intro intro_p = v_info.find('div', attrs={'class': 'intro'}) if intro_p is not None: intro = intro_p.get_text().strip() #get ctype ctype_list = [] ctype_p = v_info.find('div', attrs={'class': 's_tag'}) if ctype_p is not None: ctype_p = ctype_p.find_all('li') for li in ctype_p: ctype_list.append(li.get_text()) ctype = ','.join(ctype_list) if re.match(r'http://www\.bilibili\.com/video/(.*)/', seed): mainId = re.match(r'http://www\.bilibili\.com/video/(.*)/', seed).group(1) self.program['name'] = spiderTool.changeName(name) self.program["alias"] = spiderTool.changeName(alias) self.program["point"] = float(point) self.program['poster'] = spiderTool.listStringToJson('url', poster) self.program['star'] = spiderTool.listStringToJson('name', star) self.program['director'] = spiderTool.listStringToJson( 'name', director) self.program['ctype'] = spiderTool.listStringToJson('name', ctype) self.program['shootYear'] = shootYear self.program['area'] = spiderTool.listStringToJson('name', area) self.program['playTimes'] = long(playTimes) self.program['intro'] = intro self.program['mainId'] = mainId videoList = [] detailpage_p = soup.find('select', attrs={'id': 'dedepagetitles'}) if detailpage_p is not None: videoList = detailpage_p.find_all('option') self.secondSpider(videoList)
def firstSpider(self, seed): name = "" poster = "" point = 0.0 shootYear = "" star = "" director = "" ctype = "" programLanguage = "" intro = "" mainId = "" area = "" if re.search(r'videoId=(?P<pid>\d+)', seed): mainId = re.search(r'videoId=(?P<pid>\d+)', seed).group('pid') doc = spiderTool.getHtmlBody(seed) soup = BeautifulSoup(doc, from_encoding="utf8") data = soup.find('video') if data is None: return name_p = data.find('name') name_r = re.search(r'<name>(.*?)</name>', doc) if name_p is not None: name = name_p.get_text().replace('<![CDATA[', '').replace(']]>', '').strip() elif name_r: name = name_r.group(1).replace('<![CDATA[', '').replace(']]>', '').strip() poster_p = data.find('smallImg') poster_r = re.search(r'<smallImg>(.*?)</smallImg>', doc) if poster_p is not None: poster = poster_p.get_text().replace('<![CDATA[', '').replace(']]>', '').strip() elif poster_r: poster = poster_r.group(1).replace('<![CDATA[', '').replace(']]>', '').strip() shootYear_P = data.find('screenTime') shootYear_r = re.search(r'<screenTime>(.*?)</screenTime>', doc) if shootYear_P is not None: shootYear = shootYear_P.get_text().replace('<![CDATA[', '').replace(']]>', '').strip() elif shootYear_r: shootYear = shootYear_r.group(1).replace('<![CDATA[', '').replace(']]>', '').strip() area_P = data.find('area') area_r = re.search(r'<area>(.*?)</area>', doc) if area_P is not None: area = area_P.get_text().replace('<![CDATA[', '').replace(']]>', '').strip() elif area_r: area = area_r.group(1).replace('<![CDATA[', '').replace(']]>', '').strip() director_P = data.find('director') director_r = re.search(r'<director>(.*?)</director>', doc) if director_P is not None: director = director_P.get_text().replace('<![CDATA[', '').replace(']]>', '').strip() elif director_r: director = director_r.group(1).replace('<![CDATA[', '').replace(']]>', '').strip() star_P = data.find('performer') star_r = re.search(r'<performer>(.*?)</performer>', doc) if star_P is not None: star = star_P.get_text().replace('<![CDATA[', '').replace(']]>', '').strip() elif star_r: star = star_r.group(1).replace('<![CDATA[', '').replace(']]>', '').strip() ctype_P = data.find('cate') ctype_r = re.search(r'<cate>(.*?)</cate>', doc) if ctype_P is not None: ctype = ctype_P.get_text().replace('<![CDATA[', '').replace(']]>', '').strip() elif ctype_r: ctype = ctype_r.group(1).replace('<![CDATA[', '').replace(']]>', '').strip() intro_P = data.find('annotation') intro_r = re.search(r'<annotation>(.*?)</annotation>', doc) if intro_P is not None: intro = intro_P.get_text().replace('<![CDATA[', '').replace(']]>', '').strip() elif intro_r: intro = intro_r.group(1).replace('<![CDATA[', '').replace(']]>', '').strip() self.program["name"] = spiderTool.changeName(name) self.program['poster'] = spiderTool.listStringToJson('url', poster) self.program['point'] = point self.program['shootYear'] = shootYear.split("-")[0] self.program['star'] = spiderTool.listStringToJson('name', star) self.program['director'] = spiderTool.listStringToJson( 'name', director) self.program['ctype'] = spiderTool.listStringToJson('name', ctype) self.program['programLanguage'] = programLanguage self.program['area'] = spiderTool.listStringToJson('name', area) self.program['intro'] = intro self.program['mainId'] = mainId detail_sub = data.find('videomergeinfolist') if detail_sub is not None: self.secondSpider(detail_sub)
def secondSpider(self, video_list): #for crask videoinfo_list = [] crack_seed = "http://www.yehetang.com%s" % video_list[0].find('a').get( 'href') doc = spiderTool.getHtmlBody(crack_seed) soup = BeautifulSoup(doc, from_encoding="utf8") videoinfo = "" videoinfo_p = soup.find_all("script") for info in videoinfo_p: if re.search(r'VideoInfoList=\"(.*)\"', str(info)): videoinfo_p = re.search(r'VideoInfoList=\"(.*)\"', str(info)) if videoinfo_p is not None: videoinfo = videoinfo_p.group(1) videoinfo = str(videoinfo) videoinfo_list = videoinfo.split('#') # check if len(video_list) != len(videoinfo_list) or len( video_list) == 0 or len(videoinfo_list) == 0: return subprog_count = 0 for video in video_list: self.program_sub = copy.deepcopy(PROGRAM_SUB) setNumber = "" setName = "" webUrl = "" poster = "" webUrl = "http://www.yehetang.com%s" % video.find('a').get('href') setName = video.get_text() if re.search(r'(\d+)集'.decode('utf8'), setName): setNumber = re.search(r'(\d+)集'.decode('utf8'), setName).group(1) elif re.search(r'(\d+)'.decode('utf8'), setName): setNumber = re.search(r'(\d+)'.decode('utf8'), setName).group(1) subprog_count = subprog_count + 1 if re.search(r'预告'.decode('utf8'), setName) or setNumber == "" or setName == "" or webUrl == "" \ or setNumber is None or setName is None or webUrl is None: continue #check videoinfo_tmp = str(videoinfo_list[int(subprog_count - 1)]) if re.search(r'(\d+)集'.decode('utf-8'), videoinfo_tmp.decode('utf-8')): videoinfo_num = re.search( r'(\d+)集'.decode('utf-8'), videoinfo_tmp.decode('utf-8')).group(1) if videoinfo_num == setNumber: parm = urllib.quote(videoinfo_tmp) webUrl = "%s###%s" % (webUrl, parm) elif re.search(r'(\d+)'.decode('utf-8'), videoinfo_tmp.decode('utf-8')): videoinfo_num = re.search( r'(\d+)'.decode('utf-8'), videoinfo_tmp.decode('utf-8')).group(1) if videoinfo_num == setNumber: parm = urllib.quote(videoinfo_tmp) webUrl = "%s###%s" % (webUrl, parm) self.program_sub['setNumber'] = setNumber.replace('-', '') self.program_sub['setName'] = setName self.program_sub['webUrl'] = webUrl self.program_sub['poster'] = poster self.program['programSub'].append(self.program_sub)
def firstSpider(self, seed): name = "" poster = "" point = 0.0 shootYear = "" star = "" director = "" ctype = "" programLanguage = "" intro = "" mainId = "" area = "" pid = "" pid_P = re.search( r'http://www.mgtv.com/\w/\d+/(?P<mainId>\d+)/\w/(?P<pid>\d+)\.html', seed) if pid_P: pid = pid_P.group('pid') mainId = pid_P.group('mainId') seedJson = "" if pid != '': seedJson = 'http://m.api.hunantv.com/video/getbyid?videoId=%s' % ( pid) else: return if mainId == "": mainId = pid doc = spiderTool.getHtmlBody(seedJson) if re.search(r'<html>', doc): doc = spiderTool.getHtmlBody(seedJson) json_data = json.loads(doc.strip()) if json_data.has_key("data"): if type(json_data["data"]) is types.DictionaryType: json_data = json_data["data"] if json_data.has_key("detail"): json_data = json_data["detail"] else: return else: return if json_data.has_key("collectionName"): name = json_data["collectionName"] if json_data.has_key("image"): poster = json_data["image"] if json_data.has_key("year"): shootYear = json_data["year"] if json_data.has_key("director"): director_P = json_data["director"] if type(director_P) is types.UnicodeType: director = director_P.strip().replace(" / ", ",") if json_data.has_key("player"): star_P = json_data["player"] if type(star_P) is types.UnicodeType: star = star_P.strip().replace(" / ", ",") if json_data.has_key("area"): area_P = json_data["area"] if type(area_P) is types.UnicodeType: area = area_P.strip().replace(" ", ",") if json_data.has_key('desc'): intro = json_data['desc'] self.program["name"] = spiderTool.changeName(name) self.program['poster'] = spiderTool.listStringToJson('url', poster) self.program['point'] = point self.program['shootYear'] = shootYear self.program['star'] = spiderTool.listStringToJson('name', star) self.program['director'] = spiderTool.listStringToJson( 'name', director) self.program['ctype'] = spiderTool.listStringToJson('name', ctype) self.program['programLanguage'] = programLanguage self.program['area'] = spiderTool.listStringToJson('name', area) self.program['intro'] = intro self.program['mainId'] = mainId if json_data.has_key('typeId'): typeId = json_data['typeId'] if json_data.has_key('collectionId'): collectionId = json_data['collectionId'] pageNum = 20 if json_data.has_key("totalvideocount"): pageNum = json_data["totalvideocount"] seed_sub = "http://m.api.hunantv.com/video/getListV2?videoId=%s&pageId=0&pageNum=%s" % ( pid, pageNum) self.secondSpider(seed_sub, seed, pid)
def firstSpider(self, seed): name = "" pcUrl = "" poster = "" point = 0.0 alias = "" shootYear = "" star = "" director = "" ctype = "" programLanguage = "" intro = "" mainId = "" area = "" playLength = "" doc = spiderTool.getHtmlBody(seed) soup = BeautifulSoup(doc, from_encoding="utf8") mainId_r = re.search(r'lib/(.*?)\.html'.decode('utf8'), seed) if mainId_r: mainId = mainId_r.group(1) poster_p = soup.find("div", attrs={'class': 'result_pic'}) if poster_p is not None: poster_p_tag = poster_p.find('img') if poster_p_tag is not None: poster = poster_p_tag.get('src') name_p = poster_p.find('a') if name_p is not None: name = name_p.get('title') pcUrl = name_p.get('href') point_P = soup.find('span', attrs={'class': 'score_font'}) if point_P is not None: point_r = re.search(r'\d+.\d+'.decode('utf8'), point_P.get_text()) if point_r: point = point_r.group() detail = soup.find("div", attrs={"class": "result_detail"}) if detail is not None: for p_tag in detail.find_all( "div", attrs={"class": "topic_item clearfix"}): a_list = [] for a_tag in p_tag.find_all('a'): a_list.append(a_tag.get_text()) a_str = ','.join(a_list) if re.search("导演:".decode("utf8"), p_tag.get_text()): director = a_str if re.search("看点:".decode("utf8"), p_tag.get_text()): ctype = a_str if re.search("类型:".decode("utf8"), p_tag.get_text()): ctype = a_str if re.search("语言:".decode("utf8"), p_tag.get_text()): programLanguage = a_str if re.search("地区:".decode("utf8"), p_tag.get_text()): area = a_str if re.search("主演:".decode("utf8"), p_tag.get_text()): star = a_str if re.search("主演:".decode("utf8"), p_tag.get_text()): star = a_str if re.search(r"别名: 暂无".decode("utf8"),p_tag.get_text()) is not True \ and re.search(r"别名:(.*?)".decode("utf8"),p_tag.get_text()): alias = re.search(r"别名:(.*?)".decode("utf8"), p_tag.get_text()).group(1) if re.search(r"上映时间:[^\\]*?(\d\d\d\d)".decode("utf8"), p_tag.get_text()): shootYear = re.search( r"上映时间:[^\\]*?(\d\d\d\d)".decode("utf8"), p_tag.get_text()).group(1) if re.search(r"片长:[^\\]*?(\d+)".decode("utf8"), p_tag.get_text()): playLength = re.search(r"片长:[^\\]*?(\d+)".decode("utf8"), p_tag.get_text()).group(1) content_p = soup.find('p', attrs={"data-movlbshowmore-ele": "whole"}) if content_p is not None: intro = content_p.get_text().strip() self.program["name"] = spiderTool.changeName(name) self.program["alias"] = spiderTool.changeName(alias) self.program['pcUrl'] = pcUrl self.program['poster'] = spiderTool.listStringToJson('url', poster) self.program["point"] = float(point) self.program['shootYear'] = shootYear self.program['star'] = spiderTool.listStringToJson('name', star) self.program['director'] = spiderTool.listStringToJson( 'name', director) self.program['ctype'] = spiderTool.listStringToJson('name', ctype) self.program['programLanguage'] = programLanguage self.program['area'] = spiderTool.listStringToJson('name', area) self.program['intro'] = intro self.program['mainId'] = mainId self.program_sub['playLength'] = playLength + ':' + '00' seed_P = re.search( r'data-seriesdownload-aid="(?P<sourceId>\d+)"\s*data-seriesdownload-cid="(?P<cid>\d+)"', doc) if seed_P: sourceId = seed_P.group('sourceId') cid = seed_P.group('cid') self.program['mainId'] = sourceId seed_sub = 'http://cache.video.iqiyi.com/jp/sdvlst/%s/%s/' % ( cid, sourceId) self.secondSpider(seed_sub)
def firstSpider(self, seed): name = "" poster = "" star = "" ctype = "" shootYear = "" intro = "" mainId = "" area = "" point = 0.0 playTimes = 0 mainId_p = re.search(r'http://i.youku.com/i/(.+)/videos', seed) if mainId_p: mainId = mainId_p.group(1) doc = spiderTool.getHtmlBody(seed) soup = BeautifulSoup(doc, from_encoding="utf8") poster_p = soup.find("div",attrs={'class':'head-avatar'}) poster_p1 = soup.find("div",attrs={'class':'avatar'}) if poster_p is not None: poster_a = poster_p.find('a') if poster_a is not None: name = poster_a.get("title") if poster_p.find('img') is not None: poster = poster_p.find("img").get("src") elif poster_p1 is not None: if poster_p1.find('img') is not None: poster = poster_p1.find("img").get("src") name = poster_p1.find("img").get("title") content_p = soup.find('div', attrs={"class": "userintro"}) if content_p is not None: content = content_p.find('div', attrs={'class': 'desc'}) if content is not None: intro = content.get_text().strip().split('自频道介绍:'.decode('utf8'))[1] self.program["name"] = spiderTool.changeName(name) self.program['poster'] = spiderTool.listStringToJson('url',poster) self.program['star'] = spiderTool.listStringToJson('name',star) self.program['ctype'] = spiderTool.listStringToJson('name',ctype) self.program['shootYear'] = shootYear self.program['area'] = spiderTool.listStringToJson('name', area) self.program['intro'] = intro self.program['mainId'] = mainId self.program['point'] = point self.program['playTimes'] = playTimes pages = 0 total_num_p = soup.find_all('div', attrs={'class': 'title'}) for item in total_num_p: if re.search(r'视频[^\\]*?\((\d+)\)'.decode('utf8'), item.get_text().replace(',','')): total_num = re.search(r'\((\d+)\)', item.get_text().replace(',','')).group(1) pages = int(total_num)/40 + 2 if pages ==0: total_num_p = soup.find('span', attrs={'class': 'append'}) if total_num_p is not None and re.search(r'共(\d+)个视频'.decode('utf8'), total_num_p.get_text().replace(',','')): total_num = re.search(r'共(\d+)个视频'.decode('utf8'), total_num_p.get_text().replace(',','')).group(1) pages = int(total_num)/40 + 2 if pages > 20: pages = 20 if pages != 0: for page in range(1, pages): sub_seed = seed + '/fun_ajaxload/?page_num=%d&page_order=0' %(page) self.secondSpider(sub_seed)
def secondSpider(self, seed): doc = spiderTool.getHtmlBody(seed) sub_soup = BeautifulSoup(doc, from_encoding="utf8") sub_pog = sub_soup.find_all("div", attrs={'class': 'v va'}) if len(sub_pog) == 0: sub_pog = sub_soup.find_all("div", attrs={'class': 'yk-col4'}) for each in sub_pog: self.program_sub = copy.deepcopy(PROGRAM_SUB) setNumber = "" poster = '' setName = "" webUrl = "" playLength = '' setNumber_p = each.find('span', attrs={'class': 'v-upload-date'}) if setNumber_p is not None: if re.search(r'\d+-\d+-\d+'.decode('utf8'), setNumber_p.get_text()): setNumber = re.search( r'\d+-\d+-\d+'.decode('utf8'), setNumber_p.get_text()).group().replace( '-'.decode('utf8'), '') elif re.search(r'\d+-\d+'.decode('utf8'), setNumber_p.get_text()): year = time.strftime('%Y', time.localtime(time.time())) setNumber = year + re.search( r'\d+-\d+'.decode('utf8'), each.get_text()).group().replace( '-'.decode('utf8'), '') if re.search(r'\d+-\d+-\d+'.decode('utf8'), each.get_text()): setNumber = re.search(r'\d+-\d+-\d+'.decode('utf8'), each.get_text()).group().replace( '-'.decode('utf8'), '') elif re.search(r'\d+-\d+'.decode('utf8'), each.get_text()): year = time.strftime('%Y', time.localtime(time.time())) setNumber = year + re.search(r'\d+-\d+'.decode('utf8'), each.get_text()).group().replace( '-'.decode('utf8'), '') poster_p = each.find('img') if poster_p is not None: poster = poster_p.get('src') setName_p = each.find('div', attrs={'class': 'v-link'}) if setName_p is not None: setName = setName_p.find('a').get('title') webUrl = setName_p.find('a').get('href') playLength_p = each.find('span', attrs={'class': 'v-time'}) if playLength_p is not None: playLength = playLength_p.get_text() if re.search(r'预告'.decode('utf8'), setName) or setNumber == "" or setName == "" or webUrl == ""\ or setNumber is None or setName is None or webUrl is None: continue self.program_sub['setNumber'] = setNumber self.program_sub['setName'] = setName self.program_sub['webUrl'] = webUrl self.program_sub['poster'] = poster self.program_sub['playLength'] = playLength self.program['programSub'].append(self.program_sub)
def firstSpider(self, seed): poster = "" star = "" director = "" ctype = "" shootYear = "" intro = "" mainId = "" area = "" name = "" programLanguage = "" point = -1.0 doubanPoint = -1.0 poster = "" playTimes = 0 pcUrl = "" duration = "" alias = "" doc = spiderTool.getHtmlBody(seed) soup = BeautifulSoup(doc, from_encoding="utf8") introPage = soup.find('div', attrs={'class': 'info-content'}) if introPage is not None: #get poster poster_p = introPage.find('div', attrs={'class': 'bangumi-preview'}) if poster_p is not None: poster = poster_p.find('img').get('src') if not re.match(r'http://', poster): poster = "http:%s" % poster #get info info_p = introPage.find('div', attrs={'class': 'bangumi-info-r'}) if info_p is not None: head = info_p.find('div', attrs={'class': 'b-head'}) if head is not None: # get name name = head.find('h1').get_text() #get ctype ctype_list = [] ctype_p = head.find_all('span') for span in ctype_p: ctype_list.append(span.get_text()) ctype = ','.join(ctype_list) #get playtimes info_count = info_p.find('div', attrs={'class': 'info-count'}) if info_count is not None: #playTimes = info_count.find('em').get_text() playTimes = 0 #get actors #get desc info_desc = info_p.find('div', attrs={'class': 'info-desc'}) if info_desc is not None: intro = info_desc.get_text().strip() if re.match(r'http://bangumi\.bilibili\.com/anime/(.*)', seed): mainId = re.match(r'http://bangumi\.bilibili\.com/anime/(.*)', seed).group(1) self.program['name'] = spiderTool.changeName(name) self.program["alias"] = spiderTool.changeName(alias) self.program["point"] = float(point) self.program['poster'] = spiderTool.listStringToJson('url', poster) self.program['star'] = spiderTool.listStringToJson('name', star) self.program['director'] = spiderTool.listStringToJson( 'name', director) self.program['ctype'] = spiderTool.listStringToJson('name', ctype) self.program['shootYear'] = shootYear self.program['area'] = spiderTool.listStringToJson('name', area) self.program['playTimes'] = long(playTimes) self.program['intro'] = intro self.program['mainId'] = mainId videoList = [] subprg_list = soup.find_all( 'li', attrs={'class': 'v1-bangumi-list-part-child'}) for li in subprg_list: if li.find('a').get('class') == ['v1-complete-text']: videoList.append(li) self.secondSpider(videoList)
def firstSpider(self, seed): name = "" poster = "" point = 0.0 shootYear = "" star = "" director = "" ctype = "" programLanguage = "" intro = "" mainId = "" area = "" mainId_p = re.findall(r'http://www\.mgtv\.com/\w/\d+/(.*)\.html', seed) if mainId_p: mainId = mainId_p[0] #get doc = spiderTool.getHtmlBody(seed) soup = BeautifulSoup(doc, from_encoding="utf8") videoinfo = soup.find('div', attrs={'class': 'v-panel-info v-panel-mod'}) if videoinfo is not None: pinfo_list = videoinfo.find_all('p') for pinfo in pinfo_list: pinfo_str = str(pinfo) # print(pinfo_str) if re.search(r'导演:'.decode('utf-8'), pinfo_str.decode('utf8')): director = pinfo.find('a').get_text() if re.search(r'暂无'.decode('utf-8'), director) or re.search(r'未知'.decode('utf-8'), director): director = "" elif re.search(r'主演:'.decode('utf-8'), pinfo_str.decode('utf8')): star_p = pinfo.find_all('a') star_list = [] for li in star_p: if not re.search(r'暂无'.decode('utf-8'), li.get_text()) or re.search(r'未知'.decode('utf-8'), li.get_text()): star_list.append(li.get_text()) star = ','.join(star_list) elif re.search(r'地区:'.decode('utf-8'), pinfo_str.decode('utf8')): area_p = pinfo.find_all('a') area_list = [] for li in area_p: area_list.append(li.get_text()) area = ','.join(area_list) elif re.search(r'类型:'.decode('utf-8'), pinfo_str.decode('utf8')): ctype_p = pinfo.find_all('a') ctype_list = [] for li in ctype_p: li.append(li.get_text()) ctype = ','.join(ctype_list) elif re.search(r'简介:'.decode('utf-8'), pinfo_str.decode('utf8')): intro_p = pinfo.find('span',attrs={'class': 'details'}) if intro_p is not None: intro = intro_p.get_text() seedJson = "" if mainId != '': seedJson = "http://pcweb.api.mgtv.com/episode/list?video_id=%s&page=0&size=40" % mainId else: return json_data = "" tatal_pages = 1 current_page = 0 doc = spiderTool.getHtmlBody(seedJson) try: data = json.loads(doc) except: # print("load json error1111!") return if data.get('data') is None: # print("get html error1111") return json_data = data['data'] if json_data.get('total_page'): tatal_pages = json_data['total_page'] if json_data.get('info'): name = json_data['info']['title'] if intro == "": intro = json_data['info']['desc'] # if json_data.get('current_page'): # current_page = json_data['current_page'] self.program["name"] = spiderTool.changeName(name) self.program['poster'] = spiderTool.listStringToJson('url',poster) self.program['point'] = point self.program['shootYear'] = shootYear self.program['star'] = spiderTool.listStringToJson('name',star) self.program['director'] = spiderTool.listStringToJson('name',director) self.program['ctype'] = spiderTool.listStringToJson('name',ctype) self.program['programLanguage'] = programLanguage self.program['area'] = spiderTool.listStringToJson('name', area) self.program['intro'] = intro self.program['mainId'] = mainId for pageNo in range(1, tatal_pages + 1): subseed = "http://pcweb.api.mgtv.com/episode/list?video_id=%s&page=%s&size=40" % (mainId,pageNo) self.secondSpider(subseed)
def secondSpider(self, seed_sub): playseeduse = 0 doc = spiderTool.getHtmlBody(seed_sub) if re.search(r'window\.jQuery', doc): sub_doc_p = re.search(r'\((.*)\);', doc) doc = sub_doc_p.group(1) try: data = json.loads(doc) except: #print("load json error22222!") return if data.get('html') is None: #print("get html error22222") return sub_soup = BeautifulSoup(data['html']) for div in sub_soup.find_all('div', attrs={'class': 'p-item'}): self.program_sub = copy.deepcopy(PROGRAM_SUB) setNumber = "" setName = "" webUrl = "" poster = "" playLength = "" setIntro = "" plot = [] webUrl_p = div.find('a') if webUrl_p is not None: webUrl = "http:%s" % webUrl_p.get('href') setName_p = div.find('a') if setName_p is not None: setName = setName_p.get('title') setNumber_p = div.find('a') if setNumber_p is not None: setNumber = setNumber_p.get('title') setIntro_p = div.find('div', attrs={'class': 'item-intro c999'}) if setIntro_p is not None: setIntro = setIntro_p.get_text() poster_p = div.find('img') if poster_p is not None: poster = poster_p.get('src') playLength_p = div.find('span', attrs={'class': 'p-time'}) if playLength_p is not None: playLength = playLength_p.get_text() if re.search(r'\d+$'.decode('utf8'), setNumber): setNumber = re.search(r'\d+$'.decode('utf8'), setNumber).group(0) else: setNumber = "" if setNumber == "" and setName != "" and setName is not None: if re.search(r'\d+'.decode('utf8'), setName): setNumber = re.search(r'\d+'.decode('utf8'), setName).group(0) if setNumber == "" or setName == "" or setNumber is None or setName is None or webUrl is None or webUrl == "" or \ re.search(r'预告'.decode('utf8'), setName): continue self.program_sub['setNumber'] = setNumber self.program_sub['setName'] = setName self.program_sub['webUrl'] = webUrl self.program_sub['poster'] = poster self.program_sub['playLength'] = playLength self.program_sub['setIntro'] = setIntro self.program_sub['plot'] = plot self.program['programSub'].append(self.program_sub)
def firstSpider(self, seed): point = 0.0 poster = "" name = "" shootYear = "" alias = "" area = "" star = "" director = "" ctype = "" playTimes = 0 intro = "" mainId = "" doc = spiderTool.getHtmlBody(seed) if re.search(r'playlistId\s*=\s*"(\d+)"', doc): mainId = re.search(r'playlistId\s*=\s*"(\d+)"', doc).group(1) elif re.search(r'PLAYLIST_ID\s*=\s*"(\d+)"', doc): mainId = re.search(r'PLAYLIST_ID\s*=\s*"(\d+)"', doc).group(1) elif re.search(r'http://film\.sohu\.com/album/(\d+)\.html', seed): mainId = re.search(r'http://film\.sohu\.com/album/(\d+)\.html', seed).group(1) else: return seed = "http://pl.hd.sohu.com/videolist?playlistid=%s&callback=__get_videolist" %(mainId) try: doc = spiderTool.getHtmlBody(seed).decode('gbk').encode('utf8') doc = doc.split('__get_videolist(')[1][:-2] data = json.loads(doc) except: return if data.has_key('albumName'): name = data['albumName'] if data.has_key('mainActors'): star = ','.join(data['mainActors']) if data.has_key('categories'): ctype = ','.join(data['categories']) if data.has_key('publishYear'): shootYear = str(data['publishYear']) if data.has_key('albumDesc'): intro = data['albumDesc'] if data.has_key('largeVerPicUrl'): poster = data['largeVerPicUrl'] if data.has_key('directors'): director = ','.join(data['directors']) if data.has_key('area'): area = data['area'] self.program["name"] = spiderTool.changeName(name) self.program["alias"] = spiderTool.changeName(alias) self.program["point"] = float(point) self.program['poster'] = spiderTool.listStringToJson('url',poster) self.program['star'] = spiderTool.listStringToJson('name',star) self.program['director'] = spiderTool.listStringToJson('name',director) self.program['ctype'] = spiderTool.listStringToJson('name',ctype) self.program['shootYear'] = shootYear self.program['area'] = spiderTool.listStringToJson('name', area) self.program['playTimes'] = long(playTimes) self.program['intro'] = intro self.program['mainId'] = mainId if data.has_key('videos'): videos = data['videos'] self.secondSpider(videos)
def firstSpider(self, seed): name = "" poster = "" point = 0.0 shootYear = "" star = "" director = "" ctype = "" programLanguage = "" intro = "" mainId = "" area = "" pid = "" if re.search(r'http://www\.le\.com/movie/(?P<pid>\d+)\.html',seed): pid = re.search(r'http://www\.le\.com/movie/(?P<pid>\d+)\.html',seed).group('pid') elif re.search(r'http://www\.letv\.com/movie/(?P<pid>\d+)\.html',seed): pid = re.search(r'http://www\.letv\.com/movie/(?P<pid>\d+)\.html',seed).group('pid') if pid != '' or pid != '0': seed = 'http://static.app.m.letv.com/android/mod/mob/ctl/album/act/detail/id/%s/pcode/010110014/version/5.2.3.mindex.html' %(pid) else: return mainId = pid doc = spiderTool.getHtmlBody(seed) try: json_data = json.loads(doc) except: json_data = {} if json_data.has_key("body"): if type(json_data["body"]) is types.DictionaryType: json_data = json_data["body"] else: return if json_data.has_key("nameCn"): name = json_data["nameCn"] if json_data.has_key("nameCn"): name = json_data["nameCn"] if json_data.has_key("picCollections"): poster_dict = json_data["picCollections"] if type(poster_dict) is types.DictionaryType: if poster_dict.has_key('400*300'): poster = poster_dict['400*300'] if poster == "": for each in poster_dict: if poster_dict[each] != "": poster = poster_dict[each] if self.images.has_key(pid): poster = self.images[pid] if json_data.has_key("score"): try: point = float(json_data["score"]) except: point = 0.0 if json_data.has_key("releaseDate"): shootYear_P = json_data["releaseDate"] if re.search(r'(\d{4})-\d{2}-\d{2}',shootYear_P): shootYear = re.search(r'(\d{4})-\d{2}-\d{2}',shootYear_P).group(1) elif re.search(r'^\d{4}$',shootYear_P): shootYear = shootYear_P if json_data.has_key("directory"): director_P = json_data["directory"] if type(director_P) is types.UnicodeType: director = director_P.strip().replace(" ",",") if json_data.has_key("starring"): star_P = json_data["starring"] if type(star_P) is types.UnicodeType: star = star_P.strip().replace(" ",",") if json_data.has_key("area"): area_P = json_data["area"] if type(area_P) is types.UnicodeType: area = area_P.strip().replace(" ",",") if json_data.has_key("subCategory"): ctype_P = json_data["subCategory"] if type(ctype_P) is types.UnicodeType: ctype = ctype_P.strip().replace(" ",",") if json_data.has_key('description'): intro = json_data['description'] if json_data.has_key('language'): programLanguage = json_data['language'] self.program["name"] = spiderTool.changeName(name) self.program['poster'] = spiderTool.listStringToJson('url',poster) self.program['point'] = point self.program['shootYear'] = shootYear self.program['star'] = spiderTool.listStringToJson('name',star) self.program['director'] = spiderTool.listStringToJson('name',director) self.program['ctype'] = spiderTool.listStringToJson('name',ctype) self.program['programLanguage'] = programLanguage self.program['area'] = spiderTool.listStringToJson('name', area) self.program['intro'] = intro self.program['mainId'] = mainId seed_sub = "http://static.app.m.letv.com/android/mod/mob/ctl/videolist/act/detail/id/%s/vid/25520328/b/1/s/60/o/-1/m/0/pcode/010110014/version/5.2.3.mindex.html" %(pid) self.secondSpider(seed_sub)
def firstSpider(self, seed): name = "" poster = "" star = "" director = "" ctype = "" programLanguage = "" intro = "" mainId = "" area = "" doc = spiderTool.getHtmlBody(seed) soup = BeautifulSoup(doc, from_encoding="utf8") detail_p = soup.find("div", attrs={'class': 'k_jianjie'}) if detail_p is not None: #get poster poster_p = detail_p.find("div", attrs={'id': 'k_jianjie-2b'}) if poster_p is not None and poster_p.find('img') is not None: poster = poster_p.find('img').get('src') name = poster_p.find('img').get('alt') #get detail detail_1 = detail_p.find("div", attrs={'id': 'k_jianjie-3a'}) if detail_1 is not None: detail_list = detail_1.find_all('ul') if detail_list is not None: for ul in detail_list: ul_p = str(ul) if ul.find('li').get('class') == [ 'k_jianjie-3a-1-name' ] and name == "": name = ul.find('li').get_text() elif re.search(r'状态:'.decode('utf8'), ul_p.decode('utf8')): if re.search(r'预告'.decode('utf8'), ul_p.decode('utf8')): return elif re.search(r'别名:'.decode('utf8'), ul_p.decode('utf8')): alias_p = ul.find_all('li') alias = "" for li in alias_p: if not re.search(r'别名:'.decode('utf8'), (str(li)).decode('utf8')): alias = li.get_text() elif re.search(r'导演:'.decode('utf8'), ul_p.decode('utf8')): director_p = ul.find_all('li') for li in director_p: if not re.search(r'导演:'.decode('utf8'), (str(li)).decode('utf8')): director_p = li.get_text().strip() director_p = director_p.replace('/', '') director = ','.join(director_p.split()) elif re.search(r'演员:'.decode('utf8'), ul_p.decode('utf8')): star_p = ul.find_all('li') for li in star_p: if not re.search(r'演员:'.decode('utf8'), (str(li)).decode('utf8')): star_p = li.get_text().strip() star_p = star_p.replace('/', '') star = ','.join(star_p.split()) elif re.search(r'地区:'.decode('utf8'), ul_p.decode('utf8')): area_p = ul.find_all('li') for li in area_p: if not re.search(r'地区:'.decode('utf8'), (str(li)).decode('utf8')): area = li.get_text().strip() elif re.search(r'语言:'.decode('utf8'), ul_p.decode('utf8')): language_p = ul.find_all('li') for li in language_p: if not re.search(r'语言:'.decode('utf8'), (str(li)).decode('utf8')): programLanguage = li.get_text().strip() elif re.search(r'剧情:'.decode('utf8'), ul_p.decode('utf8')): intro_p = ul.find_all('li') for li in intro_p: if not re.search(r'剧情:'.decode('utf8'), (str(li)).decode('utf8')): intro = li.get_text().strip() #get mainId if re.match(r'http://www\.yehetang\.com/movie/(.*)\.html', seed): mainId = re.match(r'http://www\.yehetang\.com/movie/(.*)\.html', seed).group(1) if self.ctype.get(seed): ctype = self.ctype[seed] self.program["name"] = spiderTool.changeName(name) self.program['poster'] = spiderTool.listStringToJson('url', poster) self.program['star'] = spiderTool.listStringToJson('name', star) self.program['director'] = spiderTool.listStringToJson( 'name', director) self.program['ctype'] = spiderTool.listStringToJson('name', ctype) self.program['programLanguage'] = programLanguage self.program['area'] = spiderTool.listStringToJson('name', area) self.program['intro'] = intro self.program['mainId'] = mainId video_p = soup.find('div', attrs={'id': 'play_1'}) if video_p is not None: video_list = video_p.find_all('li') self.secondSpider(video_list)
def firstSpider(self, seed): name = "" poster = "" point = 0.0 shootYear = "" star = "" director = "" ctype = "" programLanguage = "" intro = "" mainId = "" area = "" videoId = "" mainId_p = re.findall(r'http://www\.mgtv\.com/\w/(.*)/.*\.html', seed) if mainId_p: mainId = mainId_p[0] videoId_p = re.findall(r'http://www\.mgtv\.com/\w/\d+/(.*)\.html', seed) if videoId_p: videoId = videoId_p[0] #get doc = spiderTool.getHtmlBody(seed) soup = BeautifulSoup(doc, from_encoding="utf8") videoinfo = soup.find('div', attrs={'class': 'v-panel-info v-panel-mod'}) if videoinfo is not None: pinfo_list = videoinfo.find_all('p') for pinfo in pinfo_list: pinfo_str = str(pinfo) if re.search(r'导演:'.decode('utf-8'), pinfo_str.decode('utf8')): director = pinfo.find('a').get_text() if re.search(r'暂无'.decode('utf-8'), director) or re.search( r'未知'.decode('utf-8'), director): director = "" elif re.search(r'主持:'.decode('utf-8'), pinfo_str.decode('utf8')): star_p = pinfo.find_all('a') star_list = [] for li in star_p: if not re.search(r'暂无'.decode('utf-8'), li.get_text()) or re.search( r'未知'.decode('utf-8'), li.get_text()): star_list.append(li.get_text()) star = ','.join(star_list) elif re.search(r'地区:'.decode('utf-8'), pinfo_str.decode('utf8')): area_p = pinfo.find_all('a') area_list = [] for li in area_p: area_list.append(li.get_text()) area = ','.join(area_list) elif re.search(r'类型:'.decode('utf-8'), pinfo_str.decode('utf8')): ctype_p = pinfo.find_all('a') ctype_list = [] for li in ctype_p: ctype_list.append(li.get_text()) ctype = ','.join(ctype_list) elif re.search(r'简介:'.decode('utf-8'), pinfo_str.decode('utf8')): intro_p = pinfo.find('span', attrs={'class': 'details'}) if intro_p is not None: intro = intro_p.get_text() seedJson = "" if mainId != '': seedJson = "http://pcweb.api.mgtv.com/variety/showlist?video_id=%s" % videoId else: return json_data = "" tatal_pages = 1 current_page = 0 doc = spiderTool.getHtmlBody(seedJson) try: data = json.loads(doc, object_pairs_hook=OrderedDict) except: # print("load json error1111!") return if data.get('data') is None: # print("get html error1111") return json_data = data['data'] #get title if json_data.get('info'): name = json_data['info']['title'] self.program["name"] = spiderTool.changeName(name) self.program['poster'] = spiderTool.listStringToJson('url', poster) self.program['point'] = point self.program['shootYear'] = shootYear self.program['star'] = spiderTool.listStringToJson('name', star) self.program['director'] = spiderTool.listStringToJson( 'name', director) self.program['ctype'] = spiderTool.listStringToJson('name', ctype) self.program['programLanguage'] = programLanguage self.program['area'] = spiderTool.listStringToJson('name', area) self.program['intro'] = intro self.program['mainId'] = mainId #only get current data #cur cid curretId = "" if json_data.get('cur'): if json_data['cur'].get('cid'): curretId = json_data['cur']['cid'] if json_data.get('tab_m') and curretId != "": for meach in json_data['tab_m']: subprgseed = "http://pcweb.api.mgtv.com/variety/showlist?collection_id=%s&month=%s" % ( curretId, meach['m']) self.secondSpider(subprgseed)