def parse2(self, response): item = response.meta['item'] movie = Selector(response).xpath('//*[@class="indent"]/span[1]/text()') if (movie.extract_first().strip() == ''): movie = Selector(response).xpath('//*[@class="indent"]/span[2]') item['content'] = movie.extract_first().strip() yield item
def parse2(self,response): title = Selector(response=response).xpath('//div[@class="movie-brief-container"]/h1/text()') type = Selector(response=response).xpath('//li[@class="ellipsis"]/a/text()') movie_date = Selector(response=response).xpath('//li[@class="ellipsis"][last()]/text()') item = Spider2Item() item['title'] = title.extract_first().strip() item['type'] = type.extract_first().strip() item['movie_date'] = movie_date.extract_first().strip() #print('------------') yield item
def parse_single_movie(self, response): item = response.meta['item'] movie_title = Selector(response=response).xpath( '/html/body/div[3]/div/div[2]/div[1]/h1/text()') movie_type = Selector(response=response).xpath( '/html/body/div[3]/div/div[2]/div[1]/ul/li[1]/a[1]/text()') movie_release_date = Selector(response=response).xpath( '/html/body/div[3]/div/div[2]/div[1]/ul/li[3]/text()') item['title'] = movie_title.extract_first().strip() item['release_date'] = movie_release_date.extract_first().strip() item['type'] = movie_type.extract_first().strip() yield item
def parse2(self, response): item = response.meta['item'] count = Selector(response=response).xpath( '//*[@id="score_list"]/div[1]/div[2]/text()') score = Selector(response=response).xpath( '//div[@class="box score-box"]/ul/li[1]/p/text()') # debug # print(count.extract_first().strip()) # print(score.extract_first().strip()) item['counts'] = count.extract_first().strip() item['scores'] = score.extract_first().strip() print(item) yield item
def parse2(self, response): item = response.meta['item'] info = Selector( response=response).xpath('//*[@id="link-report"]/span/text()') content = info.extract_first().strip() item['content'] = content yield item
def parse2(self, response): item = response.meta['item'] try: movies_name = Selector( response=response).xpath('//h1[@class="name"]/text()') #movies_tag = Selector(response=response).xpath('//h1[@class="name"]/ul/li/a[@class="text-link"]').getall() movies_tag1 = Selector(response=response).xpath( '//ul/li[1]/a[1][@class="text-link"]/text()').get() movies_tag2 = Selector(response=response).xpath( '//ul/li[1]/a[2][@class="text-link"]/text()').get() movies_tag3 = Selector(response=response).xpath( '//ul/li[1]/a[3][@class="text-link"]/text()').get() movies_time = Selector(response=response).xpath( '//ul/li[3][@class="ellipsis"]/text()').extract_first() movies_name = movies_name.extract_first() item['name'] = movies_name #item['tag'] = movies_tag item['tag1'] = movies_tag1 item['tag2'] = movies_tag2 item['tag3'] = movies_tag3 item['time'] = movies_time # print(movies_tag1,movies_tag2,movies_tag3) except Exception as e: print(e) finally: yield item
def parseMovieDetail(self, response): movie_info = response.meta['movie_info'] level_sel = Selector( response=response).xpath('//div[@class="level-item"]//img/@src') level = self.levelImgUrl2levelNum(level_sel.extract_first().strip()) movie_info['movie_level'] = level brief_sel = Selector(response=response).xpath( '//div[@class="resource-desc"]/div[@class="con"]/span[1]//text()') brief = brief_sel.extract_first().strip() movie_info['brief_desc'] = brief browse_times_sel = Selector( response=response).xpath('//li[@class="score-star"]//label/text()') browse_times = int(browse_times_sel.extract_first().strip()) movie_info['browse_times'] = browse_times yield movie_info
def parse2(self, response): item = response.meta['item'] # 电影排名 seniority = Selector( response=response).xpath('//p[@class="f4"]/text()') # 电影分级 mvrank = Selector( response=response).xpath('//div[@class="level-item"]/img/@src') # 封面信息 cover = Selector( response=response).xpath('//div[@class="imglink"]/a/img/@src') item['seniority'] = seniority.extract_first().strip() item['rank'] = mvrank.extract_first().strip() item['cover'] = cover.extract_first().strip() # print(item['seniority']) # print(item['rank']) # print(item['cover']) bt_url = item['link'].replace( '/resource', '/resource/index_json/rid') + '/channel/movie' yield scrapy.Request(url=bt_url, meta={'item': item}, callback=self.parse_views)
def parse2(self, response): item = response.meta['item'] # print(item) # soup = BeautifulSoup(response.text, 'html.parser') filmtop = Selector(response=response).xpath( "//div[@id='score_star']/../p/text()").re('\d+') print(filmtop) film_top = filmtop[0] # print(film_top) item['film_top'] = film_top filmlevel = Selector( response=response).xpath("//div[@class='level-item']/img/@src") film_level = filmlevel.extract_first() # print(film_level) if film_level == 'http://js.jstucdn.com/images/level-icon/a-big-1.png': item['film_level'] = 'A级' elif film_level == 'http://js.jstucdn.com/images/level-icon/b-big-1.png': item['film_level'] = 'B级' elif film_level == 'http://js.jstucdn.com/images/level-icon/c-big-1.png': item['film_level'] = 'C级' elif film_level == 'http://js.jstucdn.com/images/level-icon/d-big-1.png': item['film_level'] = 'D级' elif film_level == 'http://js.jstucdn.com/images/level-icon/e-big-1.png': item['film_level'] = 'E级' film_views = Selector( response=response).xpath("//li[@id='score_list']/div[1]").re('\d+') item['film_views'] = film_views[1] film_covertinfo = Selector( response=response).xpath('//div[@class="imglink"]/a/img/@src') film_covertinfo = film_covertinfo.extract_first() print(film_covertinfo) item['film_covertinfo'] = film_covertinfo print(item) yield item
def movie_detail_parse(self, response): item = response.meta['item'] rank_elmt = Selector(response=response).xpath( './/div[@class="box score-box"]//p[@class="f4"]/text()') rank_p = re.compile("[0-9]+") rank = int(rank_p.findall(rank_elmt.extract_first())[0]) image_elmt = Selector( response=response).xpath('.//div[@class="imglink"]//a/@href') image_link = image_elmt.extract_first() grade_image_elmt = Selector( response=response).xpath('.//div[@class="level-item"]//img/@src') grade_image_link = grade_image_elmt.extract_first() grade = grade_image_link.split('/')[-1][0] view_selector = Selector(response=response).xpath( '//script[@type="text/javascript" and contains(@src,"rid")]/@src') view_uri = view_selector.extract_first() parsed_uri = urllib.parse.urlsplit(response.url) view_link = f'{parsed_uri.scheme}://{parsed_uri.netloc}{view_uri}' view_response = requests.get(view_link) view = None if view_response.status_code == 200: view = int( json.loads( view_response.text.split("index_info=")[1])['views']) item['rank'] = rank item['image'] = image_link item['grade'] = grade item['view'] = view print(item) yield item
def parse2(self, response): """ 解析函数2:获取单个电影的详细信息 """ item = response.meta['item'] Movie_class = Selector(response=response).xpath( '//div[@class="fl view-left"]//div[@class="level-item"]//img/@src') Cover_info = Selector(response=response).xpath( '//div[@class="fl view-left"]//div[@class="imglink"]//img/@src') browse_times = Selector( response=response).xpath('//li[@class="score-star"]//label/text()') Browse_times = int(browse_times.extract_first().strip()) item['Movie_class'] = Movie_class item['Cover_info'] = Cover_info item['Browse_times'] = Browse_times yield item
def parse2(self, response): print(response) item = response.meta['item'] rank_selector = Selector( response=response).xpath('//p[@class="f4"]/text()') level_selector = Selector( response=response).xpath('//div[@class="level-item"]') cover_selector = Selector( response=response).xpath('//div[@class="level-item"]') rank = rank_selector.extract_first().strip() item['rank'] = rank level = level_selector.xpath('./img/@src').extract_first().strip() item['level'] = level cover = cover_selector.xpath('./img/@src').extract_first().strip() item['cover'] = cover yield item
def parse2(self, response): item = response.meta['item'] # movie = Selector(response=response).xpath('//div[@class="fl box top24"]//li') # //div[@class="fl view-left"]//div[@class="level-item"]//img/@src # //div[@class="fl view-left"]//div[@class="imglink"]//img/@src # content = movie.xpath('./a/@href').get_text().strip() # item['content'] = content classification=Selector(response=response).xpath('//div[@class="fl view-left"]//div[@class="level-item"]//img/@src') coverInfo=Selector(response=response).xpath('//div[@class="fl view-left"]//div[@class="imglink"]//img/@src') browse_times = Selector(response=response).xpath('//li[@class="score-star"]//label/text()') browseTimes = int(browse_times.extract_first().strip()) item['classification']=classification item['coverInfo']=coverInfo item['browseTimes']=browseTimes yield item
def parse(self, response): items = [dict(film_name='电影名称',film_type='电影类型',plan_date='上映日期')] print(response.encoding) response=response.text.replace("<dd>","</dd><dd>") for i in range(1, 11): item = SpidersItem() film_name = Selector(text=response).xpath(f'//*[@id="app"]/div/div[2]/div[2]/dl/dd[{i}]/div[1]/div[2]/a/div/div[1]/span[1]/text()') film_type = Selector(text=response).xpath(f'//*[@id="app"]/div/div[2]/div[2]/dl/dd[{i}]/div[1]/div[2]/a/div/div[2]/text()') plan_date = Selector(text=response).xpath(f'//*[@id="app"]/div/div[2]/div[2]/dl/dd[{i}]/div[1]/div[2]/a/div/div[4]/text()') # 设置item item['film_name'] = film_name.extract_first().strip() item['film_type'] = film_type.extract()[1].strip() # print(film_type.extract()[1].strip()) item['plan_date'] = plan_date.extract()[1].strip() # print(plan_date.extract()[1].strip()) items.append(item) return items
def parse2(self, response): print('response.url: ', response.url) commonitems = Selector( response=response).xpath('//li[@class="comment-item"]') for ci in commonitems: short = ci.xpath( './div[@class="comment"]/p[@class="comment-content"]/span[@class="short"]/text()' ).extract_first().strip() shorttime = ci.xpath( './div[@class="comment"]//span[@class="comment-info"]/span[2]/text()' ).extract_first().strip() # 判断数据是否已经读取过,读取过则返回 sql = 'select count(*) from hlmshorts_new t where t.S_SHORTSTIME = "%s" and t.S_SHORTS = "%s"' % ( shorttime, short) df = db.readtable(sql) cnt = df.iat[0, 0] if cnt > 0: return star = ci.xpath( './div[@class="comment"]/h3/span[@class="comment-info"]/span[1]/@title' ).extract_first().strip() vote = ci.xpath( './div[@class="comment"]/h3/span[@class="comment-vote"]/span[@class="vote-count"]/text()' ).extract_first().strip() # 在items.py定义DoubanbookItem item = DoubanbookItem() item['star'] = star item['vote'] = vote item['short'] = short item['shorttime'] = shorttime yield item # 取下一页数据 nextpage1 = Selector(response=response).xpath( '//div[@class="paginator-wrapper"]/ul[@class="comment-paginator"]/li[last()]/a/@href' ) if nextpage1: nextpage = nextpage1.extract_first().strip() print('nextpage: ', nextpage) url = f'{HongloumengSpider.start_urls[0]}{nextpage}' yield scrapy.Request(url=url, callback=self.parse2) time.sleep(5)
def parse_link(self, response): """ 获取对应标题的电影简介 """ # 反爬检测 # print('-'*100) # print(response.text) # print('-'*100) item = response.meta['item'] content = Selector( response=response).xpath('//div[@id="link-report"]/span/text()') m_id = Selector( response=response).xpath('//span[@class="top250-no"]/text()' ).extract_first().strip().split('.')[-1] item['content'] = content.extract_first().strip() item['m_id'] = m_id # print('-'*100) # print(item) # print('-'*100) yield item
class ParseNombre: nombre_de_pila = str apellidos = str nombre_completo = str html_data = None def __init__(self, decoded_html=str): # primero tomar la decicion if decoded_html is not None: self.html_data = Selector(text=decoded_html, type="html").xpath("//b") if decoded_html.find("REGISTRO ELECTORAL - CONSULTA DE DATOS") > 0: self._extraer_nombre_html_cne() else: self._calc_nombre(self.html_data.extract_first()) self.nombre_de_pila = self.nombre_de_pila.title() self.apellidos = self.apellidos.title() self.nombre_completo = self.nombre_completo.title() def _extraer_nombre_html_cne(self): nombre_html = self.html_data[3].extract() self._calc_nombre(nombre_html) def _extraer_nombre_html_registro_civil(self): nombre_html = self.html_data[3].extract() self._calc_nombre(nombre_html) def _calc_nombre(self, nombre_html_de_scrapy): self.nombre_completo = nombre_html_de_scrapy.replace('</b>', '').replace( '<b>', '') nombre = self.nombre_completo.split() # si el ciudadano tiene un solo apellido devuelve True un_apellido_test = nombre_html_de_scrapy.find(' </b>') > 0 # si el ciudadano tiene un solo nombre devuelve True un_nombre_test = nombre_html_de_scrapy.find(' ') > 0 if len(nombre) == 4: self.nombre_de_pila = f"{nombre[0]} {nombre[1]}" self.apellidos = f"{nombre[-2]} {nombre[-1]}" elif len(nombre) == 3: if un_apellido_test: self.nombre_de_pila = f"{nombre[0]} {nombre[1]}" # obtenemos el apellido apuntando al ultimo item -1 self.apellidos = nombre[-1] if un_nombre_test: # COLOCAMOS LOS NOMBRES Y APELLIDOS DONDE VAN self.nombre_de_pila = f"{nombre[0]} {nombre[1]}" self.apellidos = f"{nombre[1]} {nombre[2]}" elif len(nombre) == 2: self.nombre_de_pila = f"{nombre[0]}" self.apellidos = nombre[-1] elif len(nombre) == 5: self.nombre_de_pila = f"{nombre[0]} {nombre[1]} {nombre[2]}" self.apellidos = f"{nombre[-2]} {nombre[-1]}" elif nombre == "DE" or nombre == "DEL" in nombre: conectivos = [] for k, v in enumerate(nombre): if v == "DE" or v == "DEL": decision = len(nombre) / 2 if k <= decision: print("CONECTIVO INICIO ENCONTRADO >" + v) conectivos.append([k, v]) offset = k + 1 self.nombre_de_pila = f"{nombre[0]} {nombre[offset - 1]} {nombre[offset]}" self.apellidos = nombre[-1] else: print("CONECTIVO FINAL ENCONTRADO >" + v) conectivos.append([k, v]) self.nombre_de_pila = f"{nombre[0]} {nombre[1]}" self.apellidos = f"{nombre[-3]} {conectivos[0][1]} {nombre[conectivos[0][0] + 1]}" if len(conectivos) > 1: print("HAY MAS DE 1 CONECTIVO> ") print(conectivos) self.nombre_de_pila = f"{nombre[0]} {nombre[1]} {nombre[2]}" self.apellidos = f"{nombre[-3]} {conectivos[-1][1]} {nombre[-1]}"
def parse_novel(self, response): soup = BeautifulSoup(response.body, "lxml") novel_id = response.meta.get('novel_id') novel = Novel() novel['id'] = novel_id novel['title'] = response.xpath( '//span[@class="bigtext"]//span/text()').extract_first() novel['author'] = response.xpath( '//span[@itemprop="author"]/text()').extract_first() novel['intro'] = soup.find('div', {"id": "novelintro"}).text novel_metadata = response.xpath( '//ul[@name="printright"]/li').extract() # 第一行:文章类型 novel['genre'] = Selector(text=novel_metadata[0]).xpath('//span[@itemprop="genre"]/text()').extract_first()\ .strip() # 第二行:作品视角 novel['view'] = Selector(text=novel_metadata[1]).xpath( '//li/text()').extract_first().strip() # 第三行:作品风格 novel['style'] = Selector(text=novel_metadata[2]).xpath( '//li/text()').extract_first().strip() # 第四行:作品系列 novel['series'] = Selector(text=novel_metadata[3]).xpath('//span[@itemprop="series"]/text()').extract_first()\ .strip() # 第五行:文章进度 if (Selector(text=novel_metadata[4]).xpath( '//span[@itemprop="updataStatus"]/font')): novel['updateStatus'] = Selector(text=novel_metadata[4]).xpath( '//span[@itemprop="updataStatus"]/font/text()') \ .extract_first().strip() else: novel['updateStatus'] = Selector(text=novel_metadata[4]).xpath('//span[@itemprop="updataStatus"]/text()')\ .extract_first().strip() # 第六行:全文字数 novel['wordCount'] = Selector(text=novel_metadata[5]).xpath('//span[@itemprop="wordCount"]/text()')\ .extract_first().strip() # 第七行:是否出版 published = BeautifulSoup(novel_metadata[6], "lxml").text novel['published'] = published.replace("是否出版:", "").replace("(联系出版)", "").strip() # 第八行:签约状态 novel['signed'] = Selector(text=novel_metadata[7]).xpath( '//font/text()').extract_first().strip() comment = Selector(text=novel_metadata[8]).xpath( '//div[@id="marknovel_message"]/text()') if comment: novel['comment'] = comment.extract_first() else: novel['comment'] = '' poster = parse.urlsplit( soup.find('img', {"itemprop": "image"})["src"]).path.split('/')[-1] poster_url = soup.find('img', {"itemprop": "image"})["src"] novel["poster"] = poster novel["images"] = [poster] novel["image_urls"] = [poster_url] tags = [] html_tags = soup.findAll( "div", {"class": "smallreadbody"})[-1].findAll("font") for html_tag in html_tags: tags.append(html_tag.text.strip()) novel["tags"] = '|'.join(tags) html_keys = soup.findAll("div", {"class": "smallreadbody"})[-1].find( "span", { "class": "bluetext" }).text key_array = html_keys.replace("搜索关键字:", "").split("┃") key_leadings = key_array[0].replace("主角:", "").strip().split(",") key_supportings = key_array[1].replace("配角:", "").strip().split(",") key_other = key_array[2].replace("其它:", "").strip().split(",") novel["key_leadings"] = key_leadings novel["key_supportings"] = key_supportings novel["key_other"] = key_other soup_table = soup.find("table", {"id": "oneboolt"}) soup_lines = soup_table.findAll('tr')[3:-1] current_group = "" for soup_line in soup_lines: soup_tds = soup_line.findAll("td") if len(soup_tds) == 1: current_group = soup_tds[0].text else: chapter = Chapter() if soup_tds[1].find('a').has_attr("href"): chapter['vip'] = "No" chapter['url'] = soup_tds[1].find('a')['href'] else: chapter['vip'] = "Yes" chapter['url'] = soup_tds[1].find('a')['rel'] chapter['novel_id'] = novel_id chapter['group'] = current_group chapter['novel_id'] = soup_tds[0].text.strip() chapter['title'] = soup_tds[1].text.strip() chapter['summary'] = soup_tds[2].text.strip() chapter['word_count'] = soup_tds[3].text.strip() if (len(soup_tds) == 5): chapter['updated'] = soup_tds[4].text.strip() else: chapter['updated'] = soup_tds[5].text.strip() if (chapter['vip'] == "No"): yield scrapy.Request(url=chapter['url'], callback=self.parse_chapter, meta={'chapter': chapter}) yield (novel)
def parse_detail(self, response): item = response.meta["item"] content = Selector( response=response).xpath('//div[@class="mod-content"]/span/text()') item["content"] = content.extract_first().strip() yield item