def parse_imdb(self, response): item = ImdbItem() item['url'] = response.url item['title'] = "".join( response.xpath( '//*[@class="fk-3"]/div[@class="hdd"]/h3/text()').extract()) pass
def parse(self, response): ''' this part is for debug :param response: :return: ''' # from scrapy.shell import inspect_response # inspect_response(response, self) soup = BeautifulSoup(response.text) movies = soup.find("tbody", {"class": "lister-list"}) for movie in movies.findAll("tr"): item = ImdbItem() poster = movie.find("td", {"class": "posterColumn"}) item["score"] = poster.find("span", {"name": "ir"})["data-value"] movie_link = movie.find("td", { "class": "titleColumn" }).find('a')["href"] url = "http://www.imdb.com" + movie_link year_str = movie.find("td", {"class": "titleColumn"}).text year_pattern = re.compile('\d{4}') item["year"] = int(year_pattern.search(year_str).group()) id_pattern = re.compile(r'(?<=tt)\d+(?=/?)') item["movie_id"] = int(id_pattern.search(movie_link).group()) item["movie_name"] = movie.select_one('.titleColumn').select_one( 'a').string # yield item yield scrapy.Request(url, meta={'item': item}, callback=self.parse_2)
def parse(self, response): itens_number = response.css( '.lister-current-last-item::text').extract_first() next_page = response.css( '.lister-page-next::attr(href)').extract_first() for selector in response.css('.lister-item'): item = ItemLoader(item=ImdbItem(), selector=selector) item.add_value( 'position', selector.css('.lister-item-index::text').extract_first( default='')) item.add_value( 'name', selector.css('.lister-item-header > a::text').extract_first( default='')) item.add_value( 'genre', selector.css('.genre::text').extract_first(default='')) item.add_value( 'rating', selector.css('.ratings-imdb-rating > strong::text'). extract_first(default='')) yield item.load_item() if int(itens_number) < 500: next_url = response.urljoin(next_page) request = Request(url=next_url, callback=self.parse) yield request
def parse(self, response): movie_id = response.url.split("/")[-2] for sel in response.xpath('//div[@id="tn15content"]//p'): item = ImdbItem() item['movie'] = movie_id item['review'] = sel.xpath('text()').extract() yield item
def parse_item(self, response): # yield { # 'title' : response.xpath("normalize-space(//div[@class='title_wrapper']/h1/text())").get(), # 'realease' : response.xpath("//div[@class='title_wrapper']/h1/span/a/text()").get(), # 'rating' : response.xpath("//span[@itemprop='ratingValue']/text()").get(), # 'genre' : response.xpath("//div[@class='subtext']/a[1]/text()").get(), # 'duration' : response.xpath("normalize-space(//div[@class='subtext']/time/text())").get(), # 'url' : response.url # } title = response.xpath( "normalize-space(//div[@class='title_wrapper']/h1/text())").get(), realease = response.xpath( "//div[@class='title_wrapper']/h1/span/a/text()").get(), rating = response.xpath( "//span[@itemprop='ratingValue']/text()").get(), genre = response.xpath("//div[@class='subtext']/a[1]/text()").get(), duration = response.xpath( "normalize-space(//div[@class='subtext']/time/text())").get(), url = response.url imdb_item = ImdbItem(title=title, realease=realease, rating=rating, genre=genre, duration=duration, url=url) yield imdb_item
def parse_movie_page(self, response): parse = Parse_Imdb(response) title = parse.get_title() imdb_score = parse.get_imdb_score() metascore = parse.get_metascore() genres = parse.get_genres() country = parse.get_country() release_date = parse.get_release_date() budget = parse.get_budget() opening_usa = parse.get_opening_usa() usa_gross = parse.get_usa_gross() worldwide_gross = parse.get_worldwide_gross() root_url = response.request.url.split('/')[:-1] root_url.append('companycredits?ref_=tt_dt_co') url_companies = '/'.join(root_url) item = ImdbItem() item['title'] = title item['imdb_score'] = imdb_score item['metascore'] = metascore item['genres'] = genres item['country'] = country item['release_date'] = release_date item['budget'] = budget item['opening_usa'] = opening_usa item['usa_gross'] = usa_gross item['worldwide_gross'] = worldwide_gross yield Request(url_companies, callback=self.parse_companies, meta={'imdb_item': item})
def parse(self, response): #pass bad_chars = "()" for sel in response.xpath('//td[@class="title"]'): #print len(title) item = ImdbItem() title = sel.xpath('a/text()').extract() title = ''.join(title) title_href = sel.xpath('a/@href').extract() title_href = ''.join(title_href) year_type = sel.xpath('span[@class="year_type"]/text()').extract() year_type = ''.join(year_type) year_type = year_type.strip("()") #year_type=self.filterNA(year_type) user_rating = sel.xpath( 'div[@class="user_rating"]/div/@title').extract() user_rating = ''.join(user_rating) user_rating = user_rating.replace(',', '') user_rating = user_rating[user_rating.find('(') + 1:user_rating.find(' votes')] user_rating = self.filterNA(user_rating) rating_rating = sel.xpath( 'div[@class="user_rating"]/div/span[@class="rating-rating"]/span/text()' ).extract() if len(rating_rating) == 3: rating_rating = rating_rating[0] else: rating_rating = "NA" outline = sel.xpath('span[@class="outline"]/text()').extract() outline = ''.join(outline) outline = self.filterNA(outline) credit_dir = sel.xpath('span[@class="credit"]/a/text()').extract() credit_dir = self.filterNA(credit_dir) credit_with = sel.xpath('span[@class="credit"]/a/text()').extract() credit_with = self.filterNA(credit_with) genre = sel.xpath('span[@class="genre"]/a/text()').extract() genre = self.filterNA(genre) mins = sel.xpath('span[@class="runtime"]/text()').extract() mins = ''.join(mins) if not mins: mins = "NA" else: mins = mins.split(' ')[0] item['title'] = title item['title_href'] = title_href item['year_type'] = year_type item['user_rating'] = user_rating item['rating_rating'] = rating_rating item['outline'] = outline item['credit_dir'] = credit_dir item['credit_with'] = credit_with item['genre'] = genre item['mins'] = mins yield item
def parse_detail(self,response): items = ImdbItem() items['name'] = response.css('#prometer_container+.header .itemprop::text').extract() items['publish_date'] = response.css('#prometer_container+.header .nobr a::text').extract() items['last_time'] = response.css('.infobar time::text').extract() items['classifications'] = response.css('.infobar>a>span::text').extract() items['score'] = response.css('.star-box .titlePageSprite::text').extract() items['link'] = response.url yield items
def parse(self, response): sel = Selector(response) sites = sel.xpath('//ul[@class="directory-url"]/li') items = [] item = ImdbItem() item['title'] = sel.xpath('//h1[@class="header"]/span[@class="itemprop"]/text()').extract() item['cast_information'] = sel.xpath('//div[@itemprop="actors"]/a/span/text()').extract() item['sypnosis'] = sel.xpath('//p[@itemprop="description"]/text()').extract() item['broadcast_date'] = sel.xpath('//div[@id="titleDetails"]/div/h4[text()="Release Date:"]/../text()').extract() item['production_company'] = sel.xpath('//div[@id="titleDetails"]/div/h4[text()="Production Co:"]/../span/a/span/text()').extract() return item
def parse(self, response): for tr in response.xpath("//tr[@class='odd'] | //tr[@class='even']"): imdb_item = ImdbItem() imdb_item['actor'] = tr.xpath( "./td[@itemprop='actor']//span/text()").extract() imdb_item['character'] = tr.xpath( "./td[@class='character']/div/a/text()").extract() actor_url = tr.xpath( "./td[@itemprop='actor']/a/@href").extract_first() request = scrapy.Request(response.urljoin(actor_url), callback=self.parse_actor_page) request.meta['imdb_item'] = imdb_item yield request
def parse_imdb(self, response): item = ImdbItem() try: item['video_url'] = response.url item['video_title'] = "".join( response.xpath( '//*[@class="title_wrapper"]/h1/text()').extract()) item['video_year'] = "".join( response.xpath('//*[@id="titleYear"]/a/text()').extract()) item['video_level'] = "".join( response.xpath('//*[@class="subtext"]/text()').extract()) item['video_genres'] = "".join( response.xpath('//*[@class="subtext"]/a/text()').extract()) yield item except Exception as error: log(error)
def parse_result_page(self, response): self.start += 50 titles = response.xpath( '//div[@class="lister-list"]/div/div[2]/a/@href').extract() titles = [re.findall(r'(tt\d+)', x)[0] for x in titles] item = ImdbItem() item['title'] = titles yield item start, end, total = self.check_current_page(response) print(self.start, start) if self.start <= start + 50: url = 'https://www.imdb.com/search/title/?title_type=feature,tv_movie,tv_series,tv_episode,tv_special,tv_miniseries,documentary,video_game,short,video,tv_short&release_date=1900-01-01,2019-12-31&start={}&ref_=adv_nxt'.format( self.start) yield Request(url=url, callback=self.parse_result_page) else: exit(1)
def parse(self, response): for sel in response.xpath( "//*[contains(@class,'chart full-width')]/tbody/tr"): item = ImdbItem() item['titulo'] = sel.xpath('td[2]/a/text()').extract()[0].strip() item['puntuacionIMDB'] = sel.xpath( 'td[3]/strong/text()').extract()[0].strip() item['ranking'] = re.match( r'(^[0-9]+)', sel.xpath('td[2]/text()').extract() [0].__str__().strip()).group(1) item['anyoEstreno'] = sel.xpath( 'normalize-space(td[2]/span/text())').extract()[0].strip() item['enlace'] = "http://imdb.com" + sel.xpath( 'td[2]/a/@href').extract()[0] request = scrapy.Request(item['enlace'], callback=self.parsearInfoPelicula) request.meta['item'] = item yield request
def parse_movie_page(self, response): hxs = HtmlXPathSelector(response) title_h1 = hxs.select('//h1[@class="header"]') year = title_h1.select('span/a/text()').extract().pop(0) ratings = hxs.select('//div[@class="star-box-details"]') item = ImdbItem() url = response.url if url[0:4] != 'http': url = 'http://www.imdb.com' + url id = re.search('(\d+)', url).group() item['id'] = int(id) if id else 0 item['url'] = url item['title'] = title_h1.select('text()').re('.*[^<]').pop(1) item['year'] = int(year) description = hxs.select( '//p[@itemprop="description"]/text()').extract() item['description'] = description.pop(0).strip() if description else '' cover = hxs.select('//td[@id="img_primary"]/a/img/@src').extract() rating = ratings.select('.//span[@itemprop="ratingValue"]/text()' ).extract().pop(0).strip() item['rating'] = float(rating) if ratings else 0.00 votes = ratings.select('.//span[@itemprop="ratingCount"]/text()' ).extract().pop(0).replace(',', '').strip() item['votes'] = int(votes) if votes else 0 # for ImagePipeline item['image_urls'] = cover if cover else [] return item
def parse_movies(self, response): movies_list = [] for sel in response.xpath( "//tr[@class='odd detailed' or @class='even detailed']"): movie = ImdbItem() movie['url'] = sel.xpath("td[@class='title']/a/@href").extract() movie['title'] = sel.xpath("td[@class='title']/a/text()").extract() movie['year'] = sel.xpath( "td[@class='title']/span[@class='year_type']/text()").extract( ) movie['rating'] = sel.xpath( "td[@class='title']/div[@class='user_rating']/div/span[@class='rating-rating']/span[@class='value']/text()" ).extract() movie['votes'] = sel.xpath( "td[@class='title']/div[@class='user_rating']/div/@title" ).extract() movie['genre'] = sel.xpath( "td[@class='title']/span[@class='genre']/a/text()").extract() movie['length'] = sel.xpath( "td[@class='title']/span[@class='runtime']/text()").extract() movies_list.append(movie) return (movies_list)
def parse(self, response): for movie in response.css('tr.detailed'): p = movie.xpath item = ImdbItem() link = p('td[@class="image"]').xpath('a') item['uri'] = link.xpath('@href').extract_first() item['name'] = link.xpath('@title').extract_first() item['gross'] = p('td[@class="sort_col"]/text()').extract_first() title = p('td[@class="title"]') item['rating'] = title.xpath('div/div/@title').extract_first() item['desc'] = title.xpath('span[3]/text()').extract_first() item['duration'] = title.xpath('span[7]/text()').extract_first() credit = title.xpath('span[4]').extract_first() item['credit'] = remove_html(credit) item['genre'] = remove_html(title.xpath('span[5]').extract_first()) yield item next_path = '//div[@class="leftright"]/div[2]/span/a[contains(text(), "Next")]/@href' next_page = response.xpath(next_path) if next_page: url = response.urljoin(next_page.extract()[0]) logging.info("Going for %s" % url) yield scrapy.Request(url, self.parse, dont_filter=True)
def parse2(self, response): movie = [] main = response.css('div#content-2-wide') maintop = main.css( 'div#main_top div.title-overview div#title-overview-widget') mainbottom = main.css('div#main_bottom') titlecast = main.css('div#titleCast') storyline = main.css('div#titleStoryLine') detail = main.css('div#titleDetails') titlebarwrapper = maintop.css( 'div.vital div.title_block div.title_bar_wrapper') slatewrapper = maintop.css('div.vital div.slate_wrapper') poster = slatewrapper.css('div.poster a img::attr(src)').extract() poster = (poster[0].split('_V1_')[0] + '_V1_.jpg') if len(poster) else '' if poster == '': poster = maintop.css( 'div.minPosterWithPlotSummaryHeight div.poster a img[itemprop="image"]::attr(src)' ).extract() poster = (poster[0].split('_V1_')[0] + '_V1_.jpg') if len(poster) else '' slate = slatewrapper.css( 'div.slate a.slate_button.prevent-ad-overlay.video-modal::attr(href)' ).extract() slate = 'http://www.imdb.com' + slate[0].strip() if len(slate) else '' title = titlebarwrapper.css( 'div.titleBar div.title_wrapper h1[itemprop="name"]::text' ).extract() title = title[0].strip() if len(title) else '' year = titlebarwrapper.css( 'div.titleBar div.title_wrapper h1[itemprop="name"] span#titleYear a::text' ).extract() year = year[0].strip() if len(year) else '' releasedate = titlebarwrapper.css( 'div.titleBar div.title_wrapper div.subtext a[title="See more release dates"]::text' ).extract() releasedate = releasedate[0].strip() if len(releasedate) else '' releasedate1 = titlebarwrapper.css( 'div.titleBar div.title_wrapper div.subtext a[title="See more release dates"] meta[itemprop="datePublished"]::attr(content)' ).extract() releasedate1 = releasedate1[0].strip() if len(releasedate1) else '' ratingbar = titlebarwrapper.css('div.ratings_wrapper div.imdbRating') rating = ratingbar.css( 'div.ratingValue strong span[itemprop="ratingValue"]::text' ).extract() rating = rating[0].strip() if len(rating) else '' ratingcount = ratingbar.css( 'a span[itemprop="ratingCount"]::text').extract() ratingcount = ratingcount[0].strip() if len(ratingcount) else '' plotsummary = maintop.css('div.plot_summary_wrapper div.plot_summary') description = plotsummary.css('div.summary_text::text').extract() description = description[0].strip() if len(description) else '' directorslist = plotsummary.css( 'div.credit_summary_item span[itemprop="director"] a span[itemprop="name"]::text' ).extract() directors = ','.join(directorslist) writerslist = plotsummary.css( 'div.credit_summary_item span[itemprop="creator"] a span[itemprop="name"]::text' ).extract() writers = ','.join(writerslist) titlereviewbar = maintop.css( 'div.plot_summary_wrapper div.titleReviewBar ') metascore = titlereviewbar.css( 'div.titleReviewBarItem a div.titleReviewBarSubItem span::text' ).extract() metascore = metascore[0].strip() if len(metascore) else '' popularity = titlereviewbar.css( 'div.titleReviewBarItem div.titleReviewBarSubItem div span.subText::text' ).extract() for pop in popularity: number = re.findall(r'\d+', pop) if len(number) > 0: popularity = number[0] break try: test = int(popularity) pass except Exception as e: popularity = '' peoplealsolikelist = mainbottom.css( 'div#titleRecs div#title_recs div.rec_const_picker div.rec_view div.rec_slide div.rec_page' ) peoplemaylike = peoplealsolikelist[0].css( 'div.rec_item::attr(data-tconst)').extract() if len( peoplealsolikelist) else '' peoplemaylike = ','.join(peoplemaylike) castslist = titlecast.css( 'table.cast_list tr td.itemprop[itemprop="actor"] a span.itemprop::text' ).extract() actors = ','.join(castslist) keywordslist = storyline.css( 'div.see-more.inline.canwrap[itemprop="keywords"] a span[itemprop="keywords"]::text' ).extract() keywords = ','.join(keywordslist) genreslist = storyline.css( 'div.see-more.inline.canwrap[itemprop="genre"] a::text').extract() genreslist = ','.join(genreslist) itemslist = detail.css('div.txt-block') countrieslist = [] countries = '' releasedate2 = '' runtime = '' for item in itemslist: itemname = item.css('h4.inline::text').extract() itemname = itemname[0].strip() if len(itemname) else '' if itemname == 'Country:': countrieslist = item.css('a[itemprop="url"]::text').extract() countries = ','.join(countrieslist) if itemname == 'Release Date:': texts = item.css('::text').extract() for text in texts: if len(re.findall(r'\d+', text)) > 0: releasedate2 = text.split('(')[0].strip() break if itemname == 'Runtime:': runtime = item.css('time[itemprop="duration"]::text').extract() runtime = runtime[0].strip() if len(runtime) else '' if runtime == '': runtime = titlebarwrapper.css( 'div.titleBar div.title_wrapper div.subtext time[itemprop="duration"]::text' ).extract() runtime = runtime[0].strip() if len(runtime) else '' if year == '': texts = releasedate2.split(' ') year = texts[len(texts) - 1] imdbItem = ImdbItem() imdbItem['Idx'] = response.meta['idx'] imdbItem['Id'] = response.meta['Id'] imdbItem['Title'] = title imdbItem['Year'] = year imdbItem['Genres'] = genreslist imdbItem['Directors'] = directors imdbItem['Writers'] = writers imdbItem['Actors'] = actors imdbItem['Countries'] = countries imdbItem['ReleaseDate'] = releasedate imdbItem['ReleaseDate1'] = releasedate1 imdbItem['ReleaseDate2'] = releasedate2 imdbItem['Runtime'] = runtime imdbItem['Rating'] = rating imdbItem['RatingCount'] = ratingcount imdbItem['Popularity'] = popularity imdbItem['MetaScore'] = metascore imdbItem['PeopleMayLike'] = peoplemaylike imdbItem['Keywords'] = keywords imdbItem['Link'] = response.meta['Link'] imdbItem['Description'] = description.replace('\"', '') imdbItem['image_urls'] = [poster] if (len(poster.strip()) > 0 and crawl_image) else [] imdbItem['file_urls'] = [slate] if len(slate.strip()) > 0 else [] with open(fileout, 'a', newline='') as csvfile: spamwriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL) spamwriter.writerow([ response.meta['idx'], response.meta['Id'], title, year, genreslist, directors, writers, actors, countries, releasedate, releasedate1, releasedate2, runtime, rating, ratingcount, popularity, metascore, peoplemaylike, keywords, imdbItem['Link'], imdbItem['Description'], poster, slate ]) yield imdbItem
def parse(self, response): fullcast_url = get_fullcast_url(response.url) company_url = get_company_url(response.url) location_url = get_location_url(response.url) technical_url = get_technical_url(response.url) parentguide_url = get_parent_guide_url(response.url) keyword_url = get_keyword_url(response.url) urls = { "fullcast": fullcast_url, "company": company_url, "location": location_url, "technical": technical_url, "parentguide": parentguide_url, "keyword": keyword_url } film = ImdbItem() # main film["ttid"] = response.url.split('/')[4] film["name"] = response.xpath( "//div[@class='title_block']/div[@class='title_bar_wrapper']/div[@class='titleBar']/div[@class='title_wrapper']/h1/text()" ).extract()[0].strip() release_year = response.xpath( "//div[@class='title_bar_wrapper']/div[@class='titleBar']/div[@class='title_wrapper']/h1/span[@id='titleYear']/a/text()" ) if release_year and len(release_year.extract()) > 0: film["release_year"] = release_year.extract()[0].strip() rating = response.xpath( "//div[@class='imdbRating']/div[@class='ratingValue']/strong/span/text()" ) if rating and len(rating.extract()) > 0: film["rating"] = response.xpath( "//div[@class='imdbRating']/div[@class='ratingValue']/strong/span/text()" ).extract()[0].strip() film["vote"] = response.xpath( "//div[@class='ratings_wrapper']/div[@class='imdbRating']/a/span[@itemprop='ratingCount']/text()" ).extract()[0].strip() detail_items = response.xpath( "//div[@id='main_bottom']/div[@id='titleDetails']/div[@class='txt-block']/h4/text()" ).extract() if "Language:" in detail_items: lang_index = detail_items.index("Language:") + 1 languages = response.xpath( "//div[@id='main_bottom']/div[@id='titleDetails']/div[@class='txt-block'][" + str(lang_index) + "]/a/text()").extract() if len(languages) > 0: film["primary_language"] = languages[0].strip() if "Country:" in detail_items: country_index = detail_items.index("Country:") + 1 countries = response.xpath( "//div[@id='main_bottom']/div[@id='titleDetails']/div[@class='txt-block'][" + str(country_index) + "]/a/text()").extract() film["country"] = [country.strip() for country in countries] storyline_items = response.xpath( "//div[@id='main_bottom']/div[@id='titleStoryLine']/div/h4/text()" ).extract() if "Genres:" in storyline_items: genre_index = storyline_items.index("Genres:") + 2 genres = response.xpath( "//div[@id='main_bottom']/div[@id='titleStoryLine']/div[" + str(genre_index) + "]/a/text()").extract() film["genre"] = [genre.strip() for genre in genres] yield scrapy.Request(urls["technical"], callback=self.tech_specification, meta={ 'item': film, "urls": urls })
def parse_imdb(self, response): item = ImdbItem() try: item['video_title'] = "".join( response.xpath('//*[@class="fk-3"]/div[@class="hdd"]/h3/text()' ).extract()) item['video_rating'] = "".join( response.xpath( '//*[@class="fk-3"]/div[@class="hdd"]/span/i/text()'). extract()) content = response.xpath( '//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li').extract() for i in range(0, len(content)): if "片名" in content[i]: if i == 0: item['video_name'] = "".join( response.xpath( '//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[1]/a/text()' ).extract()) if "别名" in content[i]: if i == 1: item['video_alias'] = "|".join( response.xpath( '//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[2]/a/text()' ).extract()) if "导演" in content[i]: if i == 1: item['video_director'] = "|".join( response.xpath( '//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[2]/a/text()' ).extract()) elif i == 2: item['video_director'] = "|".join( response.xpath( '//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[3]/a/text()' ).extract()) if "主演" in content[i]: if i == 2: item['video_actor'] = "|".join( response.xpath( '//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[3]/a/text()' ).extract()) if i == 3: item['video_actor'] = "|".join( response.xpath( '//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[4]/a/text()' ).extract()) if "上映时间" in content[i]: if i == 4: item['video_year'] = "|".join( response.xpath( '//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[5]/a[1]/text()' ).extract()) a = response.xpath( '//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[5]/a' ).extract() length = len(a) - 1 try: item['video_color'] = "".join( response.xpath( '//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[5]/a/text()' ).extract()[length]) except Exception as e: item['video_color'] = "" try: type = "|".join( response.xpath( '//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[5]/a/text()' ).extract()[1:length]) maohao = type.split(":") if len(maohao) > 0: item['video_type'] = maohao[0] else: item['video_type'] = "" except Exception as e: item['video_type'] = "" if i == 5: item['video_year'] = "".join( response.xpath( '//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[6]/a[1]/text()' ).extract()) a = response.xpath( '//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[6]/a' ).extract() length = len(a) - 1 try: item['video_color'] = "".join( response.xpath( '//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[6]/a/text()' ).extract()[length]) except Exception as e: item['video_color'] = "" try: type = "|".join( response.xpath( '//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[6]/a/text()' ).extract()[1:length]) maohao = type.split(":") if len(maohao) > 0: item['video_type'] = maohao[0] else: item['video_type'] = "" except Exception as e: item['video_type'] = "" if "国家" in content[i]: if i == 5: item['video_area'] = "|".join( response.xpath( '//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[6]/a[1]/text()' ).extract()) item['video_voice'] = "|".join( response.xpath( '//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[6]/a[2]/text()' ).extract()) if i == 6: item['video_area'] = "|".join( response.xpath( '//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[7]/a[1]/text()' ).extract()) item['video_voice'] = "|".join( response.xpath( '//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[7]/a[2]/text()' ).extract()) item['video_length'] = "".join( response.xpath( '//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[@class="nolink"]/text()' ).extract()).replace(" ", "") item['video_language'] = "".join( response.xpath( '//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[@class="nolink"]/a/text()' ).extract()) item['video_summary'] = "".join( response.xpath( '//*[@class="fk-4 clear"]/div[@class="bdd clear"]/i/text()' ).extract()).lstrip().rstrip().replace("<br>", "") item['video_url'] = response.url yield item except Exception as error: log(error)
def parse_review_page(self, response): # run_time = response.meta['run_time'] # genre = response.meta['genre'] # imdb_rating = response.meta['imdb_rating'] # meta_rating = response.meta['meta_rating'] title = response.meta['title'] MPAA_rating = response.meta['MPAA_rating'] release_date = response.meta['release_date'] director = response.meta['director'] actors = response.meta['actors'] #extracting review info try: male_teen_rating = float( response.xpath('//div[@class="bigcell"]/text()').extract()[6]) except ValueError: male_teen_rating = "" try: male_youngAdult_rating = float( response.xpath('//div[@class="bigcell"]/text()').extract()[7]) except ValueError: male_youngAdult_rating = "" try: male_adult_rating = float( response.xpath('//div[@class="bigcell"]/text()').extract()[8]) except ValueError: male_adult_rating = "" try: male_elder_rating = float( response.xpath('//div[@class="bigcell"]/text()').extract()[9]) except ValueError: male_elder_rating = "" try: male_ratingCount = int( response.xpath('//div[@class="smallcell"]/a/text()').extract() [5].strip().replace(',', "")) except IndexError: male_ratingCount = "" # try: # male_teen_ratingCount = int(response.xpath('//div[@class="smallcell"]/a/text()').extract()[6].strip().replace(',',"")) # except ValueError: # male_teen_ratingCount = "" # try: # male_youngAdult_ratingCount = int(response.xpath('//div[@class="smallcell"]/a/text()').extract()[7].strip().replace(',',"")) # except ValueError: # male_youngAdult_ratingCount = "" # try: # male_adult_ratingCount = int(response.xpath('//div[@class="smallcell"]/a/text()').extract()[8].strip().replace(',',"")) # except ValueError: # male_youngAdult_ratingCount = "" # try: # male_elder_ratingCount = int(response.xpath('//div[@class="smallcell"]/a/text()').extract()[9].strip().replace(',',"")) # except ValueError: # male_elder_ratingCount = "" try: female_teen_rating = float( response.xpath('//div[@class="bigcell"]/text()').extract()[11]) except ValueError: female_teen_rating = "" try: female_youngAdult_rating = float( response.xpath('//div[@class="bigcell"]/text()').extract()[12]) except ValueError: female_youngAdult_rating = "" try: female_adult_rating = float( response.xpath('//div[@class="bigcell"]/text()').extract()[13]) except ValueError: female_adult_rating = "" try: female_elder_rating = float( response.xpath('//div[@class="bigcell"]/text()').extract()[14]) except ValueError: female_elder_rating = "" try: female_ratingCount = int( response.xpath('//div[@class="smallcell"]/a/text()').extract() [10].strip().replace(',', "")) except IndexError: female_ratingCount = "" # try: # female_teen_ratingCount = int(response.xpath('//div[@class="smallcell"]/a/text()').extract()[11].strip().replace(',',"")) # except ValueError: # female_teen_ratingCount = "" # try: # female_youngAdult_ratingCount = int(response.xpath('//div[@class="smallcell"]/a/text()').extract()[12].strip().replace(',',"")) # except ValueError: # female_youngAdult_ratingCount = "" # try: # female_adult_ratingCount = int(response.xpath('//div[@class="smallcell"]/a/text()').extract()[13].strip().replace(',',"")) # except ValueError: # female_adult_ratingCount = "" # try: # female_elder_ratingCount = int(response.xpath('//div[@class="smallcell"]/a/text()').extract()[14].strip().replace(',',"")) # except ValueError: # female_elder_ratingCount = "" try: us_users = float( response.xpath('//div[@class="bigcell"]/text()').extract()[17]) except ValueError: us_users = "" # try: # us_count = int(response.xpath('//div[@class="smallcell"]/a/text()').extract()[17].strip().replace(',',"")) # except ValueError: # us_count = "" try: non_USusers = float( response.xpath('//div[@class="bigcell"]/text()').extract()[18]) except ValueError: non_USusers = "" # try: # non_UScount = int(response.xpath('//div[@class="smallcell"]/a/text()').extract()[18].strip().replace(',',"")) # except ValueError: # non_UScount = "" item = ImdbItem() # item['run_time'] = run_time # item['genre'] = genre item['title'] = title # item['imdb_rating'] = imdb_rating # item['meta_rating'] = meta_rating item['MPAA_rating'] = MPAA_rating item['release_date'] = release_date item['director'] = director item['actors'] = actors item['male_teen_rating'] = male_teen_rating item['male_youngAdult_rating'] = male_youngAdult_rating item['male_adult_rating'] = male_adult_rating item['male_elder_rating'] = male_elder_rating item['male_ratingCount'] = male_ratingCount # item['male_teen_ratingCount'] = male_teen_ratingCount # item['male_youngAdult_ratingCount'] = male_youngAdult_ratingCount # item['male_adult_ratingCount'] = male_adult_ratingCount # item['male_elder_ratingCount'] = male_elder_ratingCount item['female_teen_rating'] = female_teen_rating item['female_youngAdult_rating'] = female_youngAdult_rating item['female_adult_rating'] = female_adult_rating item['female_elder_rating'] = female_elder_rating item['female_ratingCount'] = female_ratingCount # item['female_teen_ratingCount'] = female_teen_ratingCount # item['female_youngAdult_ratingCount'] = female_youngAdult_ratingCount # item['female_adult_ratingCount'] = female_adult_ratingCount # item['female_elder_ratingCount'] = female_elder_ratingCount item['non_USusers'] = non_USusers # item['non_UScount'] = non_UScount item['us_users'] = us_users # item['us_count'] = us_count yield item