def _parse_details(self, url): data = {} resp = self.session.get(url) html = parsers.html(resp.content, base_url=url) html.make_links_absolute() content = html.cssselect_first('.content_main') image = content.cssselect_first('.movie_image img') if image is not None: data['url_posters'] = [image.get('src')] for a in content.cssselect('a'): try: url = parsers.youtube_url(a.get('href')) data['url_trailer'] = url break except ValueError: pass if 'csfd.cz' in a.get('href'): data['url_csfd'] = a.get('href') if 'imdb.com' in a.get('href'): data['url_imdb'] = a.get('href') return data
def search(self, titles, year=None): year = int(year) if year else None for title in titles: resp = self.session.get( 'http://www.csfd.cz/hledat/complete-films/?q=' + urllib.quote_plus(unicode(title).encode('utf-8')) ) # direct redirect to the film page try: CsfdFilmID.from_url(resp.url) except ValueError: pass else: return self.lookup(resp.url) # results page html = parsers.html(resp.content, base_url=resp.url) results = self._iterparse_search_results(html, year) for result in results: if self._match_names(title, self._parse_matched_title(result)): return self.lookup(self._parse_film_url(result)) return None # there is no match
def _parse_details(self, url): data = {} resp = self.session.get(url) html = parsers.html(resp.content, base_url=resp.url) content = html.cssselect_first('#content .leftcol') image = content.cssselect_first('img.wp-post-image') if image is not None: data['url_posters'] = [self._parse_image_link(image)] csfd_link = content.cssselect_first('a.csfd') if csfd_link is not None and csfd_link.get('href') != 'http://': data['url_csfd'] = csfd_link.get('href') imdb_link = content.cssselect_first('a.imdb') if imdb_link is not None and imdb_link.get('href') != 'http://': data['url_imdb'] = imdb_link.get('href') if 'trailer' in content.text_content().lower(): for iframe in content.cssselect('iframe'): try: url = parsers.youtube_url(iframe.get('src')) data['url_trailer'] = url except ValueError: pass return data
def _parse_tags(self, row, tags=None): tags = tags or {} for a in row.cssselect('.movie a.tag'): resp = self.session.get(a.link()) html = parsers.html(resp.content, base_url=resp.url) tags[a.text_content()] = html.cssselect_first('h1').text_content() return tags
def lookup(self, url): try: resp = self.session.get(url) except http.HTTPError as e: if e.response.status_code == 404: return None # there is no match raise html = parsers.html(resp.content, base_url=resp.url) titles = self._parse_titles(html) origin = self._parse_origin(html) return Film( url_csfd=resp.url, url_imdb=self._parse_imdb_url(html), title_main=titles.main, title_orig=titles.orig, titles_search=titles.others, year=origin.year, directors=list(self._iterparse_directors(html)), length=origin.length, rating_csfd=self._parse_rating(html), url_posters=[self._parse_poster_url(html)], )
def _parse_tag(self, el): name = el.text_content().strip(':') if name not in self.tags: resp = self.session.get(el.link()) html = parsers.html(resp.content, base_url=resp.url) self.tags[name] = html.cssselect_first('#main h1').text_content() return name, self.tags[name]
def __call__(self): resp = self.session.get(self.url) html = parsers.html(resp.content, base_url=resp.url) for table in html.cssselect('.program'): tag = table.cssselect_first('.title').text_content().lower() for row in table.cssselect('tr'): yield self._parse_row(row, tags={tag: None})
def __call__(self): resp = self.session.get(self.url) html = parsers.html(resp.content, base_url=resp.url) for entry in html.cssselect('#content-in .aktuality'): st = self._parse_entry(entry) if st: yield st
def _scrape_entries(self): """Downloads and scrapes text of HTML elements, each with film header line. """ resp = self.session.get(self.url) html = parsers.html(resp.content, base_url=resp.url) for el in html.cssselect('.contentpaneopen strong'): if self._is_entry(el): yield self._extract_entry_text(el)
def __call__(self): resp = self.session.get(self.url) html = parsers.html(resp.content, base_url=resp.url) for event in html.cssselect('.event'): header = event.cssselect_first('h2') url = header.link() title = header.text_content() title_parts = title.split('/') if len(title_parts) == 2: # naive, but for now good enough title_main, title_orig = title_parts else: title_main = title title_orig = None details = event.cssselect_first('.descshort').text_content() cat = event.cssselect_first('.title-cat').text_content().lower() tags = [] for regexp, tag in self.tag_re: if regexp.search(title_main): tags.append(tag) title_main = regexp.sub('', title_main).strip() if title_orig and regexp.search(title_orig): tags.append(tag) title_orig = regexp.sub('', title_orig).strip() if regexp.search(details): tags.append(tag) if cat != 'filmy': tags.append(cat) d = parsers.date_cs( event.cssselect_first('.nextdate strong').text ) t = event.cssselect_first('.nextdate .evttime').text_content() t = time(*map(int, t.split(':'))) starts_at = times.to_universal(datetime.combine(d, t), self.tz) yield Showtime( cinema=cinema, film_scraped=ScrapedFilm( title_main_scraped=title_main, title_orig=title_orig or None, ), starts_at=starts_at, url=url, url_booking=self.url_booking, tags={tag: None for tag in tags}, )
def __call__(self): resp = self.session.get(self.url) html = parsers.html(resp.content, base_url=resp.url) for event in html.cssselect('.event'): header = event.cssselect_first('h2') url = header.link() title = header.text_content() title_parts = title.split('/') if len(title_parts) == 2: # naive, but for now good enough title_main, title_orig = title_parts else: title_main = title title_orig = None details = event.cssselect_first('.descshort').text_content() cat = event.cssselect_first('.title-cat').text_content().lower() tags = [] for regexp, tag in self.tag_re: if regexp.search(title_main): tags.append(tag) title_main = regexp.sub('', title_main).strip() if title_orig and regexp.search(title_orig): tags.append(tag) title_orig = regexp.sub('', title_orig).strip() if regexp.search(details): tags.append(tag) if cat != 'filmy': tags.append(cat) d = parsers.date_cs(event.cssselect_first('.nextdate strong').text) t = event.cssselect_first('.nextdate .evttime').text_content() t = time(*map(int, t.split(':'))) starts_at = times.to_universal(datetime.combine(d, t), self.tz) yield Showtime( cinema=cinema, film_scraped=ScrapedFilm( title_main_scraped=title_main, title_orig=title_orig or None, ), starts_at=starts_at, url=url, url_booking=self.url_booking, tags={tag: None for tag in tags}, )
def _parse_details(self, url): resp = self.session.get(url) html = parsers.html(resp.content, base_url=resp.url) info = {} for row in html.cssselect('.feature_info_row'): label = row.cssselect_first('.pre_label') info[label.text_content()] = label.getnext().text_content() return { 'title_orig': info.get(u'Name:'), 'length': info.get(u'Délka (min.):'), # not taking the year, because they use year of Czech premiere, # instead of the year of the original release }
def _scrape_rows(self): """Generates individual table rows of cinema's calendar.""" url = self.url while True: resp = self.session.get(url) html = parsers.html(resp.content, base_url=url) for el in html.cssselect('#content table tr'): yield Row(el, url) pagination = html.cssselect('#classic-paging a.forward') if pagination: url = pagination[0].link() else: break
def _parse_details(self, url): data = {} resp = self.session.get(url) html = parsers.html(resp.content, base_url=resp.url) content = html.cssselect_first('#content #right') image = content.cssselect_first('a.rel') if image is not None: data['url_posters'] = [image.link()] csfd_link = content.cssselect_first('a[href^="http://www.csfd.cz"]') if csfd_link is not None: data['url_csfd'] = csfd_link.get('href') return data
def lookup(self, url): try: resp = self.session.get(url) except http.HTTPError as e: if e.response.status_code == 404: return None # there is no match raise html = parsers.html(resp.content, base_url=resp.url) title = self._parse_title(html) return Film( url_imdb=resp.url, title_main=title, year=self._parse_year(html), rating_imdb=self._parse_rating(html), )
def __call__(self): day = datetime.date.today() while True: resp = self.session.get( self.url.format(location_id=self.location_id, date=day)) html = parsers.html(resp.content, base_url=resp.url) if u'Žádný program' in html.text_content(): break table = html.cssselect('tr') labels = table[0] rows = table[1:] for row in rows: for st in self._parse_row(day, row, labels): yield st day += datetime.timedelta(days=1)
def __call__(self): day = datetime.date.today() while True: resp = self.session.get(self.url.format( location_id=self.location_id, date=day )) html = parsers.html(resp.content, base_url=resp.url) if u'Žádný program' in html.text_content(): break table = html.cssselect('tr') labels = table[0] rows = table[1:] for row in rows: for st in self._parse_row(day, row, labels): yield st day += datetime.timedelta(days=1)
def _scrape_html(self): resp = self.session.get(self.url) return parsers.html(resp.content, base_url=resp.url)
def _scrape_rows(self): resp = download(self.url) html = parsers.html(resp.content, base_url=resp.url) return html.cssselect('.content table tr')
def _scrape_rows(self): resp = self.session.get(self.url) html = parsers.html(resp.content, base_url=resp.url) return html.cssselect('.content table tr')
def _scrape_table(self): resp = download(self.url) html = parsers.html(resp.content, base_url=resp.url) return html.cssselect('#main .film_table tr')
def _scrape_entries(self): """Downloads and scrapes HTML elements, each with film header line.""" resp = download(self.url) html = parsers.html(resp.content, base_url=resp.url) return (el for el in html.cssselect('.contentpaneopen strong') if self._is_entry(el))
def _scrape_rows(self): resp = self.session.get(self.url) html = parsers.html(resp.content, base_url=resp.url) return html.cssselect('.program-item')