示例#1
0
    def _parse_details(self, url):
        data = {}

        resp = self.session.get(url)
        html = parsers.html(resp.content, base_url=url)
        html.make_links_absolute()
        content = html.cssselect_first('.content_main')

        image = content.cssselect_first('.movie_image img')
        if image is not None:
            data['url_posters'] = [image.get('src')]

        for a in content.cssselect('a'):
            try:
                url = parsers.youtube_url(a.get('href'))
                data['url_trailer'] = url
                break
            except ValueError:
                pass

            if 'csfd.cz' in a.get('href'):
                data['url_csfd'] = a.get('href')

            if 'imdb.com' in a.get('href'):
                data['url_imdb'] = a.get('href')

        return data
示例#2
0
    def search(self, titles, year=None):
        year = int(year) if year else None

        for title in titles:
            resp = self.session.get(
                'http://www.csfd.cz/hledat/complete-films/?q='
                + urllib.quote_plus(unicode(title).encode('utf-8'))
            )

            # direct redirect to the film page
            try:
                CsfdFilmID.from_url(resp.url)
            except ValueError:
                pass
            else:
                return self.lookup(resp.url)

            # results page
            html = parsers.html(resp.content, base_url=resp.url)
            results = self._iterparse_search_results(html, year)

            for result in results:
                if self._match_names(title, self._parse_matched_title(result)):
                    return self.lookup(self._parse_film_url(result))

        return None  # there is no match
示例#3
0
    def _parse_details(self, url):
        data = {}

        resp = self.session.get(url)
        html = parsers.html(resp.content, base_url=url)
        html.make_links_absolute()
        content = html.cssselect_first('.content_main')

        image = content.cssselect_first('.movie_image img')
        if image is not None:
            data['url_posters'] = [image.get('src')]

        for a in content.cssselect('a'):
            try:
                url = parsers.youtube_url(a.get('href'))
                data['url_trailer'] = url
                break
            except ValueError:
                pass

            if 'csfd.cz' in a.get('href'):
                data['url_csfd'] = a.get('href')

            if 'imdb.com' in a.get('href'):
                data['url_imdb'] = a.get('href')

        return data
示例#4
0
    def _parse_details(self, url):
        data = {}

        resp = self.session.get(url)
        html = parsers.html(resp.content, base_url=resp.url)
        content = html.cssselect_first('#content .leftcol')

        image = content.cssselect_first('img.wp-post-image')
        if image is not None:
            data['url_posters'] = [self._parse_image_link(image)]

        csfd_link = content.cssselect_first('a.csfd')
        if csfd_link is not None and csfd_link.get('href') != 'http://':
            data['url_csfd'] = csfd_link.get('href')

        imdb_link = content.cssselect_first('a.imdb')
        if imdb_link is not None and imdb_link.get('href') != 'http://':
            data['url_imdb'] = imdb_link.get('href')

        if 'trailer' in content.text_content().lower():
            for iframe in content.cssselect('iframe'):
                try:
                    url = parsers.youtube_url(iframe.get('src'))
                    data['url_trailer'] = url
                except ValueError:
                    pass

        return data
示例#5
0
 def _parse_tags(self, row, tags=None):
     tags = tags or {}
     for a in row.cssselect('.movie a.tag'):
         resp = self.session.get(a.link())
         html = parsers.html(resp.content, base_url=resp.url)
         tags[a.text_content()] = html.cssselect_first('h1').text_content()
     return tags
示例#6
0
    def lookup(self, url):
        try:
            resp = self.session.get(url)
        except http.HTTPError as e:
            if e.response.status_code == 404:
                return None  # there is no match
            raise

        html = parsers.html(resp.content, base_url=resp.url)

        titles = self._parse_titles(html)
        origin = self._parse_origin(html)

        return Film(
            url_csfd=resp.url,
            url_imdb=self._parse_imdb_url(html),
            title_main=titles.main,
            title_orig=titles.orig,
            titles_search=titles.others,
            year=origin.year,
            directors=list(self._iterparse_directors(html)),
            length=origin.length,
            rating_csfd=self._parse_rating(html),
            url_posters=[self._parse_poster_url(html)],
        )
示例#7
0
 def _parse_tags(self, row, tags=None):
     tags = tags or {}
     for a in row.cssselect('.movie a.tag'):
         resp = self.session.get(a.link())
         html = parsers.html(resp.content, base_url=resp.url)
         tags[a.text_content()] = html.cssselect_first('h1').text_content()
     return tags
示例#8
0
    def _parse_details(self, url):
        data = {}

        resp = self.session.get(url)
        html = parsers.html(resp.content, base_url=resp.url)
        content = html.cssselect_first('#content .leftcol')

        image = content.cssselect_first('img.wp-post-image')
        if image is not None:
            data['url_posters'] = [self._parse_image_link(image)]

        csfd_link = content.cssselect_first('a.csfd')
        if csfd_link is not None and csfd_link.get('href') != 'http://':
            data['url_csfd'] = csfd_link.get('href')

        imdb_link = content.cssselect_first('a.imdb')
        if imdb_link is not None and imdb_link.get('href') != 'http://':
            data['url_imdb'] = imdb_link.get('href')

        if 'trailer' in content.text_content().lower():
            for iframe in content.cssselect('iframe'):
                try:
                    url = parsers.youtube_url(iframe.get('src'))
                    data['url_trailer'] = url
                except ValueError:
                    pass

        return data
 def _parse_tag(self, el):
     name = el.text_content().strip(':')
     if name not in self.tags:
         resp = self.session.get(el.link())
         html = parsers.html(resp.content, base_url=resp.url)
         self.tags[name] = html.cssselect_first('#main h1').text_content()
     return name, self.tags[name]
示例#10
0
 def _parse_tag(self, el):
     name = el.text_content().strip(':')
     if name not in self.tags:
         resp = self.session.get(el.link())
         html = parsers.html(resp.content, base_url=resp.url)
         self.tags[name] = html.cssselect_first('#main h1').text_content()
     return name, self.tags[name]
示例#11
0
    def __call__(self):
        resp = self.session.get(self.url)
        html = parsers.html(resp.content, base_url=resp.url)

        for table in html.cssselect('.program'):
            tag = table.cssselect_first('.title').text_content().lower()
            for row in table.cssselect('tr'):
                yield self._parse_row(row, tags={tag: None})
示例#12
0
    def __call__(self):
        resp = self.session.get(self.url)
        html = parsers.html(resp.content, base_url=resp.url)

        for table in html.cssselect('.program'):
            tag = table.cssselect_first('.title').text_content().lower()
            for row in table.cssselect('tr'):
                yield self._parse_row(row, tags={tag: None})
示例#13
0
    def __call__(self):
        resp = self.session.get(self.url)
        html = parsers.html(resp.content, base_url=resp.url)

        for entry in html.cssselect('#content-in .aktuality'):
            st = self._parse_entry(entry)
            if st:
                yield st
示例#14
0
    def _scrape_entries(self):
        """Downloads and scrapes text of HTML elements, each with film
        header line.
        """
        resp = self.session.get(self.url)
        html = parsers.html(resp.content, base_url=resp.url)

        for el in html.cssselect('.contentpaneopen strong'):
            if self._is_entry(el):
                yield self._extract_entry_text(el)
示例#15
0
    def _scrape_entries(self):
        """Downloads and scrapes text of HTML elements, each with film
        header line.
        """
        resp = self.session.get(self.url)
        html = parsers.html(resp.content, base_url=resp.url)

        for el in html.cssselect('.contentpaneopen strong'):
            if self._is_entry(el):
                yield self._extract_entry_text(el)
示例#16
0
    def __call__(self):
        resp = self.session.get(self.url)
        html = parsers.html(resp.content, base_url=resp.url)

        for event in html.cssselect('.event'):
            header = event.cssselect_first('h2')

            url = header.link()
            title = header.text_content()

            title_parts = title.split('/')
            if len(title_parts) == 2:
                # naive, but for now good enough
                title_main, title_orig = title_parts
            else:
                title_main = title
                title_orig = None

            details = event.cssselect_first('.descshort').text_content()
            cat = event.cssselect_first('.title-cat').text_content().lower()

            tags = []
            for regexp, tag in self.tag_re:
                if regexp.search(title_main):
                    tags.append(tag)
                    title_main = regexp.sub('', title_main).strip()
                if title_orig and regexp.search(title_orig):
                    tags.append(tag)
                    title_orig = regexp.sub('', title_orig).strip()
                if regexp.search(details):
                    tags.append(tag)
            if cat != 'filmy':
                tags.append(cat)

            d = parsers.date_cs(
                event.cssselect_first('.nextdate strong').text
            )

            t = event.cssselect_first('.nextdate .evttime').text_content()
            t = time(*map(int, t.split(':')))

            starts_at = times.to_universal(datetime.combine(d, t), self.tz)

            yield Showtime(
                cinema=cinema,
                film_scraped=ScrapedFilm(
                    title_main_scraped=title_main,
                    title_orig=title_orig or None,
                ),
                starts_at=starts_at,
                url=url,
                url_booking=self.url_booking,
                tags={tag: None for tag in tags},
            )
示例#17
0
    def __call__(self):
        resp = self.session.get(self.url)
        html = parsers.html(resp.content, base_url=resp.url)

        for event in html.cssselect('.event'):
            header = event.cssselect_first('h2')

            url = header.link()
            title = header.text_content()

            title_parts = title.split('/')
            if len(title_parts) == 2:
                # naive, but for now good enough
                title_main, title_orig = title_parts
            else:
                title_main = title
                title_orig = None

            details = event.cssselect_first('.descshort').text_content()
            cat = event.cssselect_first('.title-cat').text_content().lower()

            tags = []
            for regexp, tag in self.tag_re:
                if regexp.search(title_main):
                    tags.append(tag)
                    title_main = regexp.sub('', title_main).strip()
                if title_orig and regexp.search(title_orig):
                    tags.append(tag)
                    title_orig = regexp.sub('', title_orig).strip()
                if regexp.search(details):
                    tags.append(tag)
            if cat != 'filmy':
                tags.append(cat)

            d = parsers.date_cs(event.cssselect_first('.nextdate strong').text)

            t = event.cssselect_first('.nextdate .evttime').text_content()
            t = time(*map(int, t.split(':')))

            starts_at = times.to_universal(datetime.combine(d, t), self.tz)

            yield Showtime(
                cinema=cinema,
                film_scraped=ScrapedFilm(
                    title_main_scraped=title_main,
                    title_orig=title_orig or None,
                ),
                starts_at=starts_at,
                url=url,
                url_booking=self.url_booking,
                tags={tag: None
                      for tag in tags},
            )
示例#18
0
    def _parse_details(self, url):
        resp = self.session.get(url)
        html = parsers.html(resp.content, base_url=resp.url)

        info = {}
        for row in html.cssselect('.feature_info_row'):
            label = row.cssselect_first('.pre_label')
            info[label.text_content()] = label.getnext().text_content()

        return {
            'title_orig': info.get(u'Name:'),
            'length': info.get(u'Délka (min.):'),
            # not taking the year, because they use year of Czech premiere,
            # instead of the year of the original release
        }
示例#19
0
    def _scrape_rows(self):
        """Generates individual table rows of cinema's calendar."""
        url = self.url

        while True:
            resp = self.session.get(url)
            html = parsers.html(resp.content, base_url=url)
            for el in html.cssselect('#content table tr'):
                yield Row(el, url)

            pagination = html.cssselect('#classic-paging a.forward')
            if pagination:
                url = pagination[0].link()
            else:
                break
示例#20
0
    def _parse_details(self, url):
        resp = self.session.get(url)
        html = parsers.html(resp.content, base_url=resp.url)

        info = {}
        for row in html.cssselect('.feature_info_row'):
            label = row.cssselect_first('.pre_label')
            info[label.text_content()] = label.getnext().text_content()

        return {
            'title_orig': info.get(u'Name:'),
            'length': info.get(u'Délka (min.):'),
            # not taking the year, because they use year of Czech premiere,
            # instead of the year of the original release
        }
示例#21
0
    def _scrape_rows(self):
        """Generates individual table rows of cinema's calendar."""
        url = self.url

        while True:
            resp = self.session.get(url)
            html = parsers.html(resp.content, base_url=url)
            for el in html.cssselect('#content table tr'):
                yield Row(el, url)

            pagination = html.cssselect('#classic-paging a.forward')
            if pagination:
                url = pagination[0].link()
            else:
                break
示例#22
0
    def _parse_details(self, url):
        data = {}

        resp = self.session.get(url)
        html = parsers.html(resp.content, base_url=resp.url)
        content = html.cssselect_first('#content #right')

        image = content.cssselect_first('a.rel')
        if image is not None:
            data['url_posters'] = [image.link()]

        csfd_link = content.cssselect_first('a[href^="http://www.csfd.cz"]')
        if csfd_link is not None:
            data['url_csfd'] = csfd_link.get('href')

        return data
示例#23
0
    def _parse_details(self, url):
        data = {}

        resp = self.session.get(url)
        html = parsers.html(resp.content, base_url=resp.url)
        content = html.cssselect_first('#content #right')

        image = content.cssselect_first('a.rel')
        if image is not None:
            data['url_posters'] = [image.link()]

        csfd_link = content.cssselect_first('a[href^="http://www.csfd.cz"]')
        if csfd_link is not None:
            data['url_csfd'] = csfd_link.get('href')

        return data
示例#24
0
    def lookup(self, url):
        try:
            resp = self.session.get(url)
        except http.HTTPError as e:
            if e.response.status_code == 404:
                return None  # there is no match
            raise
        html = parsers.html(resp.content, base_url=resp.url)

        title = self._parse_title(html)
        return Film(
            url_imdb=resp.url,
            title_main=title,
            year=self._parse_year(html),
            rating_imdb=self._parse_rating(html),
        )
示例#25
0
    def __call__(self):
        day = datetime.date.today()
        while True:
            resp = self.session.get(
                self.url.format(location_id=self.location_id, date=day))
            html = parsers.html(resp.content, base_url=resp.url)

            if u'Žádný program' in html.text_content():
                break

            table = html.cssselect('tr')
            labels = table[0]
            rows = table[1:]

            for row in rows:
                for st in self._parse_row(day, row, labels):
                    yield st

            day += datetime.timedelta(days=1)
示例#26
0
    def __call__(self):
        day = datetime.date.today()
        while True:
            resp = self.session.get(self.url.format(
                location_id=self.location_id,
                date=day
            ))
            html = parsers.html(resp.content, base_url=resp.url)

            if u'Žádný program' in html.text_content():
                break

            table = html.cssselect('tr')
            labels = table[0]
            rows = table[1:]

            for row in rows:
                for st in self._parse_row(day, row, labels):
                    yield st

            day += datetime.timedelta(days=1)
示例#27
0
 def _scrape_html(self):
     resp = self.session.get(self.url)
     return parsers.html(resp.content, base_url=resp.url)
 def _scrape_rows(self):
     resp = download(self.url)
     html = parsers.html(resp.content, base_url=resp.url)
     return html.cssselect('.content table tr')
 def _scrape_rows(self):
     resp = self.session.get(self.url)
     html = parsers.html(resp.content, base_url=resp.url)
     return html.cssselect('.content table tr')
示例#30
0
 def _scrape_table(self):
     resp = download(self.url)
     html = parsers.html(resp.content, base_url=resp.url)
     return html.cssselect('#main .film_table tr')
示例#31
0
 def _scrape_entries(self):
     """Downloads and scrapes HTML elements, each with film header line."""
     resp = download(self.url)
     html = parsers.html(resp.content, base_url=resp.url)
     return (el for el in html.cssselect('.contentpaneopen strong')
             if self._is_entry(el))
示例#32
0
 def _scrape_rows(self):
     resp = self.session.get(self.url)
     html = parsers.html(resp.content, base_url=resp.url)
     return html.cssselect('.program-item')
示例#33
0
 def _scrape_rows(self):
     resp = download(self.url)
     html = parsers.html(resp.content, base_url=resp.url)
     return html.cssselect('.content table tr')