Пример #1
0
    def parse(self, page):
        self.html = fromstring(page)

        alternative_span = self.html.xpath('//span[@itemprop="alternateName"]')
        if len(alternative_span) > 0:
            self.alternative_name = alternative_span[0].text_content()

        for tr in self.html.xpath('//table[@class="info"]//tr'):
            td = tr.xpath('.//td[@class="type"]')[0]
            info_type = td.text_content()
            info = td.getnext().text_content()
            if info_type == u'дата рождения':
                self.birth_date = tr.xpath('.//td[@class="birth"]')[0].get(
                    "birthdate")
                if self.birth_date is not None and self.birth_date.startswith(
                        '-'):
                    self.birth_date = ('%s BC' % self.birth_date[1:])
                logger.warning('Birth date = %s' % self.birth_date)
            elif info_type == u'место рождения':
                self.birth_place = info
            elif info_type == u'рост':
                m = re.search('(\d+)\.(\d+) м', info, re.UNICODE)
                if m is not None:
                    self.growth = int(m.group(1)) * 100 + int(m.group(2))
            elif info_type == u'дата смерти':
                logger.warning(info)
                m = re.search(u'^(.+)•', info, re.UNICODE)
                if m is None:
                    date = get_date(info.strip()).get('date')
                else:
                    date = get_date(m.group(1).strip()).get('date')
                logger.warning('date = %s' % date)
                self.death_date = date
            elif info_type == u'место смерти':
                self.death_place = info
Пример #2
0
    def get_dates(self):
        page = self.get_page('https://www.kinopoisk.ru/film/%s/dates/' % self.full_id)
        if page is None:
            logger.warning('There is no information about dates')
            return
        html = fromstring(page)
        for div in html.xpath('//table//tr//div[contains(@class, "flag")]'):
            td_date = div.getparent().getnext()
            td_country = td_date.getnext().xpath('.//a[contains(@class, "all")]')
            td_small = td_date.getnext().xpath('.//small')
            td_count = td_date.getnext().getnext().xpath('.//small')

            date = get_date(td_date[0].text_content().strip())
            country = td_country[0].text_content()
            country_id = self.extract_country_id_from_url(td_country[0].get('href'))
            small = td_small[0].text_content().strip()
            m = re.search(u'(.+)чел.', td_count[0].text_content(), re.UNICODE)

            try:
                count = re.sub('[^\d]', '', m.group(1))
                count = int(count)
            except (AttributeError, ValueError):
                count = None

            if country_id not in [i['id'] for i in self.countries]:
                self.countries_to_save.append({'id': country_id, 'name': country})
            self.dates.append({'date': date, 'country_id': country_id,
                               'commentary': small, 'viewers': count})
Пример #3
0
 def get_premieres(self, elem):
     div = elem.xpath('.//div[@class="prem_ical"]')
     if div is not None and len(div) > 0:
         date = get_date(div[0].get('data-ical-date').strip())
         premiere = {'region': div[0].get('data-ical-type')}
         premiere.update(date)
         self.premieres.append(premiere)
         if premiere['region'] == 'world':
             self.world_premiere = date['date']