예제 #1
0
    def parse(self):
        content_info = re.compile(r'<tr\s*>\s*<td class="type">(.+?)</td>\s*<td class="birth"(.+?)>.+</td>\s*</tr>',
                                  re.S).findall(self.content)
        for name, value in content_info:
            if str(name).endswith('дата рождения'):
                year_birth = re.compile(r'\d{4}-\d{2}-\d{2}').findall(value)
                if year_birth:
                    self.instance.year_birth = self.prepare_str(year_birth[0])

        if self.instance.id:
            token = re.findall(r'xsrftoken = \'([^\']+)\'', self.content)
            obj_type = re.findall(r'objType: \'([^\']+)\'', self.content)
            if token and obj_type:
                content = self.request.get_content(self.instance.get_url('info', token=token[0], type=obj_type[0]))
                if content:
                    self.instance.information = content.replace(' class="trivia"', '')

        self.content = html.fromstring(self.content)

        person_id = re.compile(r".+/name/(\d+)/").findall(self.extract('id'))[0]
        self.instance.id = self.prepare_int(person_id)
        self.instance.name = self.extract('name', to_str=True)
        self.instance.name_en = self.extract('name_en', to_str=True)
        self.instance.gender = self.extract('gender', to_str=True)

        # movies
        from kinopoisk.person import Role
        for element in self.extract('movies'):
            type = [t.get('data-work-type') for t in element.iterancestors()][0]
            self.instance.career.setdefault(type, [])
            self.instance.career[type].append(Role.get_parsed('role_link', element))

        self.instance.set_source('main_page')
예제 #2
0
    def parse(self):

        person_id = re.compile(
            r"<link rel=\"canonical\" href=\"https?://www.kinopoisk.ru/name/(\d+)/\" />"
        ).findall(self.content)
        if person_id:
            self.instance.id = self.prepare_int(person_id[0])

        name = re.compile(
            r'<h1 class="moviename-big" itemprop="name">(.+?)</h1>').findall(
                self.content)
        if name:
            self.instance.name = self.prepare_str(name[0])

        name_en = re.compile(
            r'<span itemprop="alternateName">([A-Z]\'?[- a-zA-Z]+)</span>'
        ).findall(self.content)
        if name_en:
            self.instance.name_en = self.prepare_str(name_en[0])

        content_info = re.compile(
            r'<tr\s*>\s*<td class="type">(.+?)</td>\s*<td[^>]*>(.+?)</td>\s*</tr>'
        ).findall(self.content)
        for name, value in content_info:
            if str(name) == 'дата рождения':
                year_birth = re.compile(
                    r'<a href="/lists/m_act%5Bbirthday%5D%5Byear%5D/\d{4}/">(\d{4})</a>'
                ).findall(value)
                if year_birth:
                    self.instance.year_birth = self.prepare_int(year_birth[0])

        if self.instance.id:
            token = re.findall(r'xsrftoken = \'([^\']+)\'', self.content)
            obj_type = re.findall(r'objType: \'([^\']+)\'', self.content)
            if token and obj_type:
                response = self.request.get(self.instance.get_url(
                    'info', token=token[0], type=obj_type[0]),
                                            headers=HEADERS)
                response.connection.close()
                if response.content:
                    self.instance.information = response.content.decode(
                        'windows-1251',
                        'ignore').replace(' class="trivia"', '')

        # movies
        from kinopoisk.person import Role
        self.content = html.fromstring(self.content)
        for element in self.extract('movies'):
            type = [t.get('data-work-type')
                    for t in element.iterancestors()][0]
            self.instance.career.setdefault(type, [])
            self.instance.career[type].append(
                Role.get_parsed('role_link', element))

        self.instance.set_source('main_page')
예제 #3
0
    def parse(self):
        content_info = re.compile(
            r'<tr\s*>\s*<td class="type">(.+?)</td>\s*<td[^>]*>(.+?)</td>\s*</tr>',
            re.S).findall(self.content)
        for name, value in content_info:
            if str(name) == 'дата рождения':
                year_birth = re.compile(
                    r'<a href="/lists/m_act%5Bbirthday%5D%5Byear%5D/\d{4}/">(\d{4})</a>'
                ).findall(value)
                if year_birth:
                    self.instance.year_birth = self.prepare_int(year_birth[0])

        if self.instance.id:
            token = re.findall(r'xsrftoken = \'([^\']+)\'', self.content)
            obj_type = re.findall(r'objType: \'([^\']+)\'', self.content)
            if token and obj_type:
                response = self.request.get(self.instance.get_url(
                    'info', token=token[0], type=obj_type[0]),
                                            headers=HEADERS)
                response.connection.close()
                if response.content:
                    self.instance.information = response.content.decode(
                        'windows-1251',
                        'ignore').replace(' class="trivia"', '')

        self.content = html.fromstring(self.content)

        person_id = re.compile(r".+/name/(\d+)/").findall(
            self.extract('id'))[0]
        self.instance.id = self.prepare_int(person_id)
        self.instance.name = self.extract('name', to_str=True)
        self.instance.name_en = self.extract('name_en', to_str=True)

        # movies
        from kinopoisk.person import Role
        for element in self.extract('movies'):
            type = [t.get('data-work-type')
                    for t in element.iterancestors()][0]
            self.instance.career.setdefault(type, [])
            self.instance.career[type].append(
                Role.get_parsed('role_link', element))

        self.instance.set_source('main_page')
예제 #4
0
    def parse(self):
        content_info = re.compile(r'<tr\s*>\s*<td class="type">(.+?)</td>\s*<td[^>]*>(.+?)</td>\s*</tr>', re.S).findall(
            self.content)
        for name, value in content_info:
            if str(name) == 'дата рождения':
                year_birth = re.compile(r'<a href="/lists/m_act%5Bbirthday%5D%5Byear%5D/\d{4}/">(\d{4})</a>').findall(
                    value)
                if year_birth:
                    self.instance.year_birth = self.prepare_int(year_birth[0])

        if self.instance.id:
            token = re.findall(r'xsrftoken = \'([^\']+)\'', self.content)
            obj_type = re.findall(r'objType: \'([^\']+)\'', self.content)
            if token and obj_type:
                response = self.request.get(self.instance.get_url('info', token=token[0], type=obj_type[0]),
                                            headers=HEADERS)
                response.connection.close()
                if response.content:
                    self.instance.information = response.content.decode('windows-1251', 'ignore').replace(
                        ' class="trivia"', '')

        self.content = html.fromstring(self.content)

        person_id = re.compile(r".+/name/(\d+)/").findall(self.extract('id'))[0]
        self.instance.id = self.prepare_int(person_id)
        self.instance.name = self.extract('name', to_str=True)
        self.instance.name_en = self.extract('name_en', to_str=True)

        # movies
        from kinopoisk.person import Role
        for element in self.extract('movies'):
            type = [t.get('data-work-type') for t in element.iterancestors()][0]
            self.instance.career.setdefault(type, [])
            self.instance.career[type].append(Role.get_parsed('role_link', element))

        self.instance.set_source('main_page')