def parse(self): content_info = re.compile(r'<tr\s*>\s*<td class="type">(.+?)</td>\s*<td class="birth"(.+?)>.+</td>\s*</tr>', re.S).findall(self.content) for name, value in content_info: if str(name).endswith('дата рождения'): year_birth = re.compile(r'\d{4}-\d{2}-\d{2}').findall(value) if year_birth: self.instance.year_birth = self.prepare_str(year_birth[0]) if self.instance.id: token = re.findall(r'xsrftoken = \'([^\']+)\'', self.content) obj_type = re.findall(r'objType: \'([^\']+)\'', self.content) if token and obj_type: content = self.request.get_content(self.instance.get_url('info', token=token[0], type=obj_type[0])) if content: self.instance.information = content.replace(' class="trivia"', '') self.content = html.fromstring(self.content) person_id = re.compile(r".+/name/(\d+)/").findall(self.extract('id'))[0] self.instance.id = self.prepare_int(person_id) self.instance.name = self.extract('name', to_str=True) self.instance.name_en = self.extract('name_en', to_str=True) self.instance.gender = self.extract('gender', to_str=True) # movies from kinopoisk.person import Role for element in self.extract('movies'): type = [t.get('data-work-type') for t in element.iterancestors()][0] self.instance.career.setdefault(type, []) self.instance.career[type].append(Role.get_parsed('role_link', element)) self.instance.set_source('main_page')
def parse(self): person_id = re.compile( r"<link rel=\"canonical\" href=\"https?://www.kinopoisk.ru/name/(\d+)/\" />" ).findall(self.content) if person_id: self.instance.id = self.prepare_int(person_id[0]) name = re.compile( r'<h1 class="moviename-big" itemprop="name">(.+?)</h1>').findall( self.content) if name: self.instance.name = self.prepare_str(name[0]) name_en = re.compile( r'<span itemprop="alternateName">([A-Z]\'?[- a-zA-Z]+)</span>' ).findall(self.content) if name_en: self.instance.name_en = self.prepare_str(name_en[0]) content_info = re.compile( r'<tr\s*>\s*<td class="type">(.+?)</td>\s*<td[^>]*>(.+?)</td>\s*</tr>' ).findall(self.content) for name, value in content_info: if str(name) == 'дата рождения': year_birth = re.compile( r'<a href="/lists/m_act%5Bbirthday%5D%5Byear%5D/\d{4}/">(\d{4})</a>' ).findall(value) if year_birth: self.instance.year_birth = self.prepare_int(year_birth[0]) if self.instance.id: token = re.findall(r'xsrftoken = \'([^\']+)\'', self.content) obj_type = re.findall(r'objType: \'([^\']+)\'', self.content) if token and obj_type: response = self.request.get(self.instance.get_url( 'info', token=token[0], type=obj_type[0]), headers=HEADERS) response.connection.close() if response.content: self.instance.information = response.content.decode( 'windows-1251', 'ignore').replace(' class="trivia"', '') # movies from kinopoisk.person import Role self.content = html.fromstring(self.content) for element in self.extract('movies'): type = [t.get('data-work-type') for t in element.iterancestors()][0] self.instance.career.setdefault(type, []) self.instance.career[type].append( Role.get_parsed('role_link', element)) self.instance.set_source('main_page')
def parse(self): content_info = re.compile( r'<tr\s*>\s*<td class="type">(.+?)</td>\s*<td[^>]*>(.+?)</td>\s*</tr>', re.S).findall(self.content) for name, value in content_info: if str(name) == 'дата рождения': year_birth = re.compile( r'<a href="/lists/m_act%5Bbirthday%5D%5Byear%5D/\d{4}/">(\d{4})</a>' ).findall(value) if year_birth: self.instance.year_birth = self.prepare_int(year_birth[0]) if self.instance.id: token = re.findall(r'xsrftoken = \'([^\']+)\'', self.content) obj_type = re.findall(r'objType: \'([^\']+)\'', self.content) if token and obj_type: response = self.request.get(self.instance.get_url( 'info', token=token[0], type=obj_type[0]), headers=HEADERS) response.connection.close() if response.content: self.instance.information = response.content.decode( 'windows-1251', 'ignore').replace(' class="trivia"', '') self.content = html.fromstring(self.content) person_id = re.compile(r".+/name/(\d+)/").findall( self.extract('id'))[0] self.instance.id = self.prepare_int(person_id) self.instance.name = self.extract('name', to_str=True) self.instance.name_en = self.extract('name_en', to_str=True) # movies from kinopoisk.person import Role for element in self.extract('movies'): type = [t.get('data-work-type') for t in element.iterancestors()][0] self.instance.career.setdefault(type, []) self.instance.career[type].append( Role.get_parsed('role_link', element)) self.instance.set_source('main_page')
def parse(self): content_info = re.compile(r'<tr\s*>\s*<td class="type">(.+?)</td>\s*<td[^>]*>(.+?)</td>\s*</tr>', re.S).findall( self.content) for name, value in content_info: if str(name) == 'дата рождения': year_birth = re.compile(r'<a href="/lists/m_act%5Bbirthday%5D%5Byear%5D/\d{4}/">(\d{4})</a>').findall( value) if year_birth: self.instance.year_birth = self.prepare_int(year_birth[0]) if self.instance.id: token = re.findall(r'xsrftoken = \'([^\']+)\'', self.content) obj_type = re.findall(r'objType: \'([^\']+)\'', self.content) if token and obj_type: response = self.request.get(self.instance.get_url('info', token=token[0], type=obj_type[0]), headers=HEADERS) response.connection.close() if response.content: self.instance.information = response.content.decode('windows-1251', 'ignore').replace( ' class="trivia"', '') self.content = html.fromstring(self.content) person_id = re.compile(r".+/name/(\d+)/").findall(self.extract('id'))[0] self.instance.id = self.prepare_int(person_id) self.instance.name = self.extract('name', to_str=True) self.instance.name_en = self.extract('name_en', to_str=True) # movies from kinopoisk.person import Role for element in self.extract('movies'): type = [t.get('data-work-type') for t in element.iterancestors()][0] self.instance.career.setdefault(type, []) self.instance.career[type].append(Role.get_parsed('role_link', element)) self.instance.set_source('main_page')