def parse(self, instance, content): id = re.compile(r"<link rel=\"canonical\" href=\"http://www.kinopoisk.ru/name/(\d+)/\" />").findall(content) if id: instance.id = self.prepare_int(id[0]) name = re.compile(r'<h1 class="moviename-big" itemprop="name">(.+?)</h1>').findall(content) if name: instance.name = self.prepare_str(name[0]) name_original = re.compile(r'<span itemprop="alternativeHeadline">([\w\s]+)\s+</span>').findall(content) if name_original: instance.name_original = self.prepare_str(name_original[0]) content_info = re.compile(r'<tr\s*>\s*<td class="type">(.+?)</td>\s*<td[^>]*>(.+?)</td>\s*</tr>').findall(content) for name, value in content_info: if name.encode('utf-8') == 'дата рождения': year_birth = re.compile(r'<a href="/lists/m_act%5Bbirthday%5D%5Byear%5D/\d{4}/">(\d{4})</a>').findall(value) if year_birth: instance.year_birth = self.prepare_int(year_birth[0]) if instance.id: response = get_request(instance.get_url('info')) if response.content: instance.information = response.content.decode('windows-1251', 'ignore').replace(' class="trivia"', '') instance.set_source('main_page')
def parse(self, instance, content): id = re.compile(r"<link rel=\"canonical\" href=\"http://www.kinopoisk.ru/name/(\d+)/\" />").findall(content) if id: instance.id = self.prepare_int(id[0]) name = re.compile(r'<h1 style="padding:0px;margin:0px" class="moviename-big">(.+?)</h1>').findall(content) if name: instance.name = self.prepare_str(name[0]) name_original = re.compile(r'<span style="font-size:13px;color:#666">(.+?)</span>').findall(content) if name_original: instance.name_original = self.prepare_str(name_original[0]) content_info = content[content.find(u'<!-- инфа об актере -->'):content.find(u'<!-- /инфа об актере -->')] content_info = re.compile(r'<tr\s*>\s*<td class="type">(.+?)</td>\s*<td[^>]*>(.+?)</td>\s*</tr>').findall(content_info) for name, value in content_info: if name == u'дата рождения': year_birth = re.compile(r'<a href="/level/10/m_act%5Bbirthday%5D%5Byear%5D/\d{4}/">(\d{4})</a>').findall(value) # year_birth = re.compile(r'<a href="/level/10/m_act\[birthday\]\[year\]/\d{4}/">(\d{4})</a>').findall(value) if year_birth: instance.year_birth = self.prepare_int(year_birth[0]) if instance.id: response = get_request(instance.get_url('info')) if response.content: instance.information = response.content.decode('windows-1251', 'ignore').replace(' class="trivia"','') instance.set_source('main_page')
def all(self): from BeautifulSoup import BeautifulSoup url, params = self.get_url_with_params() response = get_request(url, params=params) content = response.content.decode('windows-1251', 'ignore') content_soup = BeautifulSoup(content) instances = [] for premier in content_soup.findAll('div', {'class': 'premier_item'}): instance = self.kinopoisk_object() instance.parse('premier_link', premier) instances += [instance] return instances
def parse(self, instance, content): id = re.compile( r"<link rel=\"canonical\" href=\"http://www.kinopoisk.ru/name/(\d+)/\" />" ).findall(content) if id: instance.id = self.prepare_int(id[0]) name = re.compile( r'<h1 class="moviename-big" itemprop="name">(.+?)</h1>').findall( content) if name: instance.name = self.prepare_str(name[0]) name_original = re.compile( r'<span itemprop="alternativeHeadline">([\w\s]+)\s+</span>' ).findall(content) if name_original: instance.name_original = self.prepare_str(name_original[0]) content_info = re.compile( r'<tr\s*>\s*<td class="type">(.+?)</td>\s*<td[^>]*>(.+?)</td>\s*</tr>' ).findall(content) for name, value in content_info: if name.encode('utf-8') == 'дата рождения': year_birth = re.compile( r'<a href="/lists/m_act%5Bbirthday%5D%5Byear%5D/\d{4}/">(\d{4})</a>' ).findall(value) if year_birth: instance.year_birth = self.prepare_int(year_birth[0]) if instance.id: response = get_request(instance.get_url('info')) if response.content: instance.information = response.content.decode( 'windows-1251', 'ignore').replace(' class="trivia"', '') instance.set_source('main_page')