def parse_description(self, html_text): self.soup = BeautifulSoup(clean_html(html_text), 'html.parser') tag = u'' self._dict['gold'] = False for a in self.soup.select('img[src="images/gold.gif"]'): self._dict['gold'] = True debug('gold') for span in self.soup.select('span.postbody span'): try: text = span.get_text() tag = self.get_tag(text) if tag != '': if tag != u'plot': self._dict[tag] = base.striphtml(unicode(span.next_sibling).strip()) else: self._dict[tag] = base.striphtml(unicode(span.next_sibling.next_sibling).strip()) debug('%s (%s): %s' % (text.encode('utf-8'), tag.encode('utf-8'), self._dict[tag].encode('utf-8'))) except: pass if 'genre' in self._dict: self._dict['genre'] = self._dict['genre'].lower().replace('.', '') count_id = 0 for a in self.soup.select('a[href*="www.imdb.com/title/"]'): try: href = a['href'] components = href.split('/') if components[2] == u'www.imdb.com' and components[3] == u'title': self._dict['imdb_id'] = components[4] count_id += 1 except: pass if count_id > 1: return False for img in self.soup.select('var.postImg'): # ('img.postImg'): try: self._dict['thumbnail'] = img['title'] debug('!!!!!!!!!!!!!!thumbnail: ' + self._dict['thumbnail']) break except: pass self.parse_country_studio() if self.settings: if self.settings.use_kinopoisk: for kp_id in self.soup.select('#kp_id'): self._dict['kp_id'] = kp_id['href'] self.make_movie_api(self.get_value('imdb_id'), self.get_value('kp_id')) return True
def parse_description(self, html_text): self.soup = BeautifulSoup(clean_html(html_text), 'html.parser') tag = u'' self._dict['gold'] = False for a in self.soup.select('img[src="images/gold.gif"]'): self._dict['gold'] = True debug('gold') for span in self.soup.select('.postbody span'): try: text = span.get_text() tag = self.get_tag(text) if tag != '': if tag != u'plot': self._dict[tag] = base.striphtml( unicode(span.next_sibling).strip()) else: self._dict[tag] = base.striphtml( unicode(span.next_sibling.next_sibling).strip()) debug('%s (%s): %s' % (text.encode('utf-8'), tag.encode('utf-8'), self._dict[tag].encode('utf-8'))) except: pass if 'genre' in self._dict: self._dict['genre'] = self._dict['genre'].replace('.', '') count_id = 0 for a in self.soup.select('a[href*="www.imdb.com/title/"]'): try: href = a['href'] components = href.split('/') if components[2] == u'www.imdb.com' and components[ 3] == u'title': self._dict['imdb_id'] = components[4] count_id += 1 except: pass if count_id > 1: return False img = self.soup.find('var', class_='postImg') if img: try: self._dict['thumbnail'] = img['title'].split('?link=')[-1] debug('!!!!!!!!!!!!!!thumbnail: ' + self._dict['thumbnail']) except: pass if 'thumbnail' not in self._dict: imgs = self.soup.select('span.postbody > img') try: self._dict['thumbnail'] = imgs[0]['src'].split('?link=')[-1] debug('!!!!!!!!!!!!!!thumbnail: ' + self._dict['thumbnail']) except BaseException as e: pass self.parse_country_studio() try: kp = self.soup.select_one('div.kpi a') except TypeError: kp = None if not kp: try: kp = self.soup.select_one('#kp_id') except TypeError: kp = None if kp: self._dict['kp_id'] = kp['href'] self.make_movie_api(self.get_value('imdb_id'), self.get_value('kp_id'), self.settings) return True
def parse_description(self, html_text): from HTMLParser import HTMLParseError html_text = clean_html(html_text) try: self.soup = BeautifulSoup(html_text, 'html.parser') except HTMLParseError as e: log.print_tb(e) log.debug(html_text) return False tag = u'' for b in self.soup.select('#details b'): try: text = b.get_text() tag = self.get_tag(text) if tag == 'plot': plot = base.striphtml( unicode(b.next_sibling.next_sibling).strip()) if plot: self._dict[tag] = plot debug('%s (%s): %s' % (text.encode('utf-8'), tag.encode('utf-8'), self._dict[tag].encode('utf-8'))) elif tag == 'genre': genres = [] elements = b.findNextSiblings('a') for a in elements: if '/tag/' in a['href']: genres.append(a.get_text()) self._dict[tag] = u', '.join(genres) elif tag != '': self._dict[tag] = base.striphtml( unicode(b.next_sibling).strip()) debug('%s (%s): %s' % (text.encode('utf-8'), tag.encode('utf-8'), self._dict[tag].encode('utf-8'))) except: pass tags = [] for tag in [ u'title', u'year', u'genre', u'director', u'actor', u'plot' ]: if tag not in self._dict: tags.append(tag) if tags: try: details = self.soup.select_one('#details').get_text() lines = details.split('\n') for l in lines: if ':' in l: key, desc = l.split(':', 1) key = key.strip(u' \r\n\t✦═') desc = desc.strip(u' \r\n\t') tag = self.get_tag(key + ':') if tag and desc and tag not in self._dict: self._dict[tag] = desc except BaseException as e: debug('No parse #details') debug(e) pass if 'genre' in self._dict: self._dict['genre'] = self._dict['genre'].lower().replace('.', '') if 'video' in self._dict: self._dict['video'] = self._dict['video'].replace('|', ',') if self.settings.rutor_nosd: video = self._dict['video'] parts = video.split(',') for part in parts: part = part.strip() if 'XviD' in part: return False m = re.search(ur'(\d+)[xXхХ](\d+)', part) if m: w = int(m.group(1)) #h = int(m.group(2)) if w < 1280: return False else: pass count_id = 0 for a in self.soup.select('a[href*="www.imdb.com/title/"]'): try: href = a['href'] components = href.split('/') if components[2] == u'www.imdb.com' and components[ 3] == u'title': self._dict['imdb_id'] = components[4] count_id += 1 except: pass if count_id == 0: div_index = self.soup.select('#index') if div_index: for a in div_index[0].findAll('a', recursive=True): if '/torrent/' in a['href']: parts = a['href'].split('/') href = parts[0] + '/' + parts[1] + '/' + parts[2] html = urllib2.urlopen(real_url(href, self.settings)) soup = BeautifulSoup(clean_html(html.read()), 'html.parser') for a in soup.select('a[href*="www.imdb.com/title/"]'): try: href = a['href'] components = href.split('/') if components[ 2] == u'www.imdb.com' and components[ 3] == u'title': self._dict['imdb_id'] = components[4] count_id += 1 except: pass if 'imdb_id' in self._dict: break if count_id > 1: return False if 'imdb_id' not in self._dict: if not hasattr(self.settings, 'no_skip_by_imdb'): return False for det in self.soup.select('#details'): tr = det.find('tr', recursive=False) if tr: tds = tr.findAll('td', recursive=False) if len(tds) > 1: td = tds[1] img = td.find('img') try: self._dict['thumbnail'] = img['src'] debug('!!!!!!!!!!!!!!thumbnail: ' + self._dict['thumbnail']) break except: pass for kp_id in self.soup.select('a[href*="www.kinopoisk.ru/"]'): self._dict['kp_id'] = kp_id['href'] self.make_movie_api(self.get_value('imdb_id'), self.get_value('kp_id'), self.settings) return True
def parse_description(self, html_text): from HTMLParser import HTMLParseError html_text = clean_html(html_text) try: self.soup = BeautifulSoup(html_text, 'html.parser') except HTMLParseError as e: log.print_tb(e) log.debug(html_text) return False tag = u'' for b in self.soup.select('#details b'): try: text = b.get_text() tag = self.get_tag(text) if tag == 'plot': self._dict[tag] = base.striphtml(unicode(b.next_sibling.next_sibling).strip()) debug('%s (%s): %s' % (text.encode('utf-8'), tag.encode('utf-8'), self._dict[tag].encode('utf-8'))) elif tag == 'genre': genres = [] elements = b.findNextSiblings('a') for a in elements: if '/tag/' in a['href']: genres.append(a.get_text()) self._dict[tag] = u', '.join(genres) elif tag != '': self._dict[tag] = base.striphtml(unicode(b.next_sibling).strip()) debug('%s (%s): %s' % (text.encode('utf-8'), tag.encode('utf-8'), self._dict[tag].encode('utf-8'))) except: pass if 'genre' in self._dict: self._dict['genre'] = self._dict['genre'].lower().replace('.', '') for tag in [u'title', u'year', u'genre', u'director', u'actor', u'plot']: if tag not in self._dict: return False count_id = 0 for a in self.soup.select('a[href*="www.imdb.com/title/"]'): try: href = a['href'] components = href.split('/') if components[2] == u'www.imdb.com' and components[3] == u'title': self._dict['imdb_id'] = components[4] count_id += 1 except: pass if count_id == 0: div_index = self.soup.select('#index') if div_index: for a in div_index[0].findAll('a', recursive=True): if '/torrent/' in a['href']: parts = a['href'].split('/') href = parts[0] + '/' + parts[1] + '/' + parts[2] html = urllib2.urlopen(real_url(href, settings)) soup = BeautifulSoup(clean_html(html.read()), 'html.parser') for a in soup.select('a[href*="www.imdb.com/title/"]'): try: href = a['href'] components = href.split('/') if components[2] == u'www.imdb.com' and components[3] == u'title': self._dict['imdb_id'] = components[4] count_id += 1 except: pass if 'imdb_id' in self._dict: break if count_id > 1: return False if 'imdb_id' not in self._dict: return False for det in self.soup.select('#details'): tr = det.find('tr', recursive=False) if tr: tds = tr.findAll('td', recursive=False) if len(tds) > 1: td = tds[1] img = td.find('img') try: self._dict['thumbnail'] = img['src'] debug('!!!!!!!!!!!!!!thumbnail: ' + self._dict['thumbnail']) break except: pass if self.settings: if self.settings.use_kinopoisk: for kp_id in self.soup.select('a[href*="www.kinopoisk.ru/"]'): self._dict['kp_id'] = kp_id['href'] self.make_movie_api(self.get_value('imdb_id'), self.get_value('kp_id')) return True