Python striphtml示例，base.striphtml Python示例

示例#1

0

显示文件

文件： nnmclub.py 项目： vadyur/script.media.aggregator

	def parse_description(self, html_text):
		self.soup = BeautifulSoup(clean_html(html_text), 'html.parser')

		tag = u''
		self._dict['gold'] = False
		for a in self.soup.select('img[src="images/gold.gif"]'):
			self._dict['gold'] = True
			debug('gold')

		for span in self.soup.select('span.postbody span'):
			try:
				text = span.get_text()
				tag = self.get_tag(text)
				if tag != '':
					if tag != u'plot':
						self._dict[tag] = base.striphtml(unicode(span.next_sibling).strip())
					else:
						self._dict[tag] = base.striphtml(unicode(span.next_sibling.next_sibling).strip())
					debug('%s (%s): %s' % (text.encode('utf-8'), tag.encode('utf-8'), self._dict[tag].encode('utf-8')))
			except:
				pass
		if 'genre' in self._dict:
			self._dict['genre'] = self._dict['genre'].lower().replace('.', '')

		count_id = 0
		for a in self.soup.select('a[href*="www.imdb.com/title/"]'):
			try:
				href = a['href']

				components = href.split('/')
				if components[2] == u'www.imdb.com' and components[3] == u'title':
					self._dict['imdb_id'] = components[4]
					count_id += 1
			except:
				pass

		if count_id > 1:
			return False

		for img in self.soup.select('var.postImg'):  # ('img.postImg'):
			try:
				self._dict['thumbnail'] = img['title']
				debug('!!!!!!!!!!!!!!thumbnail: ' + self._dict['thumbnail'])
				break
			except:
				pass

		self.parse_country_studio()

		if self.settings:
			if self.settings.use_kinopoisk:
				for kp_id in self.soup.select('#kp_id'):
					self._dict['kp_id'] = kp_id['href']

		self.make_movie_api(self.get_value('imdb_id'), self.get_value('kp_id'))

		return True

示例#2

0

显示文件

文件： nnmclub.py 项目： Enialixe/script.media.aggregator

    def parse_description(self, html_text):
        self.soup = BeautifulSoup(clean_html(html_text), 'html.parser')

        tag = u''
        self._dict['gold'] = False
        for a in self.soup.select('img[src="images/gold.gif"]'):
            self._dict['gold'] = True
            debug('gold')

        for span in self.soup.select('.postbody span'):
            try:
                text = span.get_text()
                tag = self.get_tag(text)
                if tag != '':
                    if tag != u'plot':
                        self._dict[tag] = base.striphtml(
                            unicode(span.next_sibling).strip())
                    else:
                        self._dict[tag] = base.striphtml(
                            unicode(span.next_sibling.next_sibling).strip())
                    debug('%s (%s): %s' %
                          (text.encode('utf-8'), tag.encode('utf-8'),
                           self._dict[tag].encode('utf-8')))
            except:
                pass
        if 'genre' in self._dict:
            self._dict['genre'] = self._dict['genre'].replace('.', '')

        count_id = 0
        for a in self.soup.select('a[href*="www.imdb.com/title/"]'):
            try:
                href = a['href']

                components = href.split('/')
                if components[2] == u'www.imdb.com' and components[
                        3] == u'title':
                    self._dict['imdb_id'] = components[4]
                    count_id += 1
            except:
                pass

        if count_id > 1:
            return False

        img = self.soup.find('var', class_='postImg')
        if img:
            try:
                self._dict['thumbnail'] = img['title'].split('?link=')[-1]
                debug('!!!!!!!!!!!!!!thumbnail: ' + self._dict['thumbnail'])
            except:
                pass

        if 'thumbnail' not in self._dict:
            imgs = self.soup.select('span.postbody > img')
            try:
                self._dict['thumbnail'] = imgs[0]['src'].split('?link=')[-1]
                debug('!!!!!!!!!!!!!!thumbnail: ' + self._dict['thumbnail'])
            except BaseException as e:
                pass

        self.parse_country_studio()

        try:
            kp = self.soup.select_one('div.kpi a')
        except TypeError:
            kp = None
        if not kp:
            try:
                kp = self.soup.select_one('#kp_id')
            except TypeError:
                kp = None
        if kp:
            self._dict['kp_id'] = kp['href']

        self.make_movie_api(self.get_value('imdb_id'), self.get_value('kp_id'),
                            self.settings)

        return True

示例#3

0

显示文件

    def parse_description(self, html_text):
        from HTMLParser import HTMLParseError

        html_text = clean_html(html_text)
        try:
            self.soup = BeautifulSoup(html_text, 'html.parser')
        except HTMLParseError as e:
            log.print_tb(e)
            log.debug(html_text)
            return False

        tag = u''

        for b in self.soup.select('#details b'):
            try:
                text = b.get_text()
                tag = self.get_tag(text)
                if tag == 'plot':
                    plot = base.striphtml(
                        unicode(b.next_sibling.next_sibling).strip())
                    if plot:
                        self._dict[tag] = plot
                        debug('%s (%s): %s' %
                              (text.encode('utf-8'), tag.encode('utf-8'),
                               self._dict[tag].encode('utf-8')))
                elif tag == 'genre':
                    genres = []
                    elements = b.findNextSiblings('a')
                    for a in elements:
                        if '/tag/' in a['href']:
                            genres.append(a.get_text())

                    self._dict[tag] = u', '.join(genres)

                elif tag != '':
                    self._dict[tag] = base.striphtml(
                        unicode(b.next_sibling).strip())
                    debug('%s (%s): %s' %
                          (text.encode('utf-8'), tag.encode('utf-8'),
                           self._dict[tag].encode('utf-8')))
            except:
                pass

        tags = []
        for tag in [
                u'title', u'year', u'genre', u'director', u'actor', u'plot'
        ]:
            if tag not in self._dict:
                tags.append(tag)

        if tags:
            try:
                details = self.soup.select_one('#details').get_text()
                lines = details.split('\n')
                for l in lines:
                    if ':' in l:
                        key, desc = l.split(':', 1)
                        key = key.strip(u' \r\n\t✦═')
                        desc = desc.strip(u' \r\n\t')

                        tag = self.get_tag(key + ':')
                        if tag and desc and tag not in self._dict:
                            self._dict[tag] = desc
            except BaseException as e:
                debug('No parse #details')
                debug(e)
                pass

        if 'genre' in self._dict:
            self._dict['genre'] = self._dict['genre'].lower().replace('.', '')

        if 'video' in self._dict:
            self._dict['video'] = self._dict['video'].replace('|', ',')

            if self.settings.rutor_nosd:
                video = self._dict['video']
                parts = video.split(',')

                for part in parts:
                    part = part.strip()

                    if 'XviD' in part:
                        return False

                    m = re.search(ur'(\d+)[xXхХ](\d+)', part)
                    if m:
                        w = int(m.group(1))
                        #h = int(m.group(2))
                        if w < 1280:
                            return False
        else:
            pass

        count_id = 0
        for a in self.soup.select('a[href*="www.imdb.com/title/"]'):
            try:
                href = a['href']

                components = href.split('/')
                if components[2] == u'www.imdb.com' and components[
                        3] == u'title':
                    self._dict['imdb_id'] = components[4]
                    count_id += 1
            except:
                pass

        if count_id == 0:
            div_index = self.soup.select('#index')
            if div_index:
                for a in div_index[0].findAll('a', recursive=True):
                    if '/torrent/' in a['href']:
                        parts = a['href'].split('/')
                        href = parts[0] + '/' + parts[1] + '/' + parts[2]
                        html = urllib2.urlopen(real_url(href, self.settings))
                        soup = BeautifulSoup(clean_html(html.read()),
                                             'html.parser')

                        for a in soup.select('a[href*="www.imdb.com/title/"]'):
                            try:
                                href = a['href']

                                components = href.split('/')
                                if components[
                                        2] == u'www.imdb.com' and components[
                                            3] == u'title':
                                    self._dict['imdb_id'] = components[4]
                                    count_id += 1
                            except:
                                pass

                    if 'imdb_id' in self._dict:
                        break

        if count_id > 1:
            return False

        if 'imdb_id' not in self._dict:
            if not hasattr(self.settings, 'no_skip_by_imdb'):
                return False

        for det in self.soup.select('#details'):
            tr = det.find('tr', recursive=False)
            if tr:
                tds = tr.findAll('td', recursive=False)
                if len(tds) > 1:
                    td = tds[1]
                    img = td.find('img')
                    try:
                        self._dict['thumbnail'] = img['src']
                        debug('!!!!!!!!!!!!!!thumbnail: ' +
                              self._dict['thumbnail'])
                        break
                    except:
                        pass

        for kp_id in self.soup.select('a[href*="www.kinopoisk.ru/"]'):
            self._dict['kp_id'] = kp_id['href']

        self.make_movie_api(self.get_value('imdb_id'), self.get_value('kp_id'),
                            self.settings)

        return True

示例#4

0

显示文件

文件： rutor.py 项目： vadyur/script.media.aggregator

	def parse_description(self, html_text):
		from HTMLParser import HTMLParseError

		html_text = clean_html(html_text)
		try:
			self.soup = BeautifulSoup(html_text, 'html.parser')
		except HTMLParseError as e:
			log.print_tb(e)
			log.debug(html_text)
			return False

		tag = u''

		for b in self.soup.select('#details b'):
			try:
				text = b.get_text()
				tag = self.get_tag(text)
				if tag == 'plot':
					self._dict[tag] = base.striphtml(unicode(b.next_sibling.next_sibling).strip())
					debug('%s (%s): %s' % (text.encode('utf-8'), tag.encode('utf-8'), self._dict[tag].encode('utf-8')))
				elif tag == 'genre':
					genres = []
					elements = b.findNextSiblings('a')
					for a in elements:
						if '/tag/' in a['href']:
							genres.append(a.get_text())

					self._dict[tag] = u', '.join(genres)

				elif tag != '':
					self._dict[tag] = base.striphtml(unicode(b.next_sibling).strip())
					debug('%s (%s): %s' % (text.encode('utf-8'), tag.encode('utf-8'), self._dict[tag].encode('utf-8')))
			except:
				pass
		if 'genre' in self._dict:
			self._dict['genre'] = self._dict['genre'].lower().replace('.', '')


		for tag in [u'title', u'year', u'genre', u'director', u'actor', u'plot']:
			if tag not in self._dict:
				return False
		
		count_id = 0
		for a in self.soup.select('a[href*="www.imdb.com/title/"]'):
			try:
				href = a['href']

				components = href.split('/')
				if components[2] == u'www.imdb.com' and components[3] == u'title':
					self._dict['imdb_id'] = components[4]
					count_id += 1
			except:
				pass

		if count_id == 0:
			div_index = self.soup.select('#index')
			if div_index:
				for a in div_index[0].findAll('a', recursive=True):
					if '/torrent/' in a['href']:
						parts = a['href'].split('/')
						href = parts[0] + '/' + parts[1] + '/' + parts[2]
						html = urllib2.urlopen(real_url(href, settings))
						soup = BeautifulSoup(clean_html(html.read()), 'html.parser')

						for a in soup.select('a[href*="www.imdb.com/title/"]'):
							try:
								href = a['href']

								components = href.split('/')
								if components[2] == u'www.imdb.com' and components[3] == u'title':
									self._dict['imdb_id'] = components[4]
									count_id += 1
							except:
								pass

					if 'imdb_id' in self._dict:
						break

		if count_id > 1:
			return False

		if 'imdb_id' not in self._dict:
			return False

		for det in self.soup.select('#details'):
			tr = det.find('tr', recursive=False)
			if tr:
				tds = tr.findAll('td', recursive=False)
				if len(tds) > 1:
					td = tds[1]
					img = td.find('img')
					try:
						self._dict['thumbnail'] = img['src']
						debug('!!!!!!!!!!!!!!thumbnail: ' + self._dict['thumbnail'])
						break
					except:
						pass

		if self.settings:
			if self.settings.use_kinopoisk:
				for kp_id in self.soup.select('a[href*="www.kinopoisk.ru/"]'):
					self._dict['kp_id'] = kp_id['href']

		self.make_movie_api(self.get_value('imdb_id'), self.get_value('kp_id'))

		return True