Python stripHTML 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: utils

메소드/함수: stripHTML

hotexamples.com에서의 예제들: 8

Python stripHTML - 8개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 utils.stripHTML에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: google.py 프로젝트: gipi/Richie

 def calculator(self, query):
     opts = dict(self.calcopts)
     opts['q'] = query
     doc = self.ua.openurl(self.search, opts=opts)
     if not self.reConversionDetected.search(doc):
         raise Exception, 'no conversion detected'
     response = self.reConversionResult.search(doc).group(1)
     response = stripHTML(response)
     return response

예제 #2

파일 보기

파일: google.py 프로젝트: gipi/Richie

 def spellcheck(self, query):
     opts = dict(self.spellcheck_opts)
     opts['q'] = query
     result = self.ua.openurl(self.search, opts=opts, referer=self.baseurl)
     try:
         result = self.correct.search(result).group(1)
         result = stripHTML(result)
     except:
         result = query
     return result

예제 #3

파일 보기

파일: google.py 프로젝트: compbrain/madcow

 def clock(self, query):
     """Use google to look up time in a given location"""
     try:
         doc = self.ua.open(self.search, {'q': 'time in %s' % query})
         soup = BeautifulSoup(doc)
         time = soup.find('img', src=self.clock_re).findNext('td')
         try:
             time.find('table').extract()
         except AttributeError:
             pass
         return stripHTML(time.renderContents().decode('utf-8')).strip()
     except:
         pass

예제 #4

파일 보기

    def get_summary(self, query):
        soup, title = self.get_soup(query)

        # check if this is a disambiguation page, if so construct special page
        # there isn't a consistent style guide, so we just try to do the
        # most common format (ordered list of links). if this fails, return
        # a friendly failure for now
        if soup.find('div', attrs={'id': 'disambig'}):
            try:
                summary = '%s (Disambiguation) - ' % title
                for link in soup.find('ul').findAll('a'):
                    title = str(link['title']).strip()
                    if len(summary) + len(title) + 2 > self.summary_size:
                        break
                    if not summary.endswith(' '):
                        summary += ', '
                    summary += title
            except:
                summary = 'Fancy, unsupported disambiguation page!'
            return summary

        # massage into plain text by concatenating paragraphs
        content = []
        for para in soup.findAll('p'):
            content.append(str(para))
        content = ' '.join(content)

        # clean up rendered text
        content = stripHTML(content)  # strip markup
        content = Wiki._citations.sub('', content)  # remove citations
        content = Wiki._parens.sub('', content)  # remove parentheticals
        content = Wiki._whitespace.sub(' ', content)  # compress whitespace
        content = Wiki._fix_punc.sub(r'\1', content)  # fix punctuation
        content = content.strip()  # strip whitespace

        # search error
        if title == self.error:
            return 'No results found for "%s"' % query

        # generate summary by adding as many sentences as possible before limit
        summary = '%s -' % title
        for sentence in Wiki._sentence.findall(content):
            if len(summary) + 1 + len(sentence) > self.summary_size:
                break
            summary += ' %s' % sentence
        return summary

예제 #5

파일 보기

파일: wiki.py 프로젝트: gipi/Richie

    def get_summary(self, query):
        soup, title = self.get_soup(query)

        # check if this is a disambiguation page, if so construct special page
        # there isn't a consistent style guide, so we just try to do the
        # most common format (ordered list of links). if this fails, return
        # a friendly failure for now
        if soup.find('div', attrs={'id': 'disambig'}):
            try:
                summary = '%s (Disambiguation) - ' % title
                for link in soup.find('ul').findAll('a'):
                    title = str(link['title']).strip()
                    if len(summary) + len(title) + 2 > self.summary_size:
                        break
                    if not summary.endswith(' '):
                        summary += ', '
                    summary += title
            except:
                summary = 'Fancy, unsupported disambiguation page!'
            return summary

        # massage into plain text by concatenating paragraphs
        content = []
        for para in soup.findAll('p'):
            content.append(str(para))
        content = ' '.join(content)

        # clean up rendered text
        content = stripHTML(content)                 # strip markup
        content = Wiki._citations.sub('', content)   # remove citations
        content = Wiki._parens.sub('', content)      # remove parentheticals
        content = Wiki._whitespace.sub(' ', content) # compress whitespace
        content = Wiki._fix_punc.sub(r'\1', content) # fix punctuation
        content = content.strip()                    # strip whitespace

        # search error
        if title == self.error:
            return 'No results found for "%s"' % query

        # generate summary by adding as many sentences as possible before limit
        summary = '%s -' % title
        for sentence in Wiki._sentence.findall(content):
            if len(summary) + 1 + len(sentence) > self.summary_size:
                break
            summary += ' %s' % sentence
        return summary

예제 #6

파일 보기

파일: google.py 프로젝트: compbrain/madcow

    def calculator(self, query):
        """Try to use google calculator for given query"""
        opts = dict(self.calcopts)
        opts[u'q'] = query
        doc = self.ua.open(self.search, opts=opts)
        if not self.reConversionDetected.search(doc):
            raise Exception, u'no conversion detected'
        response = self.reConversionResult.search(doc).group(1)

        # turn super scripts into utf8
        parts = []
        for part in self.sup_re.split(response):
            if self.sup_re.match(part):
                part = superscript(part)
            parts.append(part)
        response = u''.join(parts)

        return stripHTML(response)

예제 #7

파일 보기

 def extractTextFrom(self, soup):
     text = soup.findAll(text=True)
     return utils.stripHTML(" ".join(filter(utils.isVisible,
                                            text)).encode('utf-8'))

예제 #8

파일 보기

파일: PageParser.py 프로젝트: helgso/WebCrawler

 def extractTextFrom(self, soup):
     text = soup.findAll(text=True)
     return utils.stripHTML(" ".join(filter(utils.isVisible, text)).encode('utf-8'))