Python BeautifulSoup примеры использования

Язык программирования: Python

Пространство имен/Пакет: mypack.util.htmlproc.BeautifulSoup

Класс/Тип: BeautifulSoup

Примеров на hotexamples.com: 4

Python BeautifulSoup - 4 примера найдено. Это лучшие примеры Python кода для mypack.util.htmlproc.BeautifulSoup.BeautifulSoup, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

BeautifulSoup(1)

findAll(1)

Пример #1

Показать файл

Файл: parser.py Проект: chaoswork/web-classify

    def __init__(self, url):
        self.ns_li = []
        self.ns_len = 0  # 所有的文本块长度

        try:
            # 编码检测与转换
            myopener = MyOpener()
            html = myopener.open(url).read()
            #html = urlopen(url).read()
            encoding = chardet.detect(html)['encoding']

            if not encoding:
                raise Myerror("Sorry,cannot recognize encoding of the page!")
            html = html.decode(encoding, 'ignore').encode('utf-8', 'ignore')
            # BeautifulSoup不能析取，alt为中文
            html = re.sub('alt=(("[^"]*")|(\'[^\']*\')|([^"\'\s]+))', '', html)
            self.soup = BeautifulSoup(html)
            self.__clear()
        except IOError:
            raise Myerror("Sorry,cannot open this page!")
            print "Faild to open:" + url

Пример #2

Показать файл

Файл: parser.py Проект: ICTBigDataBench/web-classify

 def __init__(self,url):
     self.ns_li = []
     self.ns_len = 0         # 所有的文本块长度
             
     try:
         # 编码检测与转换
         myopener = MyOpener()
         html = myopener.open(url).read()
         #html = urlopen(url).read()
         encoding = chardet.detect(html)['encoding'] 
         
         if not encoding:
             raise Myerror("Sorry,cannot recognize encoding of the page!")
         html = html.decode(encoding,'ignore').encode('utf-8','ignore')
         # BeautifulSoup不能析取，alt为中文
         html = re.sub('alt=(("[^"]*")|(\'[^\']*\')|([^"\'\s]+))','',html)
         self.soup = BeautifulSoup(html)
         self.__clear()
     except IOError:
         raise Myerror("Sorry,cannot open this page!")
         print "Faild to open:"+url

Пример #3

Показать файл

Файл: parser.py Проект: ICTBigDataBench/web-classify

class Parser:
    def __init__(self,url):
        self.ns_li = []
        self.ns_len = 0         # 所有的文本块长度
                
        try:
            # 编码检测与转换
            myopener = MyOpener()
            html = myopener.open(url).read()
            #html = urlopen(url).read()
            encoding = chardet.detect(html)['encoding'] 
            
            if not encoding:
                raise Myerror("Sorry,cannot recognize encoding of the page!")
            html = html.decode(encoding,'ignore').encode('utf-8','ignore')
            # BeautifulSoup不能析取，alt为中文
            html = re.sub('alt=(("[^"]*")|(\'[^\']*\')|([^"\'\s]+))','',html)
            self.soup = BeautifulSoup(html)
            self.__clear()
        except IOError:
            raise Myerror("Sorry,cannot open this page!")
            print "Faild to open:"+url

    def __clear(self):
        # 去除comment
        comments = self.soup.findAll(text=lambda text:isinstance(text, Comment))
        [comment.extract() for comment in comments]
          # 去除style,script
        trivals = self.soup.findAll(['script','style'])
        [trival.extract() for trival in trivals]
        # 清楚display:none或文字或font-size: 0px
        # ToDo style="font-size: 0px; 
        trivals = self.soup.findAll(style=re.compile(".*((display\s*:\s*none)|(font-size\s*:\s*0)).*"))
        [trival.extract() for trival in trivals]

    # 获取title，取正文中与<title>最匹配的
    def get_title(self):
        try:
            t = self.soup.title
            tag_t = t.string.strip()
            
            title = None
            # 从正文中寻找title
            for ns in self.ns_li:
                s = ns.string.strip()
                if s!='' and tag_t.find( s )!=-1 :
                   if title==None or 3<len(s)-len(title.string):
                      title = ns
            if not title:
               title = t
        except AttributeError:
            raise Myerror("Sorry,cannot access this page for host limitation!")
          #对title进行分词
          #tws = title_split( title )
        #return html_char_transform(title)
        return title
    
    def text_len(self):
        return self.ns_len    
    # 抽取基本文本块
    def ns(self):
        cur = self.soup.body
        ns_list = []
        offset = 0              # 记录与上一个NavigableString之间的Tag距离
        while cur.next:
            if isinstance(cur,NavigableString):
                # 过滤单纯空格
                if not re.match(ur"^(\s|&nbsp;)*$",cur.string):                    
                    ns_list.append( {'node':cur,'offset':offset} )
                    self.ns_li.append( cur )
                    self.ns_len += len(cur.string) # 计算总的ns串长度
                    offset = 0
            #elif cur.name not in ['br','a','img','span','font','b','strong','li','tr','td']:
            elif cur.name.lower() in BLOCK_TAGS:
                offset +=1
            cur = cur.next

Пример #4

Показать файл

Файл: parser.py Проект: chaoswork/web-classify

class Parser:
    def __init__(self, url):
        self.ns_li = []
        self.ns_len = 0  # 所有的文本块长度

        try:
            # 编码检测与转换
            myopener = MyOpener()
            html = myopener.open(url).read()
            #html = urlopen(url).read()
            encoding = chardet.detect(html)['encoding']

            if not encoding:
                raise Myerror("Sorry,cannot recognize encoding of the page!")
            html = html.decode(encoding, 'ignore').encode('utf-8', 'ignore')
            # BeautifulSoup不能析取，alt为中文
            html = re.sub('alt=(("[^"]*")|(\'[^\']*\')|([^"\'\s]+))', '', html)
            self.soup = BeautifulSoup(html)
            self.__clear()
        except IOError:
            raise Myerror("Sorry,cannot open this page!")
            print "Faild to open:" + url

    def __clear(self):
        # 去除comment
        comments = self.soup.findAll(
            text=lambda text: isinstance(text, Comment))
        [comment.extract() for comment in comments]
        # 去除style,script
        trivals = self.soup.findAll(['script', 'style'])
        [trival.extract() for trival in trivals]
        # 清楚display:none或文字或font-size: 0px
        # ToDo style="font-size: 0px;
        trivals = self.soup.findAll(
            style=re.compile(".*((display\s*:\s*none)|(font-size\s*:\s*0)).*"))
        [trival.extract() for trival in trivals]

    # 获取title，取正文中与<title>最匹配的
    def get_title(self):
        try:
            t = self.soup.title
            tag_t = t.string.strip()

            title = None
            # 从正文中寻找title
            for ns in self.ns_li:
                s = ns.string.strip()
                if s != '' and tag_t.find(s) != -1:
                    if title == None or 3 < len(s) - len(title.string):
                        title = ns
            if not title:
                title = t
        except AttributeError:
            raise Myerror("Sorry,cannot access this page for host limitation!")
        #对title进行分词
        #tws = title_split( title )
        #return html_char_transform(title)
        return title

    def text_len(self):
        return self.ns_len

    # 抽取基本文本块
    def ns(self):
        cur = self.soup.body
        ns_list = []
        offset = 0  # 记录与上一个NavigableString之间的Tag距离
        while cur.next:
            if isinstance(cur, NavigableString):
                # 过滤单纯空格
                if not re.match(ur"^(\s|&nbsp;)*$", cur.string):
                    ns_list.append({'node': cur, 'offset': offset})
                    self.ns_li.append(cur)
                    self.ns_len += len(cur.string)  # 计算总的ns串长度
                    offset = 0
            #elif cur.name not in ['br','a','img','span','font','b','strong','li','tr','td']:
            elif cur.name.lower() in BLOCK_TAGS:
                offset += 1
            cur = cur.next