Python BeautifulSoup.find示例，include.BeautifulSoup.BeautifulSoup.find Python示例

示例#1

0

显示文件

 def get_lyrics_from_url(self, url):
     page = geturl(url, referer=self.baseurl)
     soup = BeautifulSoup(page)
     content = soup.find('div', attrs={'id': 'content'})
     [div.extract() for div in content.findAll('div')]
     [link.extract() for link in content.findAll('a')]
     [script.extract() for script in content.findAll('script')]
     lines = [str(line) for line in content.contents]
     data = ''.join(lines)
     data = self._newline.sub('', data)
     data = self._leadbreak.sub('', data)
     data = self._endbreak.sub('', data)
     lines = self._break.split(data)
     verses = []
     while True:
         try:
             i = lines.index('')
             verse, lines = lines[:i], lines[i+1:]
             verses.append(verse)
         except ValueError:
             verses.append(lines)
             break
     for i, verse in enumerate(verses):
         verse = ' / '.join(verse)
         verse = whitespace.sub(' ', verse)
         verses[i] = verse
     if self._spam in verses:
         del verses[verses.index(self._spam)]
     return verses

示例#2

0

显示文件

文件： google.py 项目： compbrain/madcow

 def clock(self, query):
     """Use google to look up time in a given location"""
     try:
         doc = self.ua.open(self.search, {'q': 'time in %s' % query})
         soup = BeautifulSoup(doc)
         time = soup.find('img', src=self.clock_re).findNext('td')
         try:
             time.find('table').extract()
         except AttributeError:
             pass
         return stripHTML(time.renderContents().decode('utf-8')).strip()
     except:
         pass

示例#3

0

显示文件

 def bodycount(self):
     try:
         doc = geturl(IraqWar._bodycount_url)
         soup = BeautifulSoup(doc)
         data = soup.find('td', attrs={'class': 'main-num'})
         data = data.find('a')
         data = str(data.contents[0])
         data = stripHTML(data)
         data = IraqWar._re_whitespace.sub(' ', data)
         data = data.strip()
         return data
     except Exception, e:
         log.warn('error in %s: %s' % (self.__module__, e))
         log.exception(e)
         return 'UNKNOWN'

示例#4

0

显示文件

文件： war.py 项目： gipi/Richie

 def bodycount(self):
     try:
         doc = geturl(IraqWar._bodycount_url)
         soup = BeautifulSoup(doc)
         data = soup.find('td', attrs={'class': 'main-num'})
         data = data.find('a')
         data = str(data.contents[0])
         data = stripHTML(data)
         data = IraqWar._re_whitespace.sub(' ', data)
         data = data.strip()
         return data
     except Exception, e:
         log.warn('error in %s: %s' % (self.__module__, e))
         log.exception(e)
         return 'UNKNOWN'

示例#5

0

显示文件

文件： stupid.py 项目： gipi/Richie

    def get_comment(self):
        page = geturl(self.url)

        # remove high ascii since this is going to IRC
        page = self.utf8.sub('', page)

        # create BeautifulSoup document tree
        soup = BeautifulSoup(page)
        table = soup.find('table')
        rows = table.findAll('tr')
        row = rows[1]
        cells = row.findAll('td')
        source = cells[1].string
        comment = cells[2].string
        author = cells[3].string
        return '<%s@%s> %s' % (author, source, comment)

示例#6

0

显示文件

文件： hugs.py 项目： compbrain/madcow

 def response(self, nick, args, kwargs):
     try:
         doc = geturl(self.random, add_headers={'Accept': '*/*'})
         soup = BeautifulSoup(doc)
         main = soup.find(u'div', attrs={u'id': u'main'})
         confs = main.findAll(u'div', attrs={u'class': u'content'})
         conf = random.choice(confs)
         conf = [unicode(p) for p in conf.findAll(u'p')]
         conf = u' '.join(conf)
         conf = stripHTML(conf)
         conf = conf.strip()
         return conf
     except Exception, error:
         log.warn(u'error in module %s' % self.__module__)
         log.exception(error)
         return u'%s: I had some issues with that..' % nick

示例#7

0

显示文件

    def get_comment(self):
        page = geturl(self.url)

        # remove high ascii since this is going to IRC
        page = self.utf8.sub('', page)

        # create BeautifulSoup document tree
        soup = BeautifulSoup(page)
        table = soup.find('table')
        rows = table.findAll('tr')
        row = rows[1]
        cells = row.findAll('td')
        source = cells[1].string
        comment = cells[2].string
        author = cells[3].string
        return '<%s@%s> %s' % (author, source, comment)

示例#8

0

显示文件

    def forecast(self, location):
        page = geturl(url=self.search, opts={'query': location},
                referer=self.baseurl)
        soup = BeautifulSoup(page)

        # disambiguation page
        if 'Search Results' in str(soup):
            table = soup.find('table', attrs={'class': 'boxB full'})
            rows = table.findAll('tr')
            results = []
            match = None
            for row in rows:
                cells = row.findAll('td', attrs={'class': 'sortC'})
                for cell in cells:
                    link = cell.find('a')
                    if link is None or 'addfav' in str(link['href']):
                        continue
                    city = str(link.contents[0])
                    href = urljoin(self.baseurl, str(link['href']))
                    results.append(city)
                    if city.lower() == location.lower():
                        match = urljoin(self.baseurl, href)
                        break
                if match:
                    break
            if match:
                page = geturl(url=match)
                soup = BeautifulSoup(page)
            else:
                return 'Multiple results found: %s' % ', '.join(results)

        rss_url = soup.find('link', attrs=self._rss_link)['href']
        rss = rssparser.parse(rss_url)
        title = str(soup.find('h1').string).strip()
        conditions = stripHTML(rss['items'][0]['description'])
        fields = self._bar.split(conditions)
        data = {}
        for field in fields:
            try:
                key, val = self._keyval.search(field).groups()
                data[key] = val
            except:
                pass

        try:
            temp = float(self._tempF.search(data['Temperature']).group(1))
            blink = False
            if temp < 0:
                color = 6
            elif temp >=0 and temp < 40:
                color = 2
            elif temp >= 40 and temp < 60:
                color = 10
            elif temp >= 60 and temp < 80:
                color = 3
            elif temp >= 80 and temp < 90:
                color = 7
            elif temp >= 90 and temp < 100:
                color = 5
            elif temp >= 100:
                color = 5
                blink = True
            data['Temperature'] = '\x03%s\x16\x16%s\x0F' % (color,
                    data['Temperature'])
            if blink:
                data['Temperature'] = '\x1b[5m' + data['Temperature'] + \
                        '\x1b[0m'

        except:
            pass

        output = []
        for key, val in data.items():
            line = '%s: %s' % (key, val)
            output.append(line)

        output = ' | '.join(output)

        return '%s: %s' % (title, output)

示例#9

0

显示文件

文件： page_processor.py 项目： pipifuyj/resys

class PageProcessor():
    
    def __init__(self,html):
        self.html=html
        self.soup = BeautifulSoup(self.html)
    
    def isFirstPage(self):
        if self.soup.find('div',{'class':'userMsg', 'id':'firstPostText'})!=None:
            return True
        else:
            return False
        
    def getTitle(self):
        if self.isFirstPage()==True:
            title = self.soup.find('div',{'class':'post_title'}).findAll(lambda tag:tag.name=='a',text=True)
            title = ''.join(title)
            title = title.replace('google_ad_region_start=title', '')
            title = title.replace('google_ad_region_end=title', '')
            title = title.replace('Archived From: Hot Deals', '')
            title = title.replace('&amp', '')
            title = title.replace('\n','')
            return title.strip()
        else:
            print >> sys.stderr, 'it is not the first page'
    
    def getRating(self):
        pass
    
    def getReplyNum(self):
        pass
    
    def getViewNum(self):
        pass
    
    def getPostTime(self):
        if self.isFirstPage():
            time = self.soup.find('div',{'class':'post_date'}).findAll(lambda tag:tag.name!='b',text=True)
            time = ''.join(time)
            time = time.replace('posted:', '')
            time = time.replace('updated:', '')
            time = time.replace('\n','')
            return time.strip()
        else:
            print >> sys.stderr, 'it is not the first page' 
    
    def getDescription(self):
        if self.isFirstPage():
            content = self.soup.find('div',{'class':'userMsg', 'id':'firstPostText'}).findAll(lambda tag:tag.name=='table',text=True)
            return (''.join(content[1:-1])).strip()
        else:
            print >> sys.stderr, 'it is not the first page' 
                
    def getCategory(self):
        pass
    
    def getFeedback(self):
        pass
    
    def getUser(self):
        if self.isFirstPage():
            username = self.soup.find('li',{'class':'user_name'}).findAll(lambda tag:tag.name!='span',text=True)
            return (''.join(username)).strip()
        else:
            print >> sys.stderr, 'it is not the first page'

示例#10

0

显示文件

文件： stockquote.py 项目： gipi/Richie

    def get_quote(self, symbol):
        url = Yahoo._quote_url.replace('SYMBOL', symbol)
        page = geturl(url)
        soup = BeautifulSoup(page)
        company = ' '.join([str(item) for item in soup.find('h1').contents])
        company = stripHTML(company)
        tables = soup.findAll('table')
        table = tables[0]
        rows = table.findAll('tr')
        data = {}
        current_value = 0.0
        open_value = 0.0
        for row in rows:
            key, val = row.findAll('td')
            key = str(key.contents[0])
            if key == 'Change:':
                try:
                    img = val.find('img')
                    alt = str(img['alt'])
                    val = alt + stripHTML(str(val.contents[0]))
                except:
                    val = '0.00%'
            elif key == 'Ask:':
                continue
            else:
                val = stripHTML(str(val.contents[0]))

            val = val.replace(',', '')
            if Yahoo._isfloat.search(val):
                val = float(val)

            data[key] = val

            if key == 'Last Trade:' or key == 'Index Value:':
                current_value = val

            elif key == 'Prev Close:':
                open_value = val

        # see if we can calculate percentage
        try:
            change = 100 * (current_value - open_value) / open_value
            data['Change:'] += ' (%.2f%%)' % change
        except:
            pass

        # try and colorize the change field
        try:
            if 'Up' in data['Change:']:
                data['Change:'] = self._green + data['Change:'] + self._reset
            elif 'Down' in data['Change:']:
                data['Change:'] = self._red + data['Change:'] + self._reset
        except:
            pass

        # build friendly output
        output = []
        for key, val in data.items():
            if isinstance(val, float):
                val = '%.2f' % val
            output.append('%s %s' % (key, val))

        return '%s - ' % company + ' | '.join(output)

示例#11

0

显示文件

文件： weather.py 项目： compbrain/madcow

    def forecast(self, location):
        page = geturl(url=self.search, opts={u'query': location},
                      referer=self.baseurl)
        soup = BeautifulSoup(page)

        # disambiguation page
        if u'Search Results' in unicode(soup):
            table = soup.find(u'table', attrs={u'class': u'dataTable'})
            tbody = soup.find(u'tbody')
            results = [row.findAll(u'td')[0].find(u'a')
                       for row in tbody.findAll(u'tr')]
            results = [(normalize(unicode(result.contents[0])),
                        urljoin(Weather.baseurl, unicode(result[u'href'])))
                       for result in results]

            match = None
            for result in results:
                if result[0] == normalize(location):
                    match = result[1]
                    break
            if match is None:
                match = results[0][1]
            page = geturl(url=match, referer=self.search)
            soup = BeautifulSoup(page)

        title = soup.find(u'h1').string.strip()
        rss_url = soup.find(u'link', attrs=self._rss_link)[u'href']
        rss = feedparser.parse(rss_url)
        conditions = rss.entries[0].description

        # XXX ok, here's the deal. this page has raw utf-8 bytes encoded
        # as html entities, and in some cases latin1.  this demonstrates a
        # total misunderstanding of how unicode works on the part of the
        # authors, so we need to jump through some hoops to make it work
        conditions = conditions.encode(u'raw-unicode-escape')
        conditions = stripHTML(conditions)
        conditions = encoding.convert(conditions)
        fields = self._bar.split(conditions)
        data = {}
        for field in fields:
            try:
                key, val = self._keyval.search(field).groups()
                data[key] = val
            except:
                pass

        try:
            temp = float(self._tempF.search(data[u'Temperature']).group(1))
            blink = False
            if temp < 0:
                color = u'magenta'
            elif temp >=0 and temp < 40:
                color = u'blue'
            elif temp >= 40 and temp < 60:
                color = u'cyan'
            elif temp >= 60 and temp < 80:
                color = u'green'
            elif temp >= 80 and temp < 90:
                color = u'yellow'
            elif temp >= 90 and temp < 100:
                color = u'red'
            elif temp >= 100:
                color = u'red'
                blink = True
            data[u'Temperature'] = self.colorlib.get_color(color,
                    text=data[u'Temperature'])

            # XXX this seems ill-conceived
            if blink:
                data[u'Temperature'] = u'\x1b[5m' + data[u'Temperature'] + \
                        u'\x1b[0m'

        except Exception, error:
            log.exception(error)