Python BeautifulSoup 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: resources.lib.external.BeautifulSoup

클래스/타입: BeautifulSoup

hotexamples.com에서의 예제들: 4

Python BeautifulSoup - 4개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 resources.lib.external.BeautifulSoup.BeautifulSoup에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

BeautifulSoup(4)

find(2)

findAll(2)

renderContents(1)

예제 #1

파일 보기

파일: scraper.py 프로젝트: SleepyyNet/xbmc_gomtv

 def fetch_list(self, cat_id, page = 1):
   vids = {}
   
   response = urllib2.urlopen(self.video_list_url % (cat_id, page))
   contents = response.read()
   page = BeautifulSoup(contents)
   
   matches = page.findAll('td', 'listOff')
   if matches:
     for match in matches:
       primary_link = match.find('a', 'vodlink')
       id = primary_link['href'].replace('./', '')
       if id.startswith('javascript'):
         continue
       re_match = re.search(r'Posted: (\d+) (\d+)/(\d+)<', match.parent.find('td', 'sect').renderContents())
       year = re_match.group(1)
       month = re_match.group(2)
       day = re_match.group(3)
       date_string = "%s-%s-%s" % (day, month, year)
       posted_date = datetime.date(int(year), int(month), int(day)).strftime(self.date_format)
       image_url = 'http://www.gomtv.net' + match.find('img')['src']
       local_image_path = os.path.join(self.base_data_path, cat_id, str(id) + '.tbn')
       if not os.path.exists(os.path.join(self.base_data_path, cat_id)):
         os.makedirs(os.path.join(self.base_data_path, cat_id))
       if not os.path.isfile(local_image_path):
         urllib.urlretrieve(image_url, local_image_path)
       vids[id] = {'id': int(id),
                   'date_string': date_string,
                   'posted_date': posted_date,
                   'year': year,
                   'title': str(primary_link.string),
                   'description': str(match.find('div', 'vodinfo').renderContents()), 
                   'image_url': local_image_path}
   return vids

예제 #2

파일 보기

파일: scraper.py 프로젝트: SleepyyNet/xbmc_gomtv

 def fetch_page_count(self, cat_id):
   count = 0
   
   response = urllib2.urlopen(self.video_list_url % (cat_id, 1))
   contents = response.read()
   page = BeautifulSoup(contents)
   
   link_table = page.find('table', {'id': 'bbsnum'})
   count = re.search(r"<a href=\"\./\?page=(\d+)&[^>]*>Last >></a>", link_table.renderContents()).group(1)
   
   return count

예제 #3

파일 보기

파일: scraper.py 프로젝트: SleepyyNet/xbmc_gomtv

 def fetch_video(self, cat_id, vid_id):
   vid = None
   
   response = urllib2.urlopen(self.video_url % (cat_id, vid_id))
   contents = response.read()
   page = BeautifulSoup(contents)
   
   re_match = re.search(r'\.swf\?link=(\d+)', page.renderContents())
   if re_match:
     vid = {}
     file_id = re_match.group(1)
   
     vid['title'] = page.find('div', {'id': 'bbsDetail'}).h3.string
     vid['file_url'] = 'http://flvdn.gomtv.net/viewer/%s.flv' % file_id
     if not os.path.exists(os.path.join(self.base_data_path, cat_id)):
       os.makedirs(os.path.join(self.base_data_path, cat_id))
     vid['local_vid_path'] = os.path.join(self.base_data_path, cat_id, str(vid_id) + '.flv')
   
   return vid

예제 #4

파일 보기

파일: scraper.py 프로젝트: SleepyyNet/xbmc_gomtv

  def fetch(self):
    cats = {}
    
    response = urllib2.urlopen(self.category_url)
    contents = response.read()
    page = BeautifulSoup(contents)
    
    channels = page.findAll('div', {'id': 'Channels'})
    if channels:
      for channel in channels:        
        shows = channel.findAll('dl')
        for show in shows:
          if show.dt.a:
            id = re.sub(r'/(.*)/', r'\1', show.dt.a['href'])
            image_url = show.find('dd', 'img').img['src']
            if image_url.startswith('/'):
              image_url = 'http://www.gomtv.net' + image_url
            if show.find('dd', 'txt'):
              description = show.find('dd', 'txt').renderContents()
            else:
              description = ''
            cats[id] = {'id': id, 'title': show.dt.a.string, 'description': description, 'image_url': image_url }

    return cats