Exemplo n.º 1
0
 def getNewTalks(self):
     talkContainers = SoupStrainer(attrs = {'class':re.compile('talkMedallion')})
     for talk in BeautifulSoup(self.html, parseOnlyThese = talkContainers):
         link = URLTED+talk.dt.a['href']
         title = cleanHTML(talk.dt.a['title'])
         pic = resizeImage(talk.find('img', attrs = {'src':re.compile('.+?\.jpg')})['src'])
         yield {'url':link, 'Title':title, 'Thumb':pic}
Exemplo n.º 2
0
 def getNewTalks(self):
     talkContainers = SoupStrainer(attrs = {'class':re.compile('talkMedallion')})
     for talk in BeautifulSoup(self.html, parseOnlyThese = talkContainers):
         link = URLTED+talk.dt.a['href']
         title = cleanHTML(talk.dt.a['title'])
         pic = resizeImage(talk.find('img', attrs = {'src':re.compile('.+?\.jpg')})['src'])
         yield {'url':link, 'Title':title, 'Thumb':pic}
 def getPrograms(self):
     """Return all programs in self.html"""
     # Couldn't parse properly the file using "'div', {'class':'bloc-contenu-8'}"
     # BeautifulSoup returns nothing in that class
     # So use 'contenu-descr-8 ' and find previous tag
     soup = BeautifulSoup(cleanHTML(self.html))
     for media in soup.findAll('div', {'class':'contenu-descr-8 '}):
         aTag = media.findPrevious('a')
         # Get link, title and thumb
         mediaLink = URLASI + aTag['href']
         mediaTitle = aTag['title'].encode('utf-8')
         mediaThumb = URLASI + aTag.find('img', attrs = {'src':re.compile('.+?\.[png|jpg]')})['src']
         yield {'url':mediaLink, 'Title':mediaTitle, 'Thumb':mediaThumb}
Exemplo n.º 4
0
 def getTalks(self):
     # themes loaded with a json call. Why are they not more consistant?
     from simplejson import loads
     # search HTML for the link to tedtalk's "api".  It is easier to use regex here than BS.
     jsonUrl = URLTED+re.findall('DataSource\("(.+?)"', self.html)[0]
     # make a dict from the json formatted string from above url
     talksMarkup = loads(getHTML(jsonUrl))
     # parse through said dict for all the metadata
     for markup in talksMarkup['resultSet']['result']:
         talk = BeautifulSoup(markup['markup'])
         link = URLTED+talk.dt.a['href']
         title = cleanHTML(talk.dt.a['title'])
         pic = resizeImage(talk.find('img', attrs = {'src':re.compile('.+?\.jpg')})['src'])
         yield {'url':link, 'Title':title, 'Thumb':pic}
Exemplo n.º 5
0
 def getTalks(self):
     # themes loaded with a json call. Why are they not more consistant?
     from simplejson import loads
     # search HTML for the link to tedtalk's "api".  It is easier to use regex here than BS.
     jsonUrl = URLTED+re.findall('DataSource\("(.+?)"', self.html)[0]
     # make a dict from the json formatted string from above url
     talksMarkup = loads(getHTML(jsonUrl))
     # parse through said dict for all the metadata
     for markup in talksMarkup['resultSet']['result']:
         talk = BeautifulSoup(markup['markup'])
         link = URLTED+talk.dt.a['href']
         title = cleanHTML(talk.dt.a['title'])
         pic = resizeImage(talk.find('img', attrs = {'src':re.compile('.+?\.jpg')})['src'])
         yield {'url':link, 'Title':title, 'Thumb':pic}
 def getPrograms(self):
     """Return all programs in self.html"""
     # Couldn't parse properly the file using "'div', {'class':'bloc-contenu-8'}"
     # BeautifulSoup returns nothing in that class
     # So use 'contenu-descr-8 ' and find previous tag
     soup = BeautifulSoup(cleanHTML(self.html))
     for media in soup.findAll('div', {'class': 'contenu-descr-8 '}):
         aTag = media.findPrevious('a')
         # Get link, title and thumb
         mediaLink = URLASI + aTag['href']
         mediaTitle = aTag['title'].encode('utf-8')
         mediaThumb = URLASI + aTag.find(
             'img', attrs={'src': re.compile('.+?\.[png|jpg]')})['src']
         yield {
             'url': mediaLink,
             'Title': mediaTitle,
             'Thumb': mediaThumb
         }