def _get_video_details(self,html_data): soup= BeautifulSoup(''.join(html_data),convertEntities=BeautifulStoneSoup.HTML_ENTITIES) script=soup.find('script',text=re.compile('flashvars')) t=soup.find('h1',{'class':'dmco_title'}) title=t.string if t != None else '' d=soup.find('div',id='video_description') description=d.string if d!=None else None c=soup.find('a',{'class':re.compile('fromchannel_link')}) category=c.string if c!=None else None tags_el=soup.find('div',{'class':re.compile('tags_cont')}).findAll('a') tags_list=[] for a in tags_el: tags_list.append(a.string) tags=','.join(tags_list) video=DailyMotionVideo() video.title=strip_accents(title) video.description=strip_accents(description) if description!=None else None video.category=strip_accents(category) video.tags=strip_accents(tags) return video
def _get_metacafe_videos_from_content(self,xml_data,count): #load the xml in memory sanitized_xml_data=''.join([c for c in xml_data if ord(c)<128]) tree=ElementTree.fromstring(sanitized_xml_data) videos = [] n=0 for i, elem in enumerate(tree.getiterator('item')): if n < count: n=n+1 try: video=MetacafeVideo() video.title=strip_accents(elem.find('title').text) video.description=strip_accents(elem.find('description').text) video.url=strip_accents(elem.find('link').text) other_source,source_id,new_url=self._verify_source_of_video(video.url) if other_source: video.url=new_url video.source=source_id video.category=strip_accents(elem.find('category').text) video.tags=strip_accents(elem.find('{http://search.yahoo.com/mrss/}keywords').text) videos.append(video) self._logger.info('Parsed metacafe video at url: %s',video.url) except: self._logger.exception('An error occurred while parsing a video ... Moving on to the next video...') continue else: break return videos
def _get_vimeo_videos_from_content(self,xml_data,count): #load the xml in memory sanitized_xml_data=''.join([c for c in xml_data if ord(c)<128]) tree=ElementTree.fromstring(sanitized_xml_data) videos = [] n=0 for i, elem in enumerate(tree.getiterator('video')): if n < count: n=n+1 try: video=VimeoVideo() video.title=strip_accents(elem.find('title').text) video.description=strip_accents(elem.find('caption').text) urls=elem.find('urls') if urls!=None: for url in urls.findall('url'): video.urls.append(strip_accents(url.text)) tags=elem.find('tags') if tags!=None: for tag in tags.findall('tag'): video.tags.append(strip_accents(tag.text)) videos.append(video) self._logger.info('Parsed vimeo video at url: %s',video.urls) except: self._logger.exception('An error occurred while parsing a video ... Moving on to the next video...') continue else: break return videos
def _get_video_details(self,html_data): soup= BeautifulSoup(''.join(html_data),convertEntities=BeautifulStoneSoup.HTML_ENTITIES) script=soup.find('script',text=re.compile('flashvars')) title=re.compile('flashvars.title = "(.+?)";').findall(script.string) description=re.compile('flashvars.description = "(.+?)";').findall(script.string) tags=re.compile('flashvars.tags = "(.+?)";').findall(script.string) category=re.compile('flashvars.category = "(.+?)";').findall(script.string) video=MegaVideoVideo() video.title=strip_accents(urllib.unquote(title[0].replace('+', ' '))) video.description=strip_accents(urllib.unquote(description[0].replace('+', ' '))) video.category=strip_accents(urllib.unquote(category[0].replace('+', ' '))) video.tags=strip_accents(urllib.unquote(tags[0].replace('+', ' '))) return video
def _parse_entry(self,entry): """ Collects the relevant metadata from a search result entry.""" lq_url,hq_url,hd_url=self._parse_page(entry.media.player.url) item_meta={'title':strip_accents(entry.media.title.text), 'description':strip_accents(entry.media.description.text), 'category':strip_accents(entry.media.category[0].text), 'tags':strip_accents(entry.media.keywords.text), 'page_url':entry.media.player.url, 'lq_url':lq_url, 'hq_url':hq_url, 'hd_url':hd_url, 'search-id':self.search_id, 'source':'1',} self._logger.info('Parsed youtube video at url: %s',entry.media.player.url) self._logger.debug('Video Metadata: %s',item_meta) return item_meta