def iter_videos(self): span_list = self.parser.select(self.document.getroot(), 'span#miniatura') for span in span_list: a = self.parser.select(span, 'a', 1) url = a.attrib['href'] _id = re.sub(r'/videos/(.+)\.html', r'\1', url) video = YoujizzVideo(_id) video.thumbnail = Thumbnail( unicode(span.find('.//img').attrib['src'])) title_el = self.parser.select(span, 'span#title1', 1) video.title = to_unicode(title_el.text.strip()) time_span = self.parser.select(span, 'span.thumbtime span', 1) time_txt = time_span.text.strip().replace(';', ':') if time_txt == 'N/A': minutes, seconds = 0, 0 elif ':' in time_txt: minutes, seconds = (int(v) for v in time_txt.split(':')) else: raise BrokenPageError( 'Unable to parse the video duration: %s' % time_txt) video.duration = datetime.timedelta(minutes=minutes, seconds=seconds) yield video
def get_video(self, _id): video = QuviVideo(_id) parser = LibQuvi() if not parser.load(): raise UserError('Make sure libquvi 0.4 is installed') try: info = parser.get_info(video.page_url) except QuviError as qerror: raise UserError(qerror.message) video.url = to_unicode(info.get('url')) if not video.url: raise NotImplementedError() video.ext = to_unicode(info.get('suffix')) video.title = to_unicode(info.get('title')) video.page = to_unicode(info.get('page')) duration = int(info.get('duration', 0)) if duration: video.duration = datetime.timedelta(milliseconds=duration) if info.get('thumbnail'): video.thumbnail = Thumbnail(info.get('thumbnail')) return video
def iter_videos(self): for div in self.parser.select(self.document.getroot(), 'li.vignette'): title = self.parser.select(div, 'h4 a', 1) url = title.attrib['href'] m = re.match('^http://www.pluzz.fr/([^/]+)\.html$', url) if not m: self.logger.debug('url %s does not match' % url) continue _id = m.group(1) video = PluzzVideo(_id) m = re.match('^(.+) - ([0-2][0-9])h([0-5][0-9])$', title.text) if m: video.title = m.group(1) hour = int(m.group(2)) minute = int(m.group(3)) else: video.title = title.text hour = 0 minute = 0 m = re.match('(\d+)/(\d+)/(\d+)', self.parser.select(div, 'p.date', 1).text) if m: video.date = datetime.datetime(int(m.group(3)), int(m.group(2)), int(m.group(1)), hour, minute) url = self.parser.select(div, 'img.illustration', 1).attrib['src'] video.thumbnail = Thumbnail(u'http://www.pluzz.fr/%s' % url) yield video
def iter_videos(self): for div in self.parser.select(self.document.getroot(), 'article.rs-cell'): title = self.parser.select(div, 'h3 a', 1) url = title.attrib['href'] m = re.match('^http://pluzz.francetv.fr/videos/(.+).html$', url) if not m: self.logger.debug('url %s does not match' % url) continue _id = m.group(1) video = PluzzVideo(_id) video.title = unicode(title.text.strip()) for p in div.xpath('.//p[@class="bientot"]'): video.title += ' - %s' % p.text.split('|')[0].strip() video.date = parse_dt(div.find('span').attrib['data-date']) duration = div.xpath('.//span[@class="type-duree"]')[0].text.split( '|')[1].strip() if duration[-1:] == "'": t = [0, int(duration[:-1])] else: t = map(int, duration.split(':')) video.duration = datetime.timedelta(hours=t[0], minutes=t[1]) url = self.parser.select(div, 'a.vignette img', 1).attrib['src'] video.thumbnail = Thumbnail(url) yield video
def iter_videos(self): for div in self.parser.select(self.document.getroot(), 'div.data_emissions ul li'): m = re.match('id-(\d+)', div.attrib.get('class', '')) if not m: continue img = self.parser.select(div, 'div.screenshot a img', 1) video = NolifeTVVideo(m.group(1)) video.title = unicode(img.attrib['alt']) try: video.description = unicode(self.parser.select(div, 'div.tooltip div.border-bottom p, div.infos div.border-bottom p')[-1].text) except IndexError: video.description = NotAvailable video.thumbnail = Thumbnail(unicode(img.attrib['src'])) try: dparts = self.parser.select(div, 'span.date_emission', 1).text.strip().split('/') hparts = self.parser.select(div, 'span.hour_emission', 1).text.strip().split('h') video.date = datetime(int(dparts[-1]), int(dparts[-2]), int(dparts[-3]), int(hparts[0]), int(hparts[1])) except (BrokenPageError,ValueError): video.date = NotAvailable video.set_empty_fields(NotAvailable, ('url',)) yield video
def iter_videos(self): try: ul = self.parser.select(self.document.getroot(), 'div.container-videos ul', 1) except BrokenPageError: # It means there are no results. return for li in ul.findall('li'): id = re.sub(self.URL_REGEXP, r'\1', li.find('a').attrib['href']) video = InaVideo('boutique.%s' % id) video.thumbnail = Thumbnail(u'http://boutique.ina.fr%s' % li.find('a').find('img').attrib['src']) video.title = unicode(self.parser.select(li, 'p.titre', 1).text) date = self.parser.select(li, 'p.date', 1).text day, month, year = [int(s) for s in date.split('/')] video.date = datetime.datetime(year, month, day) duration = self.parser.select(li, 'p.duree', 1).text m = re.match(r'((\d+)h)?((\d+)min)?(\d+)s', duration) if m: video.duration = datetime.timedelta(hours=int(m.group(2) or 0), minutes=int( m.group(4) or 0), seconds=int(m.group(5))) else: raise BrokenPageError('Unable to match duration (%r)' % duration) yield video
def fill_gallery(self, gallery): gallery.title = self.document.xpath("//h1[@id='gn']/text()")[0] cardinality_string = self.document.xpath( "//div[@id='gdd']//tr[td[@class='gdt1']/text()='Length:']/td[@class='gdt2']/text()" )[0] gallery.cardinality = int( re.match(r"\d+", cardinality_string).group(0)) date_string = self.document.xpath( "//div[@id='gdd']//tr[td[@class='gdt1']/text()='Posted:']/td[@class='gdt2']/text()" )[0] gallery.date = datetime.strptime(date_string, "%Y-%m-%d %H:%M") rating_string = self.document.xpath( "//td[@id='rating_label']/text()")[0] rating_match = re.search(r"\d+\.\d+", rating_string) if rating_match is None: gallery.rating = None else: gallery.rating = float(rating_match.group(0)) gallery.rating_max = 5 try: thumbnail_url = self.document.xpath( "//div[@class='gdtm']/a/img/attribute::src")[0] except IndexError: thumbnail_style = self.document.xpath( "//div[@class='gdtm']/div/attribute::style")[0] thumbnail_url = re.search(r"background:[^;]+url\((.+?)\)", thumbnail_style).group(1) gallery.thumbnail = Thumbnail(thumbnail_url)
def iter_videos(self): for a in self.parser.select(self.document.getroot(), 'section.conference ul.media_items li.featured a.session_item'): href = a.attrib.get('href', '') # print href m = re.match('/play/(\d+)/.*', href) if not m: continue # print m.group(1) video = GDCVaultVideo(m.group(1)) # get title try: video.title = unicode(self.parser.select(a, 'div.conference_info p strong', 1).text) except IndexError: video.title = NotAvailable # get description try: video.description = unicode(self.parser.select(a, 'div.conference_info p', 1).text) except IndexError: video.description = NotAvailable # get thumbnail img = self.parser.select(a, 'div.featured_image img', 1) if img is not None: video.thumbnail = Thumbnail(unicode(img.attrib['src'])) else: video.thumbnail = NotAvailable #m = re.match('id-(\d+)', a.attrib.get('class', '')) #if not m: # continue # FIXME yield video
def create_video_from_songs_result(self, songs): self.VIDEOS_FROM_SONG_RESULTS = [] for song in songs: video = GroovesharkVideo(song['SongID']) video.title = u'Song - %s' % song['SongName'].encode( 'ascii', 'replace') video.author = u'%s' % song['ArtistName'].encode( 'ascii', 'replace') video.description = u'%s - %s - %s' % ( video.author, song['AlbumName'].encode('ascii', 'replace'), song['Year'].encode('ascii', 'replace')) video.thumbnail = Thumbnail( u'http://images.gs-cdn.net/static/albums/40_' + song['CoverArtFilename']) video.duration = datetime.timedelta( seconds=int(float(song['EstimateDuration']))) video.rating = float(song['AvgRating']) try: video.date = datetime.date(year=int(song['Year']), month=1, day=1) except ValueError: video.date = NotAvailable self.VIDEOS_FROM_SONG_RESULTS.append(video) yield video
def iter_videos(self): for div in self.parser.select(self.document.getroot(), 'div.dmpi_video_item'): _id = div.attrib.get('data-id', None) if _id is None: self.browser.logger.warning('Unable to find the ID of a video') continue video = DailymotionVideo(_id) video.title = unicode(self.parser.select(div, 'h3 a', 1).text).strip() video.author = unicode( self.parser.select(div, 'div.dmpi_user_login', 1).find('a').find('span').text).strip() video.description = html2text( self.parser.tostring( self.parser.select(div, 'div.dmpi_video_description', 1))).strip() or unicode() try: parts = self.parser.select(div, 'div.duration', 1).text.split(':') except BrokenPageError: # it's probably a live, np. video.duration = NotAvailable else: if len(parts) == 1: seconds = parts[0] hours = minutes = 0 elif len(parts) == 2: minutes, seconds = parts hours = 0 elif len(parts) == 3: hours, minutes, seconds = parts else: raise BrokenPageError( 'Unable to parse duration %r' % self.parser.select(div, 'div.duration', 1).text) video.duration = datetime.timedelta(hours=int(hours), minutes=int(minutes), seconds=int(seconds)) url = unicode( self.parser.select(div, 'img.dmco_image', 1).attrib['data-src']) # remove the useless anti-caching url = re.sub('\?\d+', '', url) # use the bigger thumbnail url = url.replace('jpeg_preview_medium.jpg', 'jpeg_preview_large.jpg') video.thumbnail = Thumbnail(unicode(url)) rating_div = self.parser.select(div, 'div.small_stars', 1) video.rating_max = self.get_rate(rating_div) video.rating = self.get_rate(rating_div.find('div')) video.set_empty_fields(NotAvailable, ('url', )) yield video
def _entry2video(self, entry): """ Parse an entry returned by gdata and return a Video object. """ video = YoutubeVideo(to_unicode(entry.id.text.split('/')[-1].strip())) video.title = to_unicode(entry.media.title.text.strip()) video.duration = datetime.timedelta(seconds=int(entry.media.duration.seconds.strip())) video.thumbnail = Thumbnail(to_unicode(entry.media.thumbnail[0].url.strip())) if entry.author[0].name.text: video.author = to_unicode(entry.author[0].name.text.strip()) if entry.media.name: video.author = to_unicode(entry.media.name.text.strip()) return video
def iter_videos(self): # When no results are found, the website returns random results sb = self.parser.select(self.document.getroot(), 'div.search form input.searchbox', 1) if sb.value == 'No Results Found': return #Extracting meta data from results page vidbackdrop_list = self.parser.select(self.document.getroot(), 'div.vidBackdrop ') for vidbackdrop in vidbackdrop_list: url = self.parser.select(vidbackdrop, 'a', 1).attrib['href'] _id = url[2:] video = CappedVideo(_id) video.set_empty_fields(NotAvailable, ('url', )) video.title = to_unicode( self.parser.select(vidbackdrop, 'div.vidTitle a', 1).text) video.author = to_unicode( self.parser.select(vidbackdrop, 'div.vidAuthor a', 1).text) thumbnail_url = 'http://cdn.capped.tv/pre/%s.png' % _id video.thumbnail = Thumbnail(thumbnail_url) #we get the description field duration_tmp = self.parser.select(vidbackdrop, 'div.vidInfo', 1) #we remove tabs and spaces duration_tmp2 = duration_tmp.text[7:] #we remove all fields exept time duration_tmp3 = duration_tmp2.split(' ')[0] #we transform it in datetime format parts = duration_tmp3.split(':') if len(parts) == 1: hours = minutes = 0 seconds = parts[0] elif len(parts) == 2: hours = 0 minutes, seconds = parts elif len(parts) == 3: hours, minutes, seconds = parts else: raise BrokenPageError('Unable to parse duration %r' % duration_tmp) video.duration = datetime.timedelta(hours=int(hours), minutes=int(minutes), seconds=int(seconds)) yield video
def iter_videos(self): videos = self.document.getroot().cssselect("div[class=video]") for div in videos: title = div.find('h2').find('a').text m = re.match(r'/(fr|de|en)/videos/(.*)\.html', div.find('h2').find('a').attrib['href']) _id = '' if m: _id = m.group(2) rating = rating_max = 0 rates = self.parser.select(div, 'div[class=rateContainer]', 1) for r in rates.findall('div'): if 'star-rating-on' in r.attrib['class']: rating += 1 rating_max += 1 video = ArteVideo(_id) video.title = unicode(title) video.rating = rating video.rating_max = rating_max thumb = self.parser.select(div, 'img[class=thumbnail]', 1) video.thumbnail = Thumbnail(u'http://videos.arte.tv' + thumb.attrib['src']) try: parts = self.parser.select(div, 'div.duration_thumbnail', 1).text.split(':') if len(parts) == 2: hours = 0 minutes, seconds = parts elif len(parts) == 3: hours, minutes, seconds = parts else: raise BrokenPageError('Unable to parse duration %r' % parts) except BrokenPageError: pass else: video.duration = datetime.timedelta(hours=int(hours), minutes=int(minutes), seconds=int(seconds)) video.set_empty_fields(NotAvailable, ('url', )) yield video
def create_video(self, song): if song['EstimateDuration']: video = GroovesharkVideo(song['SongID']) video.title = u'Song - %s' % song['Name'].encode( 'ascii', 'replace') video.author = u'%s' % song['ArtistName'].encode( 'ascii', 'replace') video.description = u'%s - %s' % ( video.author, song['AlbumName'].encode('ascii', 'replace')) if song['CoverArtFilename']: video.thumbnail = Thumbnail( u'http://images.gs-cdn.net/static/albums/40_' + song['CoverArtFilename']) video.duration = datetime.timedelta( seconds=int(float(song['EstimateDuration']))) video.date = NotAvailable return video
def parse_video(self, el, video=None, quality=None): _id = el.find('ID').text if _id == '-1': # means the video is not found return None if not video: video = CanalplusVideo(_id) infos = el.find('INFOS') video.title = u'' for part in infos.find('TITRAGE'): if len(part.text.strip()) == 0: continue if len(video.title) > 0: video.title += u' — ' video.title += part.text.strip() video.description = unicode(infos.find('DESCRIPTION').text) media = el.find('MEDIA') url = media.find('IMAGES').find('PETIT').text if url: video.thumbnail = Thumbnail(unicode(url)) else: video.thumbnail = NotAvailable lastest_format = None for format in media.find('VIDEOS'): if format.text is None: continue if format.tag == quality: video.url = unicode(format.text) break lastest_format = format if not video.url and lastest_format is not None: video.url = unicode(lastest_format.text) day, month, year = map( int, infos.find('PUBLICATION').find('DATE').text.split('/')) hour, minute, second = map( int, infos.find('PUBLICATION').find('HEURE').text.split(':')) video.date = datetime(year, month, day, hour, minute, second) return video
def iter_videos(self, lang='fr'): videos = list() xml_url = (self.document.xpath('//link')[0]).attrib['href'] datas = self.browser.readurl(xml_url) re_items = re.compile("(<item>.*?</item>)", re.DOTALL) items = re.findall(re_items, datas) for item in items: parsed_element = self.get_element(item, lang) if parsed_element: video = ArteLiveVideo(parsed_element['ID']) video.title = parsed_element['title'] video.description = parsed_element['pitch'] video.author = parsed_element['author'] if parsed_element['pict']: video.thumbnail = Thumbnail(parsed_element['pict']) video.set_empty_fields(NotAvailable, ('url', )) videos.append(video) return videos
def create_video(self, item): video = ArteVideo(item['VID']) if 'VSU' in item: video.title = u'%s : %s' % (item['VTI'], item['VSU']) else: video.title = u'%s' % (item['VTI']) video.rating = int(item['VRT']) video.thumbnail = Thumbnail(u'%s' % item['programImage']) video.duration = datetime.timedelta( seconds=int(item['videoDurationSeconds'])) video.set_empty_fields(NotAvailable, ('url', )) video.description = u'%s' % item['VDE'] m = re.match('(\d{2})\s(\d{2})\s(\d{4})(.*?)', item['VDA']) if m: dd = int(m.group(1)) mm = int(m.group(2)) yyyy = int(m.group(3)) video.date = datetime.date(yyyy, mm, dd) return video
def iter_videos(self): videos = self.document.getroot().cssselect("div[class=bloc-contenu-8]") for div in videos: title = self.parser.select(div, 'a.typo-titre', 1).text_content().replace(' ', ' ') m = re.match(r'/contenu.php\?id=(.*)', div.find('a').attrib['href']) _id = '' if m: _id = m.group(1) video = ArretSurImagesVideo(_id) video.title = unicode(title) video.rating = None video.rating_max = None thumb = self.parser.select(div, 'img', 1) video.thumbnail = Thumbnail(u'http://www.arretsurimages.net' + thumb.attrib['src']) yield video
def iter_videos(self): for li in self.document.getroot().xpath('//ul/li[@class="videoBox"]'): a = li.find('div').find('a') if a is None or a.find('img') is None: continue thumbnail_url = a.find('img').attrib['src'] a = self.parser.select(li, './/a[@class="videoTitle"]', 1, 'xpath') url = a.attrib['href'] _id = url[len('/watch/'):] _id = _id[:_id.find('/')] video = YoupornVideo(int(_id)) video.title = unicode(a.text.strip()) video.thumbnail = Thumbnail(unicode(thumbnail_url)) hours = minutes = seconds = 0 div = li.cssselect('div.duration') if len(div) > 0: pack = [int(s) for s in div[0].text.strip().split(':')] if len(pack) == 3: hours, minutes, seconds = pack elif len(pack) == 2: minutes, seconds = pack video.duration = datetime.timedelta(hours=hours, minutes=minutes, seconds=seconds) div = li.cssselect('div.rating') if div: video.rating = int(div[0].text.strip('% ')) video.rating_max = 100 video.set_empty_fields(NotAvailable, ('url', 'author')) yield video
def iter_videos(self): for div in self.parser.select(self.document.getroot(), 'li#contentsearch'): title = self.parser.select(div, '#titlesearch span', 1) a = self.parser.select(div, 'a', 1) url = a.attrib['href'] m = re.match('/video-(.*)', url) if not m: self.logger.debug('url %s does not match' % url) continue _id = m.group(1) video = TricTracTVVideo(_id) video.title = unicode(title.text) url = self.parser.select(div, 'img', 1).attrib['src'] stars = self.parser.select(div, '.etoile_on') video.rating = len(stars) video.rating_max = 5 video.thumbnail = Thumbnail( unicode('http://www.trictrac.tv/%s' % url)) yield video
def get_video(self, video=None): _id = to_unicode(self.group_dict['id']) if video is None: video = NolifeTVVideo(_id) # Check if video is external. try: div = self.parser.select(self.document.getroot(), 'div#message_lien_ext', 1) except BrokenPageError: pass else: link = div.find('a').attrib['href'] raise ForbiddenVideo('Video is only available here: %s' % link) meta = self.parser.select(self.document.getroot(), 'meta[property="og:title"]', 1) try: video.title = unicode(meta.attrib['content']) except BrokenPageError: video.title = NotAvailable meta = self.parser.select(self.document.getroot(), 'meta[property="og:description"]', 1) try: video.description = unicode(meta.attrib['content']) except BrokenPageError: video.description = NotAvailable meta = self.parser.select(self.document.getroot(), 'meta[property="og:image"]', 1) try: video.thumbnail = Thumbnail(unicode(meta.attrib['content'])) except BrokenPageError: video.thumbnail = NotAvailable try: video.date = parse_dt(self.parser.select(div, 'div#infos_complementaires', 1).find('p').text.strip()) except Exception: video.date = NotAvailable video.author = NotAvailable video.duration = NotAvailable video.rating = NotAvailable video.rating_max = NotAvailable if not video.url: skey, timestamp = self.genkey() self.browser.readurl('http://online.nolife-tv.com/_nlfplayer/api/api_player.php', 'skey=%s&a=MD5×tamp=%s' % (skey, timestamp)) skey, timestamp = self.genkey() self.browser.readurl('http://online.nolife-tv.com/_nlfplayer/api/api_player.php', 'a=EML&skey=%s&id%%5Fnlshow=%s×tamp=%s' % (skey, _id, timestamp)) skey, timestamp = self.genkey() data = self.browser.readurl('http://online.nolife-tv.com/_nlfplayer/api/api_player.php', 'quality=0&a=UEM%%7CSEM%%7CMEM%%7CCH%%7CSWQ&skey=%s&id%%5Fnlshow=%s×tamp=%s' % (skey, _id, timestamp)) values = dict([urllib.splitvalue(s) for s in data.split('&')]) if not 'url' in values: raise ForbiddenVideo(values.get('message', 'Not available').decode('iso-8859-15')) video.url = unicode(values['url']) video.set_empty_fields(NotAvailable) return video
def set_details(self, v): # try to get as much from the page itself obj = self.parser.select(self.document.getroot(), 'h1[itemprop=name]') if len(obj) > 0: v.title = unicode(obj[0].text) obj = self.parser.select(self.document.getroot(), 'meta[itemprop=dateCreated]') if len(obj) > 0: v.date = parse_dt(obj[0].attrib['content']) #obj = self.parser.select(self.document.getroot(), 'meta[itemprop=duration]') obj = self.parser.select(self.document.getroot(), 'meta[itemprop=thumbnailUrl]') if len(obj) > 0: v.thumbnail = Thumbnail(unicode(obj[0].attrib['content'])) # for the rest, use the JSON config descriptor json_data = self.browser.openurl( 'http://%s/config/%s?type=%s&referrer=%s' % ("player.vimeo.com", int(v.id), "html5_desktop_local", "")) data = json.load(json_data) if data is None: raise BrokenPageError('Unable to get JSON config for id: %r' % int(v.id)) #print data if v.title is None: v.title = unicode(data['video']['title']) if v.thumbnail is None: v.thumbnail = Thumbnail(unicode(data['video']['thumbnail'])) v.duration = datetime.timedelta(seconds=int(data['video']['duration'])) # determine available codec and quality # use highest quality possible quality = 'sd' codec = None if 'vp6' in data['video']['files']: codec = 'vp6' if 'vp8' in data['video']['files']: codec = 'vp8' if 'h264' in data['video']['files']: codec = 'h264' if not codec: raise BrokenPageError( 'Unable to detect available codec for id: %r' % int(v.id)) if 'hd' in data['video']['files'][codec]: quality = 'hd' v.url = unicode( "http://player.vimeo.com/play_redirect?quality=%s&codecs=%s&clip_id=%d&time=%s&sig=%s&type=html5_desktop_local" % (quality, codec, int(v.id), data['request']['timestamp'], data['request']['signature'])) # attempt to determine the redirected URL to pass it instead # since the target server doesn't check for User-Agent, unlike # for the source one. # HACK: we use mechanize directly here for now... FIXME #print "asking for redirect on '%s'" % (v.url) self.browser.set_handle_redirect(False) try: self.browser.open_novisit(v.url) except HTTPError, e: if e.getcode() == 302 and hasattr(e, 'hdrs'): #print e.hdrs['Location'] v.url = unicode(e.hdrs['Location'])