def get_url(self, lang, quality): obj = select(self.document.getroot(), 'object', 1) movie_url = select(obj, 'param[name=movie]', 1) xml_url = urllib.unquote(movie_url.attrib['value'].split('videorefFileUrl=')[-1]) doc = self.browser.get_document(self.browser.openurl(xml_url)) videos_list = select(doc.getroot(), 'video') videos = {} for v in videos_list: videos[v.attrib['lang']] = v.attrib['ref'] if lang in videos: xml_url = videos[lang] else: xml_url = videos.popitem()[1] doc = self.browser.get_document(self.browser.openurl(xml_url)) obj = select(doc.getroot(), 'urls', 1) videos_list = select(obj, 'url') urls = {} for v in videos_list: urls[v.attrib['quality']] = v.text if quality in urls: video_url = urls[quality] else: video_url = urls.popitem()[1] return video_url
def iter_videos(self): try: ul = select(self.document.getroot(), 'div.container-videos ul', 1) except SelectElementException: # It means there are no results. return for li in ul.findall('li'): id = re.sub(r'/video/(.+)\.html', r'\1', li.find('a').attrib['href']) thumbnail = 'http://boutique.ina.fr%s' % li.find('a').find('img').attrib['src'] title = select(li, 'p.titre', 1).text date = select(li, 'p.date', 1).text day, month, year = [int(s) for s in date.split('/')] date = datetime.datetime(year, month, day) duration = select(li, 'p.duree', 1).text m = re.match(r'((\d+)h)?((\d+)min)?(\d+)s', duration) if m: duration = datetime.timedelta(hours=int(m.group(2) or 0), minutes=int(m.group(4) or 0), seconds=int(m.group(5))) else: raise SelectElementException('Unable to match duration (%r)' % duration) yield InaVideo(id, title=title, date=date, duration=duration, thumbnail_url=thumbnail, )
def iter_videos(self): span_list = select(self.document.getroot(), 'span#miniatura') for span in span_list: a = select(span, 'a', 1) url = a.attrib['href'] _id = re.sub(r'/videos/(.+)\.html', r'\1', url) thumbnail_url = span.find('.//img').attrib['src'] title_el = select(span, 'span#title1', 1) title = title_el.text.strip() time_span = select(span, 'span.thumbtime span', 1) time_txt = time_span.text.strip() if time_txt == 'N/A': minutes, seconds = 0, 0 elif ':' in time_txt: minutes, seconds = (int(v) for v in time_txt.split(':')) else: raise SelectElementException('Unable to parse the video duration: %s' % time_txt) yield YoujizzVideo(_id, title=title, duration=datetime.timedelta(minutes=minutes, seconds=seconds), thumbnail_url=thumbnail_url, )
def get_current(self, radio): if radio == 'general': _radio = '' else: _radio = '_%s' % radio title = select(self.document.getroot(), 'div#titre%s' % _radio, 1).text.strip() artist = select(self.document.getroot(), 'div#artiste%s' % _radio, 1).text.strip() return unicode(artist), unicode(title)
def get_video(self, video=None): _id = to_unicode(self.group_dict["id"]) if video is None: video = YoujizzVideo(_id) title_el = select(self.document.getroot(), "title", 1) video.title = to_unicode(title_el.text.strip()) # youjizz HTML is crap, we must parse it with regexps data = lxml.html.tostring(self.document.getroot()) m = re.search(r"<strong>.*?Runtime.*?</strong> (.+?)<br.*>", data) try: if m: minutes, seconds = (int(v) for v in to_unicode(m.group(1).strip()).split(":")) video.duration = datetime.timedelta(minutes=minutes, seconds=seconds) else: raise Exception() except Exception: raise SelectElementException("Could not retrieve video duration") video_file_urls = re.findall(r'"(http://media[^ ,]+\.flv)"', data) if len(video_file_urls) == 0: raise SelectElementException("Video URL not found") elif len(video_file_urls) > 1: raise SelectElementException("Many video file URL found") else: video.url = video_file_urls[0] return video
def iter_videos(self): videos = self.document.getroot().cssselect("div[class=video]") for div in videos: title = div.find('h2').find('a').text m = re.match(r'/fr/videos/(.*)\.html', div.find('h2').find('a').attrib['href']) _id = '' if m: _id = m.group(1) rating = rating_max = 0 rates = select(div, 'div[class=rateContainer]', 1) for r in rates.findall('div'): if 'star-rating-on' in r.attrib['class']: rating += 1 rating_max += 1 thumb = select(div, 'img[class=thumbnail]', 1) thumbnail_url = 'http://videos.arte.tv' + thumb.attrib['src'] yield ArteVideo(_id, title=title, rating=rating, rating_max=rating_max, thumbnail_url=thumbnail_url)
def set_details(self, v): details_div = select(self.document.getroot(), '#details', 1) for li in details_div.getiterator('li'): span = li.find('span') name = span.text.strip() value = span.tail.strip() if name == 'Duration:': seconds = minutes = 0 for word in value.split(): if word.endswith('min'): minutes = int(word[:word.find('min')]) elif word.endswith('sec'): seconds = int(word[:word.find('sec')]) v.duration = datetime.timedelta(minutes=minutes, seconds=seconds) elif name == 'Submitted:': author = li.find('i') if author is None: author = li.find('a') if author is None: v.author = value else: v.author = author.text elif name == 'Rating:': r = value.split() v.rating = float(r[0]) v.rating_max = float(r[2]) elif name == 'Date:': m = self.DATE_REGEXP.match(value) if m: month = self.MONTH2I.index(m.group(1)) day = int(m.group(2)) hour = int(m.group(3)) minute = int(m.group(4)) second = int(m.group(5)) year = int(m.group(6)) v.date = datetime.datetime(year, month, day, hour, minute, second)
def get_title(self): element = select(self.document.getroot(), "meta[name=title]", 1) return to_unicode(element.attrib["content"].strip())
def get_author(self): element = select(self.document.getroot(), "a.watch-description-username strong", 1) return element.text.strip()
def get_video(self, video=None): element = select(self.document.getroot(), ".yt-alert-content", 1) raise ForbiddenVideo(element.text.strip())
def get_source(self): return select(self.document.getroot(), 'textarea#content_text', 1).text
def get_title(self): element = select(self.document.getroot(), '#videoArea h1', 1) return unicode(element.getchildren()[0].tail).strip()
def get_url(self): download_div = select(self.document.getroot(), '#download', 1) a = select(download_div, 'a', 1) return a.attrib['href']
def get_nb_remaining_free_sms(self): remaining_regex = re.compile(u'Il vous reste (?P<nb>.+) Texto gratuits vers les numéros SFR à envoyer aujourd\'hui') text = select(self.document.getroot(), '#smsReminder', 1).text.strip() return remaining_regex.match(text).groupdict().get('nb')