def get_episodes(self, channel): url = 'http://www.youtube.com/user/CanalACulturaActiva/feed' j = 1 shows = {} html = urllib.urlopen(url).read() dom = lxml.html.document_fromstring(html) for item in dom.cssselect('.feed-item-main'): p = [x.strip() for x in item.cssselect('h4')[0].text_content().split('-')] show_title = p[0] episode_title = '-'.join(p[1:]) try: serie = Serie.objects.get(name=show_title) except Serie.DoesNotExist: serie = Serie(name=show_title) print ">> SERIE: %s" % show_title.encode('utf8') serie.channel = channel serie.save() serie.genres.add(Genre.objects.get_or_create(code='CULTURA',defaults={'name':'Cultura'})[0]) if Episode.objects.filter(name=episode_title).count() > 0: continue episode = Episode(serie=serie, name=episode_title, number=0) print "%s" % episode_title episode.description = item.cssselect('.description')[0].text_content() + "\n" + \ item.cssselect('.video-time')[0].text_content() episode.thumbnail = urllib.basejoin(self.BASE_URL, item.cssselect('.video-thumb img')[0].get('src')) episode.save() url2 = item.cssselect('a')[0].get('href') video_id = re.findall('v=([^&]+)', url2)[0] video_url = get_youtube_url(video_id) media = HttpMediaFile(width=640, height=480, mimetype='video/mp4', url=video_url) media.episode = episode media.save() serie.episode_set.add(episode)
def scrap_serie(self, serie): html = urllib.urlopen(serie.url).read() soup = BeautifulSoup(html, from_encoding='utf8') for cnt,article in enumerate(soup.find('div','primary-pane').ol('li')): episode = Episode() episode.thumbnail = article.img.get('src') episode.name = article.find('span','video-overview').span.text print episode.name.encode('utf8') dur = article.find('span','video-time').text.split(':') episode.duration = time(0, int(dur[0]), int(dur[1])) episode.serie = serie episode.number = cnt+1 episode.season = 1 episode.save() video_id = re.findall('v=([^&]+)', article.a.get('href'))[0] media = HttpMediaFile() media.url = self.get_real_url(video_id) media.episode = episode media.save()
def get_episode(self, serie, url): cnt = serie.episode_set.count()+1 html = urllib.urlopen(url).read() dom = lxml.html.document_fromstring(html) for elem in dom.cssselect("#ms-player-thumb-videos ul li"): episode = Episode() episode.serie = serie episode.number = cnt cnt += 1 episode.name = elem.cssselect(".ms-thumb-titulo")[0].text_content() print episode.name episode.thumbnail = elem.cssselect(".ms-thumb-img img")[0].get('src') episode.save() num = re.findall("\(([0-9])\)", elem.cssselect("a")[0].get('onclick'))[0] elemscript = dom.cssselect("#ms-player2-%s" % num)[0].getnext() sig = re.findall('"(.*?)"', elemscript.text_content())[1] media = HttpMediaFile() media.episode = episode media.url = "http://api.kewego.com/video/getHTML5Stream?playerKey=%s&sig=%s&format=normal" % (self.player_key,sig) media.save()
def scrap_episode(self, episode, url): html = urllib.urlopen(url).read() html = re.sub('<\?.*?\?>','',html) soup = BeautifulSoup(html, from_encoding='utf8') episode.air = date(*map(int, reversed(soup.find('td','fecha-hora').text.split('/')))) fileset = MediaFileSet() fileset.episode = episode fileset.save() for cnt,embed in enumerate(soup('embed')): video_id = re.findall('/([^/]*?)\?', embed.get('src'))[0] media = HttpMediaFile() media.url = self.get_real_url(video_id) if media.url is None: print "ERROR" continue media.precedence = cnt+1 print cnt+1 media.save() fileset.medias.add(media) fileset.save()
def get_episode(self, episode, url): html = urllib.urlopen(url).read() soup = BeautifulSoup(html, from_encoding='latin-1') episode.thumbnail = soup.find('div','capitulo_thumb').img.get('src') descr = ''.join(filter(lambda x:type(x) == bs4.element.NavigableString, soup.find("div","titCapitulo").children)) episode.description = descr.replace('\t','').replace('\n','') episode.save() media = HttpMediaFile() media.url = re.findall("'file' : '(.*?)'", html)[0] media.width=480 media.height=360 media.episode = episode media.save()