示例#1
0
 def get_episodes(self, channel):
     url = 'http://www.youtube.com/user/CanalACulturaActiva/feed'
     j = 1
     shows = {}
     html = urllib.urlopen(url).read()
     dom = lxml.html.document_fromstring(html)
     for item in dom.cssselect('.feed-item-main'):
             p = [x.strip() for x in item.cssselect('h4')[0].text_content().split('-')]
             show_title = p[0]
             episode_title = '-'.join(p[1:])
             try:
                 serie = Serie.objects.get(name=show_title)
             except Serie.DoesNotExist:
                 serie = Serie(name=show_title)
                 print ">> SERIE: %s" % show_title.encode('utf8')
                 serie.channel = channel
                 serie.save()
                 serie.genres.add(Genre.objects.get_or_create(code='CULTURA',defaults={'name':'Cultura'})[0])
             
             if Episode.objects.filter(name=episode_title).count() > 0:
                 continue
             episode = Episode(serie=serie, name=episode_title, number=0)
             print "%s" % episode_title
             episode.description = item.cssselect('.description')[0].text_content() + "\n" + \
                 item.cssselect('.video-time')[0].text_content()
             episode.thumbnail = urllib.basejoin(self.BASE_URL, item.cssselect('.video-thumb img')[0].get('src'))
             episode.save()
             url2 = item.cssselect('a')[0].get('href')
             video_id = re.findall('v=([^&]+)', url2)[0]
             video_url = get_youtube_url(video_id)
             media = HttpMediaFile(width=640, height=480, mimetype='video/mp4', url=video_url)
             media.episode = episode
             media.save()
             serie.episode_set.add(episode)
示例#2
0
    def get_shows(self, channel, url, params):
        while True:
            print "PAGE %d" % params['pagina']
            real_url = "%s?%s" % (url,urllib.urlencode(params))
            html = urllib.urlopen(real_url).read()
            soup = BeautifulSoup(html,from_encoding='latin-1')
            answer = []
            found = False
            for dataitem  in soup('div','resBusqueda'):
                found = True
                name = dataitem.h1.a.text.strip()
                print "%s..." % name.encode('utf8')
                if channel.serie_set.filter(name=name).count() > 0:
                    print "EXIST"
                    continue

                serie = Serie()
                serie.channel = channel
                serie.name = name
                serie.thumbnail = dataitem.find('div','resBusqueda_thumb').img.get('src')
                serie.description = dataitem.p.text
                serie.url = urllib.basejoin(self.BASE_URL, dataitem.h1.a.get('href'))
                #self.get_episodes(serie, serie_url)
                serie.save()
                print "OK"

            if not found:
                break

            params['pagina'] += 1
示例#3
0
 def scrap_channel(self, channel):
     url = channel.urls[0]
     i = url.index('?')
     base_url = url[:i]
     params = dict(urlparse.parse_qsl(url[i+1:]))
     if 'page' not in params: params['page'] = 1
     params['page'] = int(params['page'])
     found = True
     while found:
         found = False
         print "PAGE %d" % params['page']
         real_url = "%s?%s" % (base_url,urllib.urlencode(params))
         html = urllib.urlopen(real_url).read()
         soup = BeautifulSoup(html, from_encoding='utf8')
         try:
             for elem in soup('div','playlist-metadata'):
                 found = True
                 serie = Serie()
                 serie.channel = channel
                 serie.name = elem.h3.a.text.strip()
                 print serie.name.encode("utf8")
                 serie.url = urllib.basejoin(self.BASE_URL, elem.h3.a.get('href'))
                 serie.save()
             params['page'] += 1
         except:
             pass
示例#4
0
 def scrap_channel(self, channel, url):
     self.get_player_key()
     serie = Serie()
     serie.name = "Karlos Arguiñano en tu cocina"
     serie.thumbnail = "http://static.hogarutil.com/archivos/201109/logotipo-karlos-arguinano-2012-173x125x80xX.jpg?1"
     serie.channel = channel
     serie.save()
     url = "http://www.hogarutil.com/tv/programas/karlos-arguinano-cocina/index.html"
     self.get_show(serie, url)
示例#5
0
 def scrap_channel(self, channel):
     html = urllib.urlopen(channel.urls[0]).read()
     html = re.sub('<\?.*?\?>','',html)
     soup = BeautifulSoup(html)
     for elem in soup('div','item-menu'):
         serie = Serie()
         serie.channel = channel
         serie.name = elem.a.text
         serie.url = urllib.basejoin('http://www.tvpublica.com.ar/tvpublica/',elem.a.get('href'))
         serie.save()
示例#6
0
    def get_serie(self, channel, genre, url):
            if Serie.objects.filter(url=url).count() > 0:
                print "EXISTS"
                return

            html = urllib.urlopen(url).read()
            soup = BeautifulSoup(html, from_encoding='utf-8')
            info = soup.find('article','info')
            serie_name = info.strong.text.strip()

            if Serie.objects.filter(name=serie_name).count() > 0:
                print "EXISTS"    
                return            

            serie = Serie(channel=channel)
            serie.name = serie_name

            serie.url = url
            serie.thumbnail = urllib.basejoin(self.BASE_URL, info.img.get('src'))
            serie.description = info.find('div','expandable').text
            serie.save()
            serie.genres.add(genre)