Exemplo n.º 1
0
    def _subtitles_urls(self, re_name, re_sub, re_lang, date=None, url=None):
        if url and not self.browser.open(url):
            return

        for tr in self.browser.cssselect('table tr', []):
            links = tr.cssselect('.a1 a')
            if not links:
                continue
            spans = links[0].cssselect('span')
            if len(spans) < 2:
                continue
            if not re_lang.search(clean(spans[0].text)):
                continue
            title = clean(spans[1].text)
            if re_sub and not re_sub.search(title):
                continue
            yield urljoin(self.url, links[0].get('href'))

        uls = self.browser.cssselect('.box ul')
        if uls:
            for li in uls[-1]:
                links = li.cssselect('.title a')
                if not links:
                    continue
                for link in links:
                    title = link.text
                    if not re_name.search(title):
                        continue
                    date_ = self._get_date(title)
                    if date and date_ and abs(date - date_) > 1:
                        continue
                    url = urljoin(self.url, link.get('href'))
                    for res in self._subtitles_urls(re_name,
                            re_sub, re_lang, date, url):
                        yield res
Exemplo n.º 2
0
    def _get_info(self, artist, pages_max):
        url = self._get_artist_url(artist)
        if not url:
            return
        self.browser.open(url)

        info = {
            'name': clean(artist, 1),
            'url': url,
            'genre': [],
            'albums': [],
            }

        for tag in self.browser.cssselect('.tags li a', []):
            if not RE_MORE_TAGS.search(tag.text):
                info['genre'].append(clean(tag.text))

        # Get albums
        links = self.browser.cssselect('.artist-top-albums a')
        if not links:
            logger.debug('failed to find albums link for "%s" at %s', artist, url)
            return info
        elif not RE_ALBUMS.search(links[0].text):
            return

        url_albums = urljoin(self.url, links[0].get('href'))
        for info_album in self._artist_albums(url_albums, pages_max):
            info_album['genre'] = info['genre']
            info['albums'].append(info_album)

        return info
Exemplo n.º 3
0
def get_searches(query):
    parts = [v.strip() for v in query.split(',')]
    if len(parts) < 2:
        return []
    category, is_artist = get_category_info(clean(parts.pop(0)))
    if category is None:
        return []
    name = clean(parts.pop(0), 1)
    if not name:
        return []
    artist = name if is_artist else None

    langs = Settings.get_settings('media_langs').get(category, [])
    search = {
        'name': name,
        'category': category,
        'mode': 'once',
        'langs': langs,
        }

    if category == 'music':
        if not parts:
            artist = name
        if artist:
            try:
                return get_music_searches(artist)
            except InfoError, e:
                raise QueryError('failed to find albums from "%s": %s', artist, str(e))

        search['album'] = clean(parts.pop(0), 1)
        if not search['album']:
            raise QueryError('failed to parse query "%s": album name is missing', query)
Exemplo n.º 4
0
    def get_info(self, query, category, artist=None):
        re_cat = CAT_DEF.get(category)
        if not re_cat:
            logger.error('unknown category %s', category)
            return
        if not self.browser.submit_form(self.url,
                fields={'search_term': query}):
            return

        info = {}

        re_q = Title(query).get_search_re()
        re_artist = Title(artist).get_search_re() if artist else None
        for li in self.browser.cssselect('.search_results li.result', []):
            log = html.tostring(li, pretty_print=True)[:1000]

            type_ = li.cssselect('.result_type')
            if not type_:
                logger.error('failed to get type from %s', log)
                continue
            if not re_cat.search(clean(type_[0][0].text, 1)):
                continue

            title_ = li.cssselect('.product_title a')
            if not title_:
                logger.error('failed to get title from %s', log)
                continue
            info['title'] = clean(title_[0].text, 1)
            if not re_q.search(info['title']):
                continue
            info['url'] = urljoin(self.url, title_[0].get('href'))

            scores = []
            rating_ = li.cssselect('.metascore')
            if rating_:
                try:
                    scores.append(int(rating_[0].text))
                except ValueError:
                    if not RE_NA_SCORE.search(rating_[0].text):
                        logger.error('failed to get metascore from "%s"', log)
            rating_ = li.cssselect('.textscore')
            if rating_:
                try:
                    scores.append(int(float(rating_[0].text) * 10))
                except ValueError:
                    if not RE_NA_SCORE.search(rating_[0].text):
                        logger.error('failed to get user score from %s', html.tostring(rating_[0]))
            if scores:
                info['rating'] = sum(scores) / len(scores)

            info.update(self._get_media_info(info['url']))

            if re_artist and not re_artist.search(info.get('artist', '')):
                continue

            return info
Exemplo n.º 5
0
    def get_track(self, artist, album):
        artist = clean(artist)
        album = clean(album)

        re_title = Title(artist).get_search_re(mode='__all__')
        for result in self.results('%s %s' % (artist, album)):
            if not result['title'] or not result['url_watch'] or not result['urls_thumbnails']:
                continue
            if re_title.search(result['title']):
                return result
Exemplo n.º 6
0
    def _get_name_url_info(self, url):
        if not self.browser.open(url):
            return
        info = {
            'url': url,
            'titles_known_for': [],
            }

        # Get "known for" titles
        for div in self.browser.cssselect('div#knownfor > div', []):
            links = div.cssselect('a')
            if not links:
                continue
            title = links[-1].text
            res = RE_TITLE.search(title)
            if not res:
                logger.error('failed to get title and date from "%s"', title)
                continue
            title, date = res.groups()
            info['titles_known_for'].append({
                    'title': clean(title, 1),
                    'date': int(date),
                    'url': urljoin(self.url, links[-1].get('href')),
                    })

        # Get filmography
        for category, el_id in [
                ('titles_director', 'filmo-head-director'),
                ('titles_actor', 'filmo-head-actor'),
                ]:
            info.setdefault(category, [])

            for el in self.browser.cssselect('#%s + .filmo-category-section div' % el_id):
                links = el.cssselect('a')
                if not links:
                    continue
                text = ''.join(el.xpath("text()"))
                if RE_TITLES_EXCL.search(text):
                    continue

                title = {
                    'title': clean(links[0].text, 1),
                    'url': urljoin(self.url, links[0].get('href')),
                    }
                els = el.cssselect('.year_column')
                if els:
                    res = RE_DATE.findall(els[0].text)
                    if res:
                        title['date'] = int(res[0])

                info[category].append(title)

        return info
Exemplo n.º 7
0
    def _has_unrelated(self, name, path):
        '''Check unrelated media in the given directory.
        '''
        size_limit = get_size(self.file) / 10.0
        for file in files(path, types=self.TYPES):
            if file.file == self.file:
                continue

            name_ = file.get_file_info().get('display_name')
            if not name_ or clean(name_, 9) != clean(name, 9):
                if file.type == self.type == 'video' and get_size(file.file) < size_limit:
                    continue
                return True
Exemplo n.º 8
0
    def _releases(self, type):
        url = URLS.get(type)
        if not url:
            logger.error('unhandled release type "%s"', type)
            return
        self.browser.open(url)

        now = datetime.utcnow()
        year = now.year

        for li in self.browser.cssselect('li.product', []):
            log = html.tostring(li, pretty_print=True)[:1000]

            info = {}

            title_ = li.cssselect('.product_title a')
            if not title_:
                continue
            info['title'] = clean(title_[0].text, 1)
            info['url'] = urljoin(self.url, title_[0].get('href'))

            if type.startswith('music_'):
                artist_ = li.cssselect('.product_artist .data')
                if not artist_:
                    continue
                info['artist'] = clean(artist_[0].text, 1)

            rating_ = li.cssselect('.metascore')
            if not rating_:
                continue
            try:
                info['rating'] = int(rating_[0].text)
            except ValueError:
                if not RE_NA_SCORE.search(rating_[0].text):
                    logger.error('failed to get rating from "%s"', log)
                continue

            date_ = li.cssselect('.release_date .data')
            if not date_:
                continue
            res = RE_DATE.search(date_[0].text)
            if not res:
                logger.error('failed to get date from "%s"', log)
                continue
            date_str = '%s %s %02d' % (year, res.group(1).lower(), int(res.group(2)))
            date = datetime.strptime(date_str, '%Y %b %d')
            if date > now:
                date = datetime(date.year - 1, date.month, date.day)
            info['date'] = date

            yield info
Exemplo n.º 9
0
    def get_trailer(self, title, date=None):
        title = clean(title)
        re_title = Title(title).get_search_re(mode='__all__')

        queries = ['%s trailer' % title, title]
        if date:
            queries.insert(0, '%s %s trailer' % (title, date))

        for query in queries:
            for result in self.results(query):
                if not re_title.search(clean(result['title'])):
                    continue
                if result['url_watch'] and result['urls_thumbnails']:
                    return result
Exemplo n.º 10
0
    def _get_media_info(self, url):
        browser = Browser()
        browser.open(url)

        info = {}

        band_ = browser.cssselect('.band_name')
        if band_:
            info['artist'] = clean(band_[0].text, 1)

        genre_ = browser.cssselect('.product_genre .data')
        if genre_:
            info['genre'] = [clean(g, 1) for g in genre_[0].text.split(',')]

        return info
Exemplo n.º 11
0
 def get_similar(self, artist):
     '''Get similar artists.
     '''
     res = []
     url = self._get_band_url(artist)
     if url:
         for tag in self.browser.cssselect('p.alt2', []):
             if clean(tag[0][0].text, 1) == 'similar bands':
                 for tag_ in tag[1:]:
                     res.append({
                         'name': clean(tag_.text, 1),
                         'url': urljoin(self.url, tag_.get('href'))
                         })
                 break
     return res
Exemplo n.º 12
0
    def reviews(self):
        if not self.url:
            return
        url = self._get_reviews_url()
        if not url:
            logger.error('failed to get reviews url at %s', self.url)
            return

        self.browser.open(url)
        for td in self.browser.cssselect('tr.alt1 td', []):
            log = html.tostring(td, pretty_print=True)[:1000]

            info = {}
            links = td.cssselect('a')
            if not links:
                logger.error('failed to get release from %s', log)
                continue

            try:
                info['artist'] = clean(links[1][0][0].text, 1)
            except Exception:
                logger.error('failed to get artist from %s', log)
                continue
            try:
                info['album'] = clean(links[1][0][-1].text, 1)
            except Exception:
                logger.error('failed to get album from %s', log)
                continue
            try:
                info['rating'] = float(td[-1][-1].text)
            except Exception:
                continue
            try:
                y, m, d = RE_DATE_REVIEW.search(td[-1].text).groups()
                info['date'] = datetime(int(y), int(m), int(d))
            except Exception:
                logger.debug('failed to get date from %s', log)
                continue
            try:
                info['url_review'] = urljoin(self.url, links[0].get('href'))
            except Exception:
                logger.error('failed to get review url from %s', log)
            try:
                info['url_thumbnail'] = urljoin(self.url, links[0][0].get('src'))
            except Exception:
                logger.error('failed to get thumbnail url from %s', log)

            yield info
Exemplo n.º 13
0
    def get_info(self, query):
        if not self.browser.submit_form(self.url,
                fields={'search': query}):
            return

        info = {}

        re_q = Title(query).get_search_re()
        for li in self.browser.cssselect('#movie_results_ul li', []):
            log = html.tostring(li, pretty_print=True)[:1000]

            rating_ = li.cssselect('.tMeterScore')
            if not rating_:
                continue
            res = RE_RATING.search(rating_[0].text)
            if not res:
                logger.error('failed to get rating from "%s"', log)
                continue
            info['rating'] = int(res.group(1))

            title_ = li.cssselect('.nomargin a')
            if not title_:
                logger.error('failed to get title from %s', log)
                continue
            info['title'] = clean(title_[0].text, 1)
            if not re_q.search(info['title']):
                continue
            info['url'] = urljoin(self.url, title_[0].get('href'))

            url = self._get_thumbnail_url(info['url'])
            if url:
                info['url_thumbnail'] = url
            return info
Exemplo n.º 14
0
def _get_filename(remote):
    if remote:
        data = remote.info().get('Content-Disposition')
        if data:
            res = RE_CONTENT_FILENAME.findall(data)
            if res:
                return clean(res[0])
Exemplo n.º 15
0
    def results(self, query, pages_max=1):
        for page in range(1, pages_max + 1):
            if page > 1:
                if not self._next(page):
                    break
            else:
                self.browser.submit_form(self.url, fields={'q': query})

            for li in self.browser.cssselect('li.g', []):
                log = html.tostring(li, pretty_print=True)[:1000]

                links = li.cssselect('a')
                if not links:
                    logger.error('failed to get links from %s', log)
                    continue
                url = links[0].get('href')
                if not url or not urlparse(url).scheme:
                    continue
                title = clean(self.get_link_text(html.tostring(links[0])))
                if not title:
                    continue
                yield {
                    'title': title,
                    'url': url,
                    'page': page,
                    }
Exemplo n.º 16
0
    def releases(self):
        for release_type, re_release in RE_RELEASES_URLS.items():
            if not self.browser.follow_link(text_regex=re_release):
                logger.error('failed to get %s releases', release_type)
                continue

            for item in self.browser.cssselect('.list_item', []):
                log = html.tostring(item, pretty_print=True)[:1000]

                link_ = item.cssselect('.info a')
                if not link_:
                    logger.error('failed to get link from %s', log)
                    continue

                result = {
                    'title': clean(link_[0].text, 1),
                    'url': urljoin(self.url, link_[0].get('href')),
                    }
                rating_ = item.cssselect('.rating-rating .value')
                if not rating_:
                    logger.error('failed to get rating from %s', log)
                    continue
                try:
                    result['rating'] = float(rating_[0].text)
                except ValueError:
                    logger.error('failed to get rating from %s', log)
                    pass

                yield result
Exemplo n.º 17
0
    def _subtitles_urls(self, re_name, date=None, url=None):
        if url and not self.browser.open(url):
            return

        trs = self.browser.cssselect('#search_results tr[id]')
        if not trs:
            if not self.browser.cssselect('#search_results'):    # skip tvshow whole season page
                yield self.browser.geturl()
            return

        for tr in trs:
            links = tr.cssselect('a')
            if not links:
                continue
            title = clean(links[0].text)
            if not re_name.search(title):
                continue
            date_ = self._get_date(title)
            if date and date_ and abs(date - date_) > 1:
                continue

            url = urljoin(self.url, links[0].get('href'))
            for res in self._subtitles_urls(re_name=re_name,
                    date=date, url=url):
                yield res
Exemplo n.º 18
0
def create_similar():
    data = request.json
    if not data.get('recurrence'):
        return jsonify(error='missing recurrence')

    if 'id' in data:
        id = ObjectId(data['id'])
        type = data.get('type')
        search = _get_object_search(id, type)
        if not search:
            return jsonify(error='%s %s does not exist' % (type, id))
        similar = {
            'name': search['name'],
            'category': search['category'],
            }
    else:
        if not data.get('name'):
            return jsonify(error='missing name')
        if not data.get('category'):
            return jsonify(error='missing category')
        similar = {
            'name': clean(data['name'], 1),
            'category': data['category'],
            }

    similar['recurrence'] = int(data['recurrence'])
    similar['langs'] = data.get('langs') or []
    if not SimilarSearch.add(**similar):
        return jsonify(error='failed to create similar %s' % similar)

    return jsonify(result=True)
Exemplo n.º 19
0
 def _next(self, page):
     for link in self.browser.cssselect('#nav a'):
         try:
             page_ = int(clean(self.get_link_text(html.tostring(link))))
         except ValueError:
             continue
         if page_ == page:
             url = urljoin(self.url, link.get('href'))
             return self.browser.open(url)
Exemplo n.º 20
0
    def _get_title_url_info(self, url):
        if not self.browser.open(url):
            return
        info = {'url': url}

        headers = self.browser.cssselect('.header')
        if not headers:
            logger.error('failed to get title from %s', url)
            return
        titles = headers[0].cssselect('[itemprop="name"]')
        if not titles:
            return
        info['title'] = clean(titles[0].text, 1)

        dates = headers[0].cssselect('.nobr')
        if dates:
            res = RE_DATE.search(clean(html.tostring(dates[0]), 1))
            if res:
                info['date'] = int(res.group(1))

        res = self.browser.cssselect('#img_primary img')
        if res:
            info['url_thumbnail'] = res[0].get('src')

        res = self.browser.cssselect('div.star-box-giga-star')
        if res:
            info['rating'] = float(clean(res[0].text))

        res = self.browser.cssselect('.infobar')
        if res:
            info['details'] = clean(res[0].text, 1)

        tags = self.browser.cssselect('div.txt-block', []) + self.browser.cssselect('div.inline', [])
        for tag in tags:
            if tag is None or not len(tag):
                continue
            title = clean(tag[0].text, 1)
            if title.startswith('director'):
                info['director'] = [clean(a.text, 1) for a in tag.cssselect('a span') if not RE_NAMES_EXCL.search(a.text)]
            elif title == 'stars':
                info['stars'] = [clean(a.text, 1) for a in tag.cssselect('a span') if not RE_NAMES_EXCL.search(a.text)]
            elif title == 'country':
                info['country'] = [clean(a.text, 1) for a in tag.cssselect('a')]
            elif title == 'genres':
                info['genre'] = [clean(a.text, 1) for a in tag.cssselect('a')]
            elif title == 'runtime':
                info['runtime'] = tag[1].text

        return info
Exemplo n.º 21
0
 def results(self, query):
     yt_query = gdata.youtube.service.YouTubeVideoQuery()
     yt_query.vq = clean(query)
     yt_query.orderby = 'relevance'
     yt_query.racy = 'include'
     try:
         feed = self.yt_service.YouTubeQuery(yt_query)
     except Exception, e:
         logger.error('failed to process query "%s": %s', query, str(e))
         return
Exemplo n.º 22
0
 def get_results_count(self, query):
     '''Get the results count for a query.
     '''
     self.browser.submit_form(self.url, fields={'q': query})
     stat = self.browser.cssselect('#resultStats')
     if stat:
         res = RE_NB_RESULTS.findall(clean(stat[0].text))
         if res:
             nb = re.sub(r'\D+', '', res[0])
             return int(nb)
Exemplo n.º 23
0
def get_query(query, category=None):
    query = clean(query, 1)
    if category == 'tv':
        query = Title(query).name
    elif category == 'anime':
        query = Title(query).display_name

    query = re.sub(r'[\W_]+|\s+s\s+|\sand\s|\sor\s|\snot\s', ' ', query)
    query = re.sub(r'^the\s+|^[\W_]+|[\W_]+$', '', query)
    return query
Exemplo n.º 24
0
def get_info(file):
    '''Get main info by category.
    '''
    res = {}

    for cat, info in parse(file).items():

        if cat == 'general':
            try:
                res['duration'] = int(info.get('duration')) / 1000     # seconds
            except Exception:
                pass
            try:
                res['bitrate'] = int(info.get('overall bit rate'))     # bps
            except Exception:
                pass

            # Tags
            res['artist'] = clean(info.get('performer', ''), 1)
            res['album'] = clean(info.get('album', ''), 1)
            try:
                res['date'] = int(info.get('recorded date'))
            except Exception:
                pass
            res['title'] = clean(info.get('track name', ''), 1)
            try:
                res['track_number'] = int(info.get('track name/position'))
            except Exception:
                pass

        else:
            if cat == 'audio #1':
                cat = 'audio'

            try:
                res['%s_bitrate' % cat] = int(info.get('bit rate'))    # bps
            except Exception:
                pass
            res['%s_codec' % cat] = info.get('codec')
            res['%s_codec_id' % cat] = info.get('codec id')

    return res
Exemplo n.º 25
0
    def _artist_albums(self, url, pages_max):
        for i in range(pages_max):
            if i > 0:
                url = self._get_next_page_url()
                if not url:
                    return

            self.browser.open(url)
            for tag in self.browser.cssselect('.album-item', []):
                log = html.tostring(tag, pretty_print=True)[:1000]

                meta_tags = tag.cssselect('[itemprop="name"]')
                if not meta_tags:
                    continue
                title = clean(meta_tags[0].get('content', ''), 1)
                if not title:
                    continue
                info_album = {'title': title}

                url_tags = tag.cssselect('a')
                if url_tags:
                    info_album['url'] = urljoin(self.url, url_tags[0].get('href'))
                else:
                    logger.error('failed to get album url from %s', log)

                url_thumbnails = tag.cssselect('.album-item-cover img')
                if url_thumbnails:
                    url_ = url_thumbnails[0].get('src')
                    if not RE_THUMBNAIL_UNKNOWN.search(urlparse(url_).path):
                        info_album['url_thumbnail'] = url_
                else:
                    logger.error('failed to get album thumbnail url from %s', log)

                date_tags = tag.cssselect('time')
                if not date_tags:
                    continue
                try:
                    date = RE_DATE_ALBUM.search(date_tags[0].get('datetime'))
                    info_album['date'] = int(date.group(1))
                except Exception:
                    continue

                # Check nb tracks
                tracks_tags = tag.cssselect('[itemprop="numTracks"]')
                if not tracks_tags:
                    continue
                try:
                    nb_tracks = int(tracks_tags[0].text)
                except ValueError:
                    continue
                if nb_tracks < MIN_ALBUM_TRACKS:
                    continue

                yield info_album
Exemplo n.º 26
0
 def _get_artist_url(self, artist):
     url = self._get_results_url(artist)
     if not url:
         return
     re_name = Title(artist).get_search_re()
     self.browser.open(url)
     for tag in self.browser.cssselect('.artistsWithInfo li', []):
         links = tag.cssselect('a')
         if links:
             name = clean(self.get_link_text(html.tostring(links[0])))
             if re_name.search(name):
                 return urljoin(self.url, self._clean_url(links[0].get('href')))
Exemplo n.º 27
0
    def _get_torrent_url(self, query, url):
        re_q = Title(query).get_search_re(mode='__lazy__')

        for mirror_url in self._mirror_urls(url):
            for torrent_url in self._torrent_urls(mirror_url):
                res = parse_magnet_url(torrent_url)
                if not res or not 'dn' in res:
                    continue

                title = clean(res['dn'][0])
                if re_q.match(title):
                    return torrent_url
Exemplo n.º 28
0
 def get_file_info(self):
     '''Get the file info.
     '''
     info = get_info(self.file)
     if info:
         info['full_name'] = '%s%s%s' % (info['artist'], ' ' if info['artist'] and info['album'] else '', info['album'])
         info['display_name'] = '%s%s%s' % (info['artist'], ' - ' if info['artist'] and info['album'] else '', info['album'])
         if info.get('date'):
             info['display_name'] = '%s%s%s' % (info['display_name'], ' - ' if info['display_name'] else '', info['date'])
         info['subtype'] = 'music'
         if not info.get('display_name'):
             info['display_name'] = clean(self.dir, 1)
     return info
Exemplo n.º 29
0
    def _get_urls(self, query, type='title'):
        urls = []
        self.browser.addheaders = [('Accept-Language', 'en-US,en')]
        if self.browser.submit_form(self.url, fields={'q': query}):
            url = self.browser.geturl()
            if RE_URLS[type].search(url):
                urls = [url]
            else:
                re_name = Title(query).get_search_re()
                for res in self.browser.cssselect('.result_text a', []):
                    if not re_name.search(clean(res.text)):
                        continue
                    url = urljoin(self.url, res.get('href'))
                    if not RE_URLS[type].search(url):
                        continue
                    urls.append(url)

        return urls
Exemplo n.º 30
0
    def _similar_artists(self, url, pages_max):
        for i in range(pages_max):
            if i > 0:
                url = self._get_next_page_url()
                if not url:
                    return

            self.browser.open(url)
            for li in self.browser.cssselect('.similar-artists li', []):
                links = li.cssselect('a')
                if not links:
                    continue
                names = li.cssselect('.link-reference h3')
                if not names:
                    continue
                yield {
                    'name': clean(names[0].text, 1),
                    'url': urljoin(self.url, links[0].get('href')),
                    }