Exemplo n.º 1
0
    def parse(self, data=None):
        """Parse movie name. Populates name, year, quality and proper_count attributes"""

        # Reset before parsing, so the parser can be reused.
        self.reset()

        if data is None:
            data = self.data

        # Move anything in leading brackets to the end
        data = re.sub(r'^\[(.*?)\](.*)', r'\2 \1', data)

        for char in '[]()_,.':
            data = data.replace(char, ' ')

        # if there are no spaces
        if data.find(' ') == -1:
            data = data.replace('-', ' ')

        # remove unwanted words (imax, ..)
        self.remove_words(data, self.remove)

        data = self.strip_spaces(data)

        # split to parts
        parts = data.split(' ')
        cut_part = 256
        all_caps = True
        for part_pos, part in enumerate(parts):
            cut = False
            # Don't let the first word be cutoff word
            if part_pos < 1:
                continue
            # check for year
            num = str_to_int(part)
            if num is not None:
                if 1930 < num < 2050:
                    self.year = num
                    cut = True
            # Don't consider all caps words cut words if the whole title has been all caps
            if not part.isupper():
                all_caps = False
            # if length > 3 and whole word in uppers, consider as cut word (most likely a group name)
            if len(part) > 3 and part.isupper() and part.isalpha() and not all_caps:
                cut = True
            # check for cutoff words
            if part.lower() in self.cutoffs:
                cut = True
            # check for propers
            if part.lower() in self.propers:
                self.proper_count += 1
                cut = True
            # update cut position
            if cut and parts.index(part) < cut_part:
                cut_part = part_pos

        if cut_part != 256:
            log.debug('parts: %s, cut is: %s', parts, parts[cut_part])

        # calculate cut positon from cut_part
        abs_cut = len(' '.join(parts[:cut_part]))

        log.debug('after parts check, cut data would be: `%s` abs_cut: %i', data[:abs_cut], abs_cut)

        # parse quality
        quality = qualities.Quality(data)
        if quality:
            self.quality = quality
            # remaining string is same as data but quality information removed
            # find out position where there is first difference, this is earliest
            # quality bit, anything after that has no relevance to the movie name
            dp = diff_pos(data, quality.clean_text)
            if dp is not None:
                log.debug('quality start: %s', dp)
                if dp < abs_cut:
                    log.debug('quality cut is even shorter')
                    abs_cut = dp

        # make cut
        data = data[:abs_cut].strip()
        log.debug('data cut to `%s` - this will be the name', data)

        # save results
        self.name = data
Exemplo n.º 2
0
    def parse(self, imdb_id):
        self.imdb_id = extract_id(imdb_id)
        url = make_url(self.imdb_id)
        self.url = url
        page = requests.get(url)
        soup = get_soup(page.text)

        title_overview = soup.find('div', attrs={'class': 'title-overview'})
        if not title_overview:
            raise PluginError('IMDB parser needs updating, imdb format changed. Please report on Github.')

        # Parse stuff from the title-overview section
        name_elem = title_overview.find('h1', attrs={'itemprop': 'name'})
        if name_elem:
            self.name = name_elem.contents[0].strip()
        else:
            log.error('Possible IMDB parser needs updating, Please report on Github.')
            raise PluginError('Unable to set imdb_name for %s from %s' % (self.imdb_id, self.url))

        year = title_overview.find('span', attrs={'id': 'titleYear'})
        if year:
            m = re.search(r'([0-9]{4})', year.text)
            if m:
                self.year = int(m.group(1))

        if not self.year:
            log.debug('No year found for %s' % self.imdb_id)

        mpaa_rating_elem = title_overview.find(itemprop='contentRating')
        if mpaa_rating_elem:
            self.mpaa_rating = mpaa_rating_elem['content']
        else:
            log.debug('No rating found for %s' % self.imdb_id)

        photo_elem = title_overview.find(itemprop='image')
        if photo_elem:
            self.photo = photo_elem['src']
        else:
            log.debug('No photo found for %s' % self.imdb_id)

        original_name_elem = title_overview.find(attrs={'class': 'originalTitle'})
        if original_name_elem:
            self.original_name = original_name_elem.contents[0].strip().strip('"')
        else:
            log.debug('No original title found for %s' % self.imdb_id)

        votes_elem = title_overview.find(itemprop='ratingCount')
        if votes_elem:
            self.votes = str_to_int(votes_elem.text)
        else:
            log.debug('No votes found for %s' % self.imdb_id)

        score_elem = title_overview.find(itemprop='ratingValue')
        if score_elem:
            self.score = float(score_elem.text)
        else:
            log.debug('No score found for %s' % self.imdb_id)

        # get director(s)
        for director in title_overview.select('[itemprop="director"] > a'):
            director_id = extract_id(director['href'])
            director_name = director.text
            # tag instead of name
            if isinstance(director_name, Tag):
                director_name = None
            self.directors[director_id] = director_name

        # Details section
        title_details = soup.find('div', attrs={'id': 'titleDetails'})
        if title_details:
            # get languages
            for link in title_details.find_all('a', href=re.compile('/language/')):
                lang = link.text.strip().lower()
                if lang not in self.languages:
                    self.languages.append(lang.strip())

        # Storyline section
        storyline = soup.find('div', attrs={'id': 'titleStoryLine'})
        if storyline:
            plot_elem = storyline.find('p')
            if plot_elem:
                # Remove the "Written By" part.
                if plot_elem.em:
                    plot_elem.em.replace_with('')
                self.plot_outline = plot_elem.text.strip()
            else:
                log.debug('No storyline found for %s' % self.imdb_id)
            self.genres = [i.text.strip().lower() for i in storyline.select('[itemprop="genre"] > a')]

        # Cast section
        cast = soup.find('div', attrs={'id': 'titleCast'})
        if cast:
            for actor in cast.select('[itemprop="actor"] > a'):
                actor_id = extract_id(actor['href'])
                actor_name = actor.text.strip()
                # tag instead of name
                if isinstance(actor_name, Tag):
                    actor_name = None
                self.actors[actor_id] = actor_name
Exemplo n.º 3
0
    def parse(self, imdb_id):
        self.imdb_id = extract_id(imdb_id)
        url = make_url(self.imdb_id)
        self.url = url
        page = requests.get(url)
        soup = get_soup(page.text)

        # get photo
        tag_photo = soup.find('td', attrs={'id': 'img_primary'})
        if tag_photo:
            tag_img = tag_photo.find('img')
            if tag_img:
                self.photo = tag_img.get('src')
                log.debug('Detected photo: %s' % self.photo)

        # get rating. contentRating <span> in infobar.
        tag_infobar_div = soup.find('div', attrs={'class': 'infobar'})
        if tag_infobar_div:
            tag_mpaa_rating = tag_infobar_div.find(
                'span', attrs={'itemprop': 'contentRating'})
            if tag_mpaa_rating:
                if not tag_mpaa_rating.get('class') or not tag_mpaa_rating[
                        'class'][0].startswith('us_'):
                    log.warning('Could not determine mpaa rating for %s' % url)
                else:
                    rating_class = tag_mpaa_rating['class'][0]
                    if rating_class == 'us_not_rated':
                        self.mpaa_rating = 'NR'
                    else:
                        self.mpaa_rating = rating_class.lstrip('us_').replace(
                            '_', '-').upper()
                log.debug('Detected mpaa rating: %s' % self.mpaa_rating)
            else:
                log.debug('Unable to match signature of mpaa rating for %s - '
                          'could be a TV episode, or plugin needs update?' %
                          url)
        else:
            # We should match the infobar, it's an integral part of the IMDB page.
            log.warning(
                'Unable to get infodiv class for %s - plugin needs update?' %
                url)

        # get name
        tag_name = soup.find('h1').find('span', attrs={'itemprop': 'name'})
        if tag_name:
            self.name = tag_name.text
            log.debug('Detected name: %s' % self.name)
        else:
            log.warning('Unable to get name for %s - plugin needs update?' %
                        url)

        tag_original_title_i = soup.find('i',
                                         text=re.compile(r'original title'))
        if tag_original_title_i:
            span = tag_original_title_i.parent
            tag_original_title_i.decompose()
            self.original_name = span.text.strip()
            log.debug('Detected original name: %s' % self.original_name)
        else:
            # if title is already in original language, it doesn't have the tag
            log.debug(
                'Unable to get original title for %s - it probably does not exists'
                % url)

        # detect if movie is eligible for ratings
        rating_ineligible = soup.find('div',
                                      attrs={'class': 'rating-ineligible'})
        if rating_ineligible:
            log.debug('movie is not eligible for ratings')
        else:
            # get votes
            tag_votes = soup.find(itemprop='ratingCount')
            if tag_votes:
                self.votes = str_to_int(tag_votes.string) or 0
                log.debug('Detected votes: %s' % self.votes)
            else:
                log.warning(
                    'Unable to get votes for %s - plugin needs update?' % url)

            # get score - find the ratingValue item that contains a numerical value
            span_score = soup.find(itemprop='ratingValue',
                                   text=re.compile('[\d\.]+'))
            if span_score:
                try:
                    self.score = float(span_score.string)
                except (ValueError, TypeError):
                    log.debug('tag_score %r is not valid float' %
                              span_score.string)
                log.debug('Detected score: %s' % self.score)
            else:
                log.warning(
                    'Unable to get score for %s - plugin needs update?' % url)

        # get genres
        genres = soup.find('div', itemprop='genre')
        if genres:
            for link in genres.find_all('a'):
                self.genres.append(link.text.strip().lower())
        else:
            log.warning(
                'Unable to find genres section for %s - plugin needs update?' %
                url)

        # get languages
        for link in soup.find_all('a', href=re.compile('/language/.*')):
            # skip non-primary languages "(a few words)", etc.
            m = re.search('(?x) \( [^()]* \\b few \\b', link.next_sibling)
            if not m:
                lang = link.text.lower()
                if not lang in self.languages:
                    self.languages.append(lang.strip())

        # get year
        tag_year = soup.find('a', attrs={'href': re.compile('^/year/\d+')})
        if tag_year:
            self.year = int(tag_year.text)
            log.debug('Detected year: %s' % self.year)
        elif soup.head.title:
            m = re.search(r'(\d{4})\)', soup.head.title.string)
            if m:
                self.year = int(m.group(1))
                log.debug('Detected year: %s' % self.year)
            else:
                log.warning(
                    'Unable to get year for %s (regexp mismatch) - plugin needs update?'
                    % url)
        else:
            log.warning(
                'Unable to get year for %s (missing title) - plugin needs update?'
                % url)

        # get main cast
        tag_cast = soup.find('table', 'cast_list')
        if tag_cast:
            for actor in tag_cast.find_all('a', href=re.compile('/name/nm')):
                actor_id = extract_id(actor['href'])
                actor_name = actor.text.strip()
                # tag instead of name
                if isinstance(actor_name, Tag):
                    actor_name = None
                self.actors[actor_id] = actor_name

        # get director(s)
        h4_director = soup.find('h4', text=re.compile('Director'))
        if h4_director:
            for director in h4_director.parent.find_all(
                    'a', href=re.compile('/name/nm')):
                director_id = extract_id(director['href'])
                director_name = director.text
                # tag instead of name
                if isinstance(director_name, Tag):
                    director_name = None
                self.directors[director_id] = director_name

        log.debug('Detected genres: %s' % self.genres)
        log.debug('Detected languages: %s' % self.languages)
        log.debug('Detected director(s): %s' % ', '.join(self.directors))
        log.debug('Detected actors: %s' % ', '.join(self.actors))

        # get plot
        h2_plot = soup.find('h2', text='Storyline')
        if h2_plot:
            p_plot = h2_plot.find_next('p')
            if p_plot:
                self.plot_outline = p_plot.next.string.strip()
                log.debug('Detected plot outline: %s' % self.plot_outline)
            else:
                log.debug('Plot does not have p-tag')
        else:
            log.debug('Failed to find plot')
Exemplo n.º 4
0
    def parse(self, imdb_id):
        self.imdb_id = extract_id(imdb_id)
        url = make_url(self.imdb_id)
        self.url = url
        page = requests.get(url)
        soup = get_soup(page.text)

        title_overview = soup.find('div', attrs={'class': 'title-overview'})
        if not title_overview:
            raise PluginError(
                'IMDB parser needs updating, imdb format changed. Please report on Github.'
            )

        # Parse stuff from the title-overview section
        name_elem = title_overview.find('h1', attrs={'itemprop': 'name'})
        if name_elem:
            self.name = name_elem.contents[0].strip()
        else:
            log.error(
                'Possible IMDB parser needs updating, Please report on Github.'
            )
            raise PluginError('Unable to set imdb_name for %s from %s' %
                              (self.imdb_id, self.url))

        year = title_overview.find('span', attrs={'id': 'titleYear'})
        if year:
            m = re.search(r'([0-9]{4})', year.text)
            if m:
                self.year = int(m.group(1))

        if not self.year:
            log.debug('No year found for %s', self.imdb_id)

        mpaa_rating_elem = title_overview.find(itemprop='contentRating')
        if mpaa_rating_elem:
            self.mpaa_rating = mpaa_rating_elem['content']
        else:
            log.debug('No rating found for %s', self.imdb_id)

        photo_elem = title_overview.find(itemprop='image')
        if photo_elem:
            self.photo = photo_elem['src']
        else:
            log.debug('No photo found for %s', self.imdb_id)

        original_name_elem = title_overview.find(
            attrs={'class': 'originalTitle'})
        if original_name_elem:
            self.original_name = original_name_elem.contents[0].strip().strip(
                '"')
        else:
            log.debug('No original title found for %s', self.imdb_id)

        votes_elem = title_overview.find(itemprop='ratingCount')
        if votes_elem:
            self.votes = str_to_int(votes_elem.text)
        else:
            log.debug('No votes found for %s', self.imdb_id)

        score_elem = title_overview.find(itemprop='ratingValue')
        if score_elem:
            self.score = float(score_elem.text)
        else:
            log.debug('No score found for %s', self.imdb_id)

        meta_score_elem = title_overview.find(
            attrs={'class': 'metacriticScore'})
        if meta_score_elem:
            self.meta_score = str_to_int(meta_score_elem.text)
        else:
            log.debug('No Metacritic score found for %s', self.imdb_id)

            # get director(s)
        for director in title_overview.select('[itemprop="director"] > a'):
            director_id = extract_id(director['href'])
            director_name = director.text
            # tag instead of name
            if isinstance(director_name, Tag):
                director_name = None
            self.directors[director_id] = director_name

        # get writer(s)
        for writer in title_overview.select('[itemprop="creator"] > a'):
            writer_id = extract_id(writer['href'])
            writer_name = writer.text
            # tag instead of name
            if isinstance(writer_name, Tag):
                writer_name = None
            self.writers[writer_id] = writer_name

        # Details section
        title_details = soup.find('div', attrs={'id': 'titleDetails'})
        if title_details:
            # get languages
            for link in title_details.find_all(
                    'a',
                    href=re.compile('^/search/title\?title_type=feature'
                                    '&primary_language=')):
                lang = link.text.strip().lower()
                if lang not in self.languages:
                    self.languages.append(lang.strip())

        # Storyline section
        storyline = soup.find('div', attrs={'id': 'titleStoryLine'})
        if storyline:
            plot_elem = storyline.find('p')
            if plot_elem:
                # Remove the "Written By" part.
                if plot_elem.em:
                    plot_elem.em.replace_with('')
                self.plot_outline = plot_elem.text.strip()
            else:
                log.debug('No storyline found for %s', self.imdb_id)
            self.genres = [
                i.text.strip().lower()
                for i in storyline.select('[itemprop="genre"] > a')
            ]

        # Cast section
        cast = soup.find('div', attrs={'id': 'titleCast'})
        if cast:
            for actor in cast.select('[itemprop="actor"] > a'):
                actor_id = extract_id(actor['href'])
                actor_name = actor.text.strip()
                # tag instead of name
                if isinstance(actor_name, Tag):
                    actor_name = None
                self.actors[actor_id] = actor_name
Exemplo n.º 5
0
    def parse(self, data=None):
        """Parse movie name. Populates name, year, quality and proper_count attributes"""

        # Reset before parsing, so the parser can be reused.
        self.reset()

        if data is None:
            data = self.data

        # Move anything in leading brackets to the end
        data = re.sub(r'^\[(.*?)\](.*)', r'\2 \1', data)

        for char in '[]()_,.':
            data = data.replace(char, ' ')

        # if there are no spaces
        if data.find(' ') == -1:
            data = data.replace('-', ' ')

        # remove unwanted words (imax, ..)
        self.remove_words(data, self.remove)

        data = self.strip_spaces(data)

        # split to parts
        parts = data.split(' ')
        cut_part = 256
        all_caps = True
        for part_pos, part in enumerate(parts):
            cut = False
            # Don't let the first word be cutoff word
            if part_pos < 1:
                continue
            # check for year
            num = str_to_int(part)
            if num is not None:
                if 1930 < num <= datetime.now().year:
                    if self.year_pos == cut_part:
                        # Looks like a year, but we already set the cutpoint to a year, let's move it forward
                        cut_part = part_pos
                        
                    self.year = num
                    self.year_pos = part_pos
                    cut = True
            # Don't consider all caps words cut words if the whole title has been all caps
            if not part.isupper():
                all_caps = False
            # if length > 3 and whole word in uppers, consider as cut word (most likely a group name)
            if len(part) > 3 and part.isupper() and part.isalpha() and not all_caps:
                cut = True
            # check for cutoff words
            if part.lower() in self.cutoffs:
                cut = True
            # check for propers
            if part.lower() in self.propers:
                # 'real' and 'final' are too common in movie titles, only cut if it comes after year
                if part.lower() not in ['real', 'final'] or self.year:
                    self.proper_count += 1
                    cut = True
            # update cut position
            if cut and parts.index(part) < cut_part:
                cut_part = part_pos

        if cut_part != 256:
            log.debug('parts: %s, cut is: %s', parts, parts[cut_part])

        # calculate cut positon from cut_part
        abs_cut = len(' '.join(parts[:cut_part]))

        log.debug('after parts check, cut data would be: `%s` abs_cut: %i', data[:abs_cut], abs_cut)

        # parse quality
        quality = qualities.Quality(data)
        if quality:
            self.quality = quality
            # remaining string is same as data but quality information removed
            # find out position where there is first difference, this is earliest
            # quality bit, anything after that has no relevance to the movie name
            dp = diff_pos(data, quality.clean_text)
            if dp is not None:
                log.debug('quality start: %s', dp)
                if dp < abs_cut:
                    log.debug('quality cut is even shorter')
                    abs_cut = dp

        # make cut
        data = data[:abs_cut].strip()
        log.debug('data cut to `%s` - this will be the name', data)

        # save results
        self.name = data
Exemplo n.º 6
0
    def parse(self, imdb_id, soup=None):
        self.imdb_id = extract_id(imdb_id)
        url = make_url(self.imdb_id)
        self.url = url

        if not soup:
            page = requests.get(url)
            soup = get_soup(page.text)

        title_wrapper = soup.find('div', attrs={'class': 'title_wrapper'})
        if not title_wrapper:
            # New layout, transitional
            title_wrapper = soup.find(
                'div', {"class": re.compile("^TitleBlock__TitleContainer.?")})

        if not title_wrapper:
            raise plugin.PluginError(
                'IMDB parser needs updating, imdb format changed. Please report on Github.'
            )

        data = json.loads(
            soup.find('script', {
                'type': 'application/ld+json'
            }).text)

        if not data:
            raise plugin.PluginError(
                'IMDB parser needs updating, imdb format changed. Please report on Github.'
            )

        # Parse stuff from the title-overview section
        name_elem = data['name']
        if name_elem:
            self.name = name_elem.strip()
        else:
            logger.error(
                'Possible IMDB parser needs updating, Please report on Github.'
            )
            raise plugin.PluginError('Unable to set imdb_name for %s from %s' %
                                     (self.imdb_id, self.url))

        year = soup.find('span', attrs={'id': 'titleYear'})
        if not year:
            # Test new layout
            year = title_wrapper.find(
                'span',
                {"class": re.compile("^TitleBlockMetaData__ListItemText.?")})

        if year:
            m = re.search(r'([0-9]{4})', year.text)
            if m:
                self.year = int(m.group(1))

        if not self.year:
            logger.debug('No year found for {}', self.imdb_id)

        mpaa_rating_elem = data.get('contentRating')
        if mpaa_rating_elem:
            self.mpaa_rating = mpaa_rating_elem
        else:
            logger.debug('No rating found for {}', self.imdb_id)

        photo_elem = data.get('image')
        if photo_elem:
            self.photo = photo_elem
        else:
            logger.debug('No photo found for {}', self.imdb_id)

        strip_pre_text = False
        original_name_elem = title_wrapper.find('div',
                                                {'class': 'originalTitle'})
        if not original_name_elem:
            # Test new layout
            strip_pre_text = True
            original_name_elem = title_wrapper.find(
                'div', {"class": re.compile("^OriginalTitle.?")})

        if original_name_elem:
            self.name = title_wrapper.find('h1').contents[0].strip()
            self.original_name = original_name_elem.contents[0].strip().strip(
                '"')
            if strip_pre_text:
                striped_text = re.search(r"([^\:]*)\:? (.*)",
                                         self.original_name)
                if len(striped_text.groups()) == 2:
                    self.original_name = striped_text.group(2)

        if not original_name_elem:
            logger.debug('No original title found for {}', self.imdb_id)

        votes_elem = data.get('aggregateRating', {}).get('ratingCount')
        if votes_elem:
            self.votes = str_to_int(votes_elem) if not isinstance(
                votes_elem, int) else votes_elem
        else:
            logger.debug('No votes found for {}', self.imdb_id)

        score_elem = data.get('aggregateRating', {}).get('ratingValue')
        if score_elem:
            self.score = float(score_elem)
        else:
            logger.debug('No score found for {}', self.imdb_id)

        meta_score_elem = soup.find(attrs={'class': 'metacriticScore'})
        if not meta_score_elem:
            # Test new layout
            meta_score_elem = soup.find('span', attrs={'class': 'score-meta'})

        if meta_score_elem:
            self.meta_score = str_to_int(meta_score_elem.text)
        else:
            logger.debug('No Metacritic score found for {}', self.imdb_id)

        # get director(s)
        directors = data.get('director', [])
        if not isinstance(directors, list):
            directors = [directors]

        for director in directors:
            if director['@type'] != 'Person':
                continue
            director_id = extract_id(director['url'])
            director_name = director['name']
            self.directors[director_id] = director_name

        # get writer(s)
        writers = data.get('creator', [])
        if not isinstance(writers, list):
            writers = [writers]

        for writer in writers:
            if writer['@type'] != 'Person':
                continue
            writer_id = extract_id(writer['url'])
            writer_name = writer['name']
            self.writers[writer_id] = writer_name

        # Details section
        title_details = soup.find('div', attrs={'id': 'titleDetails'})
        if not title_details:
            # Test new layout
            title_details = soup.find(
                'div', attrs={'data-testid': 'title-details-section'})

        if title_details:
            # get languages
            for link in title_details.find_all(
                    'a',
                    href=re.compile(r'^/search/title\?title_type=feature'
                                    '&primary_language=')):
                lang = link.text.strip().lower()
                if lang not in self.languages:
                    self.languages.append(lang.strip())

        # Storyline section
        storyline = soup.find('div', attrs={'id': 'titleStoryLine'})
        if storyline:
            plot_elem = storyline.find('p')
            if plot_elem:
                # Remove the "Written By" part.
                if plot_elem.em:
                    plot_elem.em.replace_with('')
                self.plot_outline = plot_elem.text.strip()
            else:
                logger.debug('No storyline found for {}', self.imdb_id)

            keyword_elem = storyline.find('h4').parent
            if keyword_elem:
                # The last "a" tag is a link to the full list
                self.plot_keywords = [
                    keyword_elem.text.strip()
                    for keyword_elem in keyword_elem.find_all("a")[:-1]
                ]
        else:
            # Test new layout
            storyline = soup.find(
                'div', attrs={'data-testid': 'storyline-plot-summary'})
            if storyline:
                self.plot_outline = storyline.text

            keyword_elem = soup.find(
                'div', attrs={'data-testid': 'storyline-plot-keywords'})
            if keyword_elem:
                self.plot_keywords = [
                    keyword_elem.text.strip()
                    for keyword_elem in keyword_elem.find_all("a")[:-1]
                ]

        genres = data.get('genre', [])
        if not isinstance(genres, list):
            genres = [genres]

        self.genres = [g.strip().lower() for g in genres]

        # Cast section
        cast = soup.find('table', attrs={'class': 'cast_list'})
        if cast:
            for actor in cast.select('tr > td:nth-of-type(2) > a'):
                actor_id = extract_id(actor['href'])
                actor_name = actor.text.strip()
                # tag instead of name
                if isinstance(actor_name, Tag):
                    actor_name = None
                self.actors[actor_id] = actor_name
        else:
            # Test new layout
            cast = soup.find_all(
                'a', attrs={'data-testid': 'title-cast-item__actor'})
            if cast:
                for actor in cast:
                    actor_id = extract_id(actor['href'])
                    actor_name = actor.text.strip()
                    # tag instead of name
                    if isinstance(actor_name, Tag):
                        actor_name = None
                    self.actors[actor_id] = actor_name
Exemplo n.º 7
0
    def parse(self, imdb_id):
        self.imdb_id = extract_id(imdb_id)
        url = make_url(self.imdb_id)
        self.url = url
        page = requests.get(url)
        soup = get_soup(page.text)

        # get photo
        tag_photo = soup.find('td', attrs={'id': 'img_primary'})
        if tag_photo:
            tag_img = tag_photo.find('img')
            if tag_img:
                self.photo = tag_img.get('src')
                log.debug('Detected photo: %s' % self.photo)

        # get rating. contentRating <span> in infobar.
        tag_infobar_div = soup.find('div', attrs={'class': 'infobar'})
        if tag_infobar_div:
            tag_mpaa_rating = tag_infobar_div.find('span', attrs={'itemprop': 'contentRating'})
            if tag_mpaa_rating:
                if not tag_mpaa_rating.get('class') or not tag_mpaa_rating['class'][0].startswith('us_'):
                    log.warning('Could not determine mpaa rating for %s' % url)
                else:
                    rating_class = tag_mpaa_rating['class'][0]
                    if rating_class == 'us_not_rated':
                        self.mpaa_rating = 'NR'
                    else:
                        self.mpaa_rating = rating_class.lstrip('us_').replace('_', '-').upper()
                log.debug('Detected mpaa rating: %s' % self.mpaa_rating)
            else:
                log.debug('Unable to match signature of mpaa rating for %s - '
                          'could be a TV episode, or plugin needs update?' % url)
        else:
            # We should match the infobar, it's an integral part of the IMDB page.
            log.warning('Unable to get infodiv class for %s - plugin needs update?' % url)

        # get name
        tag_name = soup.find('h1')
        if tag_name:
            tag_name = tag_name.find('span', attrs={'itemprop': 'name'})
        if tag_name:
            self.name = tag_name.text
            log.debug('Detected name: %s' % self.name)
        else:
            log.warning('Unable to get name for %s - plugin needs update?' % url)

        tag_original_title_i = soup.find('i', text=re.compile(r'original title'))
        if tag_original_title_i:
            span = tag_original_title_i.parent
            tag_original_title_i.decompose()
            self.original_name = span.text.strip().strip('"')
            log.debug('Detected original name: %s' % self.original_name)
        else:
            # if title is already in original language, it doesn't have the tag
            log.debug('Unable to get original title for %s - it probably does not exists' % url)

        star_box = soup.find('div', attrs={'class': 'star-box giga-star'})
        if star_box:
            # detect if movie is eligible for ratings
            rating_ineligible = star_box.find('div', attrs={'class': 'rating-ineligible'})
            if rating_ineligible:
                log.debug('movie is not eligible for ratings')
            else:
                # get votes
                tag_votes = star_box.find(itemprop='ratingCount')
                if tag_votes:
                    self.votes = str_to_int(tag_votes.string) or 0
                    log.debug('Detected votes: %s' % self.votes)
                else:
                    log.warning('Unable to get votes for %s - plugin needs update?' % url)

                # get score - find the ratingValue item that contains a numerical value
                span_score = star_box.find(itemprop='ratingValue', text=re.compile('[\d\.]+'))
                if span_score:
                    try:
                        self.score = float(span_score.string)
                    except (ValueError, TypeError):
                        log.debug('tag_score %r is not valid float' % span_score.string)
                    log.debug('Detected score: %s' % self.score)
                else:
                    log.warning('Unable to get score for %s - plugin needs update?' % url)
        else:
            log.warning('Unable to find score/vote section for %s - plugin needs update?' % url)

        # get genres
        genres = soup.find('div', itemprop='genre')
        if genres:
            for link in genres.find_all('a'):
                self.genres.append(link.text.strip().lower())
        else:
            log.warning('Unable to find genres section for %s - plugin needs update?' % url)

        # get languages
        for link in soup.find_all('a', href=re.compile('/language/.*')):
            # skip non-primary languages "(a few words)", etc.
            m = re.search('(?x) \( [^()]* \\b few \\b', link.next_sibling)
            if not m:
                lang = link.text.lower()
                if not lang in self.languages:
                    self.languages.append(lang.strip())

        # get year
        tag_year = soup.find('a', attrs={'href': re.compile('^/year/\d+')})
        if tag_year:
            self.year = int(tag_year.text)
            log.debug('Detected year: %s' % self.year)
        elif soup.head.title:
            m = re.search(r'(\d{4})\)', soup.head.title.string)
            if m:
                self.year = int(m.group(1))
                log.debug('Detected year: %s' % self.year)
            else:
                log.warning('Unable to get year for %s (regexp mismatch) - plugin needs update?' % url)
        else:
            log.warning('Unable to get year for %s (missing title) - plugin needs update?' % url)

        # get main cast
        tag_cast = soup.find('table', 'cast_list')
        if tag_cast:
            for actor in tag_cast.find_all('a', href=re.compile('/name/nm')):
                actor_id = extract_id(actor['href'])
                actor_name = actor.text.strip()
                # tag instead of name
                if isinstance(actor_name, Tag):
                    actor_name = None
                self.actors[actor_id] = actor_name

        # get director(s)
        h4_director = soup.find('h4', text=re.compile('Director'))
        if h4_director:
            for director in h4_director.parent.find_all('a', href=re.compile('/name/nm')):
                director_id = extract_id(director['href'])
                director_name = director.text
                # tag instead of name
                if isinstance(director_name, Tag):
                    director_name = None
                self.directors[director_id] = director_name

        log.debug('Detected genres: %s' % self.genres)
        log.debug('Detected languages: %s' % self.languages)
        log.debug('Detected director(s): %s' % ', '.join(self.directors))
        log.debug('Detected actors: %s' % ', '.join(self.actors))

        # get plot
        h2_plot = soup.find('h2', text='Storyline')
        if h2_plot:
            p_plot = h2_plot.find_next('p')
            if p_plot and p_plot.next.string:
                self.plot_outline = p_plot.next.string.strip()
                log.debug('Detected plot outline: %s' % self.plot_outline)
            else:
                log.debug('Plot does not have p-tag')
        else:
            log.debug('Failed to find plot')
Exemplo n.º 8
0
    def parse(self, imdb_id):
        self.imdb_id = extract_id(imdb_id)
        url = make_url(self.imdb_id)
        self.url = url
        page = requests.get(url)
        soup = get_soup(page.text)

        title_overview = soup.find('div', attrs={'class': 'title-overview'})
        if not title_overview:
            log.error('IMDB parser needs updating, imdb format changed.')
            return

        # Parse the year from the page title, no good places in the body (in current format)
        year_match = re.search(r'\((\d{4})\) - IMDb', soup.title.text)
        if year_match:
            self.year = int(year_match.group(1))

        # Parse stuff from the title-overview section
        self.name = title_overview.find('h1', itemprop='name').text

        mpaa_rating_elem = title_overview.find(itemprop='contentRating')
        if mpaa_rating_elem:
            self.mpaa_rating = mpaa_rating_elem['content']
        else:
            log.debug('No rating found for %s' % self.imdb_id)

        photo_elem = title_overview.find(itemprop='image')
        if photo_elem:
            self.photo = photo_elem['src']
        else:
            log.debug('No photo found for %s' % self.imdb_id)

        original_name_elem = title_overview.find(attrs={'class': 'originalTitle'})
        if original_name_elem:
            self.original_name = original_name_elem.find(text=True, recursive=False)
        else:
            log.debug('No original title found for %s' % self.imdb_id)

        votes_elem = title_overview.find(itemprop='ratingCount')
        if votes_elem:
            self.votes = str_to_int(votes_elem.text)
        else:
            log.debug('No votes found for %s' % self.imdb_id)

        score_elem = title_overview.find(itemprop='ratingValue')
        if score_elem:
            self.score = float(score_elem.text)
        else:
            log.debug('No score found for %s' % self.imdb_id)

        # get director(s)
        for director in title_overview.select('[itemprop="director"] > a'):
            director_id = extract_id(director['href'])
            director_name = director.text
            # tag instead of name
            if isinstance(director_name, Tag):
                director_name = None
            self.directors[director_id] = director_name

        # Details section
        title_details = soup.find('div', attrs={'id': 'titleDetails'})
        if title_details:
            # get languages
            for link in title_details.find_all('a', href=re.compile('/language/')):
                lang = link.text.strip().lower()
                if lang not in self.languages:
                    self.languages.append(lang.strip())

        # Storyline section
        storyline = soup.find('div', attrs={'id': 'titleStoryLine'})
        if storyline:
            plot_elem = storyline.find('p')
            if plot_elem:
                self.plot_outline = plot_elem.find(text=True, recursive=False).strip()
            else:
                log.debug('No storyline found for %s' % self.imdb_id)
            self.genres = [i.text.strip().lower() for i in storyline.select('[itemprop="genre"] > a')]

        # Cast section
        cast = soup.find('div', attrs={'id': 'titleCast'})
        if cast:
            for actor in cast.select('[itemprop="actor"] > a'):
                actor_id = extract_id(actor['href'])
                actor_name = actor.text.strip()
                # tag instead of name
                if isinstance(actor_name, Tag):
                    actor_name = None
                self.actors[actor_id] = actor_name
Exemplo n.º 9
0
    def search(self, name):
        """Return array of movie details (dict)"""
        logger.debug('Searching: {}', name)
        url = 'https://www.imdb.com/find'
        # This may include Shorts and TV series in the results
        params = {'q': name, 's': 'tt'}

        logger.debug('Search query: {}', repr(url))
        page = requests.get(url, params=params)
        actual_url = page.url

        movies = []
        soup = get_soup(page.text)
        # in case we got redirected to movie page (perfect match)
        re_m = re.match(r'.*\.imdb\.com/title/tt\d+/', actual_url)
        if re_m:
            actual_url = re_m.group(0)
            imdb_id = extract_id(actual_url)
            movie_parse = ImdbParser()
            movie_parse.parse(imdb_id, soup=soup)
            logger.debug('Perfect hit. Search got redirected to {}',
                         actual_url)
            movie = {
                'match': 1.0,
                'name': movie_parse.name,
                'imdb_id': imdb_id,
                'url': make_url(imdb_id),
                'year': movie_parse.year,
            }
            movies.append(movie)
            return movies

        section_table = soup.find('table', 'findList')
        if not section_table:
            logger.debug('results table not found')
            return

        rows = section_table.find_all('tr')
        if not rows:
            logger.debug('Titles section does not have links')
        for count, row in enumerate(rows):
            # Title search gives a lot of results, only check the first ones
            if count > self.max_results:
                break

            result_text = row.find('td', 'result_text')
            movie = {}
            additional = re.findall(r'\((.*?)\)', result_text.text)
            if len(additional) > 0:
                if re.match(r'^\d{4}$', additional[-1]):
                    movie['year'] = str_to_int(additional[-1])
                elif len(additional) > 1:
                    movie['year'] = str_to_int(additional[-2])
                    if additional[-1] not in ['TV Movie', 'Video']:
                        logger.debug('skipping {}', result_text.text)
                        continue
            primary_photo = row.find('td', 'primary_photo')
            movie['thumbnail'] = primary_photo.find('a').find('img').get('src')

            link = result_text.find_next('a')
            movie['name'] = link.text
            movie['imdb_id'] = extract_id(link.get('href'))
            movie['url'] = make_url(movie['imdb_id'])
            logger.debug('processing name: {} url: {}', movie['name'],
                         movie['url'])

            # calc & set best matching ratio
            seq = difflib.SequenceMatcher(lambda x: x == ' ',
                                          movie['name'].title(), name.title())
            ratio = seq.ratio()

            # check if some of the akas have better ratio
            for aka in link.parent.find_all('i'):
                aka = aka.next.string
                match = re.search(r'".*"', aka)
                if not match:
                    logger.debug('aka `{}` is invalid', aka)
                    continue
                aka = match.group(0).replace('"', '')
                logger.trace('processing aka {}', aka)
                seq = difflib.SequenceMatcher(lambda x: x == ' ', aka.title(),
                                              name.title())
                aka_ratio = seq.ratio()
                if aka_ratio > ratio:
                    ratio = aka_ratio * self.aka_weight
                    logger.debug(
                        '- aka `{}` matches better to `{}` ratio {} (weighted to {})',
                        aka,
                        name,
                        aka_ratio,
                        ratio,
                    )

            # prioritize items by position
            position_ratio = (self.first_weight - 1) / (count + 1) + 1
            logger.debug('- prioritizing based on position {} `{}`: {}', count,
                         movie['url'], position_ratio)
            ratio *= position_ratio

            # store ratio
            movie['match'] = ratio
            movies.append(movie)

        movies.sort(key=lambda x: x['match'], reverse=True)
        return movies
Exemplo n.º 10
0
    def parse(self, imdb_id):
        self.imdb_id = extract_id(imdb_id)
        url = make_url(self.imdb_id)
        self.url = url
        page = requests.get(url)
        soup = get_soup(page.text)

        title_overview = soup.find('div', attrs={'class': 'title-overview'})
        if not title_overview:
            log.error('IMDB parser needs updating, imdb format changed.')
            return

        # Parse stuff from the title-overview section
        name_elem = title_overview.find('h1', itemprop='name')
        self.name = name_elem.find(text=True, recursive=False).strip()

        year = name_elem.find('a')
        if year:
            self.year = int(year.text)
        else:
            log.debug('No year found for %s' % self.imdb_id)

        mpaa_rating_elem = title_overview.find(itemprop='contentRating')
        if mpaa_rating_elem:
            self.mpaa_rating = mpaa_rating_elem['content']
        else:
            log.debug('No rating found for %s' % self.imdb_id)

        photo_elem = title_overview.find(itemprop='image')
        if photo_elem:
            self.photo = photo_elem['src']
        else:
            log.debug('No photo found for %s' % self.imdb_id)

        original_name_elem = title_overview.find(
            attrs={'class': 'originalTitle'})
        if original_name_elem:
            self.original_name = original_name_elem.find(text=True,
                                                         recursive=False)
        else:
            log.debug('No original title found for %s' % self.imdb_id)

        votes_elem = title_overview.find(itemprop='ratingCount')
        if votes_elem:
            self.votes = str_to_int(votes_elem.text)
        else:
            log.debug('No votes found for %s' % self.imdb_id)

        score_elem = title_overview.find(itemprop='ratingValue')
        if score_elem:
            self.score = float(score_elem.text)
        else:
            log.debug('No score found for %s' % self.imdb_id)

        # get director(s)
        for director in title_overview.select('[itemprop="director"] > a'):
            director_id = extract_id(director['href'])
            director_name = director.text
            # tag instead of name
            if isinstance(director_name, Tag):
                director_name = None
            self.directors[director_id] = director_name

        # Details section
        title_details = soup.find('div', attrs={'id': 'titleDetails'})
        if title_details:
            # get languages
            for link in title_details.find_all('a',
                                               href=re.compile('/language/')):
                lang = link.text.strip().lower()
                if lang not in self.languages:
                    self.languages.append(lang.strip())

        # Storyline section
        storyline = soup.find('div', attrs={'id': 'titleStoryLine'})
        if storyline:
            plot_elem = storyline.find('p')
            if plot_elem:
                self.plot_outline = plot_elem.find(text=True,
                                                   recursive=False).strip()
            else:
                log.debug('No storyline found for %s' % self.imdb_id)
            self.genres = [
                i.text.strip().lower()
                for i in storyline.select('[itemprop="genre"] > a')
            ]

        # Cast section
        cast = soup.find('div', attrs={'id': 'titleCast'})
        if cast:
            for actor in cast.select('[itemprop="actor"] > a'):
                actor_id = extract_id(actor['href'])
                actor_name = actor.text.strip()
                # tag instead of name
                if isinstance(actor_name, Tag):
                    actor_name = None
                self.actors[actor_id] = actor_name
Exemplo n.º 11
0
    def parse(self, imdb_id, soup=None):
        self.imdb_id = extract_id(imdb_id)
        url = make_url(self.imdb_id)
        self.url = url

        if not soup:
            page = requests.get(url)
            soup = get_soup(page.text)

        title_wrapper = soup.find('div', attrs={'class': 'title_wrapper'})

        data = json.loads(
            soup.find('script', {
                'type': 'application/ld+json'
            }).text)

        if not data:
            raise PluginError(
                'IMDB parser needs updating, imdb format changed. Please report on Github.'
            )

        # Parse stuff from the title-overview section
        name_elem = data['name']
        if name_elem:
            self.name = name_elem.strip()
        else:
            log.error(
                'Possible IMDB parser needs updating, Please report on Github.'
            )
            raise PluginError('Unable to set imdb_name for %s from %s' %
                              (self.imdb_id, self.url))

        year = soup.find('span', attrs={'id': 'titleYear'})
        if year:
            m = re.search(r'([0-9]{4})', year.text)
            if m:
                self.year = int(m.group(1))

        if not self.year:
            log.debug('No year found for %s', self.imdb_id)

        mpaa_rating_elem = data.get('contentRating')
        if mpaa_rating_elem:
            self.mpaa_rating = mpaa_rating_elem
        else:
            log.debug('No rating found for %s', self.imdb_id)

        photo_elem = data.get('image')
        if photo_elem:
            self.photo = photo_elem
        else:
            log.debug('No photo found for %s', self.imdb_id)

        original_name_elem = title_wrapper.find('div',
                                                {'class': 'originalTitle'})
        if original_name_elem:
            self.name = title_wrapper.find('h1').contents[0].strip()
            self.original_name = original_name_elem.contents[0].strip().strip(
                '"')
        else:
            log.debug('No original title found for %s', self.imdb_id)

        votes_elem = data.get('aggregateRating', {}).get('ratingCount')
        if votes_elem:
            self.votes = str_to_int(votes_elem) if not isinstance(
                votes_elem, int) else votes_elem
        else:
            log.debug('No votes found for %s', self.imdb_id)

        score_elem = data.get('aggregateRating', {}).get('ratingValue')
        if score_elem:
            self.score = float(score_elem)
        else:
            log.debug('No score found for %s', self.imdb_id)

        meta_score_elem = soup.find(attrs={'class': 'metacriticScore'})
        if meta_score_elem:
            self.meta_score = str_to_int(meta_score_elem.text)
        else:
            log.debug('No Metacritic score found for %s', self.imdb_id)

        # get director(s)
        directors = data.get('director', [])
        if not isinstance(directors, list):
            directors = [directors]

        for director in directors:
            if director['@type'] != 'Person':
                continue
            director_id = extract_id(director['url'])
            director_name = director['name']
            self.directors[director_id] = director_name

        # get writer(s)
        writers = data.get('creator', [])
        if not isinstance(writers, list):
            writers = [writers]

        for writer in writers:
            if writer['@type'] != 'Person':
                continue
            writer_id = extract_id(writer['url'])
            writer_name = writer['name']
            self.writers[writer_id] = writer_name

        # Details section
        title_details = soup.find('div', attrs={'id': 'titleDetails'})
        if title_details:
            # get languages
            for link in title_details.find_all(
                    'a',
                    href=re.compile('^/search/title\?title_type=feature'
                                    '&primary_language=')):
                lang = link.text.strip().lower()
                if lang not in self.languages:
                    self.languages.append(lang.strip())

        # Storyline section
        storyline = soup.find('div', attrs={'id': 'titleStoryLine'})
        if storyline:
            plot_elem = storyline.find('p')
            if plot_elem:
                # Remove the "Written By" part.
                if plot_elem.em:
                    plot_elem.em.replace_with('')
                self.plot_outline = plot_elem.text.strip()
            else:
                log.debug('No storyline found for %s', self.imdb_id)

        genres = data.get('genre', [])
        if not isinstance(genres, list):
            genres = [genres]

        self.genres = [g.strip().lower() for g in genres]

        # Cast section
        cast = soup.find('table', attrs={'class': 'cast_list'})
        if cast:
            for actor in cast.select('tr > td:nth-of-type(2) > a'):
                actor_id = extract_id(actor['href'])
                actor_name = actor.text.strip()
                # tag instead of name
                if isinstance(actor_name, Tag):
                    actor_name = None
                self.actors[actor_id] = actor_name
Exemplo n.º 12
0
    def parse(self, imdb_id):
        self.imdb_id = extract_id(imdb_id)
        url = make_url(self.imdb_id)
        self.url = url
        page = requests.get(url)
        soup = get_soup(page.content)

        # get photo
        tag_photo = soup.find('div', attrs={'class': 'photo'})
        if tag_photo:
            tag_img = tag_photo.find('img')
            if tag_img:
                self.photo = tag_img.get('src')
                log.debug('Detected photo: %s' % self.photo)

        # get rating. Always the first absmiddle.
        tag_infobar_div = soup.find('div', attrs={'class': 'infobar'})
        if tag_infobar_div:
            tag_mpaa_rating = tag_infobar_div.find('img', attrs={'class': 'absmiddle'})
            if tag_mpaa_rating:
                if tag_mpaa_rating['alt'] != tag_mpaa_rating['title']:
                    # If we've found something of class absmiddle in the infobar,
                    # it should be mpaa_rating, since that's the only one in there.
                    log.warning("MPAA rating alt and title don't match for URL %s - plugin needs an update?" % url)
                else:
                    self.mpaa_rating = tag_mpaa_rating['alt']
                    log.debug('Detected mpaa rating: %s' % self.mpaa_rating)
            else:
                log.debug('Unable to match signature of mpaa rating for %s - could be a TV episode, or plugin needs update?' % url)
        else:
            # We should match the infobar, it's an integral part of the IMDB page.
            log.warning('Unable to get infodiv class for %s - plugin needs update?' % url)

        # get name
        tag_name = soup.find('h1')
        if tag_name:
            if tag_name.next:
                # Handle a page not found in IMDB. tag_name.string is
                # "<br/> Page Not Found" and there is no next tag. Thus, None.
                if tag_name.next.string is not None:
                    self.name = tag_name.next.string.strip()
                    log.debug('Detected name: %s' % self.name)
        else:
            log.warning('Unable to get name for %s - plugin needs update?' % url)

        # detect if movie is eligible for ratings
        rating_ineligible = soup.find('div', attrs={'class': 'rating-ineligible'})
        if rating_ineligible:
            log.debug('movie is not eligible for ratings')
        else:
            # get votes
            tag_votes = soup.find(itemprop='ratingCount')
            if tag_votes:
                self.votes = str_to_int(tag_votes.string) or 0
                log.debug('Detected votes: %s' % self.votes)
            else:
                log.warning('Unable to get votes for %s - plugin needs update?' % url)

            # get score
            span_score = soup.find(itemprop='ratingValue')
            if span_score:
                try:
                    self.score = float(span_score.string)
                except ValueError:
                    log.debug('tag_score %s is not valid float' % span_score.contents[0])
                log.debug('Detected score: %s' % self.score)
            else:
                log.warning('Unable to get score for %s - plugin needs update?' % url)

        # get genres
        for link in soup.find_all('a', attrs={'itemprop': 'genre'}):
            self.genres.append(unicode(link.contents[0].lower()))

        # get languages
        for link in soup.find_all('a', attrs={'itemprop': 'inLanguage'}):
            # skip non-primary languages "(a few words)", etc.
            m = re.search('(?x) \( [^()]* \\b few \\b', unicode(link.next_sibling))
            if not m:
                lang = unicode(link.contents[0].lower())
                if not lang in self.languages:
                    self.languages.append(lang.strip())

        # get year
        tag_year = soup.find('a', attrs={'href': re.compile('^/year/\d+')})
        if tag_year:
            self.year = int(tag_year.contents[0])
            log.debug('Detected year: %s' % self.year)
        elif soup.head.title:
            m = re.search(r'(\d{4})\)', unicode(soup.head.title.string))
            if m:
                self.year = int(m.group(1))
                log.debug('Detected year: %s' % self.year)
            else:
                log.warning('Unable to get year for %s (regexp mismatch) - plugin needs update?' % url)
        else:
            log.warning('Unable to get year for %s (missing title) - plugin needs update?' % url)

        # get main cast
        tag_cast = soup.find('table', 'cast_list')
        if tag_cast:
            for actor in tag_cast.find_all('a', href=re.compile('/name/nm')):
                actor_id = extract_id(actor['href'])
                actor_name = unicode(actor.contents[0])
                # tag instead of name
                if isinstance(actor_name, Tag):
                    actor_name = None
                self.actors[actor_id] = actor_name

        # get director(s)
        h4_director = soup.find('h4', text=re.compile('Director'))
        if h4_director:
            for director in h4_director.parent.parent.find_all('a', href=re.compile('/name/nm')):
                director_id = extract_id(director['href'])
                director_name = unicode(director.contents[0])
                # tag instead of name
                if isinstance(director_name, Tag):
                    director_name = None
                self.directors[director_id] = director_name

        log.debug('Detected genres: %s' % self.genres)
        log.debug('Detected languages: %s' % self.languages)
        log.debug('Detected director(s): %s' % ', '.join(self.directors))
        log.debug('Detected actors: %s' % ', '.join(self.actors))

        # get plot
        h2_plot = soup.find('h2', text='Storyline')
        if h2_plot:
            p_plot = h2_plot.find_next('p')
            if p_plot:
                self.plot_outline = p_plot.next.string.strip()
                log.debug('Detected plot outline: %s' % self.plot_outline)
            else:
                log.debug('Plot does not have p-tag')
        else:
            log.debug('Failed to find plot')
Exemplo n.º 13
0
    def parse(self, imdb_id, soup=None):
        self.imdb_id = extract_id(imdb_id)
        url = make_url(self.imdb_id)
        self.url = url

        if not soup:
            page = requests.get(url)
            soup = get_soup(page.text)

        data = json.loads(
            soup.find('script', {
                'type': 'application/ld+json'
            }).string)
        if not data:
            raise plugin.PluginError(
                'IMDB parser needs updating, imdb format changed. Please report on Github.'
            )

        props_data = json.loads(
            soup.find('script', {
                'type': 'application/json'
            }).string)
        if (not props_data or not props_data.get('props')
                or not props_data.get('props').get('pageProps')):
            raise plugin.PluginError(
                'IMDB parser needs updating, imdb props_data format changed. Please report on Github.'
            )

        above_the_fold_data = props_data['props']['pageProps'].get(
            'aboveTheFoldData')
        if not above_the_fold_data:
            raise plugin.PluginError(
                'IMDB parser needs updating, imdb above_the_fold_data format changed. Please report on Github.'
            )

        title = above_the_fold_data.get('titleText')
        if title:
            self.name = title.get('text')
        if not self.name:
            raise plugin.PluginError(
                'IMDB parser needs updating, imdb above_the_fold_data format changed for title. Please report on Github.'
            )

        original_name = above_the_fold_data.get('originalTitleText')
        if original_name:
            self.original_name = original_name.get('text')

        if not self.original_name:
            logger.debug('No original title found for {}', self.imdb_id)

        # NOTE: We cannot use the get default approach here .(get(x, {}))
        # as the data returned in imdb has all fields with null values if they do not exist.
        if above_the_fold_data.get('releaseYear'):
            self.year = above_the_fold_data['releaseYear'].get('year')
        if not self.year:
            logger.debug('No year found for {}', self.imdb_id)

        self.mpaa_rating = data.get('contentRating')
        if not self.mpaa_rating:
            logger.debug('No rating found for {}', self.imdb_id)

        self.photo = data.get('image')
        if not self.photo:
            logger.debug('No photo found for {}', self.imdb_id)

        rating_data = data.get('aggregateRating')
        if rating_data:
            rating_count = rating_data.get('ratingCount')
            if rating_count:
                self.votes = (str_to_int(rating_count)
                              if not isinstance(rating_count, int) else
                              rating_count)
            else:
                logger.debug('No votes found for {}', self.imdb_id)

            score = rating_data.get('ratingValue')
            if score:
                self.score = float(score)
            else:
                logger.debug('No score found for {}', self.imdb_id)

        meta_critic = above_the_fold_data.get('metacritic')
        if meta_critic:
            meta_score = meta_critic.get('metascore')
            if meta_score:
                self.meta_score = meta_score.get('score')
        if not self.meta_score:
            logger.debug('No Metacritic score found for {}', self.imdb_id)

        # get director(s)
        directors = data.get('director', [])
        if not isinstance(directors, list):
            directors = [directors]

        for director in directors:
            if director['@type'] != 'Person':
                continue
            director_id = extract_id(director['url'])
            director_name = director['name']
            self.directors[director_id] = director_name

        # get writer(s)
        writers = data.get('creator', [])
        if not isinstance(writers, list):
            writers = [writers]

        for writer in writers:
            if writer['@type'] != 'Person':
                continue
            writer_id = extract_id(writer['url'])
            writer_name = writer['name']
            self.writers[writer_id] = writer_name

        # Details section
        main_column_data = props_data['props']['pageProps'].get(
            'mainColumnData')
        if not main_column_data:
            raise plugin.PluginError(
                'IMDB parser needs updating, imdb main_column_data format changed. Please report on Github.'
            )

        for language in (main_column_data.get('spokenLanguages')
                         or {}).get('spokenLanguages', []):
            self.languages.append(language['text'].lower())

        # Storyline section
        # NOTE: We cannot use the get default approach here .(get(x, {}))
        # as the data returned in imdb has all fields with null values if they do not exist.
        summaries = main_column_data.get('summaries') or {}
        summary_edges = summaries.get('edges') or []
        if len(summary_edges) > 0:
            edge_node = summary_edges[0].get('node') or {}
            plot_text = edge_node.get('plotText') or {}
            # Strip out html
            plot_html = get_soup(plot_text.get('plaidHtml'))
            if plot_html:
                self.plot_outline = plot_html.text
        if not self.plot_outline:
            logger.debug('No storyline found for {}', self.imdb_id)

        storyline_keywords = main_column_data.get('storylineKeywords') or {}
        for keyword_node in storyline_keywords.get('edges') or []:
            keyword = keyword_node.get('node') or {}
            if keyword:
                self.plot_keywords.append(keyword.get('text').lower())

        genres = (above_the_fold_data.get('genres', {}) or {}).get('genres')
        self.genres = [g['text'].lower() for g in genres]

        # Cast section
        cast_data = main_column_data.get('cast', {}) or {}
        for cast_node in cast_data.get('edges') or []:
            actor_node = (cast_node.get('node') or {}).get('name') or {}
            actor_id = actor_node.get('id')
            actor_name = (actor_node.get('nameText') or {}).get('text')
            if actor_id and actor_name:
                self.actors[actor_id] = actor_name

        principal_cast_data = main_column_data.get('principalCast', []) or []
        if principal_cast_data:
            for cast_node in principal_cast_data[0].get('credits') or []:
                actor_node = cast_node.get('name') or {}
                actor_id = actor_node.get('id')
                actor_name = (actor_node.get('nameText') or {}).get('text')
                if actor_id and actor_name:
                    self.actors[actor_id] = actor_name
Exemplo n.º 14
0
    def parse(self, imdb_id, soup=None):
        self.imdb_id = extract_id(imdb_id)
        url = make_url(self.imdb_id)
        self.url = url

        if not soup:
            page = requests.get(url)
            soup = get_soup(page.text)

        title_wrapper = soup.find('div', attrs={'class': 'title_wrapper'})

        data = json.loads(soup.find('script', {'type': 'application/ld+json'}).text)

        if not data:
            raise PluginError('IMDB parser needs updating, imdb format changed. Please report on Github.')

        # Parse stuff from the title-overview section
        name_elem = data['name']
        if name_elem:
            self.name = name_elem.strip()
        else:
            log.error('Possible IMDB parser needs updating, Please report on Github.')
            raise PluginError('Unable to set imdb_name for %s from %s' % (self.imdb_id, self.url))

        year = soup.find('span', attrs={'id': 'titleYear'})
        if year:
            m = re.search(r'([0-9]{4})', year.text)
            if m:
                self.year = int(m.group(1))

        if not self.year:
            log.debug('No year found for %s', self.imdb_id)

        mpaa_rating_elem = data.get('contentRating')
        if mpaa_rating_elem:
            self.mpaa_rating = mpaa_rating_elem
        else:
            log.debug('No rating found for %s', self.imdb_id)

        photo_elem = data.get('image')
        if photo_elem:
            self.photo = photo_elem
        else:
            log.debug('No photo found for %s', self.imdb_id)

        original_name_elem = title_wrapper.find('div', {'class': 'originalTitle'})
        if original_name_elem:
            self.name = title_wrapper.find('h1').contents[0].strip()
            self.original_name = original_name_elem.contents[0].strip().strip('"')
        else:
            log.debug('No original title found for %s', self.imdb_id)

        votes_elem = data.get('aggregateRating', {}).get('ratingCount')
        if votes_elem:
            self.votes = str_to_int(votes_elem) if not isinstance(votes_elem, int) else votes_elem
        else:
            log.debug('No votes found for %s', self.imdb_id)

        score_elem = data.get('aggregateRating', {}).get('ratingValue')
        if score_elem:
            self.score = float(score_elem)
        else:
            log.debug('No score found for %s', self.imdb_id)

        meta_score_elem = soup.find(attrs={'class': 'metacriticScore'})
        if meta_score_elem:
            self.meta_score = str_to_int(meta_score_elem.text)
        else:
            log.debug('No Metacritic score found for %s', self.imdb_id)

        # get director(s)
        directors = data.get('director', [])
        if not isinstance(directors, list):
            directors = [directors]

        for director in directors:
            if director['@type'] != 'Person':
                continue
            director_id = extract_id(director['url'])
            director_name = director['name']
            self.directors[director_id] = director_name

        # get writer(s)
        writers = data.get('creator', [])
        if not isinstance(writers, list):
            writers = [writers]

        for writer in writers:
            if writer['@type'] != 'Person':
                continue
            writer_id = extract_id(writer['url'])
            writer_name = writer['name']
            self.writers[writer_id] = writer_name

        # Details section
        title_details = soup.find('div', attrs={'id': 'titleDetails'})
        if title_details:
            # get languages
            for link in title_details.find_all('a', href=re.compile('^/search/title\?title_type=feature'
                                                                    '&primary_language=')):
                lang = link.text.strip().lower()
                if lang not in self.languages:
                    self.languages.append(lang.strip())

        # Storyline section
        storyline = soup.find('div', attrs={'id': 'titleStoryLine'})
        if storyline:
            plot_elem = storyline.find('p')
            if plot_elem:
                # Remove the "Written By" part.
                if plot_elem.em:
                    plot_elem.em.replace_with('')
                self.plot_outline = plot_elem.text.strip()
            else:
                log.debug('No storyline found for %s', self.imdb_id)

        genres = data.get('genre', [])
        if not isinstance(genres, list):
            genres = [genres]

        self.genres = [g.strip().lower() for g in genres]

        # Cast section
        cast = soup.find('table', attrs={'class': 'cast_list'})
        if cast:
            for actor in cast.select('tr > td:nth-of-type(2) > a'):
                actor_id = extract_id(actor['href'])
                actor_name = actor.text.strip()
                # tag instead of name
                if isinstance(actor_name, Tag):
                    actor_name = None
                self.actors[actor_id] = actor_name
Exemplo n.º 15
0
    def search(self, name):
        """Return array of movie details (dict)"""
        log.debug('Searching: %s', name)
        url = u'https://www.imdb.com/find'
        # This may include Shorts and TV series in the results
        params = {'q': name, 's': 'tt', }

        log.debug('Search query: %s', repr(url))
        page = requests.get(url, params=params)
        actual_url = page.url

        movies = []
        soup = get_soup(page.text)
        # in case we got redirected to movie page (perfect match)
        re_m = re.match(r'.*\.imdb\.com/title/tt\d+/', actual_url)
        if re_m:
            actual_url = re_m.group(0)
            imdb_id = extract_id(actual_url)
            movie_parse = ImdbParser()
            movie_parse.parse(imdb_id, soup=soup)
            log.debug('Perfect hit. Search got redirected to %s', actual_url)
            movie = {
                'match': 1.0,
                'name': movie_parse.name,
                'imdb_id': imdb_id,
                'url': make_url(imdb_id),
                'year': movie_parse.year
            }
            movies.append(movie)
            return movies

        section_table = soup.find('table', 'findList')
        if not section_table:
            log.debug('results table not found')
            return

        rows = section_table.find_all('tr')
        if not rows:
            log.debug('Titles section does not have links')
        for count, row in enumerate(rows):
            # Title search gives a lot of results, only check the first ones
            if count > self.max_results:
                break

            result_text = row.find('td', 'result_text')
            movie = {}
            additional = re.findall(r'\((.*?)\)', result_text.text)
            if len(additional) > 0:
                if re.match('^\d{4}$', additional[-1]):
                    movie['year'] = str_to_int(additional[-1])
                elif len(additional) > 1:
                    movie['year'] = str_to_int(additional[-2])
                    if additional[-1] not in ['TV Movie', 'Video']:
                        log.debug('skipping %s', result_text.text)
                        continue
            primary_photo = row.find('td', 'primary_photo')
            movie['thumbnail'] = primary_photo.find('a').find('img').get('src')

            link = result_text.find_next('a')
            movie['name'] = link.text
            movie['imdb_id'] = extract_id(link.get('href'))
            movie['url'] = make_url(movie['imdb_id'])
            log.debug('processing name: %s url: %s' % (movie['name'], movie['url']))

            # calc & set best matching ratio
            seq = difflib.SequenceMatcher(lambda x: x == ' ', movie['name'].title(), name.title())
            ratio = seq.ratio()

            # check if some of the akas have better ratio
            for aka in link.parent.find_all('i'):
                aka = aka.next.string
                match = re.search(r'".*"', aka)
                if not match:
                    log.debug('aka `%s` is invalid' % aka)
                    continue
                aka = match.group(0).replace('"', '')
                log.trace('processing aka %s' % aka)
                seq = difflib.SequenceMatcher(lambda x: x == ' ', aka.title(), name.title())
                aka_ratio = seq.ratio()
                if aka_ratio > ratio:
                    ratio = aka_ratio * self.aka_weight
                    log.debug('- aka `%s` matches better to `%s` ratio %s (weighted to %s)' %
                              (aka, name, aka_ratio, ratio))

            # prioritize items by position
            position_ratio = (self.first_weight - 1) / (count + 1) + 1
            log.debug('- prioritizing based on position %s `%s`: %s' % (count, movie['url'], position_ratio))
            ratio *= position_ratio

            # store ratio
            movie['match'] = ratio
            movies.append(movie)

        movies.sort(key=lambda x: x['match'], reverse=True)
        return movies
Exemplo n.º 16
0
    def parse(self, imdb_id):
        self.imdb_id = extract_id(imdb_id)
        url = make_url(self.imdb_id)
        self.url = url
        page = requests.get(url)
        soup = get_soup(page.content)

        # get photo
        tag_photo = soup.find('div', attrs={'class': 'photo'})
        if tag_photo:
            tag_img = tag_photo.find('img')
            if tag_img:
                self.photo = tag_img.get('src')
                log.debug('Detected photo: %s' % self.photo)

        # get rating. Always the first absmiddle.
        tag_infobar_div = soup.find('div', attrs={'class': 'infobar'})
        if tag_infobar_div:
            tag_mpaa_rating = tag_infobar_div.find(
                'img', attrs={'class': 'absmiddle'})
            if tag_mpaa_rating:
                if tag_mpaa_rating['alt'] != tag_mpaa_rating['title']:
                    # If we've found something of class absmiddle in the infobar,
                    # it should be mpaa_rating, since that's the only one in there.
                    log.warning(
                        "MPAA rating alt and title don't match for URL %s - plugin needs an update?"
                        % url)
                else:
                    self.mpaa_rating = tag_mpaa_rating['alt']
                    log.debug('Detected mpaa rating: %s' % self.mpaa_rating)
            else:
                log.debug(
                    'Unable to match signature of mpaa rating for %s - could be a TV episode, or plugin needs update?'
                    % url)
        else:
            # We should match the infobar, it's an integral part of the IMDB page.
            log.warning(
                'Unable to get infodiv class for %s - plugin needs update?' %
                url)

        # get name
        tag_name = soup.find('h1')
        if tag_name:
            if tag_name.next:
                # Handle a page not found in IMDB. tag_name.string is
                # "<br/> Page Not Found" and there is no next tag. Thus, None.
                if tag_name.next.string is not None:
                    self.name = tag_name.next.string.strip()
                    log.debug('Detected name: %s' % self.name)
        else:
            log.warning('Unable to get name for %s - plugin needs update?' %
                        url)

        # detect if movie is eligible for ratings
        rating_ineligible = soup.find('div',
                                      attrs={'class': 'rating-ineligible'})
        if rating_ineligible:
            log.debug('movie is not eligible for ratings')
        else:
            # get votes
            tag_votes = soup.find(itemprop='ratingCount')
            if tag_votes:
                self.votes = str_to_int(tag_votes.string) or 0
                log.debug('Detected votes: %s' % self.votes)
            else:
                log.warning(
                    'Unable to get votes for %s - plugin needs update?' % url)

            # get score
            span_score = soup.find(itemprop='ratingValue')
            if span_score:
                try:
                    self.score = float(span_score.string)
                except ValueError:
                    log.debug('tag_score %s is not valid float' %
                              span_score.contents[0])
                log.debug('Detected score: %s' % self.score)
            else:
                log.warning(
                    'Unable to get score for %s - plugin needs update?' % url)

        # get genres
        for link in soup.find_all('a', attrs={'itemprop': 'genre'}):
            self.genres.append(unicode(link.contents[0].lower()))

        # get languages
        for link in soup.find_all('a', attrs={'itemprop': 'inLanguage'}):
            # skip non-primary languages "(a few words)", etc.
            m = re.search('(?x) \( [^()]* \\b few \\b',
                          unicode(link.next_sibling))
            if not m:
                lang = unicode(link.contents[0].lower())
                if not lang in self.languages:
                    self.languages.append(lang.strip())

        # get year
        tag_year = soup.find('a', attrs={'href': re.compile('^/year/\d+')})
        if tag_year:
            self.year = int(tag_year.contents[0])
            log.debug('Detected year: %s' % self.year)
        elif soup.head.title:
            m = re.search(r'(\d{4})\)', unicode(soup.head.title.string))
            if m:
                self.year = int(m.group(1))
                log.debug('Detected year: %s' % self.year)
            else:
                log.warning(
                    'Unable to get year for %s (regexp mismatch) - plugin needs update?'
                    % url)
        else:
            log.warning(
                'Unable to get year for %s (missing title) - plugin needs update?'
                % url)

        # get main cast
        tag_cast = soup.find('table', 'cast_list')
        if tag_cast:
            for actor in tag_cast.find_all('a', href=re.compile('/name/nm')):
                actor_id = extract_id(actor['href'])
                actor_name = unicode(actor.contents[0])
                # tag instead of name
                if isinstance(actor_name, Tag):
                    actor_name = None
                self.actors[actor_id] = actor_name

        # get director(s)
        h4_director = soup.find('h4', text=re.compile('Director'))
        if h4_director:
            for director in h4_director.parent.parent.find_all(
                    'a', href=re.compile('/name/nm')):
                director_id = extract_id(director['href'])
                director_name = unicode(director.contents[0])
                # tag instead of name
                if isinstance(director_name, Tag):
                    director_name = None
                self.directors[director_id] = director_name

        log.debug('Detected genres: %s' % self.genres)
        log.debug('Detected languages: %s' % self.languages)
        log.debug('Detected director(s): %s' % ', '.join(self.directors))
        log.debug('Detected actors: %s' % ', '.join(self.actors))

        # get plot
        h2_plot = soup.find('h2', text='Storyline')
        if h2_plot:
            p_plot = h2_plot.find_next('p')
            if p_plot:
                self.plot_outline = p_plot.next.string.strip()
                log.debug('Detected plot outline: %s' % self.plot_outline)
            else:
                log.debug('Plot does not have p-tag')
        else:
            log.debug('Failed to find plot')