Пример #1
0
 def localTest_scoreMediaTitleMatch(self):
     score = common.scoreMediaTitleMatch('Gladiatory Rima', '2012',
                                         u'Гладиаторы Рима',
                                         'Gladiatori di Roma', '2012', 3)
     self._assertEquals(92, score, 'Wrong score.')
     score = common.scoreMediaTitleMatch(
         u'Кавказская пленница', '1966',
         u'Кавказская пленница, или Новые приключения Шурика', None, '1966',
         0)
     self._assertEquals(94, score, 'Wrong score.')
  def queryKinoPoisk(self, mediaName, mediaYear):
    """ Ищет фильм на кинопоиске.
        Returns title results as they are returned (no sorting is done here!).
    """
    results = []
    try:encodedName = urllib.quote(mediaName.encode(S.ENCODING_KINOPOISK_PAGE))
    except:encodedName=urllib.quote(mediaName.encode('utf-8'))
    page = self.httpUtils.requestAndParseHtmlPage(S.KINOPOISK_SEARCH_SIMPLE % encodedName)
    if page is None:
      self.log.Warn(' ### nothing was found on kinopoisk for media name "%s"' % mediaName)
      return results

    # Страница получена, берем с нее перечень всех названий фильмов.
    self.log.Debug('got a KinoPoisk query results page to parse...')

    reobj = re.compile(r'<span><a href="http://m\.kinopoisk\.ru/movie/(\d+)/">(.+?)</a><br />(.+?)</span>')
    result = reobj.findall(page)

    # Inspect query results titles and score them.
    self.log.Debug('found %d results (div info tags)' % len(result))
    itemIndex=-1
    for itemKinoPoiskId, itemTitleitemYear, itemAltTitle in result:
        itemIndex=itemIndex+1
        itemAltTitle=itemAltTitle.replace('&nbsp;','')
        if ',' in itemTitleitemYear:
            itemYear=itemTitleitemYear.split(',')[-1].strip()
            itemTitle=itemTitleitemYear[:len(itemTitleitemYear)-6]
        else:
            itemYear=None
            itemTitle=itemTitleitemYear
        itemScore = common.scoreMediaTitleMatch(mediaName, mediaYear, itemTitle, itemAltTitle, itemYear, itemIndex)
        results.append([itemKinoPoiskId, itemTitle, itemYear, itemScore])

    return results
Пример #3
0
def searchForImdbTitles(mediaName, mediaYear, lang):
  """ Given media name and a candidate title, returns the title result score penalty.
  """
  mediaName = mediaName.lower()
  page = common.getElementFromHttpRequest(TMDB_GETINFO % mediaName.replace(' ', '%20'), TMDB_PAGE_ENCODING)
  matches = []
  if page is None:
    Log.Warn('nothing was found on tmdb for media name "%s"' % mediaName)
  else:
    movieElems = page.xpath('//movies/movie')
    itemIndex = 0
    for movieElem in movieElems:
      try:
        imdbId = common.getXpathRequiredText(movieElem, './imdb_id/text()')
        title = common.getXpathRequiredText(movieElem, './name/text()')
        altTitle = common.getXpathOptionalText(movieElem, './alternative_name/text()')
        releaseDate = common.getXpathOptionalText(movieElem, './released/text()')
        year = common.getReOptionalGroup(MATCHER_RELEASED, releaseDate, 0)
        score = common.scoreMediaTitleMatch(mediaName, mediaYear, title, altTitle, year, itemIndex)
        matches.append({'id': imdbId, 'name': title, 'year': year, 'score': score})
        itemIndex += 1
      except:
        Log.Warn('failed to parse movie element')

    return matches
Пример #4
0
    def searchForImdbTitles(self,
                            mediaName,
                            mediaYear,
                            lang='ru',
                            searchByTitleOnly=False):
        """ Given media name and media year looks for media title matches on TMDb.
        @param mediaName: CGI escaped media name string.
        @param mediaYear: (optional) Filter the results release dates to matches that
                   include this value.
        @param lang: (optional) ISO 639-1 code.
    """
        yearSearch = ''
        if not searchByTitleOnly:
            yearSearch = mediaYear
        mediaName = mediaName.lower()
        search = Search(self.httpUtils)
        response = search.movie(query=mediaName.encode(S.TMDB_PAGE_ENCODING),
                                year=yearSearch,
                                language=lang)
        results = []
        if response is None:
            self.log.Warn('nothing was found on tmdb for media name "%s"' %
                          mediaName)
        else:
            itemIndex = 0
            for s in search.results:
                try:
                    tmdbId = str(s['id'])
                    title = s['title']
                    altTitle = s['original_title']
                    year = common.getReOptionalGroup(MATCHER_RELEASED,
                                                     s['release_date'], 0)
                    score = common.scoreMediaTitleMatch(
                        mediaName, mediaYear, title, altTitle, year, itemIndex)
                    results.append({
                        'id': tmdbId,
                        'title': title,
                        'year': year,
                        'score': score
                    })
                    itemIndex = itemIndex + 1
                except:
                    self.log.Warn('failed to parse movie element')

        orderedResults = sorted(results,
                                key=lambda item: item['score'],
                                reverse=True)
        if self.isDebug:
            self.log.Debug('Search produced %d results:' % len(orderedResults))
            index = -1
            for result in orderedResults:
                index = index + 1
                self.log.Debug(
                    ' ... %d: id="%s", title="%s", year="%s", score="%d".' %
                    (index, result['id'], result['title'], str(
                        result['year']), result['score']))
        return orderedResults
Пример #5
0
    def queryKinoPoisk(self, mediaName, mediaYear):
        """ Ищет фильм на кинопоиске.
            Returns title results as they are returned (no sorting is done here!).
        """
        results = []
        try:
            encodedName = urllib.quote(
                mediaName.encode(S.ENCODING_KINOPOISK_PAGE))
        except:
            encodedName = urllib.quote(mediaName.encode('utf-8'))
        page = self.httpUtils.requestAndParseHtmlPage(
            S.KINOPOISK_SEARCH_SIMPLE % encodedName)
        if page is None:
            self.log.Warn(
                ' ### nothing was found on kinopoisk for media name "%s"' %
                mediaName)
            return results

        # Страница получена, берем с нее перечень всех названий фильмов.
        self.log.Debug('**** Finding %s (%s) got a KinoPoisk...' %
                       (mediaName, mediaYear))

        reobj = re.compile(
            r'<span><a href="http://m\.kinopoisk\.ru/movie/(\d+)/">(.+?)</a><br />(.+?)</span>'
        )
        result = reobj.findall(page)

        # Inspect query results titles and score them.
        self.log.Debug('found %d results (div info tags)' % len(result))
        itemIndex = -1
        for itemKinoPoiskId, itemTitleitemYear, itemAltTitle in result:
            itemIndex = itemIndex + 1
            itemAltTitle = itemAltTitle.replace('&nbsp;', '')
            try:
                itemTitle, itemYear = re.compile(
                    '^(.+?), (\d\d\d\d)$').findall(
                        itemTitleitemYear.strip())[0]
            except:
                itemYear = None
                itemTitle = itemTitleitemYear
            itemScore = common.scoreMediaTitleMatch(mediaName, mediaYear,
                                                    itemTitle, itemAltTitle,
                                                    itemYear, itemIndex)
            results.append([itemKinoPoiskId, itemTitle, itemYear, itemScore])

        return results
Пример #6
0
  def searchForImdbTitles(self, mediaName, mediaYear, lang='ru', searchByTitleOnly=False):
    """ Given media name and media year looks for media title matches on TMDb.
        @param mediaName: CGI escaped media name string.
        @param mediaYear: (optional) Filter the results release dates to matches that
                   include this value.
        @param lang: (optional) ISO 639-1 code.
    """
    yearSearch = ''
    if not searchByTitleOnly:
      yearSearch = mediaYear
    mediaName = mediaName.lower()
    search = Search(self.httpUtils)
    response = search.movie(
      query = mediaName.encode(S.TMDB_PAGE_ENCODING),
      year = yearSearch,
      language = lang
    )
    results = []
    if response is None:
      self.log.Warn('nothing was found on tmdb for media name "%s"' % mediaName)
    else:
      itemIndex = 0
      for s in search.results:
        try:
          tmdbId = str(s['id'])
          title = s['title']
          altTitle = s['original_title']
          year = common.getReOptionalGroup(MATCHER_RELEASED, s['release_date'], 0)
          score = common.scoreMediaTitleMatch(mediaName, mediaYear, title, altTitle, year, itemIndex)
          results.append({'id': tmdbId, 'title': title, 'year': year, 'score': score})
          itemIndex = itemIndex + 1
        except:
          self.log.Warn('failed to parse movie element')

    orderedResults = sorted(results, key=lambda item: item['score'], reverse=True)
    if self.isDebug:
      self.log.Debug('Search produced %d results:' % len(orderedResults))
      index = -1
      for result in orderedResults:
        index = index + 1
        self.log.Debug(' ... %d: id="%s", title="%s", year="%s", score="%d".' %
                       (index, result['id'], result['title'], str(result['year']), result['score']))
    return orderedResults
Пример #7
0
def searchForImdbTitles(mediaName, mediaYear, lang):
    """ Given media name and a candidate title, returns the title result score penalty.
  """
    mediaName = mediaName.lower()
    page = common.getElementFromHttpRequest(
        TMDB_GETINFO % mediaName.replace(' ', '%20'), TMDB_PAGE_ENCODING)
    matches = []
    if page is None:
        Log.Warn('nothing was found on tmdb for media name "%s"' % mediaName)
    else:
        movieElems = page.xpath('//movies/movie')
        itemIndex = 0
        for movieElem in movieElems:
            try:
                imdbId = common.getXpathRequiredText(movieElem,
                                                     './imdb_id/text()')
                title = common.getXpathRequiredText(movieElem, './name/text()')
                altTitle = common.getXpathOptionalText(
                    movieElem, './alternative_name/text()')
                releaseDate = common.getXpathOptionalText(
                    movieElem, './released/text()')
                year = common.getReOptionalGroup(MATCHER_RELEASED, releaseDate,
                                                 0)
                score = common.scoreMediaTitleMatch(mediaName, mediaYear,
                                                    title, altTitle, year,
                                                    itemIndex)
                matches.append({
                    'id': imdbId,
                    'name': title,
                    'year': year,
                    'score': score
                })
                itemIndex += 1
            except:
                Log.Warn('failed to parse movie element')

        return matches
Пример #8
0
 def localTest_scoreMediaTitleMatch(self):
   score = common.scoreMediaTitleMatch('Gladiatory Rima', '2012', u'Гладиаторы Рима', 'Gladiatori di Roma', '2012', 3)
   self._assertEquals(92, score, 'Wrong score.')
   score = common.scoreMediaTitleMatch(u'Кавказская пленница', '1966', u'Кавказская пленница, или Новые приключения Шурика', None, '1966', 0)
   self._assertEquals(94, score, 'Wrong score.')
Пример #9
0
    def queryKinoPoisk(self, mediaName, mediaYear):
        """ Ищет фильм на кинопоиске.
        Returns title results as they are returned (no sorting is done here!).
    """
        results = []
        encodedName = urllib.quote(mediaName.encode(S.ENCODING_KINOPOISK_PAGE))
        page = self.httpUtils.requestAndParseHtmlPage(
            S.KINOPOISK_SEARCH_SIMPLE % encodedName)
        if page is None:
            self.log.Warn(
                ' ### nothing was found on kinopoisk for media name "%s"' %
                mediaName)
            return results

        # Страница получена, берем с нее перечень всех названий фильмов.
        self.log.Debug('got a KinoPoisk query results page to parse...')
        divInfoElems = page.xpath(
            '//div[@class="info"][p[@class="name"]/a[contains(@href,"/level/1/film/")]]'
        )

        # Если не нашли там текст названия, значит сайт сразу дал нам страницу с фильмом (хочется верить).
        # TODO(zhenya): investigate if we need this clause at all (haven't seen this happening).
        if not len(divInfoElems):
            self.log.Warn(
                'nothing was found on kinopoisk for media name "%s"' %
                mediaName)
            try:
                itemTitle = common.getXpathOptionalText(
                    page, '//h1[@class="moviename-big"]/text()')
                if itemTitle is not None:
                    itemKinoPoiskId = re.search(
                        '\/film\/(.+?)\/',
                        page.xpath(
                            './/link[contains(@href, "/film/")]/attribute::href'
                        )[0]).groups(0)[0]
                    itemYear = common.parseYearFromString(
                        page.xpath('//a[contains(@href,"year")]/text()')[0])
                    itemAltTitle = None  # TODO: parse original title.
                    itemScore = common.scoreMediaTitleMatch(
                        mediaName, mediaYear, itemTitle, itemAltTitle,
                        itemYear, 0)
                    results.append(
                        [itemKinoPoiskId, itemTitle, itemYear, itemScore])
            except:
                self.logException(
                    'failed to parse a KinoPoisk query results page')
            return results

        # Inspect query results titles and score them.
        itemIndex = -1
        self.log.Debug('found %d results (div info tags)' % len(divInfoElems))
        for divInfoElem in divInfoElems:
            itemIndex += 1
            try:
                anchorFilmElem = divInfoElem.xpath(
                    './/a[contains(@href,"/level/1/film/")]/attribute::href')
                if not len(anchorFilmElem):
                    self.log.Warn(
                        'unable to find film anchor elements for title "%s"' %
                        mediaName)
                    continue

                # Parse kinopoisk movie title id, title and year.
                match = re.search('\/film\/(.+?)\/', anchorFilmElem[0])
                if match is None:
                    self.log.Error('unable to parse movie title id')
                    continue

                itemKinoPoiskId = match.groups(1)[0]
                itemTitle = common.getXpathRequiredText(
                    divInfoElem,
                    './/a[contains(@href,"/level/1/film/")]/text()')
                itemYear = common.parseYearFromString(
                    common.getXpathOptionalText(
                        divInfoElem, './/span[@class="year"]/text()'))
                itemAltTitle = None
                try:
                    # Try to parse the alternative (original) title. Ignore failures.
                    # This is a <span> below the title <a> tag.
                    altTitleCandidate = common.getXpathOptionalText(
                        divInfoElem, './/span[@class="gray"]/text()')
                    if altTitleCandidate is not None:
                        # Strip any non alpha character in front (unfortunately, this may also remove a leading part
                        # of a movie title if it starts with a digit).
                        altTitleCandidate = MATCHER_LEADING_NONALPHA.sub(
                            '', altTitleCandidate).rstrip()
                        if len(altTitleCandidate) > 0:
                            itemAltTitle = altTitleCandidate
                except:
                    pass


#        self.log.Debug(' ... kinoPoiskId="%s"; title="%s"; year="%s"...' % (itemKinoPoiskId, itemTitle, str(itemYear)))
                itemScore = common.scoreMediaTitleMatch(
                    mediaName, mediaYear, itemTitle, itemAltTitle, itemYear,
                    itemIndex)
                results.append(
                    [itemKinoPoiskId, itemTitle, itemYear, itemScore])
            except:
                self.logException('failed to parse div.info container')

        return results
Пример #10
0
  def search(self, results, media, lang, manual=False):
    """ Searches for matches on KinoPoisk using the title and year
        passed via the media object. All matches are saved in a list of results
        as MetadataSearchResult objects. For each results, we determine a
        page id, title, year, and the score (how good we think the match
        is on the scale of 1 - 100).
    """
    Log.Debug('SEARCH START <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')
    mediaName = media.name
    mediaYear = media.year
    Log.Debug('searching for name="%s", year="%s", guid="%s", hash="%s"...' %
        (str(mediaName), str(mediaYear), str(media.guid), str(media.hash)))
    # Получаем страницу поиска
    Log.Debug('quering kinopoisk...')

    page = common.getElementFromHttpRequest(KINOPOISK_SEARCH % urllib.quote(mediaName.encode(ENCODING_KINOPOISK_PAGE)), ENCODING_KINOPOISK_PAGE)
    Log.Debug('Loading page "%s"' % urllib.quote(mediaName.encode(ENCODING_KINOPOISK_PAGE)))

    if page is None:
      Log.Warn('nothing was found on kinopoisk for media name "%s"' % mediaName)
    else:
      # Если страница получена, берем с нее перечень всех названий фильмов.
      Log.Debug('got a kinopoisk page to parse...')
      divInfoElems = page.xpath('//self::div[@class="info"]/p[@class="name"]/a[contains(@href,"/level/1/film/")]/..')
      itemIndex = 0
      altTitle = None
      if len(divInfoElems):
        Log.Debug('found %d results' % len(divInfoElems))
        for divInfoElem in divInfoElems:
          try:
            anchorFilmElem = divInfoElem.xpath('./a[contains(@href,"/level/1/film/")]/attribute::href')
            if len(anchorFilmElem):
              # Parse kinopoisk movie title id, title and year.
              match = re.search('\/film\/(.+?)\/', anchorFilmElem[0])
              if match is None:
                Log.Error('unable to parse movie title id')
              else:
                kinoPoiskId = match.groups(1)[0]
                title = common.getXpathRequiredNode(divInfoElem, './/a[contains(@href,"/level/1/film/")]/text()')
                year = common.getXpathOptionalNode(divInfoElem, './/span[@class="year"]/text()')
                # Try to parse the alternative (original) title. Ignore failures.
                # This is a <span> below the title <a> tag.
                try:
                  altTitle = common.getXpathOptionalNode(divInfoElem, '../span[1]/text()')
                  if altTitle is not None:
                    altTitle = altTitle.split(',')[0].strip()
                except:
                  pass
                score = common.scoreMediaTitleMatch(mediaName, mediaYear, title, altTitle, year, itemIndex)
                results.Append(MetadataSearchResult(id=kinoPoiskId, name=title, year=year, lang=lang, score=score))
            else:
              Log.Warn('unable to find film anchor elements for title "%s"' % mediaName)
          except:
            common.logException('failed to parse div.info container')
          itemIndex += 1
      else:
        Log.Warn('nothing was found on kinopoisk for media name "%s"' % mediaName)
        # TODO(zhenya): investigate 1 we need this clause at all (haven't seen this happening).
        # Если не нашли там текст названия, значит сайт сразу дал нам страницу с фильмом (хочется верить =)
       # try:
          #title = page.xpath('//h1[@class="moviename-big"]/text()')[0].strip()
          #kinoPoiskId = re.search('\/film\/(.+?)\/', page.xpath('.//link[contains(@href, "/film/")]/attribute::href')[0]).groups(0)[0]
          #year = page.xpath('//a[contains(@href,"year")]/text()')[0].strip()
          #score = common.scoreMediaTitleMatch(mediaName, mediaYear, title, altTitle, year, itemIndex)
          #results.Append(MetadataSearchResult(id=kinoPoiskId, name=title, year=year, lang=lang, score=score))
        #except:
         # common.logException('failed to parse a KinoPoisk page')

    # Sort results according to their score (Сортируем результаты).
    results.Sort('score', descending=True)
    if IS_DEBUG:
      common.printSearchResults(results)
    Log.Debug('SEARCH END <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')
Пример #11
0
 def fetchAndParseSearchResults(self, mediaName, mediaYear):
     """ Searches for movie titles on KinoPoisk.
     @param mediaName Movie title parsed from a filename.
     @param mediaName Movie year parsed from a filename.
     @return Array of tuples: [kinoPoiskId, title, year, score]
 """
     self.log.Info('Quering kinopoisk...')
     results = []
     encodedName = urllib.quote(mediaName.encode(S.ENCODING_KINOPOISK_PAGE))
     self.log.Debug('Loading page "%s"' % encodedName)
     page = self.httpUtils.requestAndParseHtmlPage(
         S.KINOPOISK_SEARCH_SIMPLE % encodedName)
     if page is None:
         self.log.Warn(
             ' ### nothing was found on kinopoisk for media name "%s"' %
             mediaName)
     else:
         # Если страница получена, берем с нее перечень всех названий фильмов.
         self.log.Debug('got a kinopoisk results page to parse...')
         # Pick all divs with class "info" that have specific children (/p/a/ etc).
         #      divInfoElems = page.xpath('//self::div[@class="info"]/p[@class="name"]/a[contains(@href,"/level/1/film/")]/..')
         divInfoElems = page.xpath(
             '//div[@class="info"][p[@class="name"]/a[contains(@href,"/level/1/film/")]]'
         )
         itemIndex = 0
         if len(divInfoElems):
             self.log.Debug('found %d results (div info tags)' %
                            len(divInfoElems))
             for divInfoElem in divInfoElems:
                 try:
                     anchorFilmElem = divInfoElem.xpath(
                         './/a[contains(@href,"/level/1/film/")]/attribute::href'
                     )
                     if len(anchorFilmElem):
                         # Parse kinopoisk movie title id, title and year.
                         match = re.search('\/film\/(.+?)\/',
                                           anchorFilmElem[0])
                         if match is None:
                             self.log.Error(
                                 'unable to parse movie title id')
                         else:
                             kinoPoiskId = match.groups(1)[0]
                             title = common.getXpathRequiredText(
                                 divInfoElem,
                                 './/a[contains(@href,"/level/1/film/")]/text()'
                             )
                             year = common.getXpathOptionalText(
                                 divInfoElem,
                                 './/span[@class="year"]/text()')
                             # Try to parse the alternative (original) title. Ignore failures.
                             # This is a <span> below the title <a> tag.
                             altTitle = None
                             try:
                                 altTitleCandidate = common.getXpathOptionalText(
                                     divInfoElem,
                                     './/span[@class="gray"]/text()')
                                 if altTitleCandidate is not None:
                                     # Strip any non alpha character in front (unfortunately, this may also remove a leading part
                                     # of a movie title if it starts with a digit).
                                     altTitleCandidate = MATCHER_LEADING_NONALPHA.sub(
                                         '', altTitleCandidate).rstrip()
                                     if len(altTitleCandidate) > 0:
                                         altTitle = altTitleCandidate
                             except:
                                 pass
                             self.log.Debug(
                                 ' ... kinoPoiskId="%s"; title="%s"; year="%s"...'
                                 % (kinoPoiskId, title, str(year)))
                             score = common.scoreMediaTitleMatch(
                                 mediaName, mediaYear, title, altTitle,
                                 year, itemIndex)
                             results.append(
                                 [kinoPoiskId, title, year, score])
                     else:
                         self.log.Warn(
                             'unable to find film anchor elements for title "%s"'
                             % mediaName)
                 except:
                     self.logException('failed to parse div.info container')
                 itemIndex += 1
         else:
             self.log.Warn(
                 'nothing was found on kinopoisk for media name "%s"' %
                 mediaName)
             # TODO(zhenya): investigate if we need this clause at all (haven't seen this happening).
             # Если не нашли там текст названия, значит сайт сразу дал нам страницу с фильмом (хочется верить =)
             try:
                 title = common.getXpathOptionalText(
                     page, '//h1[@class="moviename-big"]/text()')
                 if title is not None:
                     kinoPoiskId = re.search(
                         '\/film\/(.+?)\/',
                         page.xpath(
                             './/link[contains(@href, "/film/")]/attribute::href'
                         )[0]).groups(0)[0]
                     year = page.xpath(
                         '//a[contains(@href,"year")]/text()')[0].strip()
                     altTitle = None  # TODO: parse original title.
                     score = common.scoreMediaTitleMatch(
                         mediaName, mediaYear, title, altTitle, year,
                         itemIndex)
                     results.append([kinoPoiskId, title, year, score])
             except:
                 self.logException('failed to parse a KinoPoisk page')
     return results
Пример #12
0
    def fetchAndParseSearchResultsFull(self, mediaName, mediaYear):
        self.log.Info('Quering kinopoisk...')
        results = []
        encodedName = urllib.quote(mediaName.encode(S.ENCODING_KINOPOISK_PAGE))
        self.log.Debug('Loading page "%s"' % encodedName)
        page = self.httpUtils.requestAndParseHtmlPage(S.KINOPOISK_SEARCH %
                                                      encodedName)

        if page is None:
            self.log.Warn(
                ' ### nothing was found on kinopoisk for media name "%s"' %
                mediaName)
        else:
            # Если страница получена, берем с нее перечень всех названий фильмов.
            self.log.Debug('got a kinopoisk page to parse...')
            divInfoElems = page.xpath(
                '//self::div[@class="info"]/p[@class="name"]/a[contains(@href,"/level/1/film/")]/..'
            )
            itemIndex = 0
            altTitle = None
            if len(divInfoElems):
                self.log.Debug('found %d results' % len(divInfoElems))
                for divInfoElem in divInfoElems:
                    try:
                        anchorFilmElem = divInfoElem.xpath(
                            './a[contains(@href,"/level/1/film/")]/attribute::href'
                        )
                        if len(anchorFilmElem):
                            # Parse kinopoisk movie title id, title and year.
                            match = re.search('\/film\/(.+?)\/',
                                              anchorFilmElem[0])
                            if match is None:
                                self.log.Error(
                                    'unable to parse movie title id')
                            else:
                                kinoPoiskId = match.groups(1)[0]
                                title = common.getXpathRequiredText(
                                    divInfoElem,
                                    './/a[contains(@href,"/level/1/film/")]/text()'
                                )
                                year = common.getXpathOptionalText(
                                    divInfoElem,
                                    './/span[@class="year"]/text()')
                                # Try to parse the alternative (original) title. Ignore failures.
                                # This is a <span> below the title <a> tag.
                                try:
                                    altTitle = common.getXpathOptionalText(
                                        divInfoElem, '../span[1]/text()')
                                    if altTitle is not None:
                                        altTitle = altTitle.split(
                                            ',')[0].strip()
                                except:
                                    pass
                                score = common.scoreMediaTitleMatch(
                                    mediaName, mediaYear, title, altTitle,
                                    year, itemIndex)
                                results.append(
                                    [kinoPoiskId, title, year, score])
                        else:
                            self.log.Warn(
                                'unable to find film anchor elements for title "%s"'
                                % mediaName)
                    except:
                        self.logException('failed to parse div.info container')
                    itemIndex += 1
            else:
                self.log.Warn(
                    'nothing was found on kinopoisk for media name "%s"' %
                    mediaName)
                # TODO(zhenya): investigate if we need this clause at all (haven't seen this happening).
                # Если не нашли там текст названия, значит сайт сразу дал нам страницу с фильмом (хочется верить =)
                try:
                    title = page.xpath(
                        '//h1[@class="moviename-big"]/text()')[0].strip()
                    kinoPoiskId = re.search(
                        '\/film\/(.+?)\/',
                        page.xpath(
                            './/link[contains(@href, "/film/")]/attribute::href'
                        )[0]).groups(0)[0]
                    year = page.xpath(
                        '//a[contains(@href,"year")]/text()')[0].strip()
                    score = common.scoreMediaTitleMatch(
                        mediaName, mediaYear, title, altTitle, year, itemIndex)
                    results.append([kinoPoiskId, title, year, score])
                except:
                    self.logException('failed to parse a KinoPoisk page')
        return results
Пример #13
0
 def fetchAndParseSearchResults(self, mediaName, mediaYear):
     """ Searches for movie titles on KinoPoisk.
     @param mediaName Movie title parsed from a filename.
     @param mediaName Movie year parsed from a filename.
     @return Array of tuples: [kinoPoiskId, title, year, score]
 """
     self.log.Info("Quering kinopoisk...")
     results = []
     encodedName = urllib.quote(mediaName.encode(S.ENCODING_KINOPOISK_PAGE))
     self.log.Debug('Loading page "%s"' % encodedName)
     page = self.httpUtils.requestAndParseHtmlPage(S.KINOPOISK_SEARCH_SIMPLE % encodedName)
     if page is None:
         self.log.Warn(' ### nothing was found on kinopoisk for media name "%s"' % mediaName)
     else:
         # Если страница получена, берем с нее перечень всех названий фильмов.
         self.log.Debug("got a kinopoisk results page to parse...")
         # Pick all divs with class "info" that have specific children (/p/a/ etc).
         #      divInfoElems = page.xpath('//self::div[@class="info"]/p[@class="name"]/a[contains(@href,"/level/1/film/")]/..')
         divInfoElems = page.xpath('//div[@class="info"][p[@class="name"]/a[contains(@href,"/level/1/film/")]]')
         itemIndex = 0
         if len(divInfoElems):
             self.log.Debug("found %d results (div info tags)" % len(divInfoElems))
             for divInfoElem in divInfoElems:
                 try:
                     anchorFilmElem = divInfoElem.xpath('.//a[contains(@href,"/level/1/film/")]/attribute::href')
                     if len(anchorFilmElem):
                         # Parse kinopoisk movie title id, title and year.
                         match = re.search("\/film\/(.+?)\/", anchorFilmElem[0])
                         if match is None:
                             self.log.Error("unable to parse movie title id")
                         else:
                             kinoPoiskId = match.groups(1)[0]
                             title = common.getXpathRequiredText(
                                 divInfoElem, './/a[contains(@href,"/level/1/film/")]/text()'
                             )
                             year = common.getXpathOptionalText(divInfoElem, './/span[@class="year"]/text()')
                             # Try to parse the alternative (original) title. Ignore failures.
                             # This is a <span> below the title <a> tag.
                             altTitle = None
                             try:
                                 altTitleCandidate = common.getXpathOptionalText(
                                     divInfoElem, './/span[@class="gray"]/text()'
                                 )
                                 if altTitleCandidate is not None:
                                     # Strip any non alpha character in front (unfortunately, this may also remove a leading part
                                     # of a movie title if it starts with a digit).
                                     altTitleCandidate = MATCHER_LEADING_NONALPHA.sub("", altTitleCandidate).rstrip()
                                     if len(altTitleCandidate) > 0:
                                         altTitle = altTitleCandidate
                             except:
                                 pass
                             self.log.Debug(
                                 ' ... kinoPoiskId="%s"; title="%s"; year="%s"...' % (kinoPoiskId, title, str(year))
                             )
                             score = common.scoreMediaTitleMatch(
                                 mediaName, mediaYear, title, altTitle, year, itemIndex
                             )
                             results.append([kinoPoiskId, title, year, score])
                     else:
                         self.log.Warn('unable to find film anchor elements for title "%s"' % mediaName)
                 except:
                     self.logException("failed to parse div.info container")
                 itemIndex += 1
         else:
             self.log.Warn('nothing was found on kinopoisk for media name "%s"' % mediaName)
             # TODO(zhenya): investigate if we need this clause at all (haven't seen this happening).
             # Если не нашли там текст названия, значит сайт сразу дал нам страницу с фильмом (хочется верить =)
             try:
                 title = common.getXpathOptionalText(page, '//h1[@class="moviename-big"]/text()')
                 if title is not None:
                     kinoPoiskId = re.search(
                         "\/film\/(.+?)\/", page.xpath('.//link[contains(@href, "/film/")]/attribute::href')[0]
                     ).groups(0)[0]
                     year = page.xpath('//a[contains(@href,"year")]/text()')[0].strip()
                     altTitle = None  # TODO: parse original title.
                     score = common.scoreMediaTitleMatch(mediaName, mediaYear, title, altTitle, year, itemIndex)
                     results.append([kinoPoiskId, title, year, score])
             except:
                 self.logException("failed to parse a KinoPoisk page")
     return results
Пример #14
0
    def fetchAndParseSearchResultsFull(self, mediaName, mediaYear):
        self.log.Info("Quering kinopoisk...")
        results = []
        encodedName = urllib.quote(mediaName.encode(S.ENCODING_KINOPOISK_PAGE))
        self.log.Debug('Loading page "%s"' % encodedName)
        page = self.httpUtils.requestAndParseHtmlPage(S.KINOPOISK_SEARCH % encodedName)

        if page is None:
            self.log.Warn(' ### nothing was found on kinopoisk for media name "%s"' % mediaName)
        else:
            # Если страница получена, берем с нее перечень всех названий фильмов.
            self.log.Debug("got a kinopoisk page to parse...")
            divInfoElems = page.xpath(
                '//self::div[@class="info"]/p[@class="name"]/a[contains(@href,"/level/1/film/")]/..'
            )
            itemIndex = 0
            altTitle = None
            if len(divInfoElems):
                self.log.Debug("found %d results" % len(divInfoElems))
                for divInfoElem in divInfoElems:
                    try:
                        anchorFilmElem = divInfoElem.xpath('./a[contains(@href,"/level/1/film/")]/attribute::href')
                        if len(anchorFilmElem):
                            # Parse kinopoisk movie title id, title and year.
                            match = re.search("\/film\/(.+?)\/", anchorFilmElem[0])
                            if match is None:
                                self.log.Error("unable to parse movie title id")
                            else:
                                kinoPoiskId = match.groups(1)[0]
                                title = common.getXpathRequiredText(
                                    divInfoElem, './/a[contains(@href,"/level/1/film/")]/text()'
                                )
                                year = common.getXpathOptionalText(divInfoElem, './/span[@class="year"]/text()')
                                # Try to parse the alternative (original) title. Ignore failures.
                                # This is a <span> below the title <a> tag.
                                try:
                                    altTitle = common.getXpathOptionalText(divInfoElem, "../span[1]/text()")
                                    if altTitle is not None:
                                        altTitle = altTitle.split(",")[0].strip()
                                except:
                                    pass
                                score = common.scoreMediaTitleMatch(
                                    mediaName, mediaYear, title, altTitle, year, itemIndex
                                )
                                results.append([kinoPoiskId, title, year, score])
                        else:
                            self.log.Warn('unable to find film anchor elements for title "%s"' % mediaName)
                    except:
                        self.logException("failed to parse div.info container")
                    itemIndex += 1
            else:
                self.log.Warn('nothing was found on kinopoisk for media name "%s"' % mediaName)
                # TODO(zhenya): investigate if we need this clause at all (haven't seen this happening).
                # Если не нашли там текст названия, значит сайт сразу дал нам страницу с фильмом (хочется верить =)
                try:
                    title = page.xpath('//h1[@class="moviename-big"]/text()')[0].strip()
                    kinoPoiskId = re.search(
                        "\/film\/(.+?)\/", page.xpath('.//link[contains(@href, "/film/")]/attribute::href')[0]
                    ).groups(0)[0]
                    year = page.xpath('//a[contains(@href,"year")]/text()')[0].strip()
                    score = common.scoreMediaTitleMatch(mediaName, mediaYear, title, altTitle, year, itemIndex)
                    results.append([kinoPoiskId, title, year, score])
                except:
                    self.logException("failed to parse a KinoPoisk page")
        return results
Пример #15
0
  def queryKinoPoisk(self, mediaName, mediaYear):
    """ Ищет фильм на кинопоиске.
        Returns title results as they are returned (no sorting is done here!).
    """
    results = []
    encodedName = urllib.quote(mediaName.encode(S.ENCODING_KINOPOISK_PAGE))
    page = self.httpUtils.requestAndParseHtmlPage(S.KINOPOISK_SEARCH_SIMPLE % encodedName)
    if page is None:
      self.log.Warn(' ### nothing was found on kinopoisk for media name "%s"' % mediaName)
      return results

    # Страница получена, берем с нее перечень всех названий фильмов.
    self.log.Debug('got a KinoPoisk query results page to parse...')
    divInfoElems = page.xpath('//div[@class="info"][p[@class="name"]/a[contains(@href,"/level/1/film/")]]')

    # Если не нашли там текст названия, значит сайт сразу дал нам страницу с фильмом (хочется верить).
    # TODO(zhenya): investigate if we need this clause at all (haven't seen this happening).
    if not len(divInfoElems):
      self.log.Warn('nothing was found on kinopoisk for media name "%s"' % mediaName)
      try:
        itemTitle = common.getXpathOptionalText(page, '//h1[@class="moviename-big"]/text()')
        if itemTitle is not None:
          itemKinoPoiskId = re.search('\/film\/(.+?)\/', page.xpath('.//link[contains(@href, "/film/")]/attribute::href')[0]).groups(0)[0]
          itemYear = common.parseYearFromString(page.xpath('//a[contains(@href,"year")]/text()')[0])
          itemAltTitle = None # TODO: parse original title.
          itemScore = common.scoreMediaTitleMatch(mediaName, mediaYear, itemTitle, itemAltTitle, itemYear, 0)
          results.append([itemKinoPoiskId, itemTitle, itemYear, itemScore])
      except:
        self.logException('failed to parse a KinoPoisk query results page')
      return results

    # Inspect query results titles and score them.
    itemIndex = -1
    self.log.Debug('found %d results (div info tags)' % len(divInfoElems))
    for divInfoElem in divInfoElems:
      itemIndex += 1
      try:
        anchorFilmElem = divInfoElem.xpath('.//a[contains(@href,"/level/1/film/")]/attribute::href')
        if not len(anchorFilmElem):
          self.log.Warn('unable to find film anchor elements for title "%s"' % mediaName)
          continue

        # Parse kinopoisk movie title id, title and year.
        match = re.search('\/film\/(.+?)\/', anchorFilmElem[0])
        if match is None:
          self.log.Error('unable to parse movie title id')
          continue

        itemKinoPoiskId = match.groups(1)[0]
        itemTitle = common.getXpathRequiredText(divInfoElem, './/a[contains(@href,"/level/1/film/")]/text()')
        itemYear = common.parseYearFromString(common.getXpathOptionalText(divInfoElem, './/span[@class="year"]/text()'))
        itemAltTitle = None
        try:
          # Try to parse the alternative (original) title. Ignore failures.
          # This is a <span> below the title <a> tag.
          altTitleCandidate = common.getXpathOptionalText(divInfoElem, './/span[@class="gray"]/text()')
          if altTitleCandidate is not None:
            # Strip any non alpha character in front (unfortunately, this may also remove a leading part
            # of a movie title if it starts with a digit).
            altTitleCandidate = MATCHER_LEADING_NONALPHA.sub('', altTitleCandidate).rstrip()
            if len(altTitleCandidate) > 0:
              itemAltTitle = altTitleCandidate
        except:
          pass
#        self.log.Debug(' ... kinoPoiskId="%s"; title="%s"; year="%s"...' % (itemKinoPoiskId, itemTitle, str(itemYear)))
        itemScore = common.scoreMediaTitleMatch(mediaName, mediaYear, itemTitle, itemAltTitle, itemYear, itemIndex)
        results.append([itemKinoPoiskId, itemTitle, itemYear, itemScore])
      except:
        self.logException('failed to parse div.info container')

    return results