def _getitem(self, key): """Handle special keys.""" if self.data.has_key('episode of'): if key == 'long imdb episode title': return build_title(self.data) elif key == 'series title': return self.data['episode of']['title'] elif key == 'canonical series title': ser_title = self.data['episode of']['title'] return canonicalTitle(ser_title) elif key == 'smart canonical series title': ser_title = self.data['episode of']['title'] return self.smartCanonicalTitle(ser_title) elif key == 'episode title': return self.data.get('title', u'') elif key == 'canonical episode title': return canonicalTitle(self.data.get('title', u'')) elif key == 'smart canonical episode title': return self.smartCanonicalTitle(self.data.get('title', u'')) if self.data.has_key('title'): if key == 'title': return self.data['title'] elif key == 'long imdb title': return build_title(self.data) elif key == 'canonical title': return canonicalTitle(self.data['title']) elif key == 'smart canonical title': return self.smartCanonicalTitle(self.data['title']) elif key == 'long imdb canonical title': return build_title(self.data, canonical=1) elif key == 'smart long imdb canonical title': return build_title(self.data, canonical=1, lang=self.guessLanguage()) return None
def _getitem(self, key): """Handle special keys.""" if 'episode of' in self.data: if key == 'long imdb episode title': return build_title(self.data) elif key == 'series title': return self._getSeriesTitle(self.data['episode of']) elif key == 'canonical series title': ser_title = self._getSeriesTitle(self.data['episode of']) return canonicalTitle(ser_title) elif key == 'smart canonical series title': ser_title = self._getSeriesTitle(self.data['episode of']) return self.smartCanonicalTitle(ser_title) elif key == 'episode title': return self.data.get('title', '') elif key == 'canonical episode title': return canonicalTitle(self.data.get('title', '')) elif key == 'smart canonical episode title': return self.smartCanonicalTitle(self.data.get('title', '')) if 'title' in self.data: if key == 'title': return self.data['title'] elif key == 'long imdb title': return build_title(self.data) elif key == 'canonical title': return canonicalTitle(self.data['title']) elif key == 'smart canonical title': return self.smartCanonicalTitle(self.data['title']) elif key == 'long imdb canonical title': return build_title(self.data, canonical=True) elif key == 'smart long imdb canonical title': return build_title(self.data, canonical=True, lang=self.guessLanguage()) if key == 'full-size cover url': return self.get_fullsizeURL() return None
def titleVariations(title, fromPtdf=0): """Build title variations useful for searches; if fromPtdf is true, the input is assumed to be in the plain text data files format.""" if fromPtdf: title1 = u'' else: title1 = title title2 = title3 = u'' if fromPtdf or re_year_index.search(title): # If it appears to have a (year[/imdbIndex]) indication, # assume that a long imdb canonical name was provided. titldict = analyze_title(title, canonical=1) # title1: the canonical name. title1 = titldict['title'] if titldict['kind'] != 'episode': # title3: the long imdb canonical name. if fromPtdf: title3 = title else: title3 = build_title(titldict, canonical=1, ptdf=1) else: title1 = normalizeTitle(title1) title3 = build_title(titldict, canonical=1, ptdf=1) else: # Just a title. # title1: the canonical title. title1 = canonicalTitle(title) title3 = u'' # title2 is title1 without the article, or title1 unchanged. if title1: title2 = title1 t2s = title2.split(u', ') if t2s[-1].lower() in _unicodeArticles: title2 = u', '.join(t2s[:-1]) return title1, title2, title3
def titleVariations(title, fromPtdf=0): """Build title variations useful for searches; if fromPtdf is true, the input is assumed to be in the plain text data files format.""" if fromPtdf: title1 = u'' else: title1 = title title2 = title3 = u'' if fromPtdf or re_year_index.search(title): # If it appears to have a (year[/imdbIndex]) indication, # assume that a long imdb canonical name was provided. titldict = analyze_title(title, canonical=1) # title1: the canonical name. title1 = titldict['title'] if titldict['kind'] != 'episode': # title3: the long imdb canonical name. if fromPtdf: title3 = title else: title3 = build_title(titldict, canonical=1, ptdf=1) else: title1 = normalizeTitle(title1) title3 = build_title(titldict, canonical=1, ptdf=1) else: # Just a title. # title1: the canonical title. title1 = canonicalTitle(title) title3 = u'' # title2 is title1 without the article, or title1 unchanged. if title1: title2 = title1 t2s = title2.split(u', ') if t2s[-1].lower() in _articles: title2 = u', '.join(t2s[:-1]) return title1, title2, title3
def smartCanonicalTitle(self, title=None, lang=None): """Return the canonical title, guessing its language. The title can be forces with the 'title' argument (internally used) and the language can be forced with the 'lang' argument, otherwise it's auto-detected.""" if title is None: title = self.data.get('title', '') if lang is None: lang = self.guessLanguage() return canonicalTitle(title, lang=lang)
def smartCanonicalTitle(self, title=None, lang=None): """Return the canonical title, guessing its language. The title can be forces with the 'title' argument (internally used) and the language can be forced with the 'lang' argument, otherwise it's auto-detected.""" if title is None: title = self.data.get('title', u'') if lang is None: lang = self.guessLanguage() return canonicalTitle(title, lang=lang)
def title_soundex(title): """Return the soundex code for the given title; the (optional) starting article is pruned. :param title: movie title :type title: str :returns: soundex of the title (without the article, if any) :rtype: str """ if not title: return None title = canonicalTitle(title) ts = title.split(', ') if ts[-1].lower() in _unicodeArticles: title = ', '.join(ts[:-1]) return soundex(title)
def _findRefs(self, o, trefs, nrefs): """Find titles or names references in strings.""" if isinstance(o, (unicode, str)): for title in re_titleRef.findall(o): a_title = analyze_title(title, canonical=0) rtitle = build_title(a_title, ptdf=1) if trefs.has_key(rtitle): continue movieID = self._getTitleID(rtitle) if movieID is None: movieID = self._getTitleID(title) if movieID is None: continue m = Movie(title=rtitle, movieID=movieID, accessSystem=self.accessSystem) trefs[rtitle] = m rtitle2 = canonicalTitle(a_title.get('title', u'')) if rtitle2 and rtitle2 != rtitle and rtitle2 != title: trefs[rtitle2] = m if title != rtitle: trefs[title] = m for name in re_nameRef.findall(o): a_name = analyze_name(name, canonical=1) rname = build_name(a_name, canonical=1) if nrefs.has_key(rname): continue personID = self._getNameID(rname) if personID is None: personID = self._getNameID(name) if personID is None: continue p = Person(name=rname, personID=personID, accessSystem=self.accessSystem) nrefs[rname] = p rname2 = normalizeName(a_name.get('name', u'')) if rname2 and rname2 != rname: nrefs[rname2] = p if name != rname and name != rname2: nrefs[name] = p elif isinstance(o, (list, tuple)): for item in o: self._findRefs(item, trefs, nrefs) elif isinstance(o, dict): for value in o.values(): self._findRefs(value, trefs, nrefs) return (trefs, nrefs)
def _findRefs(self, o, trefs, nrefs): """Find titles or names references in strings.""" if isinstance(o, (UnicodeType, StringType)): for title in re_titleRef.findall(o): a_title = analyze_title(title, canonical=1) rtitle = build_title(a_title, canonical=1, ptdf=1) if trefs.has_key(rtitle): continue movieID = self._getTitleID(rtitle) if movieID is None: movieID = self._getTitleID(title) if movieID is None: continue m = Movie(title=rtitle, movieID=movieID, accessSystem=self.accessSystem) trefs[rtitle] = m rtitle2 = canonicalTitle(a_title.get('title', u'')) if rtitle2 and rtitle2 != rtitle and rtitle2 != title: trefs[rtitle2] = m if title != rtitle: trefs[title] = m for name in re_nameRef.findall(o): a_name = analyze_name(name, canonical=1) rname = build_name(a_name, canonical=1) if nrefs.has_key(rname): continue personID = self._getNameID(rname) if personID is None: personID = self._getNameID(name) if personID is None: continue p = Person(name=rname, personID=personID, accessSystem=self.accessSystem) nrefs[rname] = p rname2 = normalizeName(a_name.get('name', u'')) if rname2 and rname2 != rname: nrefs[rname2] = p if name != rname and name != rname2: nrefs[name] = p elif isinstance(o, (ListType, TupleType)): for item in o: self._findRefs(item, trefs, nrefs) elif isinstance(o, DictType): for value in o.values(): self._findRefs(value, trefs, nrefs) return (trefs, nrefs)
def scan_titles(titles_list, title1, title2, title3, results=0, searchingEpisode=0, onlyEpisodes=0, ro_thresold=None): """Scan a list of titles, searching for best matches against the given variations.""" if ro_thresold is not None: RO_THRESHOLD = ro_thresold else: RO_THRESHOLD = 0.6 sm1 = SequenceMatcher() sm2 = SequenceMatcher() sm3 = SequenceMatcher() sm1.set_seq1(title1.lower()) sm2.set_seq2(title2.lower()) if title3: sm3.set_seq1(title3.lower()) if title3[-1] == '}': searchingEpisode = 1 hasArt = 0 if title2 != title1: hasArt = 1 resd = {} for i, t_data in titles_list: if onlyEpisodes: if t_data.get('kind') != 'episode': continue til = t_data['title'] if til[-1] == ')': dateIdx = til.rfind('(') if dateIdx != -1: til = til[:dateIdx].rstrip() if not til: continue ratio = ratcliff(title1, til, sm1) if ratio >= RO_THRESHOLD: resd[i] = (ratio, (i, t_data)) continue if searchingEpisode: if t_data.get('kind') != 'episode': continue elif t_data.get('kind') == 'episode': continue til = t_data['title'] # XXX: on Symbian, here we get a str; not sure this is the # right place to fix it. if isinstance(til, str): til = unicode(til, 'latin1', 'ignore') # Distance with the canonical title (with or without article). # titleS -> titleR # titleS, the -> titleR, the if not searchingEpisode: til = canonicalTitle(til) ratios = [ratcliff(title1, til, sm1) + 0.05] # til2 is til without the article, if present. til2 = til tils = til2.split(', ') matchHasArt = 0 if tils[-1].lower() in _unicodeArticles: til2 = ', '.join(tils[:-1]) matchHasArt = 1 if hasArt and not matchHasArt: # titleS[, the] -> titleR ratios.append(ratcliff(title2, til, sm2)) elif matchHasArt and not hasArt: # titleS -> titleR[, the] ratios.append(ratcliff(title1, til2, sm1)) else: ratios = [0.0] if title3: # Distance with the long imdb canonical title. ratios.append( ratcliff(title3, build_title(t_data, canonical=1, ptdf=1), sm3) + 0.1) ratio = max(ratios) if ratio >= RO_THRESHOLD: if resd.has_key(i): if ratio > resd[i][0]: resd[i] = (ratio, (i, t_data)) else: resd[i] = (ratio, (i, t_data)) res = resd.values() res.sort() res.reverse() if results > 0: res[:] = res[:results] return res
def strip_article(title): no_article_title = canonicalTitle(title) t2s = no_article_title.split(', ') if t2s[-1].lower() in _unicodeArticles: no_article_title = ', '.join(t2s[:-1]) return no_article_title
def scan_titles(titles_list, title1, title2, title3, results=0, searchingEpisode=0, onlyEpisodes=0, ro_thresold=None): """Scan a list of titles, searching for best matches against the given variations.""" if ro_thresold is not None: RO_THRESHOLD = ro_thresold else: RO_THRESHOLD = 0.6 sm1 = SequenceMatcher() sm2 = SequenceMatcher() sm3 = SequenceMatcher() sm1.set_seq1(title1.lower()) sm2.set_seq2(title2.lower()) if title3: sm3.set_seq1(title3.lower()) if title3[-1] == '}': searchingEpisode = 1 hasArt = 0 if title2 != title1: hasArt = 1 resd = {} for i, t_data in titles_list: if onlyEpisodes: if t_data.get('kind') != 'episode': continue til = t_data['title'] if til[-1] == ')': dateIdx = til.rfind('(') if dateIdx != -1: til = til[:dateIdx].rstrip() if not til: continue ratio = ratcliff(title1, til, sm1) if ratio >= RO_THRESHOLD: resd[i] = (ratio, (i, t_data)) continue if searchingEpisode: if t_data.get('kind') != 'episode': continue elif t_data.get('kind') == 'episode': continue til = t_data['title'] # XXX: on Symbian, here we get a str; not sure this is the # right place to fix it. if isinstance(til, str): til = unicode(til, 'latin1', 'ignore') # Distance with the canonical title (with or without article). # titleS -> titleR # titleS, the -> titleR, the if not searchingEpisode: til = canonicalTitle(til) ratios = [ratcliff(title1, til, sm1) + 0.05] # til2 is til without the article, if present. til2 = til tils = til2.split(', ') matchHasArt = 0 if tils[-1].lower() in _unicodeArticles: til2 = ', '.join(tils[:-1]) matchHasArt = 1 if hasArt and not matchHasArt: # titleS[, the] -> titleR ratios.append(ratcliff(title2, til, sm2)) elif matchHasArt and not hasArt: # titleS -> titleR[, the] ratios.append(ratcliff(title1, til2, sm1)) else: ratios = [0.0] if title3: # Distance with the long imdb canonical title. ratios.append(ratcliff(title3, build_title(t_data, canonical=1, ptdf=1), sm3) + 0.1) ratio = max(ratios) if ratio >= RO_THRESHOLD: if resd.has_key(i): if ratio > resd[i][0]: resd[i] = (ratio, (i, t_data)) else: resd[i] = (ratio, (i, t_data)) res = resd.values() res.sort() res.reverse() if results > 0: res[:] = res[:results] return res