def _search_movie(self, title, results): ##params = urllib.urlencode({'tt': 'on','mx': str(results),'q': title}) ##params = 'q=%s&tt=on&mx=%s' % (urllib.quote_plus(title), str(results)) ##cont = self._mretrieve(imdbURL_search % params) cont = subXMLRefs(self._get_search_content('tt', title, results)) title = _findBetween(cont, '<title>', '</title>', maxRes=1) res = [] if not title: self._mobile_logger.error('no title tag searching for movie %s', title) return res tl = title[0].lower() if not tl.startswith('imdb title'): # a direct hit! title = _unHtml(title[0]) mid = None midtag = _getTagsWith(cont, 'rel="canonical"', maxRes=1) if midtag: mid = _findBetween(midtag[0], '/title/tt', '/', maxRes=1) if not (mid and title): self._mobile_logger.error('no direct hit title/movieID for' \ ' title %s', title) return res if cont.find('<span class="tv-extra">TV mini-series</span>') != -1: title += ' (mini)' res[:] = [(str(mid[0]), analyze_title(title))] else: # XXX: this results*3 prevents some recursion errors, but... # it's not exactly understandable (i.e.: why 'results' is # not enough to get all the results?) lis = _findBetween(cont, 'td valign="top">', '</td>', maxRes=results*3) for li in lis: akaIdx = li.find('aka <em>') akas = [] if akaIdx != -1: akas = [_unHtml(x) for x in li[akaIdx:].split('<br>')] li = li[:akaIdx] if akas: for idx, aka in enumerate(akas): aka = aka.replace('" - ', '::') if aka.startswith('aka "'): aka = aka[5:] if aka[-1] == '"': aka = aka[:-1] akas[idx] = aka imdbid = re_imdbID.findall(li) mtitle = _unHtml(li) if not (imdbid and mtitle): self._mobile_logger.debug('no title/movieID parsing' \ ' %s searching for title %s', li, title) continue mtitle = mtitle.replace('(TV mini-series)', '(mini)') resd = analyze_title(mtitle) if akas: resd['akas'] = akas res.append((str(imdbid[0]), resd)) return res
def _search_movie(self, title, results): ##params = urllib.urlencode({'tt': 'on','mx': str(results),'q': title}) ##params = 'q=%s&tt=on&mx=%s' % (urllib.quote_plus(title), str(results)) ##cont = self._mretrieve(imdbURL_search % params) cont = subXMLRefs(self._get_search_content('tt', title, results)) title = _findBetween(cont, '<title>', '</title>', maxRes=1) res = [] if not title: self._mobile_logger.error('no title tag searching for movie %s', title) return res tl = title[0].lower() if not tl.startswith('find - imdb'): # a direct hit! title = _unHtml(title[0]) mid = None midtag = _getTagsWith(cont, 'rel="canonical"', maxRes=1) if midtag: mid = _findBetween(midtag[0], '/title/tt', '/', maxRes=1) if not (mid and title): self._mobile_logger.error('no direct hit title/movieID for' \ ' title %s', title) return res if cont.find('<span class="tv-extra">TV mini-series</span>') != -1: title += ' (mini)' res[:] = [(str(mid[0]), analyze_title(title))] else: # XXX: this results*3 prevents some recursion errors, but... # it's not exactly understandable (i.e.: why 'results' is # not enough to get all the results?) lis = _findBetween(cont, 'td class="result_text">', '</td>', maxRes=results*3) for li in lis: akas = re_makas.findall(li) for idx, aka in enumerate(akas): aka = aka.replace('" - ', '::', 1) aka = _unHtml(aka) if aka.startswith('aka "'): aka = aka[5:].strip() if aka[-1] == '"': aka = aka[:-1] akas[idx] = aka imdbid = re_imdbID.findall(li) li = re_makas.sub('', li) mtitle = _unHtml(li) if not (imdbid and mtitle): self._mobile_logger.debug('no title/movieID parsing' \ ' %s searching for title %s', li, title) continue mtitle = mtitle.replace('(TV mini-series)', '(mini)') resd = analyze_title(mtitle) if akas: resd['akas'] = akas res.append((str(imdbid[0]), resd)) return res
def _search_movie(self, title, results): ##params = urllib.urlencode({'tt': 'on','mx': str(results),'q': title}) ##params = 'q=%s&tt=on&mx=%s' % (urllib.quote_plus(title), str(results)) ##cont = self._mretrieve(imdbURL_search % params) cont = subXMLRefs(self._get_search_content("tt", title, results)) title = _findBetween(cont, "<title>", "</title>", maxRes=1) res = [] if not title: self._mobile_logger.error("no title tag searching for movie %s", title) return res tl = title[0].lower() if not tl.startswith("find - imdb"): # a direct hit! title = _unHtml(title[0]) mid = None midtag = _getTagsWith(cont, 'rel="canonical"', maxRes=1) if midtag: mid = _findBetween(midtag[0], "/title/tt", "/", maxRes=1) if not (mid and title): self._mobile_logger.error("no direct hit title/movieID for" " title %s", title) return res if cont.find('<span class="tv-extra">TV mini-series</span>') != -1: title += " (mini)" res[:] = [(str(mid[0]), analyze_title(title))] else: # XXX: this results*3 prevents some recursion errors, but... # it's not exactly understandable (i.e.: why 'results' is # not enough to get all the results?) lis = _findBetween(cont, 'td class="result_text">', "</td>", maxRes=results * 3) for li in lis: akas = re_makas.findall(li) for idx, aka in enumerate(akas): aka = aka.replace('" - ', "::", 1) aka = _unHtml(aka) if aka.startswith('aka "'): aka = aka[5:].strip() if aka[-1] == '"': aka = aka[:-1] akas[idx] = aka imdbid = re_imdbID.findall(li) li = re_makas.sub("", li) mtitle = _unHtml(li) if not (imdbid and mtitle): self._mobile_logger.debug("no title/movieID parsing" " %s searching for title %s", li, title) continue mtitle = mtitle.replace("(TV mini-series)", "(mini)") resd = analyze_title(mtitle) if akas: resd["akas"] = akas res.append((str(imdbid[0]), resd)) return res
def _search_episode(self, title, results): title = title.strip() if not title: return _episodes = True if analyze_title(title)['kind'] == 'episode': _episodes = False return self._search_movie(title, results, _episodes=_episodes)
def titleVariations(title, fromPtdf=0): """Build title variations useful for searches; if fromPtdf is true, the input is assumed to be in the plain text data files format.""" if fromPtdf: title1 = u'' else: title1 = title title2 = title3 = u'' if fromPtdf or re_year_index.search(title): # If it appears to have a (year[/imdbIndex]) indication, # assume that a long imdb canonical name was provided. titldict = analyze_title(title, canonical=1) # title1: the canonical name. title1 = titldict['title'] if titldict['kind'] != 'episode': # title3: the long imdb canonical name. if fromPtdf: title3 = title else: title3 = build_title(titldict, canonical=1, ptdf=1) else: title1 = normalizeTitle(title1) title3 = build_title(titldict, canonical=1, ptdf=1) else: # Just a title. # title1: the canonical title. title1 = canonicalTitle(title) title3 = u'' # title2 is title1 without the article, or title1 unchanged. if title1: title2 = title1 t2s = title2.split(u', ') if t2s[-1].lower() in _unicodeArticles: title2 = u', '.join(t2s[:-1]) return title1, title2, title3
def do_br(self, attrs): if self._in_series_title: self._in_series_title = 0 st = self._series_title.strip() if st and self.__seriesID: d_title = analyze_title(st, canonical=1) m = Movie(movieID=str(self.__seriesID), data=d_title, accessSystem=self._as, modFunct=self._modFunct) self._result['kind'] = u'episode' self._result['episode of'] = m self._series_title = u'' elif self._in_series_info: self._in_series_info = 0 si = ' '.join([x for x in self._series_info.split() if x]) if si: aid = self.re_airdate.findall(si) if aid and len(aid[0]) == 3: date, season, episode = aid[0] date = date.strip() try: season = int(season) except: pass try: episode = int(episode) except: pass if date and date != '????': self._result['original air date'] = date # Handle also "episode 0". if season or type(season) is type(0): self._result['season'] = season if episode or type(season) is type(0): self._result['episode'] = episode self._series_info = u''
def _search_episode(self, title, results): t_dict = analyze_title(title) if t_dict['kind'] == 'episode': title = t_dict['title'] cont = self._get_search_content('ep', title, results) return self.smProxy.search_movie_parser.parse(cont, results=results)['data']
def titleVariations(title, fromPtdf=0): """Build title variations useful for searches; if fromPtdf is true, the input is assumed to be in the plain text data files format.""" if fromPtdf: title1 = u'' else: title1 = title title2 = title3 = u'' if fromPtdf or re_year_index.search(title): # If it appears to have a (year[/imdbIndex]) indication, # assume that a long imdb canonical name was provided. titldict = analyze_title(title, canonical=1) # title1: the canonical name. title1 = titldict['title'] if titldict['kind'] != 'episode': # title3: the long imdb canonical name. if fromPtdf: title3 = title else: title3 = build_title(titldict, canonical=1, ptdf=1) else: title1 = normalizeTitle(title1) title3 = build_title(titldict, canonical=1, ptdf=1) else: # Just a title. # title1: the canonical title. title1 = canonicalTitle(title) title3 = u'' # title2 is title1 without the article, or title1 unchanged. if title1: title2 = title1 t2s = title2.split(u', ') if t2s[-1].lower() in _articles: title2 = u', '.join(t2s[:-1]) return title1, title2, title3
def postprocess_data(self, data): if not data or self.label not in data: return [] mlist = [] data = data[self.label] # Avoid duplicates. A real fix, using XPath, is auspicabile. # XXX: probably this is no more needed. seenIDs = [] for d in data: if 'movieID' not in d: continue if self.ranktext not in d: continue if 'title' not in d: continue theID = analyze_imdbid(d['movieID']) if theID is None: continue theID = str(theID) if theID in seenIDs: continue seenIDs.append(theID) minfo = analyze_title(d['title']+" "+d['year']) try: minfo[self.ranktext] = int(d[self.ranktext].replace('.', '')) except: pass if 'votes' in d: try: votes = d['votes'].replace(' votes','') votes = votes.split(' based on ')[1] minfo['votes'] = int(votes.replace(',', '')) except: pass if 'rating' in d: try: minfo['rating'] = float(d['rating']) except: pass mlist.append((theID, minfo)) return mlist
def _buildEpisodes(self, eps_list, parentID): episodes = {} parentTitle = getLabel(parentID, '%stitles.index' % self.__db, '%stitles.key' % self.__db) parentSeries = Movie(title=parentTitle, movieID=parentID, accessSystem='local') for episodeID, episodeTitle in eps_list: episodeTitle = unicode(episodeTitle, 'latin_1', 'replace') data = analyze_title(episodeTitle, canonical=1) m = Movie(data=data, movieID=episodeID, accessSystem='local') m['episode of'] = parentSeries if data.get('year') is None: year = getFullIndex('%smovies.data' % self.__db, key=episodeID, kind='moviedata', rindex=1) if year: m['year'] = year season = data.get('season', 'UNKNOWN') if not episodes.has_key(season): episodes[season] = {} ep_number = data.get('episode') if ep_number is None: ep_number = max((episodes[season].keys() or [0])) + 1 episodes[season][ep_number] = m return episodes
def _search_movie(self, title, results, _episodes=False): title = title.strip() if not title: return [] # Search for these title variations. if not _episodes: title1, title2, title3 = titleVariations(title, fromPtdf=1) else: title1 = normalizeTitle(title) title2 = '' title3 = '' # XXX: only a guess: results are shrinked, to exclude Adult # titles and to remove duplicated entries. resultsST = results * 3 res = _scan_titles('%stitles.key' % self.__db, title1, title2, title3, resultsST, _episodes) res[:] = [x[1] for x in res] # Check for adult movies. if not self.doAdult: newlist = [] for entry in res: genres = getMovieMisc(movieID=entry[0], dataF='%s%s.data' % (self.__db, 'genres'), indexF='%s%s.index' % (self.__db, 'genres'), attrIF='%sattributes.index' % self.__db, attrKF='%sattributes.key' % self.__db) if 'Adult' not in genres: newlist.append(entry) res[:] = newlist # Get the real name, if this is an AKA. # XXX: duplicated code! new_res = [] seen_MID = [] for idx, (movieID, r) in enumerate(res): # Remove duplicates. # XXX: find a way to prefer titles with an AKA? Or prefer # the original title? if movieID in seen_MID: continue else: seen_MID.append(movieID) realMID = self._get_real_movieID(movieID) if movieID == realMID: new_res.append((movieID, r)) continue if realMID in seen_MID: continue else: seen_MID.append(realMID) aka_title = build_title(r, canonical=0) real_title = getLabel(realMID, '%stitles.index' % self.__db, '%stitles.key' % self.__db) if aka_title == real_title: new_res.append((realMID, r)) continue new_r = analyze_title(real_title, canonical=1) new_r['akas'] = [aka_title] new_res.append((realMID, new_r)) if results > 0: new_res[:] = new_res[:results] return new_res
def custom_analyze_title(title): """Remove garbage notes after the (year), (year/imdbIndex) or (year) (TV)""" # XXX: very crappy. :-( nt = title.split(' aka ')[0] if nt: title = nt if not title: return {} return analyze_title(title)
class DOMHTMLSearchMovieParser(DOMParserBase): """A parser for the title search page.""" rules = [ Rule( key='data', extractor=Rules( foreach='//td[@class="result_text"]', rules=[ Rule(key='link', extractor=Path('./a/@href', reduce=reducers.first)), Rule(key='info', extractor=Path('.//text()')), Rule(key='akas', extractor=Path(foreach='./i', path='./text()')), Rule(key='cover url', extractor=Path( '../td[@class="primary_photo"]/a/img/@src')) ], transform=lambda x: (analyze_imdbid(x.get('link')), analyze_title(x.get( 'info', '')), x.get('akas'), x.get('cover url')))) ] def _init(self): self.url = '' self.img_type = 'cover url' def _reset(self): self.url = '' def postprocess_data(self, data): if 'data' not in data: data['data'] = [] results = getattr(self, 'results', None) if results is not None: data['data'][:] = data['data'][:results] # Horrible hack to support AKAs. data['data'] = [x for x in data['data'] if x[0] and x[1]] if data and data['data'] and len(data['data'][0]) == 4 and isinstance( data['data'][0], tuple): for idx, datum in enumerate(data['data']): if not isinstance(datum, tuple): continue if not datum[0] and datum[1]: continue if datum[2] is not None: akas = [aka[1:-1] for aka in datum[2]] # remove the quotes datum[1]['akas'] = akas if datum[3] is not None: datum[1][self.img_type] = datum[3] data['data'][idx] = (datum[0], datum[1]) return data def add_refs(self, data): return data
def _search_movie(self, title, results): ##params = urllib.urlencode({'tt': 'on','mx': str(results),'q': title}) ##params = 'q=%s&tt=on&mx=%s' % (urllib.quote_plus(title), str(results)) ##cont = self._mretrieve(imdbURL_search % params) cont = subXMLRefs(self._get_search_content('tt', title, results)) title = _findBetween(cont, '<title>', '</title>', maxRes=1) res = [] if not title: return res tl = title[0].lower() if not tl.startswith('imdb title'): # XXX: a direct hit! title = _unHtml(title[0]) midtag = _getTagsWith(cont, 'name="arg"', maxRes=1) if not midtag: midtag = _getTagsWith(cont, 'name="auto"', maxRes=1) mid = None if midtag: mid = _findBetween(midtag[0], 'value="', '"', maxRes=1) if mid and not mid[0].isdigit(): mid = re_imdbID.findall(mid[0]) if not (mid and title): return res if cont.find('<span class="tv-extra">TV mini-series</span>') != -1: title += ' (mini)' res[:] = [(str(mid[0]), analyze_title(title, canonical=1))] else: cont = _reAKAS.sub('</td>', cont) lis = _findBetween(cont, 'td valign="top">', ['</td>', '</small>']) for li in lis: imdbid = re_imdbID.findall(li) mtitle = _unHtml(li) if not (imdbid and mtitle): img = _findBetween(li, '<img src="', ['" ']) if img and len(img)>0: img = img[0] else: img='' continue mtitle = mtitle.replace('(TV mini-series)', '(mini)') movie = (str(imdbid[0]), analyze_title(mtitle, canonical=1)) movie[1]['image']=img res.append(movie) return res
def _scan_titles(keyFile, title1, title2, title3, results=0): """Scan the given file, using the cutils.search_title C function, for title variations.""" title1, title2, title3 = [x.encode('latin_1', 'replace') for x in title1, title2, title3] st = search_title(keyFile, title1, title2, title3, results) res = [] for x in st: tmpd = analyze_title(latin2utf(x[2])) res.append((x[0], (x[1], tmpd))) return res
def custom_analyze_title4kwd(title, yearNote, outline): """Return a dictionary with the needed info.""" title = title.strip() if not title: return {} if yearNote: yearNote = '%s)' % yearNote.split(' ')[0] title = title + ' ' + yearNote retDict = analyze_title(title) if outline: retDict['plot outline'] = outline return retDict
def end_a(self): if self._in_episode_title: self._in_episode_title = 0 self._in_misc_info = 1 elif self._in_series_title: self._in_series_title = 0 st = self._cur_series_title.strip() if st and self._series_id is not None: series_data = analyze_title(st, canonical=1) s = Movie(movieID=str(self._series_id), data=series_data, accessSystem=self._as, modFunct=self._modFunct) self._cur_series = s
def _scan_titles(keyFile, title1, title2, title3, results=0, _only_episodes=0): """Scan the given file, using the cutils.search_title C function, for title variations.""" title1, title2, title3 = [x.encode('latin_1', 'replace') for x in title1, title2, title3] st = search_title(keyFile, title1, title2, title3, results, _only_episodes) res = [] for x in st: tmpd = analyze_title(latin2utf(x[2])) res.append((x[0], (x[1], tmpd))) return res
def _readTitlesKeyFile(keyFile, searchingEpisode=0): """Iterate over the given file, returning tuples suited for the common.locsql.scan_titles function.""" try: kf = open(keyFile, 'r') except IOError, e: raise IMDbDataAccessError, str(e) for line in kf: ls = line.split('|') t = ls[0] if not t: continue if searchingEpisode: if t[-1] != '}': continue elif t[-1] == '}': continue titled = analyze_title(latin2utf(t)) yield (long(ls[1], 16), titled) kf.close()
class DOMBasicMovieParser(DOMParserBase): """Simply get the title of a movie and the imdbID. It's used by the DOMHTMLSearchMovieParser class to return a result for a direct match (when a search on IMDb results in a single movie, the web server sends directly the movie page.""" # Stay generic enough to be used also for other DOMBasic*Parser classes. _titleAttrPath = ".//text()" _linkPath = "//link[@rel='canonical']" _titleFunct = lambda self, x: analyze_title(x or '') def _init(self): self.preprocessors += [('<span class="tv-extra">TV mini-series</span>', '<span class="tv-extra">(mini)</span>')] self.extractors = [ Extractor(label='title', path="//h1", attrs=Attribute(key='title', path=self._titleAttrPath, postprocess=self._titleFunct)), Extractor(label='link', path=self._linkPath, attrs=Attribute( key='link', path="./@href", postprocess=lambda x: analyze_imdbid( (x or '').replace('http://pro.imdb.com', '')))) ] # Remove 'More at IMDb Pro' links. preprocessors = [ (re.compile(r'<span class="pro-link".*?</span>'), ''), (re.compile(r'<a href="http://ad.doubleclick.net.*?;id=(co[0-9]{7});'), r'<a href="http://pro.imdb.com/company/\1"></a>< a href="') ] def postprocess_data(self, data): if 'link' not in data: data = [] else: link = data.pop('link') if link and data: data = [(link, data)] else: data = [] return data
def end_li(self): self._in_li = 0 if self._in_episodes: et = self._cur_episode_title.strip() minfo = self._misc_info.strip() if et and self._episode_id: eps_data = analyze_title(et, canonical=1) eps_data['kind'] = u'episode' e = Movie(movieID=str(self._episode_id), data=eps_data, accessSystem=self._as, modFunct=self._modFunct) e['episode of'] = self._cur_series if minfo.startswith('('): pe = minfo.find(')') if pe != -1: date = minfo[1:pe] if date != '????': e['original air date'] = date if eps_data.get('year', '????') == '????': syear = date.split()[-1] if syear.isdigit(): e['year'] = syear rolei = minfo.find(' - ') if rolei != -1: if not self._got_i_info: role = u'' role = minfo[rolei + 3:].strip() notei = role.rfind('(') note = u'' if notei != -1 and role and role[-1] == ')': note = role[notei:] role = role[:notei].strip() e.notes = note e.currentRole = role else: randn = minfo[rolei + 3:].strip().split() note = '[%s]' % randn[0] note += ' '.join(randn[1:]) e.notes = note self._episodes.setdefault(self._cur_series, []).append(e) self._cur_episode_title = u'' self._episode_id = None self._in_misc_info = 0 self._misc_info = u''
def _findRefs(self, o, trefs, nrefs): """Find titles or names references in strings.""" if isinstance(o, (unicode, str)): for title in re_titleRef.findall(o): a_title = analyze_title(title, canonical=0) rtitle = build_title(a_title, ptdf=1) if trefs.has_key(rtitle): continue movieID = self._getTitleID(rtitle) if movieID is None: movieID = self._getTitleID(title) if movieID is None: continue m = Movie(title=rtitle, movieID=movieID, accessSystem=self.accessSystem) trefs[rtitle] = m rtitle2 = canonicalTitle(a_title.get('title', u'')) if rtitle2 and rtitle2 != rtitle and rtitle2 != title: trefs[rtitle2] = m if title != rtitle: trefs[title] = m for name in re_nameRef.findall(o): a_name = analyze_name(name, canonical=1) rname = build_name(a_name, canonical=1) if nrefs.has_key(rname): continue personID = self._getNameID(rname) if personID is None: personID = self._getNameID(name) if personID is None: continue p = Person(name=rname, personID=personID, accessSystem=self.accessSystem) nrefs[rname] = p rname2 = normalizeName(a_name.get('name', u'')) if rname2 and rname2 != rname: nrefs[rname2] = p if name != rname and name != rname2: nrefs[name] = p elif isinstance(o, (list, tuple)): for item in o: self._findRefs(item, trefs, nrefs) elif isinstance(o, dict): for value in o.values(): self._findRefs(value, trefs, nrefs) return (trefs, nrefs)
def postprocess_data(self, data): if len(data) == 0: return {} nd = {} for key in data.keys(): dom = self.get_dom(key) link = self.xpath(dom, "//a/@href")[0] title = self.xpath(dom, "//a/text()")[0][1:-1] series = Movie(movieID=analyze_imdbid(link), data=analyze_title(title), accessSystem=self._as, modFunct=self._modFunct) nd[series] = [] for episode in data[key]: # XXX: should we create a copy of 'series', to avoid # circular references? episode['episode of'] = series nd[series].append(episode) return {'episodes': nd}
def _build_episode(link, title, minfo, role, roleA, roleAID): """Build an Movie object for a given episode of a series.""" episode_id = analyze_imdbid(link) notes = '' minidx = minfo.find(' -') # Sometimes, for some unknown reason, the role is left in minfo. if minidx != -1: slfRole = minfo[minidx + 3:].lstrip() minfo = minfo[:minidx].rstrip() if slfRole.endswith(')'): commidx = slfRole.rfind('(') if commidx != -1: notes = slfRole[commidx:] slfRole = slfRole[:commidx] if slfRole and role is None and roleA is None: role = slfRole eps_data = analyze_title(title) eps_data['kind'] = 'episode' # FIXME: it's wrong for multiple characters (very rare on tv series?). if role is None: role = roleA # At worse, it's None. if role is None: roleAID = None if roleAID is not None: roleAID = analyze_imdbid(roleAID) e = Movie(movieID=episode_id, data=eps_data, currentRole=role, roleID=roleAID, notes=notes) # XXX: are we missing some notes? # XXX: does it parse things as "Episode dated 12 May 2005 (12 May 2005)"? if minfo.startswith('('): pe = minfo.find(')') if pe != -1: date = minfo[1:pe] if date != '????': e['original air date'] = date if eps_data.get('year', '????') == '????': syear = date.split()[-1] if syear.isdigit(): e['year'] = int(syear) return e
def end_li(self): self._in_li = 0 if self._in_episodes: et = self._cur_episode_title.strip() minfo = self._misc_info.strip() if et and self._episode_id: eps_data = analyze_title(et, canonical=1) eps_data['kind'] = u'episode' e = Movie(movieID=str(self._episode_id), data=eps_data, accessSystem=self._as, modFunct=self._modFunct) e['episode of'] = self._cur_series if minfo.startswith('('): pe = minfo.find(')') if pe != -1: date = minfo[1:pe] if date != '????': e['original air date'] = date if eps_data.get('year', '????') == '????': syear = date.split()[-1] if syear.isdigit(): e['year'] = syear rolei = minfo.find(' - ') if rolei != -1: if not self._got_i_info: role = u'' role = minfo[rolei+3:].strip() notei = role.rfind('(') note = u'' if notei != -1 and role and role[-1] == ')': note = role[notei:] role = role[:notei].strip() e.notes = note e.currentRole = role else: randn = minfo[rolei+3:].strip().split() note = '[%s]' % randn[0] note += ' '.join(randn[1:]) e.notes = note self._episodes.setdefault(self._cur_series, []).append(e) self._cur_episode_title = u'' self._episode_id = None self._in_misc_info = 0 self._misc_info = u''
def _findRefs(self, o, trefs, nrefs): """Find titles or names references in strings.""" if isinstance(o, (UnicodeType, StringType)): for title in re_titleRef.findall(o): a_title = analyze_title(title, canonical=1) rtitle = build_title(a_title, canonical=1, ptdf=1) if trefs.has_key(rtitle): continue movieID = self._getTitleID(rtitle) if movieID is None: movieID = self._getTitleID(title) if movieID is None: continue m = Movie(title=rtitle, movieID=movieID, accessSystem=self.accessSystem) trefs[rtitle] = m rtitle2 = canonicalTitle(a_title.get('title', u'')) if rtitle2 and rtitle2 != rtitle and rtitle2 != title: trefs[rtitle2] = m if title != rtitle: trefs[title] = m for name in re_nameRef.findall(o): a_name = analyze_name(name, canonical=1) rname = build_name(a_name, canonical=1) if nrefs.has_key(rname): continue personID = self._getNameID(rname) if personID is None: personID = self._getNameID(name) if personID is None: continue p = Person(name=rname, personID=personID, accessSystem=self.accessSystem) nrefs[rname] = p rname2 = normalizeName(a_name.get('name', u'')) if rname2 and rname2 != rname: nrefs[rname2] = p if name != rname and name != rname2: nrefs[name] = p elif isinstance(o, (ListType, TupleType)): for item in o: self._findRefs(item, trefs, nrefs) elif isinstance(o, DictType): for value in o.values(): self._findRefs(value, trefs, nrefs) return (trefs, nrefs)
def _getTitleID(self, title): """Given a long imdb canonical title, returns a movieID or None if not found.""" td = analyze_title(title) condition = None if td['kind'] == 'episode': epof = td['episode of'] seriesID = [ s.id for s in Title.select( AND( Title.q.title == self.toUTF8(epof['title']), self._buildNULLCondition(Title.q.imdbIndex, epof.get('imdbIndex')), Title.q.kindID == self._kindRev[epof['kind']], self._buildNULLCondition(Title.q.productionYear, epof.get('year')))) ] if seriesID: condition = AND( IN(Title.q.episodeOfID, seriesID), Title.q.title == self.toUTF8(td['title']), self._buildNULLCondition(Title.q.imdbIndex, td.get('imdbIndex')), Title.q.kindID == self._kindRev[td['kind']], self._buildNULLCondition(Title.q.productionYear, td.get('year'))) if condition is None: condition = AND( Title.q.title == self.toUTF8(td['title']), self._buildNULLCondition(Title.q.imdbIndex, td.get('imdbIndex')), Title.q.kindID == self._kindRev[td['kind']], self._buildNULLCondition(Title.q.productionYear, td.get('year'))) res = Title.select(condition) try: if res.count() != 1: return None except (UnicodeDecodeError, TypeError): return None return res[0].id
def postprocess_data(self, data): if not data or self.label not in data: return [] mlist = [] data = data[self.label] # Avoid duplicates. A real fix, using XPath, is auspicabile. # XXX: probably this is no more needed. seenIDs = [] for d in data: if 'movieID' not in d: continue if self.ranktext not in d: continue if 'title' not in d: continue theID = analyze_imdbid(d['movieID']) if theID is None: continue theID = str(theID) if theID in seenIDs: continue seenIDs.append(theID) minfo = analyze_title(d['title'] + ' ' + d['year']) try: minfo[self.ranktext] = int(d[self.ranktext].replace('.', '')) except ValueError: pass if 'votes' in d: try: votes = d['votes'].replace(' user ratings', '') votes = votes.split(' based on ')[1] # is IndexError possible? minfo['votes'] = int(votes.replace(',', '')) except (IndexError, ValueError): pass if 'rating' in d: try: minfo['rating'] = float(d['rating']) except ValueError: pass mlist.append((theID, minfo)) return mlist
def postprocess_data(self, data): if not data or self.label not in data: return [] mlist = [] data = data[self.label] # Avoid duplicates. A real fix, using XPath, is auspicabile. # XXX: probably this is no more needed. seenIDs = [] for d in data: if "movieID" not in d: continue if self.ranktext not in d: continue if "title" not in d: continue theID = analyze_imdbid(d["movieID"]) if theID is None: continue theID = str(theID) if theID in seenIDs: continue seenIDs.append(theID) minfo = analyze_title(d["title"] + " " + d["year"]) try: minfo[self.ranktext] = int(d[self.ranktext].replace(".", "")) except: pass if "votes" in d: try: votes = d["votes"].replace(" votes", "") votes = votes.split(" based on ")[1] minfo["votes"] = int(votes.replace(",", "")) except: pass if "rating" in d: try: minfo["rating"] = float(d["rating"]) except: pass mlist.append((theID, minfo)) return mlist
def _build_episode(link, title, minfo, role, roleA, roleAID): """Build an Movie object for a given episode of a series.""" episode_id = analyze_imdbid(link) notes = u'' minidx = minfo.find(' -') # Sometimes, for some unknown reason, the role is left in minfo. if minidx != -1: slfRole = minfo[minidx+3:].lstrip() minfo = minfo[:minidx].rstrip() if slfRole.endswith(')'): commidx = slfRole.rfind('(') if commidx != -1: notes = slfRole[commidx:] slfRole = slfRole[:commidx] if slfRole and role is None and roleA is None: role = slfRole eps_data = analyze_title(title) eps_data['kind'] = u'episode' # FIXME: it's wrong for multiple characters (very rare on tv series?). if role is None: role = roleA # At worse, it's None. if role is None: roleAID = None if roleAID is not None: roleAID = analyze_imdbid(roleAID) e = Movie(movieID=episode_id, data=eps_data, currentRole=role, roleID=roleAID, notes=notes) # XXX: are we missing some notes? # XXX: does it parse things as "Episode dated 12 May 2005 (12 May 2005)"? if minfo.startswith('('): pe = minfo.find(')') if pe != -1: date = minfo[1:pe] if date != '????': e['original air date'] = date if eps_data.get('year', '????') == '????': syear = date.split()[-1] if syear.isdigit(): e['year'] = int(syear) return e
def _get_top_bottom_movies(self, kind): if kind == 'top': kind = 'top 250 rank' elif kind == 'bottom': kind = 'bottom 10 rank' else: return [] info = getTopBottomList(kind, '%stopbottom.db' % self.__db) if not info: return [] res = [] for d in info: if not 'movieID' in d: continue movieID = d['movieID'] del d['movieID'] minfo = analyze_title(getLabel(movieID, '%stitles.index' % self.__db, '%stitles.key' % self.__db)) minfo.update(d) res.append((movieID, minfo)) return res
def postprocess_data(self, data): if (not data) or ('chart' not in data): return [] movies = [] for entry in data['chart']: if ('movieID' not in entry) or ('rank' not in entry) or ('title' not in entry): continue movie_id = analyze_imdbid(entry['movieID']) if movie_id is None: continue del entry['movieID'] entry[self.ranktext] = entry['rank'] del entry['rank'] title = analyze_title(entry['title'] + ' ' + entry.get('year', '')) entry.update(title) movies.append((movie_id, entry)) return movies
def postprocess_data(self, data): if (not data) or ('chart' not in data): return [] movies = [] for entry in data['chart']: if ('movieID' not in entry) or ('rank' not in entry) or ('title' not in entry): continue movie_id = analyze_imdbid( entry['movieID']) # actually url parser to filter out id if movie_id is None: continue del entry['movieID'] title = analyze_title(entry['title']) entry.update(title) movies.append((movie_id, entry)) return movies
def _get_keyword(self, keyword, results): return [(movieID, analyze_title(getLabel(movieID, '%stitles.index' % self.__db, '%stitles.key' % self.__db))) for movieID in getKeywordMovies(keyword, '%skeywords.data' % self.__db)][:results]
def set_title(self, title): """Set the title of the movie.""" # XXX: convert title to unicode, if it's a plain string? d_title = analyze_title(title) self.data.update(d_title)
def _readTitlesKeyFile(keyFile, searchingEpisode=0): """Iterate over the given file, returning tuples suited for the common.locsql.scan_titles function.""" try: kf = open(keyFile, 'r') except IOError, e: raise IMDbDataAccessError, str(e) for line in kf: ls = line.split('|') t = ls[0] if not t: continue if searchingEpisode: if t[-1] != '}': continue elif t[-1] == '}': continue titled = analyze_title(latin2utf(t)) yield (long(ls[1], 16), titled) kf.close() def _scan_titles(keyFile, title1, title2, title3, results=0, _only_episodes=0): """Scan the given file, using the common.locsql.scan_titles pure-Python function, for title variations.""" se = 0 if _only_episodes: se = 1 else:
def get_movie_main(self, movieID): # Information sets provided by this method. infosets = ('main', 'vote details') tl = getLabel(movieID, '%stitles.index' % self.__db, '%stitles.key' % self.__db) # No title, no party. if tl is None: raise IMDbDataAccessError, 'unable to get movieID "%s"' % movieID res = analyze_title(tl) # Build the cast list. actl = [] for castG in ('actors', 'actresses'): midx = getFullIndex('%s%s.titles' % (self.__db, castG), movieID, multi=1) if midx is not None: params = {'movieID': movieID, 'dataF': '%s%s.data' % (self.__db, castG), 'indexF': '%snames.index' % self.__db, 'keyF': '%snames.key' % self.__db, 'attrIF': '%sattributes.index' % self.__db, 'attrKF': '%sattributes.key' % self.__db, 'charNF': '%scharacter2id.index' % self.__db, 'offsList': midx, 'doCast': 1} actl += getMovieCast(**params) if actl: actl.sort() res['cast'] = actl # List of other workers. works = ('writer', 'cinematographer', 'composer', 'costume-designer', 'director', 'editor', 'miscellaneou', 'producer', 'production-designer', 'cinematographer') for i in works: index = getFullIndex('%s%ss.titles' % (self.__db, i), movieID, multi=1) if index is not None: params = {'movieID': movieID, 'dataF': '%s%s.data' % (self.__db, i), 'indexF': '%snames.index' % self.__db, 'keyF': '%snames.key' % self.__db, 'attrIF': '%sattributes.index' % self.__db, 'attrKF': '%sattributes.key' % self.__db, 'offsList': index} name = key = i if '-' in name: name = name.replace('-', ' ') elif name == 'miscellaneou': name = 'miscellaneous crew' key = 'miscellaneou' elif name == 'writer': params['doWriters'] = 1 params['dataF'] = '%s%ss.data' % (self.__db, key) data = getMovieCast(**params) if name == 'writer': data.sort() res[name] = data # Rating. rt = self.get_movie_vote_details(movieID)['data'] if rt: res.update(rt) # Various information. miscInfo = (('runtimes', 'running-times'), ('color info', 'color-info'), ('genres', 'genres'), ('distributors', 'distributors'), ('languages', 'language'), ('certificates', 'certificates'), ('special effects companies', 'special-effects-companies'), ('sound mix', 'sound-mix'), ('tech info', 'technical'), ('production companies', 'production-companies'), ('countries', 'countries')) for name, fname in miscInfo: params = {'movieID': movieID, 'dataF': '%s%s.data' % (self.__db, fname), 'indexF': '%s%s.index' % (self.__db, fname), 'attrIF': '%sattributes.index' % self.__db, 'attrKF': '%sattributes.key' % self.__db} data = getMovieMisc(**params) if name in ('distributors', 'special effects companies', 'production companies'): for nitem in xrange(len(data)): n, notes = split_company_name_notes(data[nitem]) company = Company(name=n, companyID=getCompanyID(n, '%scompany2id.index' % self.__db), notes=notes, accessSystem=self.accessSystem) data[nitem] = company if data: res[name] = data if res.has_key('runtimes') and len(res['runtimes']) > 0: rt = res['runtimes'][0] episodes = re_episodes.findall(rt) if episodes: res['runtimes'][0] = re_episodes.sub('', rt) res['number of episodes'] = episodes[0] # AKA titles. akas = getAkaTitles(movieID, '%saka-titles.data' % self.__db, '%stitles.index' % self.__db, '%stitles.key' % self.__db, '%sattributes.index' % self.__db, '%sattributes.key' % self.__db) if akas: # normalize encoding. for i in xrange(len(akas)): ts = akas[i].split('::') if len(ts) != 2: continue t = ts[0] n = ts[1] nt = self._changeAKAencoding(n, t) if nt is not None: akas[i] = '%s::%s' % (nt, n) res['akas'] = akas if res.get('kind') == 'episode': # Things to do if this is a tv series episode. episodeOf = res.get('episode of') if episodeOf is not None: parentSeries = Movie(data=res['episode of'], accessSystem='local') seriesID = self._getTitleID(parentSeries.get( 'long imdb canonical title')) parentSeries.movieID = seriesID res['episode of'] = parentSeries if not res.get('year'): year = getFullIndex('%smovies.data' % self.__db, movieID, kind='moviedata', rindex=1) if year: res['year'] = year # MPAA info. mpaa = getMPAA(movieID, '%smpaa-ratings-reasons.index' % self.__db, '%smpaa-ratings-reasons.data' % self.__db) if mpaa: res.update(mpaa) return {'data': res, 'info sets': infosets}
def _search_movie(self, title, results, _episodes=False): title = title.strip() if not title: return [] title_dict = analyze_title(title, canonical=1) s_title = title_dict['title'] if not s_title: return [] episodeOf = title_dict.get('episode of') if not episodeOf: if not _episodes: s_title_split = s_title.split(', ') if len(s_title_split) > 1 and \ s_title_split[-1].lower() in _articles: s_title_rebuilt = ', '.join(s_title_split[:-1]) if s_title_rebuilt: s_title = s_title_rebuilt else: _episodes = False s_title = normalizeTitle(s_title) if isinstance(s_title, UnicodeType): s_title = s_title.encode('ascii', 'ignore') soundexCode = soundex(s_title) # XXX: improve the search restricting the kindID if the # "kind" of the input differs from "movie"? condition = conditionAka = None if _episodes: condition = AND(Title.q.phoneticCode == soundexCode, Title.q.kindID == self._kindRev['episode']) conditionAka = AND(AkaTitle.q.phoneticCode == soundexCode, AkaTitle.q.kindID == self._kindRev['episode']) elif title_dict['kind'] == 'episode' and episodeOf is not None: series_title = build_title(episodeOf, canonical=1) # XXX: is it safe to get "results" results? # Too many? Too few? serRes = results if serRes < 3 or serRes > 10: serRes = 10 searchSeries = self._search_movie(series_title, serRes) seriesIDs = [result[0] for result in searchSeries] if seriesIDs: condition = AND(Title.q.phoneticCode == soundexCode, IN(Title.q.episodeOfID, seriesIDs), Title.q.kindID == self._kindRev['episode']) conditionAka = AND( AkaTitle.q.phoneticCode == soundexCode, IN(AkaTitle.q.episodeOfID, seriesIDs), AkaTitle.q.kindID == self._kindRev['episode']) else: # XXX: bad situation: we have found no matching series; # try searching everything (both episodes and # non-episodes) for the title. condition = AND(Title.q.phoneticCode == soundexCode, IN(Title.q.episodeOfID, seriesIDs)) conditionAka = AND(AkaTitle.q.phoneticCode == soundexCode, IN(AkaTitle.q.episodeOfID, seriesIDs)) if condition is None: # XXX: excludes episodes? condition = AND(Title.q.kindID != self._kindRev['episode'], Title.q.phoneticCode == soundexCode) conditionAka = AND(AkaTitle.q.kindID != self._kindRev['episode'], AkaTitle.q.phoneticCode == soundexCode) # Up to 3 variations of the title are searched, plus the # long imdb canonical title, if provided. if not _episodes: title1, title2, title3 = titleVariations(title) else: title1 = title title2 = '' title3 = '' try: qr = [(q.id, get_movie_data(q.id, self._kind)) for q in Title.select(condition)] q2 = [(q.movieID, get_movie_data(q.id, self._kind, fromAka=1)) for q in AkaTitle.select(conditionAka)] qr += q2 except NotFoundError, e: raise IMDbDataAccessError, \ 'unable to search the database: "%s"' % str(e)
def end_title(self): self._reading_page_title = 0 t = self._page_title.strip() if t.find('IMDb Title') != -1 and t.find('Search') != -1: return self._result = analyze_title(t, canonical=1)
def set_title(self, title): """Set the title of the movie.""" d_title = analyze_title(title) self.data.update(d_title)
def run( self ): try: if self.strategy == 'film_db': #TODO self.chomik.connect() movies = [] file = open( 'db/movies.list', 'r' ) for line in file.readlines(): title = line.split('\t')[0].decode( 'latin1' ) analized = analyze_title( title ) title = analized['title'] if analized['kind'] in ( 'movie', 'tv series', 'tv movie', 'tv mini series', 'episode' ): movies.append( analized ) file.close() random.shuffle( movies ) for movie in movies: title = movie['title'].replace( '/', '\\') year = movie['year'] if movie.has_key( 'year' ) else '' FIRST_LETTER = title[0].upper() if movie['kind'] == 'episode': pattern1 = [ 'Seriale', 'Alfabetycznie', movie['episode of']['title'][0].upper(), movie['episode of']['title'] + ' (%s)' % movie['episode of']['year'] if movie['episode of'].has_key( 'year' ) else '', ] if movie.has_key( 'season' ): pattern1.append( 'Sezon %s' % movie['season'] ) if movie.has_key( 'episode' ): pattern1.append( 'Odcinek %s, %s' % ( movie['episode'], movie['title'] ) ) else: pattern1.append( title ) pattern1 = '/'.join( pattern1 ) patterns = [ pattern1 ] if movie['episode of'].has_key( 'year' ): pattern2 = [ 'Seriale', 'Chronologicznie', str( movie['episode of']['year'] ), movie['episode of']['title'] + ' (%s)' % movie['episode of']['year'] if movie['episode of'].has_key( 'year' ) else '', ] if movie.has_key( 'season' ): pattern2.append( 'Sezon %s' % movie['season'] ) if movie.has_key( 'episode' ): pattern2.append( 'Odcinek %s, %s' % ( movie['episode'], movie['title'] ) ) else: pattern2.append( title ) pattern2 = '/'.join( pattern2 ) patterns.append( pattern2 ) else: title = '%s (%s)' % ( title, year ) if year else title if movie['kind'] in ( 'tv series', 'tv mini series' ): folder = 'Seriale' else: folder = 'Filmy' full_title = ( "%s (%s)" % ( movie['title'], year ) ).decode( 'latin1' ) patterns = ( '%s/Alfabetycznie/%s/%s' % ( folder, FIRST_LETTER, title ), '%s/Chronologicznie/%s/%s' % ( folder, year, title ) ) if not Db.fetchone( "SELECT * FROM folders WHERE user_id=? AND name=?", ( self.id, title ) ): good = [] if movie['kind'] in ( 'movie', 'tv movie' ): # TODO search series sizes = [] self.chomik.logger.debug( 'searching: %s' % full_title ) items = self.chomik.search( full_title ) self.chomik.logger.debug( 'find: %d' % len( items ) ) for item in items: self.chomik.logger.debug( '%s, %s' % item['title'], item['size'] ) if item['title'].lower() == full_title.lower() or item['title'].lower().startswith( full_title.lower() ) or item['title'].lower() == movie['title'].lower() or item['title'].lower == ( '%s %s' % ( movie['title'], year ) ).lower(): if not item['size'] in sizes: good.append( item ) sizes.append( item['size'] ) for pattern in patterns: id, url = self.chomik.create_directory( pattern ) if id and url: Db.execute( "INSERT INTO folders VALUES (?, ?, ?, ?)", ( id, self.id, title, url ), commit=True ) for item in good: self.chomik.clone( item['id'], id ) elif self.strategy == 'smieciarz': if self.chomik.connect(): self.generate_other_users() users = Db.fetch( "SELECT login from other_users" ) random.shuffle( users ) for user in users: url = '/%s' % user full_url = 'http://chomikuj.pl/%s%s' % ( self.login, url ) if not self.chomik.check_directory( url )[0]: self.chomik.copy_directory_tree( url, timeout=self.server.timeout ) self.generate_other_users( 5 ) except Exception, e: print e self.chomik.logger.exception( e ) self.chomik.logger.info( "going to sleep for 60 seconds" ) time.sleep( 60 ) self.run()
def get_movie_main(self, movieID): cont = self._mretrieve(self.urls["movie_main"] % movieID + "maindetails") title = _findBetween(cont, "<title>", "</title>", maxRes=1) if not title: raise IMDbDataAccessError('unable to get movieID "%s"' % movieID) title = _unHtml(title[0]) if title.endswith(" - IMDb"): title = title[:-7] if cont.find('<span class="tv-extra">TV mini-series</span>') != -1: title += " (mini)" d = analyze_title(title) kind = d.get("kind") tv_series = _findBetween(cont, "TV Series:</h5>", "</a>", maxRes=1) if tv_series: mid = re_imdbID.findall(tv_series[0]) else: mid = None if tv_series and mid: s_title = _unHtml(tv_series[0]) s_data = analyze_title(s_title) m = Movie(movieID=str(mid[0]), data=s_data, accessSystem=self.accessSystem, modFunct=self._defModFunct) d["kind"] = kind = u"episode" d["episode of"] = m if kind in ("tv series", "tv mini series"): years = _findBetween(cont, "<h1>", "</h1>", maxRes=1) if years: years[:] = _findBetween(years[0], "TV series", "</span>", maxRes=1) if years: d["series years"] = years[0].strip() air_date = _findBetween(cont, "Original Air Date:</h5>", "</div>", maxRes=1) if air_date: air_date = air_date[0] vi = air_date.find("(") if vi != -1: date = _unHtml(air_date[:vi]).strip() if date != "????": d["original air date"] = date air_date = air_date[vi:] season = _findBetween(air_date, "Season", ",", maxRes=1) if season: season = season[0].strip() try: season = int(season) except: pass if season or type(season) is _inttype: d["season"] = season episode = _findBetween(air_date, "Episode", ")", maxRes=1) if episode: episode = episode[0].strip() try: episode = int(episode) except: pass if episode or type(season) is _inttype: d["episode"] = episode direct = _findBetween(cont, "<h5>Director", ("</div>", "<br/> <br/>"), maxRes=1) if direct: direct = direct[0] h5idx = direct.find("/h5>") if h5idx != -1: direct = direct[h5idx + 4 :] direct = self._getPersons(direct) if direct: d["director"] = direct if kind in ("tv series", "tv mini series", "episode"): if kind != "episode": seasons = _findBetween(cont, "Seasons:</h5>", "</div>", maxRes=1) if seasons: d["number of seasons"] = seasons[0].count("|") + 1 creator = _findBetween(cont, "Created by</h5>", ('class="tn15more"', "</div>", "<br/> <br/>"), maxRes=1) if not creator: # They change 'Created by' to 'Creator' and viceversa # from time to time... # XXX: is 'Creators' also used? creator = _findBetween(cont, "Creator:</h5>", ('class="tn15more"', "</div>", "<br/> <br/>"), maxRes=1) if creator: creator = creator[0] if creator.find("tn15more"): creator = "%s>" % creator creator = self._getPersons(creator) if creator: d["creator"] = creator writers = _findBetween(cont, "<h5>Writer", ("</div>", "<br/> <br/>"), maxRes=1) if writers: writers = writers[0] h5idx = writers.find("/h5>") if h5idx != -1: writers = writers[h5idx + 4 :] writers = self._getPersons(writers) if writers: d["writer"] = writers cvurl = _getTagsWith(cont, 'name="poster"', toClosure=True, maxRes=1) if cvurl: cvurl = _findBetween(cvurl[0], 'src="', '"', maxRes=1) if cvurl: d["cover url"] = cvurl[0] genres = _findBetween(cont, 'href="/genre/', '"') if genres: d["genres"] = list(set(genres)) ur = _findBetween(cont, 'id="star-bar-user-rate">', "</div>", maxRes=1) if ur: rat = _findBetween(ur[0], "<b>", "</b>", maxRes=1) if rat: if rat: d["rating"] = rat[0].strip() else: self._mobile_logger.warn("wrong rating: %s", rat) vi = ur[0].rfind('href="ratings"') if vi != -1 and ur[0][vi + 10 :].find("await") == -1: try: votes = _findBetween(ur[0][vi:], "title='", " IMDb", maxRes=1) votes = int(votes[0].replace(",", "")) d["votes"] = votes except (ValueError, IndexError): self._mobile_logger.warn("wrong votes: %s", ur) top250 = _findBetween(cont, 'href="/chart/top?', "</a>", maxRes=1) if top250: fn = top250[0].rfind("#") if fn != -1: try: td = int(top250[0][fn + 1 :]) d["top 250 rank"] = td except ValueError: self._mobile_logger.warn("wrong top250: %s", top250) castdata = _findBetween(cont, "Cast overview", "</table>", maxRes=1) if not castdata: castdata = _findBetween(cont, "Credited cast", "</table>", maxRes=1) if not castdata: castdata = _findBetween(cont, "Complete credited cast", "</table>", maxRes=1) if not castdata: castdata = _findBetween(cont, "Series Cast Summary", "</table>", maxRes=1) if not castdata: castdata = _findBetween(cont, "Episode Credited cast", "</table>", maxRes=1) if castdata: castdata = castdata[0] # Reintegrate the fist tag. fl = castdata.find("href=") if fl != -1: castdata = "<a " + castdata[fl:] # Exclude the 'rest of cast listed alphabetically' row. smib = castdata.find('<tr><td align="center" colspan="4"><small>') if smib != -1: smie = castdata.rfind("</small></td></tr>") if smie != -1: castdata = castdata[:smib].strip() + castdata[smie + 18 :].strip() castdata = castdata.replace("/tr> <tr", "/tr><tr") cast = self._getPersons(castdata, sep="</tr><tr") if cast: d["cast"] = cast akas = _findBetween(cont, "Also Known As:</h5>", "</div>", maxRes=1) if akas: # For some reason, here <br> is still used in place of <br/>. akas[:] = [x for x in akas[0].split("<br>") if x.strip()] akas = [_unHtml(x).replace('" - ', "::", 1).lstrip('"').strip() for x in akas] if "See more" in akas: akas.remove("See more") akas[:] = [x for x in akas if x] if akas: d["akas"] = akas mpaa = _findBetween(cont, "MPAA</a>:", "</div>", maxRes=1) if mpaa: d["mpaa"] = _unHtml(mpaa[0]) runtimes = _findBetween(cont, "Runtime:</h5>", "</div>", maxRes=1) if runtimes: runtimes = runtimes[0] runtimes = [x.strip().replace(" min", "").replace(" (", "::(", 1) for x in runtimes.split("|")] d["runtimes"] = [_unHtml(x).strip() for x in runtimes] if kind == "episode": # number of episodes. epsn = _findBetween(cont, 'title="Full Episode List">', "</a>", maxRes=1) if epsn: epsn = epsn[0].replace(" Episodes", "").strip() if epsn: try: epsn = int(epsn) except: self._mobile_logger.warn("wrong episodes #: %s", epsn) d["number of episodes"] = epsn country = _findBetween(cont, "Country:</h5>", "</div>", maxRes=1) if country: country[:] = country[0].split(" | ") country[:] = ["<a %s" % x for x in country if x] country[:] = [_unHtml(x.replace(" <i>", "::")) for x in country] if country: d["countries"] = country lang = _findBetween(cont, "Language:</h5>", "</div>", maxRes=1) if lang: lang[:] = lang[0].split(" | ") lang[:] = ["<a %s" % x for x in lang if x] lang[:] = [_unHtml(x.replace(" <i>", "::")) for x in lang] if lang: d["languages"] = lang col = _findBetween(cont, '"/search/title?colors=', "</div>") if col: col[:] = col[0].split(" | ") col[:] = ["<a %s" % x for x in col if x] col[:] = [_unHtml(x.replace(" <i>", "::")) for x in col] if col: d["color info"] = col sm = _findBetween(cont, "/search/title?sound_mixes=", "</div>", maxRes=1) if sm: sm[:] = sm[0].split(" | ") sm[:] = ["<a %s" % x for x in sm if x] sm[:] = [_unHtml(x.replace(" <i>", "::")) for x in sm] if sm: d["sound mix"] = sm cert = _findBetween(cont, "Certification:</h5>", "</div>", maxRes=1) if cert: cert[:] = cert[0].split(" | ") cert[:] = [_unHtml(x.replace(" <i>", "::")) for x in cert] if cert: d["certificates"] = cert plotoutline = _findBetween(cont, "Plot:</h5>", ["<a ", "</div>"], maxRes=1) if plotoutline: plotoutline = plotoutline[0].strip() plotoutline = plotoutline.rstrip("|").rstrip() if plotoutline: d["plot outline"] = _unHtml(plotoutline) aratio = _findBetween(cont, "Aspect Ratio:</h5>", ["<a ", "</div>"], maxRes=1) if aratio: aratio = aratio[0].strip().replace(" (", "::(", 1) if aratio: d["aspect ratio"] = _unHtml(aratio) return {"data": d}
def _search_episode(self, title, results): t_dict = analyze_title(title) if t_dict["kind"] == "episode": title = t_dict["title"] cont = self._get_search_content("ep", title, results) return self.smProxy.search_movie_parser.parse(cont, results=results)["data"]
def get_movie_main(self, movieID): cont = self._mretrieve(self.urls['movie_main'] % movieID + 'maindetails') title = _findBetween(cont, '<title>', '</title>', maxRes=1) if not title: raise IMDbDataAccessError('unable to get movieID "%s"' % movieID) title = _unHtml(title[0]) if title.endswith(' - IMDb'): title = title[:-7] if cont.find('<span class="tv-extra">TV mini-series</span>') != -1: title += ' (mini)' d = analyze_title(title) kind = d.get('kind') tv_series = _findBetween(cont, 'TV Series:</h5>', '</a>', maxRes=1) if tv_series: mid = re_imdbID.findall(tv_series[0]) else: mid = None if tv_series and mid: s_title = _unHtml(tv_series[0]) s_data = analyze_title(s_title) m = Movie(movieID=str(mid[0]), data=s_data, accessSystem=self.accessSystem, modFunct=self._defModFunct) d['kind'] = kind = u'episode' d['episode of'] = m if kind in ('tv series', 'tv mini series'): years = _findBetween(cont, '<h1>', '</h1>', maxRes=1) if years: years[:] = _findBetween(years[0], 'TV series', '</span>', maxRes=1) if years: d['series years'] = years[0].strip() air_date = _findBetween(cont, 'Original Air Date:</h5>', '</div>', maxRes=1) if air_date: air_date = air_date[0] vi = air_date.find('(') if vi != -1: date = _unHtml(air_date[:vi]).strip() if date != '????': d['original air date'] = date air_date = air_date[vi:] season = _findBetween(air_date, 'Season', ',', maxRes=1) if season: season = season[0].strip() try: season = int(season) except: pass if season or type(season) is _inttype: d['season'] = season episode = _findBetween(air_date, 'Episode', ')', maxRes=1) if episode: episode = episode[0].strip() try: episode = int(episode) except: pass if episode or type(season) is _inttype: d['episode'] = episode direct = _findBetween(cont, '<h5>Director', ('</div>', '<br/> <br/>'), maxRes=1) if direct: direct = direct[0] h5idx = direct.find('/h5>') if h5idx != -1: direct = direct[h5idx+4:] direct = self._getPersons(direct) if direct: d['director'] = direct if kind in ('tv series', 'tv mini series', 'episode'): if kind != 'episode': seasons = _findBetween(cont, 'Seasons:</h5>', '</div>', maxRes=1) if seasons: d['number of seasons'] = seasons[0].count('|') + 1 creator = _findBetween(cont, 'Created by</h5>', ('class="tn15more"', '</div>', '<br/> <br/>'), maxRes=1) if not creator: # They change 'Created by' to 'Creator' and viceversa # from time to time... # XXX: is 'Creators' also used? creator = _findBetween(cont, 'Creator:</h5>', ('class="tn15more"', '</div>', '<br/> <br/>'), maxRes=1) if creator: creator = creator[0] if creator.find('tn15more'): creator = '%s>' % creator creator = self._getPersons(creator) if creator: d['creator'] = creator writers = _findBetween(cont, '<h5>Writer', ('</div>', '<br/> <br/>'), maxRes=1) if writers: writers = writers[0] h5idx = writers.find('/h5>') if h5idx != -1: writers = writers[h5idx+4:] writers = self._getPersons(writers) if writers: d['writer'] = writers cvurl = _getTagsWith(cont, 'name="poster"', toClosure=True, maxRes=1) if cvurl: cvurl = _findBetween(cvurl[0], 'src="', '"', maxRes=1) if cvurl: d['cover url'] = cvurl[0] genres = _findBetween(cont, 'href="/genre/', '"') if genres: d['genres'] = list(set(genres)) ur = _findBetween(cont, 'id="star-bar-user-rate">', '</div>', maxRes=1) if ur: rat = _findBetween(ur[0], '<b>', '</b>', maxRes=1) if rat: if rat: d['rating'] = rat[0].strip() else: self._mobile_logger.warn('wrong rating: %s', rat) vi = ur[0].rfind('href="ratings"') if vi != -1 and ur[0][vi+10:].find('await') == -1: try: votes = _findBetween(ur[0][vi:], "title='", " IMDb", maxRes=1) votes = int(votes[0].replace(',', '')) d['votes'] = votes except (ValueError, IndexError): self._mobile_logger.warn('wrong votes: %s', ur) top250 = _findBetween(cont, 'href="/chart/top?', '</a>', maxRes=1) if top250: fn = top250[0].rfind('#') if fn != -1: try: td = int(top250[0][fn+1:]) d['top 250 rank'] = td except ValueError: self._mobile_logger.warn('wrong top250: %s', top250) castdata = _findBetween(cont, 'Cast overview', '</table>', maxRes=1) if not castdata: castdata = _findBetween(cont, 'Credited cast', '</table>', maxRes=1) if not castdata: castdata = _findBetween(cont, 'Complete credited cast', '</table>', maxRes=1) if not castdata: castdata = _findBetween(cont, 'Series Cast Summary', '</table>', maxRes=1) if not castdata: castdata = _findBetween(cont, 'Episode Credited cast', '</table>', maxRes=1) if castdata: castdata = castdata[0] # Reintegrate the fist tag. fl = castdata.find('href=') if fl != -1: castdata = '<a ' + castdata[fl:] # Exclude the 'rest of cast listed alphabetically' row. smib = castdata.find('<tr><td align="center" colspan="4"><small>') if smib != -1: smie = castdata.rfind('</small></td></tr>') if smie != -1: castdata = castdata[:smib].strip() + \ castdata[smie+18:].strip() castdata = castdata.replace('/tr> <tr', '/tr><tr') cast = self._getPersons(castdata, sep='</tr><tr') if cast: d['cast'] = cast akas = _findBetween(cont, 'Also Known As:</h5>', '</div>', maxRes=1) if akas: # For some reason, here <br> is still used in place of <br/>. akas[:] = [x for x in akas[0].split('<br>') if x.strip()] akas = [_unHtml(x).replace('" - ','::', 1).lstrip('"').strip() for x in akas] if 'See more' in akas: akas.remove('See more') akas[:] = [x for x in akas if x] if akas: d['akas'] = akas mpaa = _findBetween(cont, 'MPAA</a>:', '</div>', maxRes=1) if mpaa: d['mpaa'] = _unHtml(mpaa[0]) runtimes = _findBetween(cont, 'Runtime:</h5>', '</div>', maxRes=1) if runtimes: runtimes = runtimes[0] runtimes = [x.strip().replace(' min', '').replace(' (', '::(', 1) for x in runtimes.split('|')] d['runtimes'] = [_unHtml(x).strip() for x in runtimes] if kind == 'episode': # number of episodes. epsn = _findBetween(cont, 'title="Full Episode List">', '</a>', maxRes=1) if epsn: epsn = epsn[0].replace(' Episodes', '').strip() if epsn: try: epsn = int(epsn) except: self._mobile_logger.warn('wrong episodes #: %s', epsn) d['number of episodes'] = epsn country = _findBetween(cont, 'Country:</h5>', '</div>', maxRes=1) if country: country[:] = country[0].split(' | ') country[:] = ['<a %s' % x for x in country if x] country[:] = [_unHtml(x.replace(' <i>', '::')) for x in country] if country: d['countries'] = country lang = _findBetween(cont, 'Language:</h5>', '</div>', maxRes=1) if lang: lang[:] = lang[0].split(' | ') lang[:] = ['<a %s' % x for x in lang if x] lang[:] = [_unHtml(x.replace(' <i>', '::')) for x in lang] if lang: d['languages'] = lang col = _findBetween(cont, '"/search/title?colors=', '</div>') if col: col[:] = col[0].split(' | ') col[:] = ['<a %s' % x for x in col if x] col[:] = [_unHtml(x.replace(' <i>', '::')) for x in col] if col: d['color info'] = col sm = _findBetween(cont, '/search/title?sound_mixes=', '</div>', maxRes=1) if sm: sm[:] = sm[0].split(' | ') sm[:] = ['<a %s' % x for x in sm if x] sm[:] = [_unHtml(x.replace(' <i>', '::')) for x in sm] if sm: d['sound mix'] = sm cert = _findBetween(cont, 'Certification:</h5>', '</div>', maxRes=1) if cert: cert[:] = cert[0].split(' | ') cert[:] = [_unHtml(x.replace(' <i>', '::')) for x in cert] if cert: d['certificates'] = cert plotoutline = _findBetween(cont, 'Plot:</h5>', ['<a ', '</div>'], maxRes=1) if plotoutline: plotoutline = plotoutline[0].strip() plotoutline = plotoutline.rstrip('|').rstrip() if plotoutline: d['plot outline'] = _unHtml(plotoutline) aratio = _findBetween(cont, 'Aspect Ratio:</h5>', ['<a ', '</div>'], maxRes=1) if aratio: aratio = aratio[0].strip().replace(' (', '::(', 1) if aratio: d['aspect ratio'] = _unHtml(aratio) return {'data': d}