def getFilmography(dataF, indexF, keyF, attrIF, attrKF, offset, charNF=None, doCast=0, doWriters=0): """Gather information from the given files about the person entry found at offset; return a list of Movie objects, with the relevant attributes.""" name, res = getRawData(dataF, offset, doCast, doWriters) resList = [] for movie in res: title = getLabel(movie['movieID'], indexF, keyF) if not title: continue curRole = movie.get('currentRole', u'') roleID = None if curRole and charNF: curRole, roleID = getCharactersIDs(curRole, charNF) m = Movie(title=title, movieID=movie['movieID'], currentRole=curRole, roleID=roleID, accessSystem='local') if movie.has_key('attributeID'): attr = getLabel(movie['attributeID'], attrIF, attrKF) if attr: m.notes = attr resList.append(m) return resList
def _buildEpisodes(self, eps_list, parentID): episodes = {} parentTitle = getLabel(parentID, '%stitles.index' % self.__db, '%stitles.key' % self.__db) parentSeries = Movie(title=parentTitle, movieID=parentID, accessSystem='local') for episodeID, episodeTitle in eps_list: episodeTitle = unicode(episodeTitle, 'latin_1', 'replace') data = analyze_title(episodeTitle, canonical=1) m = Movie(data=data, movieID=episodeID, accessSystem='local') m['episode of'] = parentSeries if data.get('year') is None: year = getFullIndex('%smovies.data' % self.__db, key=episodeID, kind='moviedata', rindex=1) if year: m['year'] = year season = data.get('season', 'UNKNOWN') if not episodes.has_key(season): episodes[season] = {} ep_number = data.get('episode') if ep_number is None: ep_number = max((episodes[season].keys() or [0])) + 1 episodes[season][ep_number] = m return episodes
def _buildGuests(gl): """Return a list of Movie objects from a list of GA lines.""" rl = [] rlapp = rl.append for g in gl: # When used by the imdbpy2sql.py script, latin_1 strings are passed. if not isinstance(g, UnicodeType): g = unicode(g, 'latin_1', 'replace') titl = re_titleRef.findall(g) if len(titl) != 1: continue note = u'' if g[-1] == ')': opi = g.rfind('(episode') if opi == -1: opi = g.rfind('(') if opi != -1: note = g[opi:].replace('_', '"').strip() g = g[:opi].strip() cr = u'' cri = g.find('_ (qv), as ') if cri != -1: cr = g[cri + 11:].replace('[unknown]', u'').strip() if cr and cr[-1] == ')': opi = cr.rfind('(') if opi != -1: if note: note += ' ' note += cr[opi:] cr = cr[:opi].strip() # As you can see, we've no notion of the movieID, here. m = Movie(title=titl[0], currentRole=cr, notes=note, accessSystem='local') rlapp(m) return rl
def postprocess_data(self, data): result = {} for item in ('names refs', 'titles refs', 'characters refs'): result[item] = {} for k, v in data.get(item, []): k = k.strip() v = v.strip() if not (k and v): continue if not v.endswith('/'): continue imdbID = analyze_imdbid(v) if item == 'names refs': obj = Person(personID=imdbID, name=k, accessSystem=self._as, modFunct=self._modFunct) elif item == 'titles refs': obj = Movie(movieID=imdbID, title=k, accessSystem=self._as, modFunct=self._modFunct) else: obj = Character(characterID=imdbID, name=k, accessSystem=self._as, modFunct=self._modFunct) # XXX: companies aren't handled: are they ever found in text, # as links to their page? result[item][k] = obj return result
def _base_person_info(self, personID, movies_cache=None, persons_cache=None): if movies_cache is None: movies_cache = {} if persons_cache is None: persons_cache = {} if personID in persons_cache: return persons_cache[personID] nb = self.T['name_basics'] person = nb.select(nb.c.nconst == personID).execute().fetchone() or {} data = self._rename('name_basics', dict(person)) movies = [] for movieID in split_array(data.get('known for') or ''): if not movieID: continue movieID = int(movieID) movie_data = self._base_title_info(movieID, movies_cache=movies_cache, persons_cache=persons_cache) movie = Movie(movieID=movieID, data=movie_data, accessSystem=self.accessSystem) movies.append(movie) data['known for'] = movies self._clean(data, ('ns_soundex', 'sn_soundex', 's_soundex', 'personID')) persons_cache[personID] = data return data
def get_movie_data(movieID, kindDict, fromAka=0): """Return a dictionary containing data about the given movieID; if fromAka is true, the AkaTitle table is searched.""" if not fromAka: Table = Title else: Table = AkaTitle m = Table.get(movieID) mdict = { 'title': m.title, 'kind': kindDict[m.kindID], 'year': m.productionYear, 'imdbIndex': m.imdbIndex, 'season': m.seasonNr, 'episode': m.episodeNr } if not fromAka: if m.seriesYears is not None: mdict['series years'] = unicode(m.seriesYears) if mdict['imdbIndex'] is None: del mdict['imdbIndex'] if mdict['year'] is None: del mdict['year'] else: try: mdict['year'] = int(mdict['year']) except (TypeError, ValueError): del mdict['year'] if mdict['season'] is None: del mdict['season'] else: try: mdict['season'] = int(mdict['season']) except: pass if mdict['episode'] is None: del mdict['episode'] else: try: mdict['episode'] = int(mdict['episode']) except: pass episodeOfID = m.episodeOfID if episodeOfID is not None: ser_dict = get_movie_data(episodeOfID, kindDict, fromAka) mdict['episode of'] = Movie(data=ser_dict, movieID=episodeOfID, accessSystem='sql') if fromAka: ser_note = AkaTitle.get(episodeOfID).note if ser_note: mdict['episode of'].notes = ser_note return mdict
def end_li(self): self._in_li = 0 if self._in_episodes: et = self._cur_episode_title.strip() minfo = self._misc_info.strip() if et and self._episode_id: eps_data = analyze_title(et, canonical=1) eps_data['kind'] = u'episode' e = Movie(movieID=str(self._episode_id), data=eps_data, accessSystem=self._as, modFunct=self._modFunct) e['episode of'] = self._cur_series if minfo.startswith('('): pe = minfo.find(')') if pe != -1: date = minfo[1:pe] if date != '????': e['original air date'] = date if eps_data.get('year', '????') == '????': syear = date.split()[-1] if syear.isdigit(): e['year'] = syear rolei = minfo.find(' - ') if rolei != -1: if not self._got_i_info: role = u'' role = minfo[rolei + 3:].strip() notei = role.rfind('(') note = u'' if notei != -1 and role and role[-1] == ')': note = role[notei:] role = role[:notei].strip() e.notes = note e.currentRole = role else: randn = minfo[rolei + 3:].strip().split() note = '[%s]' % randn[0] note += ' '.join(randn[1:]) e.notes = note self._episodes.setdefault(self._cur_series, []).append(e) self._cur_episode_title = u'' self._episode_id = None self._in_misc_info = 0 self._misc_info = u''
def end_li(self): self._in_li = 0 if self._in_episodes: et = self._cur_episode_title.strip() minfo = self._misc_info.strip() if et and self._episode_id: eps_data = analyze_title(et, canonical=1) eps_data['kind'] = u'episode' e = Movie(movieID=str(self._episode_id), data=eps_data, accessSystem=self._as, modFunct=self._modFunct) e['episode of'] = self._cur_series if minfo.startswith('('): pe = minfo.find(')') if pe != -1: date = minfo[1:pe] if date != '????': e['original air date'] = date if eps_data.get('year', '????') == '????': syear = date.split()[-1] if syear.isdigit(): e['year'] = syear rolei = minfo.find(' - ') if rolei != -1: if not self._got_i_info: role = u'' role = minfo[rolei+3:].strip() notei = role.rfind('(') note = u'' if notei != -1 and role and role[-1] == ')': note = role[notei:] role = role[:notei].strip() e.notes = note e.currentRole = role else: randn = minfo[rolei+3:].strip().split() note = '[%s]' % randn[0] note += ' '.join(randn[1:]) e.notes = note self._episodes.setdefault(self._cur_series, []).append(e) self._cur_episode_title = u'' self._episode_id = None self._in_misc_info = 0 self._misc_info = u''
def postprocess_data(self, data): if not data: return {} newData = {} for title in data: movieID, quotes = data[title] if movieID is None: movie = title else: movie = Movie(title=title, movieID=movieID, accessSystem=self._as, modFunct=self._modFunct) newData[movie] = [quote.split('::') for quote in quotes] return {'quotes': newData}
def _add_items(self): self._quotes = [x.replace(':: ', '::').replace(' ::', '::').rstrip(':') for x in self._quotes] self._quotes = [x.replace(' ', ' ').replace(' ', ' ').strip() for x in self._quotes] self._quotes = filter(None, self._quotes) if not (self._cur_title and self._cur_titleID and self._quotes): self._quotes = [u''] return movie = Movie(title=self._cur_title, movieID=self._cur_titleID, accessSystem=self._as, modFunct=self._modFunct) self._tot_quotes[movie] = self._quotes[:] self._quotes = [u'']
def test_search(self): ia = IMDb() serials = ia.search_movie("good place") for serial in serials: if serial.data["kind"] == "tv series": print(serial.movieID, serial.data["title"], serial.data["kind"]) print(serials) result = ia.get_movie_episodes("4955642") print(result) for season in result["data"]["episodes"]: for index in result["data"]["episodes"][season]: item = Movie(result["data"]["episodes"][4][index]) print(f"{season}x{index}", item.myID)
def end_a(self): if self._in_episode_title: self._in_episode_title = 0 self._in_misc_info = 1 elif self._in_series_title: self._in_series_title = 0 st = self._cur_series_title.strip() if st and self._series_id is not None: series_data = analyze_title(st, canonical=1) s = Movie(movieID=str(self._series_id), data=series_data, accessSystem=self._as, modFunct=self._modFunct) self._cur_series = s
def getMovieLinks(movieID, dataF, movieTitlIF, movieTitlKF): """Return a dictionary with movie connections.""" entries = getFullIndex(dataF, movieID, kind='mlinks', rindex=None, multi=1, default=[]) res = {} for entry in entries: title = getLabel(entry[2], movieTitlIF, movieTitlKF) if not title: continue m = Movie(title=title, movieID=entry[2], accessSystem='local') sect = _links_sect.get(entry[1]) if not sect: continue res.setdefault(sect, []).append(m) return res
def _add_ref(self, kind): """Add a reference entry to the names and titles dictionaries.""" if kind == 'tt': if self._titleRefCID and self._titleCN: if not self._titlesRefs.has_key(self._titleCN): try: movie = Movie(movieID=str(self._titleRefCID), title=self._titleCN, accessSystem=self._as, modFunct=self._modFunct) self._titlesRefs[self._titleCN] = movie except IMDbParserError: pass self._titleRefCID = u'' self._titleCN = u'' self._inTTRef = 0 self._inLinkTTRef = 0 elif kind == 'nm' and self._nameRefCID and self._nameCN: # XXX: 'Neo' and 'Keanu Reeves' are two separated # entry in the dictionary. Check the ID value instead # of the key? if not self._namesRefs.has_key(self._nameCN): try: person = Person(name=self._nameCN, personID=str(self._nameRefCID), accessSystem=self._as, modFunct=self._modFunct) self._namesRefs[self._nameCN] = person except IMDbParserError: pass self._nameRefCID = u'' self._nameCN = u'' self._inNMRef = 0 elif kind == 'ch' and self._characterRefCID and self._characterCN: if not self._charactersRefs.has_key(self._characterCN): try: character = Character(name=self._characterCN, characterID=str( self._characterRefCID), accessSystem='http') self._charactersRefs[self._characterCN] = character except IMDbParserError: pass self._characterRefCID = u'' self._characterCN = u'' self._inCHRef = 0
def _findRefs(self, o, trefs, nrefs): """Find titles or names references in strings.""" if isinstance(o, (unicode, str)): for title in re_titleRef.findall(o): a_title = analyze_title(title, canonical=0) rtitle = build_title(a_title, ptdf=1) if trefs.has_key(rtitle): continue movieID = self._getTitleID(rtitle) if movieID is None: movieID = self._getTitleID(title) if movieID is None: continue m = Movie(title=rtitle, movieID=movieID, accessSystem=self.accessSystem) trefs[rtitle] = m rtitle2 = canonicalTitle(a_title.get('title', u'')) if rtitle2 and rtitle2 != rtitle and rtitle2 != title: trefs[rtitle2] = m if title != rtitle: trefs[title] = m for name in re_nameRef.findall(o): a_name = analyze_name(name, canonical=1) rname = build_name(a_name, canonical=1) if nrefs.has_key(rname): continue personID = self._getNameID(rname) if personID is None: personID = self._getNameID(name) if personID is None: continue p = Person(name=rname, personID=personID, accessSystem=self.accessSystem) nrefs[rname] = p rname2 = normalizeName(a_name.get('name', u'')) if rname2 and rname2 != rname: nrefs[rname2] = p if name != rname and name != rname2: nrefs[name] = p elif isinstance(o, (list, tuple)): for item in o: self._findRefs(item, trefs, nrefs) elif isinstance(o, dict): for value in o.values(): self._findRefs(value, trefs, nrefs) return (trefs, nrefs)
def postprocess_data(self, data): if len(data) == 0: return {} nd = {} for key in data.keys(): dom = self.get_dom(key) link = self.xpath(dom, "//a/@href")[0] title = self.xpath(dom, "//a/text()")[0][1:-1] series = Movie(movieID=analyze_imdbid(link), data=analyze_title(title), accessSystem=self._as, modFunct=self._modFunct) nd[series] = [] for episode in data[key]: # XXX: should we create a copy of 'series', to avoid # circular references? episode['episode of'] = series nd[series].append(episode) return {'episodes': nd}
def postprocess_data(self, data): result = {} for item in ('names refs', 'titles refs'): result[item] = {} for k, v in data.get(item, []): k = k.strip() v = v.strip() if not (k and v): continue imdbID = analyze_imdbid(v) if item == 'names refs': obj = Person(personID=imdbID, name=k, accessSystem=self._as, modFunct=self._modFunct) elif item == 'titles refs': obj = Movie(movieID=imdbID, title=k, accessSystem=self._as, modFunct=self._modFunct) result[item][k] = obj return result
def __get_serial(imdb_id: str, title: str, year: str) -> Serial: try: ia = IMDb() info = ia.get_movie_episodes(imdb_id) seasons = list() for season_index in info["data"]["episodes"]: episodes = list() for episode_index in info["data"]["episodes"][season_index]: movie = Movie(info["data"]["episodes"][season_index][episode_index]) episode = Episode(episode_index, movie.myID.data["title"], movie.myID.movieID) episodes.append(episode) # print(f"{season_index}x{episode_index}", movie.myID) season = Season(f"Season {season_index}", episodes) seasons.append(season) seasons.sort(key=lambda s: s.title) serial = Serial(0, title, year, imdb_id, "", seasons) return serial except Exception as ex: print("Exception", ex)
def _build_episode(link, title, minfo, role, roleA, roleAID): """Build an Movie object for a given episode of a series.""" episode_id = analyze_imdbid(link) notes = '' minidx = minfo.find(' -') # Sometimes, for some unknown reason, the role is left in minfo. if minidx != -1: slfRole = minfo[minidx + 3:].lstrip() minfo = minfo[:minidx].rstrip() if slfRole.endswith(')'): commidx = slfRole.rfind('(') if commidx != -1: notes = slfRole[commidx:] slfRole = slfRole[:commidx] if slfRole and role is None and roleA is None: role = slfRole eps_data = analyze_title(title) eps_data['kind'] = 'episode' # FIXME: it's wrong for multiple characters (very rare on tv series?). if role is None: role = roleA # At worse, it's None. if role is None: roleAID = None if roleAID is not None: roleAID = analyze_imdbid(roleAID) e = Movie(movieID=episode_id, data=eps_data, currentRole=role, roleID=roleAID, notes=notes) # XXX: are we missing some notes? # XXX: does it parse things as "Episode dated 12 May 2005 (12 May 2005)"? if minfo.startswith('('): pe = minfo.find(')') if pe != -1: date = minfo[1:pe] if date != '????': e['original air date'] = date if eps_data.get('year', '????') == '????': syear = date.split()[-1] if syear.isdigit(): e['year'] = int(syear) return e
def do_br(self, attrs): if self._in_series_title: self._in_series_title = 0 st = self._series_title.strip() if st and self.__seriesID: d_title = analyze_title(st, canonical=1) m = Movie(movieID=str(self.__seriesID), data=d_title, accessSystem=self._as, modFunct=self._modFunct) self._result['kind'] = u'episode' self._result['episode of'] = m self._series_title = u'' elif self._in_series_info: self._in_series_info = 0 si = ' '.join([x for x in self._series_info.split() if x]) if si: aid = self.re_airdate.findall(si) if aid and len(aid[0]) == 3: date, season, episode = aid[0] date = date.strip() try: season = int(season) except: pass try: episode = int(episode) except: pass if date and date != '????': self._result['original air date'] = date # Handle also "episode 0". if season or type(season) is type(0): self._result['season'] = season if episode or type(season) is type(0): self._result['episode'] = episode self._series_info = u''
return None length = convBin(dfptr.read(2), 'longlength') # Skip character name. latin2utf(dfptr.read(length)) nrItems = convBin(dfptr.read(3), 'nrCharacterItems') if limit is not None and nrItems/2 > limit: nrItems = limit*2 filmography = [] for i in xrange(nrItems/2): personID = convBin(dfptr.read(3), 'personID') name = getLabel(personID, personIF, personKF) movieID = convBin(dfptr.read(3), 'movieID') title = getLabel(movieID, movieIF, movieKF) # XXX: notes are not retrieved: they can be found scanning # actors.list and acresses.list, but it will slow down everything. m = Movie(title=title, movieID=movieID, currentRole=name, roleID=personID, roleIsPerson=True, accessSystem='local') filmography.append(m) dfptr.close() return filmography def _convChID(characterID): """Return a numeric value for the given string, or None.""" if characterID is None: return None return convBin(characterID, 'characterID') def getCharactersIDs(names_string, charNF): """Returns a tuple (name, roleID) if the supplied string contains only one character, otherwise returns a tuple of lists:
# Complete cast/crew. compcast = [ (self._compcast[cc.subjectID], self._compcast[cc.statusID]) for cc in CompleteCast.select(CompleteCast.q.movieID == movieID) ] if compcast: for entry in compcast: val = unicode(entry[1]) res[u'complete %s' % entry[0]] = val # Movie connections. mlinks = [[ml.linkedMovieID, self._link[ml.linkTypeID]] for ml in MovieLink.select(MovieLink.q.movieID == movieID)] if mlinks: for ml in mlinks: lmovieData = get_movie_data(ml[0], self._kind) m = Movie(movieID=ml[0], data=lmovieData, accessSystem='sql') ml[0] = m res['connections'] = {} mlinks[:] = _groupListBy(mlinks, 1) for group in mlinks: lt = group[0][1] res['connections'][lt] = [i[0] for i in group] # Episodes. episodes = {} eps_list = list(Title.select(Title.q.episodeOfID == movieID)) eps_list.sort() if eps_list: ps_data = { 'title': res['title'], 'kind': res['kind'], 'year': res.get('year'),
def get_movie_main(self, movieID): # Information sets provided by this method. infosets = ('main', 'vote details') tl = getLabel(movieID, '%stitles.index' % self.__db, '%stitles.key' % self.__db) # No title, no party. if tl is None: raise IMDbDataAccessError, 'unable to get movieID "%s"' % movieID res = analyze_title(tl) # Build the cast list. actl = [] for castG in ('actors', 'actresses'): midx = getFullIndex('%s%s.titles' % (self.__db, castG), movieID, multi=1) if midx is not None: params = {'movieID': movieID, 'dataF': '%s%s.data' % (self.__db, castG), 'indexF': '%snames.index' % self.__db, 'keyF': '%snames.key' % self.__db, 'attrIF': '%sattributes.index' % self.__db, 'attrKF': '%sattributes.key' % self.__db, 'charNF': '%scharacter2id.index' % self.__db, 'offsList': midx, 'doCast': 1} actl += getMovieCast(**params) if actl: actl.sort() res['cast'] = actl # List of other workers. works = ('writer', 'cinematographer', 'composer', 'costume-designer', 'director', 'editor', 'miscellaneou', 'producer', 'production-designer', 'cinematographer') for i in works: index = getFullIndex('%s%ss.titles' % (self.__db, i), movieID, multi=1) if index is not None: params = {'movieID': movieID, 'dataF': '%s%s.data' % (self.__db, i), 'indexF': '%snames.index' % self.__db, 'keyF': '%snames.key' % self.__db, 'attrIF': '%sattributes.index' % self.__db, 'attrKF': '%sattributes.key' % self.__db, 'offsList': index} name = key = i if '-' in name: name = name.replace('-', ' ') elif name == 'miscellaneou': name = 'miscellaneous crew' key = 'miscellaneou' elif name == 'writer': params['doWriters'] = 1 params['dataF'] = '%s%ss.data' % (self.__db, key) data = getMovieCast(**params) if name == 'writer': data.sort() res[name] = data # Rating. rt = self.get_movie_vote_details(movieID)['data'] if rt: res.update(rt) # Various information. miscInfo = (('runtimes', 'running-times'), ('color info', 'color-info'), ('genres', 'genres'), ('distributors', 'distributors'), ('languages', 'language'), ('certificates', 'certificates'), ('special effects companies', 'special-effects-companies'), ('sound mix', 'sound-mix'), ('tech info', 'technical'), ('production companies', 'production-companies'), ('countries', 'countries')) for name, fname in miscInfo: params = {'movieID': movieID, 'dataF': '%s%s.data' % (self.__db, fname), 'indexF': '%s%s.index' % (self.__db, fname), 'attrIF': '%sattributes.index' % self.__db, 'attrKF': '%sattributes.key' % self.__db} data = getMovieMisc(**params) if name in ('distributors', 'special effects companies', 'production companies'): for nitem in xrange(len(data)): n, notes = split_company_name_notes(data[nitem]) company = Company(name=n, companyID=getCompanyID(n, '%scompany2id.index' % self.__db), notes=notes, accessSystem=self.accessSystem) data[nitem] = company if data: res[name] = data if res.has_key('runtimes') and len(res['runtimes']) > 0: rt = res['runtimes'][0] episodes = re_episodes.findall(rt) if episodes: res['runtimes'][0] = re_episodes.sub('', rt) res['number of episodes'] = episodes[0] # AKA titles. akas = getAkaTitles(movieID, '%saka-titles.data' % self.__db, '%stitles.index' % self.__db, '%stitles.key' % self.__db, '%sattributes.index' % self.__db, '%sattributes.key' % self.__db) if akas: # normalize encoding. for i in xrange(len(akas)): ts = akas[i].split('::') if len(ts) != 2: continue t = ts[0] n = ts[1] nt = self._changeAKAencoding(n, t) if nt is not None: akas[i] = '%s::%s' % (nt, n) res['akas'] = akas if res.get('kind') == 'episode': # Things to do if this is a tv series episode. episodeOf = res.get('episode of') if episodeOf is not None: parentSeries = Movie(data=res['episode of'], accessSystem='local') seriesID = self._getTitleID(parentSeries.get( 'long imdb canonical title')) parentSeries.movieID = seriesID res['episode of'] = parentSeries if not res.get('year'): year = getFullIndex('%smovies.data' % self.__db, movieID, kind='moviedata', rindex=1) if year: res['year'] = year # MPAA info. mpaa = getMPAA(movieID, '%smpaa-ratings-reasons.index' % self.__db, '%smpaa-ratings-reasons.data' % self.__db) if mpaa: res.update(mpaa) return {'data': res, 'info sets': infosets}
def build_movie(txt, movieID=None, roleID=None, status=None, accessSystem='http', modFunct=None, _parsingCharacter=False, _parsingCompany=False, year=None, chrRoles=None, rolesNoChar=None, additionalNotes=None): """Given a string as normally seen on the "categorized" page of a person on the IMDb's web site, returns a Movie instance.""" # FIXME: Oook, lets face it: build_movie and build_person are now # two horrible sets of patches to support the new IMDb design. They # must be rewritten from scratch. if _parsingCharacter: _defSep = ' Played by ' elif _parsingCompany: _defSep = ' ... ' else: _defSep = ' .... ' title = re_spaces.sub(' ', txt).strip() # Split the role/notes from the movie title. tsplit = title.split(_defSep, 1) role = u'' notes = u'' roleNotes = [] if len(tsplit) == 2: title = tsplit[0].rstrip() role = tsplit[1].lstrip() if title[-9:] == 'TV Series': title = title[:-9].rstrip() elif title[-14:] == 'TV mini-series': title = title[:-14] + ' (mini)' if title and title.endswith(_defSep.rstrip()): title = title[:-len(_defSep) + 1] # Try to understand where the movie title ends. while True: if year: break if title[-1:] != ')': # Ignore the silly "TV Series" notice. if title[-9:] == 'TV Series': title = title[:-9].rstrip() continue else: # Just a title: stop here. break # Try to match paired parentheses; yes: sometimes there are # parentheses inside comments... nidx = title.rfind('(') while (nidx != -1 and \ title[nidx:].count('(') != title[nidx:].count(')')): nidx = title[:nidx].rfind('(') # Unbalanced parentheses: stop here. if nidx == -1: break # The last item in parentheses seems to be a year: stop here. first4 = title[nidx + 1:nidx + 5] if (first4.isdigit() or first4 == '????') and \ title[nidx+5:nidx+6] in (')', '/'): break # The last item in parentheses is a known kind: stop here. if title[nidx + 1:-1] in ('TV', 'V', 'mini', 'VG'): break # Else, in parentheses there are some notes. # XXX: should the notes in the role half be kept separated # from the notes in the movie title half? if notes: notes = '%s %s' % (title[nidx:], notes) else: notes = title[nidx:] title = title[:nidx].rstrip() if year: year = year.strip() if title[-1] == ')': fpIdx = title.rfind('(') if fpIdx != -1: if notes: notes = '%s %s' % (title[fpIdx:], notes) else: notes = title[fpIdx:] title = title[:fpIdx].rstrip() title = u'%s (%s)' % (title, year) if _parsingCharacter and roleID and not role: roleID = None if not roleID: roleID = None elif len(roleID) == 1: roleID = roleID[0] if not role and chrRoles and isinstance(roleID, (str, unicode)): roleID = _re_chrIDs.findall(roleID) role = ' / '.join(filter(None, chrRoles.split('@@'))) # Manages multiple roleIDs. if isinstance(roleID, list): tmprole = role.split('/') role = [] for r in tmprole: nidx = r.find('(') if nidx != -1: role.append(r[:nidx].rstrip()) roleNotes.append(r[nidx:]) else: role.append(r) roleNotes.append(None) lr = len(role) lrid = len(roleID) if lr > lrid: roleID += [None] * (lrid - lr) elif lr < lrid: roleID = roleID[:lr] for i, rid in enumerate(roleID): if rid is not None: roleID[i] = str(rid) if lr == 1: role = role[0] roleID = roleID[0] elif roleID is not None: roleID = str(roleID) if movieID is not None: movieID = str(movieID) if (not title) or (movieID is None): _b_m_logger.error('empty title or movieID for "%s"', txt) if rolesNoChar: rolesNoChar = filter(None, [x.strip() for x in rolesNoChar.split('/')]) if not role: role = [] elif not isinstance(role, list): role = [role] role += rolesNoChar notes = notes.strip() if additionalNotes: additionalNotes = re_spaces.sub(' ', additionalNotes).strip() if notes: notes += u' ' notes += additionalNotes m = Movie(title=title, movieID=movieID, notes=notes, currentRole=role, roleID=roleID, roleIsPerson=_parsingCharacter, modFunct=modFunct, accessSystem=accessSystem) if roleNotes and len(roleNotes) == len(roleID): for idx, role in enumerate(m.currentRole): try: if roleNotes[idx]: role.notes = roleNotes[idx] except IndexError: break # Status can't be checked here, and must be detected by the parser. if status: m['status'] = status return m
def get_person_filmography(self, personID): infosets = ('filmography', 'episodes') res = {} episodes = {} works = ('actor', 'actresse', 'producer', 'writer', 'cinematographer', 'composer', 'costume-designer', 'director', 'editor', 'miscellaneou', 'production-designer') for i in works: index = getFullIndex('%s%ss.names' % (self.__db, i), personID) if index is not None: params = {'offset': index, 'indexF': '%stitles.index' % self.__db, 'keyF': '%stitles.key' % self.__db, 'attrIF': '%sattributes.index' % self.__db, 'attrKF': '%sattributes.key' % self.__db, 'charNF': '%scharacter2id.index' % self.__db} name = key = i if '-' in name: name = name.replace('-', ' ') elif name == 'actresse': name = 'actress' params['doCast'] = 1 elif name == 'miscellaneou': name = 'miscellaneous crew' key = 'miscellaneou' elif name == 'actor': params['doCast'] = 1 elif name == 'writer': params['doWriters'] = 1 params['dataF'] = '%s%ss.data' % (self.__db, key) data = getFilmography(**params) movies = [] eps = [] # Split normal titles from episodes. for d in data: if d.get('kind') != 'episode': movies.append(d) else: eps.append(d) movies.sort() if movies: res[name] = movies for e in eps: series = Movie(data=e['episode of'], accessSystem='local') seriesID = self._getTitleID(series.get( 'long imdb canonical title')) series.movieID = seriesID if not e.get('year'): year = getFullIndex('%smovies.data' % self.__db, e.movieID, kind='moviedata', rindex=1) if year: e['year'] = year if not e.currentRole and name not in ('actor', 'actress'): if e.notes: e.notes = ' %s' % e.notes e.notes = '[%s]%s' % (name, e.notes) episodes.setdefault(series, []).append(e) if episodes: for k in episodes: episodes[k].sort() episodes[k].reverse() res['episodes'] = episodes return {'data': res, 'info sets': tuple(infosets)}
def build_movie(txt, movieID=None, roleID=None, status=None, accessSystem='http', modFunct=None, _parsingCharacter=False, _parsingCompany=False): """Given a string as normally seen on the "categorized" page of a person on the IMDb's web site, returns a Movie instance.""" if _parsingCharacter: _defSep = ' Played by ' elif _parsingCompany: _defSep = ' ... ' else: _defSep = ' .... ' title = re_spaces.sub(' ', txt).strip() # Split the role/notes from the movie title. tsplit = title.split(_defSep, 1) role = u'' notes = u'' roleNotes = [] if len(tsplit) == 2: title = tsplit[0].rstrip() role = tsplit[1].lstrip() if title[-9:] == 'TV Series': title = title[:-9].rstrip() elif title[-14:] == 'TV mini-series': title = title[:-14] + ' (mini)' # Try to understand where the movie title ends. while True: if title[-1:] != ')': # Ignore the silly "TV Series" notice. if title[-9:] == 'TV Series': title = title[:-9].rstrip() continue else: # Just a title: stop here. break # Try to match paired parentheses; yes: sometimes there are # parentheses inside comments... nidx = title.rfind('(') while (nidx != -1 and \ title[nidx:].count('(') != title[nidx:].count(')')): nidx = title[:nidx].rfind('(') # Unbalanced parentheses: stop here. if nidx == -1: break # The last item in parentheses seems to be a year: stop here. first4 = title[nidx + 1:nidx + 5] if (first4.isdigit() or first4 == '????') and \ title[nidx+5:nidx+6] in (')', '/'): break # The last item in parentheses is a known kind: stop here. if title[nidx + 1:-1] in ('TV', 'V', 'mini', 'VG'): break # Else, in parentheses there are some notes. # XXX: should the notes in the role half be kept separated # from the notes in the movie title half? if notes: notes = '%s %s' % (title[nidx:], notes) else: notes = title[nidx:] title = title[:nidx].rstrip() if _parsingCharacter and roleID and not role: roleID = None if not roleID: roleID = None elif len(roleID) == 1: roleID = roleID[0] # Manages multiple roleIDs. if isinstance(roleID, list): tmprole = role.split('/') role = [] for r in tmprole: nidx = r.find('(') if nidx != -1: role.append(r[:nidx].rstrip()) roleNotes.append(r[nidx:]) else: role.append(r) roleNotes.append(None) lr = len(role) lrid = len(roleID) if lr > lrid: roleID += [None] * (lrid - lr) elif lr < lrid: roleID = roleID[:lr] for i, rid in enumerate(roleID): if rid is not None: roleID[i] = str(rid) if lr == 1: role = role[0] roleID = roleID[0] elif roleID is not None: roleID = str(roleID) if movieID is not None: movieID = str(movieID) if (not title) or (movieID is None): _b_m_logger.error('empty title or movieID for "%s"', txt) m = Movie(title=title, movieID=movieID, notes=notes, currentRole=role, roleID=roleID, roleIsPerson=_parsingCharacter, modFunct=modFunct, accessSystem=accessSystem) if roleNotes and len(roleNotes) == len(roleID): for idx, role in enumerate(m.currentRole): if roleNotes[idx]: role.notes = roleNotes[idx] # Status can't be checked here, and must be detected by the parser. if status: m['status'] = status return m
# Yes: kindID values are hard-coded in the companies4local.py script. _kinds = { 0: 'distributors', 1: 'production companies', 2: 'special effect companies', 3: 'miscellaneous companies' } for i in xrange(nrItems): kind = _kinds.get(ord(dfptr.read(1))) if kind is None: import warnings warnings.warn('Unidentified kindID for a company.') break movieID = convBin(dfptr.read(3), 'movieID') title = getLabel(movieID, movieIF, movieKF) m = Movie(title=title, movieID=movieID, accessSystem='local') filmography.setdefault(kind, []).append(m) dfptr.close() return filmography def _convChID(companyID): """Return a numeric value for the given string, or None.""" if companyID is None: return None return convBin(companyID, 'companyID') def getCompanyID(name, compNF): """Return a companyID for a name.""" try:
def get_movie_main(self, movieID): cont = self._mretrieve(self.urls['movie_main'] % movieID + 'maindetails') title = _findBetween(cont, '<title>', '</title>', maxRes=1) if not title: raise IMDbDataAccessError('unable to get movieID "%s"' % movieID) title = _unHtml(title[0]) if title.endswith(' - IMDb'): title = title[:-7] if cont.find('<span class="tv-extra">TV mini-series</span>') != -1: title += ' (mini)' d = analyze_title(title) kind = d.get('kind') tv_series = _findBetween(cont, 'TV Series:</h5>', '</a>', maxRes=1) if tv_series: mid = re_imdbID.findall(tv_series[0]) else: mid = None if tv_series and mid: s_title = _unHtml(tv_series[0]) s_data = analyze_title(s_title) m = Movie(movieID=str(mid[0]), data=s_data, accessSystem=self.accessSystem, modFunct=self._defModFunct) d['kind'] = kind = u'episode' d['episode of'] = m if kind in ('tv series', 'tv mini series'): years = _findBetween(cont, '<h1>', '</h1>', maxRes=1) if years: years[:] = _findBetween(years[0], 'TV series', '</span>', maxRes=1) if years: d['series years'] = years[0].strip() air_date = _findBetween(cont, 'Original Air Date:</h5>', '</div>', maxRes=1) if air_date: air_date = air_date[0] vi = air_date.find('(') if vi != -1: date = _unHtml(air_date[:vi]).strip() if date != '????': d['original air date'] = date air_date = air_date[vi:] season = _findBetween(air_date, 'Season', ',', maxRes=1) if season: season = season[0].strip() try: season = int(season) except: pass if season or type(season) is _inttype: d['season'] = season episode = _findBetween(air_date, 'Episode', ')', maxRes=1) if episode: episode = episode[0].strip() try: episode = int(episode) except: pass if episode or type(season) is _inttype: d['episode'] = episode direct = _findBetween(cont, '<h5>Director', ('</div>', '<br/> <br/>'), maxRes=1) if direct: direct = direct[0] h5idx = direct.find('/h5>') if h5idx != -1: direct = direct[h5idx+4:] direct = self._getPersons(direct) if direct: d['director'] = direct if kind in ('tv series', 'tv mini series', 'episode'): if kind != 'episode': seasons = _findBetween(cont, 'Seasons:</h5>', '</div>', maxRes=1) if seasons: d['number of seasons'] = seasons[0].count('|') + 1 creator = _findBetween(cont, 'Created by</h5>', ('class="tn15more"', '</div>', '<br/> <br/>'), maxRes=1) if not creator: # They change 'Created by' to 'Creator' and viceversa # from time to time... # XXX: is 'Creators' also used? creator = _findBetween(cont, 'Creator:</h5>', ('class="tn15more"', '</div>', '<br/> <br/>'), maxRes=1) if creator: creator = creator[0] if creator.find('tn15more'): creator = '%s>' % creator creator = self._getPersons(creator) if creator: d['creator'] = creator writers = _findBetween(cont, '<h5>Writer', ('</div>', '<br/> <br/>'), maxRes=1) if writers: writers = writers[0] h5idx = writers.find('/h5>') if h5idx != -1: writers = writers[h5idx+4:] writers = self._getPersons(writers) if writers: d['writer'] = writers cvurl = _getTagsWith(cont, 'name="poster"', toClosure=True, maxRes=1) if cvurl: cvurl = _findBetween(cvurl[0], 'src="', '"', maxRes=1) if cvurl: d['cover url'] = cvurl[0] genres = _findBetween(cont, 'href="/genre/', '"') if genres: d['genres'] = list(set(genres)) ur = _findBetween(cont, 'id="star-bar-user-rate">', '</div>', maxRes=1) if ur: rat = _findBetween(ur[0], '<b>', '</b>', maxRes=1) if rat: if rat: d['rating'] = rat[0].strip() else: self._mobile_logger.warn('wrong rating: %s', rat) vi = ur[0].rfind('href="ratings"') if vi != -1 and ur[0][vi+10:].find('await') == -1: try: votes = _findBetween(ur[0][vi:], "title='", " IMDb", maxRes=1) votes = int(votes[0].replace(',', '')) d['votes'] = votes except (ValueError, IndexError): self._mobile_logger.warn('wrong votes: %s', ur) top250 = _findBetween(cont, 'href="/chart/top?', '</a>', maxRes=1) if top250: fn = top250[0].rfind('#') if fn != -1: try: td = int(top250[0][fn+1:]) d['top 250 rank'] = td except ValueError: self._mobile_logger.warn('wrong top250: %s', top250) castdata = _findBetween(cont, 'Cast overview', '</table>', maxRes=1) if not castdata: castdata = _findBetween(cont, 'Credited cast', '</table>', maxRes=1) if not castdata: castdata = _findBetween(cont, 'Complete credited cast', '</table>', maxRes=1) if not castdata: castdata = _findBetween(cont, 'Series Cast Summary', '</table>', maxRes=1) if not castdata: castdata = _findBetween(cont, 'Episode Credited cast', '</table>', maxRes=1) if castdata: castdata = castdata[0] # Reintegrate the fist tag. fl = castdata.find('href=') if fl != -1: castdata = '<a ' + castdata[fl:] # Exclude the 'rest of cast listed alphabetically' row. smib = castdata.find('<tr><td align="center" colspan="4"><small>') if smib != -1: smie = castdata.rfind('</small></td></tr>') if smie != -1: castdata = castdata[:smib].strip() + \ castdata[smie+18:].strip() castdata = castdata.replace('/tr> <tr', '/tr><tr') cast = self._getPersons(castdata, sep='</tr><tr') if cast: d['cast'] = cast akas = _findBetween(cont, 'Also Known As:</h5>', '</div>', maxRes=1) if akas: # For some reason, here <br> is still used in place of <br/>. akas[:] = [x for x in akas[0].split('<br>') if x.strip()] akas = [_unHtml(x).replace('" - ','::', 1).lstrip('"').strip() for x in akas] if 'See more' in akas: akas.remove('See more') akas[:] = [x for x in akas if x] if akas: d['akas'] = akas mpaa = _findBetween(cont, 'MPAA</a>:', '</div>', maxRes=1) if mpaa: d['mpaa'] = _unHtml(mpaa[0]) runtimes = _findBetween(cont, 'Runtime:</h5>', '</div>', maxRes=1) if runtimes: runtimes = runtimes[0] runtimes = [x.strip().replace(' min', '').replace(' (', '::(', 1) for x in runtimes.split('|')] d['runtimes'] = [_unHtml(x).strip() for x in runtimes] if kind == 'episode': # number of episodes. epsn = _findBetween(cont, 'title="Full Episode List">', '</a>', maxRes=1) if epsn: epsn = epsn[0].replace(' Episodes', '').strip() if epsn: try: epsn = int(epsn) except: self._mobile_logger.warn('wrong episodes #: %s', epsn) d['number of episodes'] = epsn country = _findBetween(cont, 'Country:</h5>', '</div>', maxRes=1) if country: country[:] = country[0].split(' | ') country[:] = ['<a %s' % x for x in country if x] country[:] = [_unHtml(x.replace(' <i>', '::')) for x in country] if country: d['countries'] = country lang = _findBetween(cont, 'Language:</h5>', '</div>', maxRes=1) if lang: lang[:] = lang[0].split(' | ') lang[:] = ['<a %s' % x for x in lang if x] lang[:] = [_unHtml(x.replace(' <i>', '::')) for x in lang] if lang: d['languages'] = lang col = _findBetween(cont, '"/search/title?colors=', '</div>') if col: col[:] = col[0].split(' | ') col[:] = ['<a %s' % x for x in col if x] col[:] = [_unHtml(x.replace(' <i>', '::')) for x in col] if col: d['color info'] = col sm = _findBetween(cont, '/search/title?sound_mixes=', '</div>', maxRes=1) if sm: sm[:] = sm[0].split(' | ') sm[:] = ['<a %s' % x for x in sm if x] sm[:] = [_unHtml(x.replace(' <i>', '::')) for x in sm] if sm: d['sound mix'] = sm cert = _findBetween(cont, 'Certification:</h5>', '</div>', maxRes=1) if cert: cert[:] = cert[0].split(' | ') cert[:] = [_unHtml(x.replace(' <i>', '::')) for x in cert] if cert: d['certificates'] = cert plotoutline = _findBetween(cont, 'Plot:</h5>', ['<a ', '</div>'], maxRes=1) if plotoutline: plotoutline = plotoutline[0].strip() plotoutline = plotoutline.rstrip('|').rstrip() if plotoutline: d['plot outline'] = _unHtml(plotoutline) aratio = _findBetween(cont, 'Aspect Ratio:</h5>', ['<a ', '</div>'], maxRes=1) if aratio: aratio = aratio[0].strip().replace(' (', '::(', 1) if aratio: d['aspect ratio'] = _unHtml(aratio) return {'data': d}