class DOMHTMLSearchPersonParser(DOMHTMLSearchMovieParser): """Parse the html page that the IMDb web server shows when the "new search system" is used, for persons.""" _BaseParser = DOMBasicPersonParser _notDirectHitTitle = '<title>imdb name' _titleBuilder = lambda self, x: build_name(x, canonical=True) _linkPrefix = '/name/nm' _attrs = [Attribute(key='data', multi=True, path={ 'link': "./a[1]/@href", 'name': "./a[1]/text()", 'index': "./text()[1]", 'akas': ".//div[@class='_imdbpyAKA']/text()" }, postprocess=lambda x: ( analyze_imdbid(x.get('link') or u''), analyze_name((x.get('name') or u'') + \ (x.get('index') or u''), canonical=1), x.get('akas') ))] extractors = [ Extractor(label='search', path="//td[3]/a[starts-with(@href, '/name/nm')]/..", attrs=_attrs) ] def preprocess_string(self, html_string): if self._notDirectHitTitle in html_string[:1024].lower(): html_string = _reAKASp.sub( r'\1<div class="_imdbpyAKA">\2::</div>\3', html_string) return DOMHTMLSearchMovieParser.preprocess_string(self, html_string)
class DOMHTMLSearchPersonParser(DOMHTMLSearchMovieParser): """Parse the html page that the IMDb web server shows when the "new search system" is used, for persons.""" _BaseParser = DOMBasicPersonParser _notDirectHitTitle = '<title>imdb name' _titleBuilder = lambda self, x: build_name(x, canonical=True) _linkPrefix = '/name/nm' _attrs = [ Attribute( key='data', multi=True, path={ 'link': "./a[1]/@href", 'name': "./a[1]/text()", 'index': "./text()[1]" }, postprocess=lambda x: (analyze_imdbid(x.get('link') or u''), analyze_name(x.get('name') + (x.get('index') or u''), canonical=1))) ] extractors = [ Extractor(label='search', path="//td[3]/a[starts-with(@href, '/name/nm')]/..", attrs=_attrs) ]
def postprocess_data(self, data): if not data or self.label not in data: return [] mlist = [] data = data[self.label] # Avoid duplicates. A real fix, using XPath, is auspicabile. # XXX: probably this is no more needed. seenIDs = [] for d in data: if 'movieID' not in d: continue if self.ranktext not in d: continue if 'title' not in d: continue theID = analyze_imdbid(d['movieID']) if theID is None: continue theID = str(theID) if theID in seenIDs: continue seenIDs.append(theID) minfo = analyze_title(d['title']+" "+d['year']) try: minfo[self.ranktext] = int(d[self.ranktext].replace('.', '')) except: pass if 'votes' in d: try: votes = d['votes'].replace(' votes','') votes = votes.split(' based on ')[1] minfo['votes'] = int(votes.replace(',', '')) except: pass if 'rating' in d: try: minfo['rating'] = float(d['rating']) except: pass mlist.append((theID, minfo)) return mlist
class DOMHTMLSearchMovieKeywordParser(DOMHTMLSearchMovieParser): """Parse the html page that the IMDb web server shows when the "new search system" is used, searching for movies with the given keyword.""" _notDirectHitTitle = '<title>best' _attrs = [ Attribute(key='data', multi=True, path={ 'link': "./a[1]/@href", 'info': "./a[1]//text()", 'ynote': "./span[@class='desc']/text()", 'outline': "./span[@class='outline']//text()" }, postprocess=lambda x: (analyze_imdbid(x.get('link') or u''), custom_analyze_title4kwd( x.get('info') or u'', x.get('ynote') or u'', x.get('outline') or u''))) ] extractors = [Extractor(label='search', path="//td[3]/a[starts-with(@href, " \ "'/title/tt')]/..", attrs=_attrs)]
class DOMHTMLSearchCompanyParser(DOMHTMLSearchMovieParser): _BaseParser = DOMBasicCompanyParser _notDirectHitTitle = '<title>find - imdb' _titleBuilder = lambda self, x: build_company_name(x) _linkPrefix = '/company/co' _attrs = [ Attribute(key='data', multi=True, path={ 'link': "./a[1]/@href", 'name': "./a[1]/text()", 'notes': "./text()[1]" }, postprocess=lambda x: (analyze_imdbid(x.get('link')), analyze_company_name(x.get('name') + (x.get('notes') or u''), stripNotes=True))) ] extractors = [ Extractor( label='search', path= "//td[@class='result_text']/a[starts-with(@href, '/company/co')]/..", attrs=_attrs) ]
def _build_episode(link, title, minfo, role, roleA, roleAID): """Build an Movie object for a given episode of a series.""" episode_id = analyze_imdbid(link) notes = u'' minidx = minfo.find(' -') # Sometimes, for some unknown reason, the role is left in minfo. if minidx != -1: slfRole = minfo[minidx + 3:].lstrip() minfo = minfo[:minidx].rstrip() if slfRole.endswith(')'): commidx = slfRole.rfind('(') if commidx != -1: notes = slfRole[commidx:] slfRole = slfRole[:commidx] if slfRole and role is None and roleA is None: role = slfRole eps_data = analyze_title(title) eps_data['kind'] = u'episode' # FIXME: it's wrong for multiple characters (very rare on tv series?). if role is None: role = roleA # At worse, it's None. if role is None: roleAID = None if roleAID is not None: roleAID = analyze_imdbid(roleAID) e = Movie(movieID=episode_id, data=eps_data, currentRole=role, roleID=roleAID, notes=notes) # XXX: are we missing some notes? # XXX: does it parse things as "Episode dated 12 May 2005 (12 May 2005)"? if minfo.startswith('('): pe = minfo.find(')') if pe != -1: date = minfo[1:pe] if date != '????': e['original air date'] = date if eps_data.get('year', '????') == '????': syear = date.split()[-1] if syear.isdigit(): e['year'] = int(syear) return e
def _build_episode(link, title, minfo, role, roleA, roleAID): """Build an Movie object for a given episode of a series.""" episode_id = analyze_imdbid(link) notes = u'' minidx = minfo.find(' -') # Sometimes, for some unknown reason, the role is left in minfo. if minidx != -1: slfRole = minfo[minidx+3:].lstrip() minfo = minfo[:minidx].rstrip() if slfRole.endswith(')'): commidx = slfRole.rfind('(') if commidx != -1: notes = slfRole[commidx:] slfRole = slfRole[:commidx] if slfRole and role is None and roleA is None: role = slfRole eps_data = analyze_title(title) eps_data['kind'] = u'episode' # FIXME: it's wrong for multiple characters (very rare on tv series?). if role is None: role = roleA # At worse, it's None. if role is None: roleAID = None if roleAID is not None: roleAID = analyze_imdbid(roleAID) e = Movie(movieID=episode_id, data=eps_data, currentRole=role, roleID=roleAID, notes=notes) # XXX: are we missing some notes? # XXX: does it parse things as "Episode dated 12 May 2005 (12 May 2005)"? if minfo.startswith('('): pe = minfo.find(')') if pe != -1: date = minfo[1:pe] if date != '????': e['original air date'] = date if eps_data.get('year', '????') == '????': syear = date.split()[-1] if syear.isdigit(): e['year'] = int(syear) return e
class DOMCompanyParser(DOMParserBase): """Parser for the main page of a given company. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: cparser = DOMCompanyParser() result = cparser.parse(company_html_string) """ _containsObjects = True extractors = [ Extractor(label='name', path="//h1/span[@class='display-title ']", # note the extra trailing space in class attrs=Attribute(key='name', path="./text()", postprocess=lambda x: \ analyze_company_name(x, stripNotes=True))), Extractor(label='filmography', group="//b/a[@name]", group_key="./text()", group_key_normalize=lambda x: x.lower(), path="../following-sibling::ol[1]/li", attrs=Attribute(key=None, multi=True, path={ 'link': "./a[1]/@href", 'title': "./a[1]/text()", 'year': "./text()[1]" }, postprocess=lambda x: build_movie(u'%s %s' % \ (x.get('title'), x.get('year').strip()), movieID=analyze_imdbid(x.get('link') or u''), _parsingCompany=True))), ] preprocessors = [ (re.compile('(<b><a name=)', re.I), r'</p>\1') ] def postprocess_data(self, data): for key in data.keys(): new_key = key.replace('company', 'companies') new_key = new_key.replace('other', 'miscellaneous') new_key = new_key.replace('distributor', 'distributors') if new_key != key: data[new_key] = data[key] del data[key] return data
def _init(self): self.preprocessors += [('<span class="tv-extra">TV mini-series</span>', '<span class="tv-extra">(mini)</span>')] self.extractors = [Extractor(label='title', path="//h1", attrs=Attribute(key='title', path=self._titleAttrPath, postprocess=self._titleFunct)), Extractor(label='link', path=self._linkPath, attrs=Attribute(key='link', path="./@href", postprocess=lambda x: \ analyze_imdbid((x or u'').replace( 'http://pro.imdb.com', '')) ))]
class DOMHTMLCharacterQuotesParser(DOMParserBase): """Parser for the "quotes" page of a given character. The page should be provided as a string, as taken from the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: qparser = DOMHTMLCharacterQuotesParser() result = qparser.parse(character_quotes_html_string) """ _defGetRefs = True extractors = [ Extractor(label='charquotes', group="//h5", group_key="./a/text()", path="./following-sibling::div[1]", attrs=Attribute( key=None, path={ 'txt': ".//text()", 'movieID': ".//a[1]/@href" }, postprocess=lambda x: (analyze_imdbid(x['movieID']), x['txt'].strip().replace( ': ', ': ').replace(': ', ': ').split('||')))) ] preprocessors = [(re.compile('(</h5>)', re.I), r'\1<div>'), (re.compile('\s*<br/><br/>\s*', re.I), r'||'), (re.compile('\|\|\s*(<hr/>)', re.I), r'</div>\1'), (re.compile('\s*<br/>\s*', re.I), r'::')] def postprocess_data(self, data): if not data: return {} newData = {} for title in data: movieID, quotes = data[title] if movieID is None: movie = title else: movie = Movie(title=title, movieID=movieID, accessSystem=self._as, modFunct=self._modFunct) newData[movie] = [quote.split('::') for quote in quotes] return {'quotes': newData}
def postprocess_data(self, data): if len(data) == 0: return {} nd = {} for key in data.keys(): dom = self.get_dom(key) link = self.xpath(dom, "//a/@href")[0] title = self.xpath(dom, "//a/text()")[0][1:-1] series = Movie(movieID=analyze_imdbid(link), data=analyze_title(title), accessSystem=self._as, modFunct=self._modFunct) nd[series] = [] for episode in data[key]: # XXX: should we create a copy of 'series', to avoid # circular references? episode['episode of'] = series nd[series].append(episode) return {'episodes': nd}
class DOMHTMLSearchCharacterParser(DOMHTMLSearchMovieParser): _BaseParser = DOMBasicCharacterParser _notDirectHitTitle = '<title>imdb search' _titleBuilder = lambda self, x: build_name(x, canonical=False) _linkPrefix = '/character/ch' _attrs = [Attribute(key='data', multi=True, path={ 'link': "./a[1]/@href", 'name': "./a[1]/text()" }, postprocess=lambda x: ( analyze_imdbid(x.get('link') or u''), {'name': x.get('name')} ))] extractors = [Extractor(label='search', path="//td[3]/a[starts-with(@href, " \ "'/character/ch')]/..", attrs=_attrs)]
def postprocess_data(self, data): if not data or self.label not in data: return [] mlist = [] data = data[self.label] # Avoid duplicates. A real fix, using XPath, is auspicabile. # XXX: probably this is no more needed. seenIDs = [] for d in data: if 'movieID' not in d: continue if self.ranktext not in d: continue if 'title' not in d: continue theID = analyze_imdbid(d['movieID']) if theID is None: continue theID = str(theID) if theID in seenIDs: continue seenIDs.append(theID) minfo = analyze_title(d['title'] + ' ' + d['year']) try: minfo[self.ranktext] = int(d[self.ranktext].replace('.', '')) except: pass if 'votes' in d: try: votes = d['votes'].replace(' votes', '') votes = votes.split(' based on ')[1] minfo['votes'] = int(votes.replace(',', '')) except: pass if 'rating' in d: try: minfo['rating'] = float(d['rating']) except: pass mlist.append((theID, minfo)) return mlist
class DOMHTMLPersonGenresParser(DOMParserBase): """Parser for the "by genre" and "by keywords" pages of a given person. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: gparser = DOMHTMLPersonGenresParser() result = gparser.parse(bygenre_html_string) """ kind = 'genres' _containsObjects = True extractors = [ Extractor(label='genres', group="//b/a[@name]/following-sibling::a[1]", group_key="./text()", group_key_normalize=lambda x: x.lower(), path="../../following-sibling::ol[1]/li//a[1]", attrs=Attribute(key=None, multi=True, path={ 'link': "./@href", 'title': "./text()", 'info': "./following-sibling::text()" }, postprocess=lambda x: \ build_movie(x.get('title') + \ x.get('info').split('[')[0], analyze_imdbid(x.get('link'))))) ] def postprocess_data(self, data): if len(data) == 0: return {} return {self.kind: data}
class DOMHTMLMaindetailsParser(DOMParserBase): """Parser for the "categorized" (maindetails) page of a given person. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: cparser = DOMHTMLMaindetailsParser() result = cparser.parse(categorized_html_string) """ _containsObjects = True _name_imdb_index = re.compile(r'\([IVXLCDM]+\)') _birth_attrs = [Attribute(key='birth date', path='.//time[@itemprop="birthDate"]/@datetime'), Attribute(key='birth place', path=".//a[starts-with(@href, " \ "'/search/name?birth_place=')]/text()")] _death_attrs = [Attribute(key='death date', path='.//time[@itemprop="deathDate"]/@datetime'), Attribute(key='death place', path=".//a[starts-with(@href, " \ "'/search/name?death_place=')]/text()")] _film_attrs = [ Attribute(key=None, multi=True, path={ 'link': "./b/a[1]/@href", 'title': "./b/a[1]/text()", 'notes': "./b/following-sibling::text()", 'year': "./span[@class='year_column']/text()", 'status': "./a[@class='in_production']/text()", 'rolesNoChar': './/br/following-sibling::text()', 'chrRoles': "./a[@imdbpyname]/@imdbpyname", 'roleID': "./a[starts-with(@href, '/character/')]/@href" }, postprocess=lambda x: build_movie( x.get('title') or u'', year=x.get('year'), movieID=analyze_imdbid(x.get('link') or u''), rolesNoChar=(x.get('rolesNoChar') or u'').strip(), chrRoles=(x.get('chrRoles') or u'').strip(), additionalNotes=x.get('notes'), roleID=(x.get('roleID') or u''), status=x.get('status') or None)) ] extractors = [ Extractor(label='name', path="//h1[@class='header']", attrs=Attribute(key='name', path=".//text()", postprocess=lambda x: analyze_name(x, canonical=1))), Extractor(label='name_index', path="//h1[@class='header']/span[1]", attrs=Attribute(key='name_index', path="./text()")), Extractor(label='birth info', path="//div[h4='Born:']", attrs=_birth_attrs), Extractor(label='death info', path="//div[h4='Died:']", attrs=_death_attrs), Extractor(label='headshot', path="//td[@id='img_primary']/div[@class='image']/a", attrs=Attribute(key='headshot', path="./img/@src")), Extractor(label='akas', path="//div[h4='Alternate Names:']", attrs=Attribute(key='akas', path="./text()", postprocess=lambda x: x.strip().split(' '))), Extractor(label='filmography', group="//div[starts-with(@id, 'filmo-head-')]", group_key="./a[@name]/text()", group_key_normalize=lambda x: x.lower().replace(': ', ' '), path="./following-sibling::div[1]" \ "/div[starts-with(@class, 'filmo-row')]", attrs=_film_attrs), Extractor(label='indevelopment', path="//div[starts-with(@class,'devitem')]", attrs=Attribute(key='in development', multi=True, path={ 'link': './a/@href', 'title': './a/text()' }, postprocess=lambda x: build_movie(x.get('title') or u'', movieID=analyze_imdbid(x.get('link') or u''), roleID=(x.get('roleID') or u'').split('/'), status=x.get('status') or None))) ] preprocessors = [ ('<div class="clear"/> </div>', ''), ('<br/>', '<br />'), (re.compile(r'(<a href="/character/ch[0-9]{7}")>(.*?)</a>'), r'\1 imdbpyname="\2@@">\2</a>') ] def postprocess_data(self, data): for what in 'birth date', 'death date': if what in data and not data[what]: del data[what] name_index = (data.get('name_index') or '').strip() if name_index: if self._name_imdb_index.match(name_index): data['imdbIndex'] = name_index[1:-1] del data['name_index'] # XXX: the code below is for backwards compatibility # probably could be removed for key in data.keys(): if key.startswith('actor '): if not data.has_key('actor'): data['actor'] = [] data['actor'].extend(data[key]) del data[key] if key.startswith('actress '): if not data.has_key('actress'): data['actress'] = [] data['actress'].extend(data[key]) del data[key] if key.startswith('self '): if not data.has_key('self'): data['self'] = [] data['self'].extend(data[key]) del data[key] if key == 'birth place': data['birth notes'] = data[key] del data[key] if key == 'death place': data['death notes'] = data[key] del data[key] return data
class DOMHTMLMaindetailsParser(DOMParserBase): """Parser for the "categorized" (maindetails) page of a given person. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: cparser = DOMHTMLMaindetailsParser() result = cparser.parse(categorized_html_string) """ _containsObjects = True _birth_attrs = [Attribute(key='birth date', path={ 'day': "./a[starts-with(@href, " \ "'/OnThisDay?')]/text()", 'year': "./a[starts-with(@href, " \ "'/BornInYear?')]/text()" }, postprocess=lambda x: build_date(x)), Attribute(key='birth notes', path="./a[starts-with(@href, '/BornWhere?')]/text()")] _death_attrs = [Attribute(key='death date', path={ 'day': "./a[starts-with(@href, " \ "'/OnThisDay?')]/text()", 'year': "./a[starts-with(@href, " \ "'/DiedInYear?')]/text()" }, postprocess=lambda x: build_date(x)), Attribute(key='death notes', path="./text()", # TODO: check if this slicing is always correct postprocess=lambda x: x.strip()[2:])] _film_attrs = [ Attribute(key=None, multi=True, path={ 'link': "./a[1]/@href", 'title': ".//text()", 'status': "./i/a//text()", 'roleID': "./div[@class='_imdbpyrole']/@roleid" }, postprocess=lambda x: build_movie( x.get('title') or u'', movieID=analyze_imdbid(x.get('link') or u''), roleID=(x.get('roleID') or u'').split('/'), status=x.get('status') or None)) ] extractors = [ Extractor(label='page title', path="//title", attrs=Attribute( key='name', path="./text()", postprocess=lambda x: analyze_name(x, canonical=1))), Extractor(label='birth info', path="//div[h5='Date of Birth:']", attrs=_birth_attrs), Extractor(label='death info', path="//div[h5='Date of Death:']", attrs=_death_attrs), Extractor(label='headshot', path="//a[@name='headshot']", attrs=Attribute(key='headshot', path="./img/@src")), Extractor(label='akas', path="//div[h5='Alternate Names:']", attrs=Attribute( key='akas', path="./text()", postprocess=lambda x: x.strip().split(' | '))), Extractor(label='filmography', group="//div[@class='filmo'][h5]", group_key="./h5/a[@name]/text()", group_key_normalize=lambda x: x.lower()[:-1], path="./ol/li", attrs=_film_attrs) ] preprocessors = [ # XXX: check that this doesn't cut "status" or other info... (re.compile(r'<br>(\.\.\.| ?).+?</li>', re.I | re.M | re.S), '</li>'), (_reRoles, _manageRoles) ]
class DOMHTMLSearchMovieParser(DOMParserBase): """Parse the html page that the IMDb web server shows when the "new search system" is used, for movies.""" _BaseParser = DOMBasicMovieParser _notDirectHitTitle = '<title>imdb title' _titleBuilder = lambda self, x: build_title(x) _linkPrefix = '/title/tt' _attrs = [Attribute(key='data', multi=True, path={ 'link': "./a[1]/@href", 'info': ".//text()", #'akas': ".//div[@class='_imdbpyAKA']//text()" 'akas': ".//p[@class='find-aka']//text()" }, postprocess=lambda x: ( analyze_imdbid(x.get('link') or u''), custom_analyze_title(x.get('info') or u''), x.get('akas') ))] extractors = [Extractor(label='search', path="//td[3]/a[starts-with(@href, '/title/tt')]/..", attrs=_attrs)] def _init(self): self.url = u'' def _reset(self): self.url = u'' def preprocess_string(self, html_string): if self._notDirectHitTitle in html_string[:1024].lower(): if self._linkPrefix == '/title/tt': # Only for movies. html_string = html_string.replace('(TV mini-series)', '(mini)') html_string = html_string.replace('<p class="find-aka">', '<p class="find-aka">::') #html_string = _reAKAStitles.sub( # r'<div class="_imdbpyAKA">\1::</div>\2', html_string) return html_string # Direct hit! dbme = self._BaseParser(useModule=self._useModule) res = dbme.parse(html_string, url=self.url) if not res: return u'' res = res['data'] if not (res and res[0]): return u'' link = '%s%s' % (self._linkPrefix, res[0][0]) # # Tries to cope with companies for which links to pro.imdb.com # # are missing. # link = self.url.replace(imdbURL_base[:-1], '') title = self._titleBuilder(res[0][1]) if not (link and title): return u'' link = link.replace('http://pro.imdb.com', '') new_html = '<td></td><td></td><td><a href="%s">%s</a></td>' % (link, title) return new_html def postprocess_data(self, data): if not data.has_key('data'): data['data'] = [] results = getattr(self, 'results', None) if results is not None: data['data'][:] = data['data'][:results] # Horrible hack to support AKAs. if data and data['data'] and len(data['data'][0]) == 3 and \ isinstance(data['data'][0], tuple): for idx, datum in enumerate(data['data']): if datum[2] is not None: akas = filter(None, datum[2].split('::')) if self._linkPrefix == '/title/tt': akas = [a.replace('" - ', '::').rstrip() for a in akas] akas = [a.replace('aka "', '', 1).lstrip() for a in akas] datum[1]['akas'] = akas data['data'][idx] = (datum[0], datum[1]) else: data['data'][idx] = (datum[0], datum[1]) return data def add_refs(self, data): return data
class DOMHTMLSearchMovieParser(DOMParserBase): """Parse the html page that the IMDb web server shows when the "new search system" is used, for movies.""" _BaseParser = DOMBasicMovieParser _notDirectHitTitle = '<title>find - imdb</title>' _titleBuilder = lambda self, x: build_title(x) _linkPrefix = '/title/tt' _attrs = [ Attribute(key='data', multi=True, path={ 'link': "./a[1]/@href", 'info': ".//text()", 'akas': "./i//text()" }, postprocess=lambda x: (analyze_imdbid(x.get('link') or u''), custom_analyze_title(x.get('info') or u''), x.get('akas'))) ] extractors = [ Extractor(label='search', path="//td[@class='result_text']", attrs=_attrs) ] def _init(self): self.url = u'' def _reset(self): self.url = u'' def preprocess_string(self, html_string): if self._notDirectHitTitle in html_string[:10240].lower(): if self._linkPrefix == '/title/tt': # Only for movies. # XXX (HTU): does this still apply? html_string = html_string.replace('(TV mini-series)', '(mini)') return html_string # Direct hit! dbme = self._BaseParser(useModule=self._useModule) res = dbme.parse(html_string, url=self.url) if not res: return u'' res = res['data'] if not (res and res[0]): return u'' link = '%s%s' % (self._linkPrefix, res[0][0]) # # Tries to cope with companies for which links to pro.imdb.com # # are missing. # link = self.url.replace(imdbURL_base[:-1], '') title = self._titleBuilder(res[0][1]) if not (link and title): return u'' link = link.replace('http://pro.imdb.com', '') new_html = '<td class="result_text"><a href="%s">%s</a></td>' % (link, title) return new_html def postprocess_data(self, data): if not data.has_key('data'): data['data'] = [] results = getattr(self, 'results', None) if results is not None: data['data'][:] = data['data'][:results] # Horrible hack to support AKAs. if data and data['data'] and len(data['data'][0]) == 3 and \ isinstance(data['data'][0], tuple): data['data'] = [x for x in data['data'] if x[0] and x[1]] for idx, datum in enumerate(data['data']): if not isinstance(datum, tuple): continue if not datum[0] and datum[1]: continue if datum[2] is not None: #akas = filter(None, datum[2].split('::')) if self._linkPrefix == '/title/tt': # XXX (HTU): couldn't find a result with multiple akas aka = datum[2] akas = [aka[1:-1]] # remove the quotes #akas = [a.replace('" - ', '::').rstrip() for a in akas] #akas = [a.replace('aka "', '', 1).replace('aka "', #'', 1).lstrip() for a in akas] datum[1]['akas'] = akas data['data'][idx] = (datum[0], datum[1]) else: data['data'][idx] = (datum[0], datum[1]) return data def add_refs(self, data): return data
class DOMHTMLSearchMovieParser(DOMParserBase): """Parse the html page that the IMDb web server shows when the "new search system" is used, for movies.""" _BaseParser = DOMBasicMovieParser _notDirectHitTitle = '<title>imdb title' _titleBuilder = lambda self, x: build_title(x, canonical=True) _linkPrefix = '/title/tt' _attrs = [ Attribute(key='data', multi=True, path={ 'link': "./a[1]/@href", 'info': ".//text()" }, postprocess=lambda x: (analyze_imdbid(x.get('link') or u''), analyze_title(x.get('info') or u'', canonical=1))) ] extractors = [ Extractor(label='search', path="//td[3]/a[starts-with(@href, '/title/tt')]/..", attrs=_attrs) ] def _init(self): self.url = u'' def _reset(self): self.url = u'' def preprocess_string(self, html_string): if self._notDirectHitTitle in html_string[:1024].lower(): if self._linkPrefix == '/title/tt': # Only for movies. html_string = html_string.replace('(TV mini-series)', '(mini)') html_string = _reAKAS.sub('</td>', html_string) return html_string # Direct hit! dbme = self._BaseParser(useModule=self._useModule) res = dbme.parse(html_string, url=self.url) if not res: return u'' res = res['data'] if not (res and res[0]): return u'' link = '%s%s' % (self._linkPrefix, res[0][0]) # # Tries to cope with companies for which links to pro.imdb.com # # are missing. # link = self.url.replace(imdbURL_base[:-1], '') title = self._titleBuilder(res[0][1]) if not (link and title): return u'' link = link.replace('http://pro.imdb.com', '') new_html = '<td></td><td></td><td><a href="%s">%s</a></td>' % (link, title) return new_html def postprocess_data(self, data): if not data.has_key('data'): data['data'] = [] results = getattr(self, 'results', None) if results is not None: data['data'][:] = data['data'][:results] return data def add_refs(self, data): return data
class DOMHTMLCharacterMaindetailsParser(DOMHTMLMaindetailsParser): """Parser for the "biography" page of a given character. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: bparser = DOMHTMLCharacterMaindetailsParser() result = bparser.parse(character_biography_html_string) """ _containsObjects = True _film_attrs = [Attribute(key=None, multi=True, path={ 'link': "./a[1]/@href", 'title': ".//text()", 'status': "./i/a//text()", 'roleID': "./a/@href" }, postprocess=lambda x: build_movie(x.get('title') or u'', movieID=analyze_imdbid(x.get('link') or u''), roleID=_personIDs.findall(x.get('roleID') or u''), status=x.get('status') or None, _parsingCharacter=True))] extractors = [ Extractor(label='title', path="//title", attrs=Attribute(key='name', path="./text()", postprocess=lambda x: \ x.replace(' (Character)', '').strip())), Extractor(label='headshot', path="//a[@name='headshot']", attrs=Attribute(key='headshot', path="./img/@src")), Extractor(label='akas', path="//div[h5='Alternate Names:']", attrs=Attribute(key='akas', path="./text()", postprocess=lambda x: x.strip().split(' / '))), Extractor(label='filmography', path="//div[@class='filmo'][not(h5)]/ol/li", attrs=_film_attrs), Extractor(label='filmography sections', group="//div[@class='filmo'][h5]", group_key="./h5/a/text()", group_key_normalize=lambda x: x.lower()[:-1], path="./ol/li", attrs=_film_attrs), ] preprocessors = [ # Check that this doesn't cut "status"... (re.compile(r'<br>(\.\.\.| ).+?</li>', re.I | re.M), '</li>')]