def end_li(self): if self._stop_here or not self._in_content: return self._get_imdbID = False if not self._in_movie: return self._movie = self._movie.strip() self._cur_status = self._cur_status.strip() if not (self._movie and self._last_imdbID and self._section): return if not self._cids: self._cids = None elif len(self._cids) == 1: self._cids = self._cids[0] # Add this movie to the list. kwds = {'movieID': self._last_imdbID, 'status': self._cur_status, 'roleID': self._cids, 'modFunct': self._modFunct, 'accessSystem': self._as} if self.kind == 'character': kwds['_parsingCharacter'] = True lnids = self._last_nameIDs if not lnids: lnids = None elif len(lnids) == 1: lnids = lnids[0] kwds['roleID'] = lnids movie = build_movie(self._movie, **kwds) self._data.setdefault(self._section, []).append(movie)
def end_li(self): if self._stop_here or not self._in_content: return self._get_imdbID = False if not self._in_movie: return self._movie = self._movie.strip() self._cur_status = self._cur_status.strip() if not (self._movie and self._last_imdbID and self._section): return if not self._cids: self._cids = None elif len(self._cids) == 1: self._cids = self._cids[0] # Add this movie to the list. kwds = { 'movieID': self._last_imdbID, 'status': self._cur_status, 'roleID': self._cids, 'modFunct': self._modFunct, 'accessSystem': self._as } if self.kind == 'character': kwds['_parsingCharacter'] = True lnids = self._last_nameIDs if not lnids: lnids = None elif len(lnids) == 1: lnids = lnids[0] kwds['roleID'] = lnids movie = build_movie(self._movie, **kwds) self._data.setdefault(self._section, []).append(movie)
def end_li(self): self._cur_item = self._cur_item.strip() if self._section and self._in_li and self._last_movieid \ and self._cur_item: self._in_li = False kwds = {'movieID': self._last_movieid, 'modFunct': self._modFunct, 'accessSystem': self._as, '_parsingCompany': True} movie = build_movie(self._cur_item, **kwds) if movie: self._data.setdefault(self._section, []).append(movie) self._cur_item = u'' self._last_movieid = None
class DOMCompanyParser(DOMParserBase): """Parser for the main page of a given company. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: cparser = DOMCompanyParser() result = cparser.parse(company_html_string) """ _containsObjects = True extractors = [ Extractor(label='name', path="//h1/span[@class='display-title ']", # note the extra trailing space in class attrs=Attribute(key='name', path="./text()", postprocess=lambda x: \ analyze_company_name(x, stripNotes=True))), Extractor(label='filmography', group="//b/a[@name]", group_key="./text()", group_key_normalize=lambda x: x.lower(), path="../following-sibling::ol[1]/li", attrs=Attribute(key=None, multi=True, path={ 'link': "./a[1]/@href", 'title': "./a[1]/text()", 'year': "./text()[1]" }, postprocess=lambda x: build_movie(u'%s %s' % \ (x.get('title'), x.get('year').strip()), movieID=analyze_imdbid(x.get('link') or u''), _parsingCompany=True))), ] preprocessors = [ (re.compile('(<b><a name=)', re.I), r'</p>\1') ] def postprocess_data(self, data): for key in data.keys(): new_key = key.replace('company', 'companies') new_key = new_key.replace('other', 'miscellaneous') new_key = new_key.replace('distributor', 'distributors') if new_key != key: data[new_key] = data[key] del data[key] return data
def _add_info(self): self._cur_key = self._cur_key.strip() self._cur_title = self._cur_title.strip() if not (self._cur_key and self._cur_title and self._cur_movieID): self._cur_title = u'' self._cur_movieID = None self._cur_characterID = None return ridx = self._cur_title.find('[') notes = u'' if ridx != -1: notes = self._cur_title[ridx:].lstrip() self._cur_title = self._cur_title[:ridx].rstrip() m = build_movie(self._cur_title, movieID=self._cur_movieID, roleID=self._cur_characterID, modFunct=self._modFunct, accessSystem=self._as) m.notes = notes self._info.setdefault(self._cur_key.replace('X2D', '-'), []).append(m) self._cur_title = u'' self._cur_movieID = None self._cur_characterID = None
class DOMHTMLPersonGenresParser(DOMParserBase): """Parser for the "by genre" and "by keywords" pages of a given person. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: gparser = DOMHTMLPersonGenresParser() result = gparser.parse(bygenre_html_string) """ kind = 'genres' _containsObjects = True extractors = [ Extractor(label='genres', group="//b/a[@name]/following-sibling::a[1]", group_key="./text()", group_key_normalize=lambda x: x.lower(), path="../../following-sibling::ol[1]/li//a[1]", attrs=Attribute(key=None, multi=True, path={ 'link': "./@href", 'title': "./text()", 'info': "./following-sibling::text()" }, postprocess=lambda x: \ build_movie(x.get('title') + \ x.get('info').split('[')[0], analyze_imdbid(x.get('link'))))) ] def postprocess_data(self, data): if len(data) == 0: return {} return {self.kind: data}
class DOMHTMLMaindetailsParser(DOMParserBase): """Parser for the "categorized" (maindetails) page of a given person. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: cparser = DOMHTMLMaindetailsParser() result = cparser.parse(categorized_html_string) """ _containsObjects = True _name_imdb_index = re.compile(r'\([IVXLCDM]+\)') _birth_attrs = [Attribute(key='birth date', path='.//time[@itemprop="birthDate"]/@datetime'), Attribute(key='birth place', path=".//a[starts-with(@href, " \ "'/search/name?birth_place=')]/text()")] _death_attrs = [Attribute(key='death date', path='.//time[@itemprop="deathDate"]/@datetime'), Attribute(key='death place', path=".//a[starts-with(@href, " \ "'/search/name?death_place=')]/text()")] _film_attrs = [ Attribute(key=None, multi=True, path={ 'link': "./b/a[1]/@href", 'title': "./b/a[1]/text()", 'notes': "./b/following-sibling::text()", 'year': "./span[@class='year_column']/text()", 'status': "./a[@class='in_production']/text()", 'rolesNoChar': './/br/following-sibling::text()', 'chrRoles': "./a[@imdbpyname]/@imdbpyname", 'roleID': "./a[starts-with(@href, '/character/')]/@href" }, postprocess=lambda x: build_movie( x.get('title') or u'', year=x.get('year'), movieID=analyze_imdbid(x.get('link') or u''), rolesNoChar=(x.get('rolesNoChar') or u'').strip(), chrRoles=(x.get('chrRoles') or u'').strip(), additionalNotes=x.get('notes'), roleID=(x.get('roleID') or u''), status=x.get('status') or None)) ] extractors = [ Extractor(label='name', path="//h1[@class='header']", attrs=Attribute(key='name', path=".//text()", postprocess=lambda x: analyze_name(x, canonical=1))), Extractor(label='name_index', path="//h1[@class='header']/span[1]", attrs=Attribute(key='name_index', path="./text()")), Extractor(label='birth info', path="//div[h4='Born:']", attrs=_birth_attrs), Extractor(label='death info', path="//div[h4='Died:']", attrs=_death_attrs), Extractor(label='headshot', path="//td[@id='img_primary']/div[@class='image']/a", attrs=Attribute(key='headshot', path="./img/@src")), Extractor(label='akas', path="//div[h4='Alternate Names:']", attrs=Attribute(key='akas', path="./text()", postprocess=lambda x: x.strip().split(' '))), Extractor(label='filmography', group="//div[starts-with(@id, 'filmo-head-')]", group_key="./a[@name]/text()", group_key_normalize=lambda x: x.lower().replace(': ', ' '), path="./following-sibling::div[1]" \ "/div[starts-with(@class, 'filmo-row')]", attrs=_film_attrs), Extractor(label='indevelopment', path="//div[starts-with(@class,'devitem')]", attrs=Attribute(key='in development', multi=True, path={ 'link': './a/@href', 'title': './a/text()' }, postprocess=lambda x: build_movie(x.get('title') or u'', movieID=analyze_imdbid(x.get('link') or u''), roleID=(x.get('roleID') or u'').split('/'), status=x.get('status') or None))) ] preprocessors = [ ('<div class="clear"/> </div>', ''), ('<br/>', '<br />'), (re.compile(r'(<a href="/character/ch[0-9]{7}")>(.*?)</a>'), r'\1 imdbpyname="\2@@">\2</a>') ] def postprocess_data(self, data): for what in 'birth date', 'death date': if what in data and not data[what]: del data[what] name_index = (data.get('name_index') or '').strip() if name_index: if self._name_imdb_index.match(name_index): data['imdbIndex'] = name_index[1:-1] del data['name_index'] # XXX: the code below is for backwards compatibility # probably could be removed for key in data.keys(): if key.startswith('actor '): if not data.has_key('actor'): data['actor'] = [] data['actor'].extend(data[key]) del data[key] if key.startswith('actress '): if not data.has_key('actress'): data['actress'] = [] data['actress'].extend(data[key]) del data[key] if key.startswith('self '): if not data.has_key('self'): data['self'] = [] data['self'].extend(data[key]) del data[key] if key == 'birth place': data['birth notes'] = data[key] del data[key] if key == 'death place': data['death notes'] = data[key] del data[key] return data
class DOMHTMLCharacterMaindetailsParser(DOMHTMLMaindetailsParser): """Parser for the "biography" page of a given character. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: bparser = DOMHTMLCharacterMaindetailsParser() result = bparser.parse(character_biography_html_string) """ _containsObjects = True _film_attrs = [Attribute(key=None, multi=True, path={ 'link': "./a[1]/@href", 'title': ".//text()", 'status': "./i/a//text()", 'roleID': "./a/@href" }, postprocess=lambda x: build_movie(x.get('title') or u'', movieID=analyze_imdbid(x.get('link') or u''), roleID=_personIDs.findall(x.get('roleID') or u''), status=x.get('status') or None, _parsingCharacter=True))] extractors = [ Extractor(label='title', path="//title", attrs=Attribute(key='name', path="./text()", postprocess=lambda x: \ x.replace(' (Character)', '').strip())), Extractor(label='headshot', path="//a[@name='headshot']", attrs=Attribute(key='headshot', path="./img/@src")), Extractor(label='akas', path="//div[h5='Alternate Names:']", attrs=Attribute(key='akas', path="./text()", postprocess=lambda x: x.strip().split(' / '))), Extractor(label='filmography', path="//div[@class='filmo'][not(h5)]/ol/li", attrs=_film_attrs), Extractor(label='filmography sections', group="//div[@class='filmo'][h5]", group_key="./h5/a/text()", group_key_normalize=lambda x: x.lower()[:-1], path="./ol/li", attrs=_film_attrs), ] preprocessors = [ # Check that this doesn't cut "status"... (re.compile(r'<br>(\.\.\.| ).+?</li>', re.I | re.M), '</li>')]
class DOMHTMLMaindetailsParser(DOMParserBase): """Parser for the "categorized" (maindetails) page of a given person. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: cparser = DOMHTMLMaindetailsParser() result = cparser.parse(categorized_html_string) """ _containsObjects = True _birth_attrs = [Attribute(key='birth date', path={ 'day': "./a[starts-with(@href, " \ "'/OnThisDay?')]/text()", 'year': "./a[starts-with(@href, " \ "'/BornInYear?')]/text()" }, postprocess=lambda x: build_date(x)), Attribute(key='birth notes', path="./a[starts-with(@href, '/BornWhere?')]/text()")] _death_attrs = [Attribute(key='death date', path={ 'day': "./a[starts-with(@href, " \ "'/OnThisDay?')]/text()", 'year': "./a[starts-with(@href, " \ "'/DiedInYear?')]/text()" }, postprocess=lambda x: build_date(x)), Attribute(key='death notes', path="./text()", # TODO: check if this slicing is always correct postprocess=lambda x: x.strip()[2:])] _film_attrs = [ Attribute(key=None, multi=True, path={ 'link': "./a[1]/@href", 'title': ".//text()", 'status': "./i/a//text()", 'roleID': "./div[@class='_imdbpyrole']/@roleid" }, postprocess=lambda x: build_movie( x.get('title') or u'', movieID=analyze_imdbid(x.get('link') or u''), roleID=(x.get('roleID') or u'').split('/'), status=x.get('status') or None)) ] extractors = [ Extractor(label='page title', path="//title", attrs=Attribute( key='name', path="./text()", postprocess=lambda x: analyze_name(x, canonical=1))), Extractor(label='birth info', path="//div[h5='Date of Birth:']", attrs=_birth_attrs), Extractor(label='death info', path="//div[h5='Date of Death:']", attrs=_death_attrs), Extractor(label='headshot', path="//a[@name='headshot']", attrs=Attribute(key='headshot', path="./img/@src")), Extractor(label='akas', path="//div[h5='Alternate Names:']", attrs=Attribute( key='akas', path="./text()", postprocess=lambda x: x.strip().split(' | '))), Extractor(label='filmography', group="//div[@class='filmo'][h5]", group_key="./h5/a[@name]/text()", group_key_normalize=lambda x: x.lower()[:-1], path="./ol/li", attrs=_film_attrs) ] preprocessors = [ # XXX: check that this doesn't cut "status" or other info... (re.compile(r'<br>(\.\.\.| ?).+?</li>', re.I | re.M | re.S), '</li>'), (_reRoles, _manageRoles) ]