示例#1
0
 def end_li(self):
     if self._stop_here or not self._in_content: return
     self._get_imdbID = False
     if not self._in_movie: return
     self._movie = self._movie.strip()
     self._cur_status = self._cur_status.strip()
     if not (self._movie and self._last_imdbID and self._section): return
     if not self._cids:
         self._cids = None
     elif len(self._cids) == 1:
         self._cids = self._cids[0]
     # Add this movie to the list.
     kwds = {'movieID': self._last_imdbID, 'status': self._cur_status,
             'roleID': self._cids, 'modFunct': self._modFunct,
             'accessSystem': self._as}
     if self.kind == 'character':
         kwds['_parsingCharacter'] = True
         lnids = self._last_nameIDs
         if not lnids:
             lnids = None
         elif len(lnids) == 1:
             lnids = lnids[0]
         kwds['roleID'] = lnids
     movie = build_movie(self._movie, **kwds)
     self._data.setdefault(self._section, []).append(movie)
示例#2
0
 def end_li(self):
     if self._stop_here or not self._in_content: return
     self._get_imdbID = False
     if not self._in_movie: return
     self._movie = self._movie.strip()
     self._cur_status = self._cur_status.strip()
     if not (self._movie and self._last_imdbID and self._section): return
     if not self._cids:
         self._cids = None
     elif len(self._cids) == 1:
         self._cids = self._cids[0]
     # Add this movie to the list.
     kwds = {
         'movieID': self._last_imdbID,
         'status': self._cur_status,
         'roleID': self._cids,
         'modFunct': self._modFunct,
         'accessSystem': self._as
     }
     if self.kind == 'character':
         kwds['_parsingCharacter'] = True
         lnids = self._last_nameIDs
         if not lnids:
             lnids = None
         elif len(lnids) == 1:
             lnids = lnids[0]
         kwds['roleID'] = lnids
     movie = build_movie(self._movie, **kwds)
     self._data.setdefault(self._section, []).append(movie)
示例#3
0
 def end_li(self):
     self._cur_item = self._cur_item.strip()
     if self._section and self._in_li and self._last_movieid \
             and self._cur_item:
         self._in_li = False
         kwds = {'movieID': self._last_movieid, 'modFunct': self._modFunct,
                 'accessSystem': self._as, '_parsingCompany': True}
         movie = build_movie(self._cur_item, **kwds)
         if movie:
             self._data.setdefault(self._section, []).append(movie)
     self._cur_item = u''
     self._last_movieid = None
示例#4
0
class DOMCompanyParser(DOMParserBase):
    """Parser for the main page of a given company.
    The page should be provided as a string, as taken from
    the akas.imdb.com server.  The final result will be a
    dictionary, with a key for every relevant section.

    Example:
        cparser = DOMCompanyParser()
        result = cparser.parse(company_html_string)
    """
    _containsObjects = True

    extractors = [
            Extractor(label='name',
                        path="//h1/span[@class='display-title ']",  # note the extra trailing space in class
                        attrs=Attribute(key='name',
                            path="./text()",
                        postprocess=lambda x: \
                                analyze_company_name(x, stripNotes=True))),

            Extractor(label='filmography',
                        group="//b/a[@name]",
                        group_key="./text()",
                        group_key_normalize=lambda x: x.lower(),
                        path="../following-sibling::ol[1]/li",
                        attrs=Attribute(key=None,
                            multi=True,
                            path={
                                'link': "./a[1]/@href",
                                'title': "./a[1]/text()",
                                'year': "./text()[1]"
                                },
                            postprocess=lambda x:
                                build_movie(u'%s %s' % \
                                (x.get('title'), x.get('year').strip()),
                                movieID=analyze_imdbid(x.get('link') or u''),
                                _parsingCompany=True))),
            ]

    preprocessors = [
        (re.compile('(<b><a name=)', re.I), r'</p>\1')
        ]

    def postprocess_data(self, data):
        for key in data.keys():
            new_key = key.replace('company', 'companies')
            new_key = new_key.replace('other', 'miscellaneous')
            new_key = new_key.replace('distributor', 'distributors')
            if new_key != key:
                data[new_key] = data[key]
                del data[key]
        return data
示例#5
0
 def _add_info(self):
     self._cur_key = self._cur_key.strip()
     self._cur_title = self._cur_title.strip()
     if not (self._cur_key and self._cur_title and self._cur_movieID):
         self._cur_title = u''
         self._cur_movieID = None
         self._cur_characterID = None
         return
     ridx = self._cur_title.find('[')
     notes = u''
     if ridx != -1:
         notes = self._cur_title[ridx:].lstrip()
         self._cur_title = self._cur_title[:ridx].rstrip()
     m = build_movie(self._cur_title, movieID=self._cur_movieID,
                     roleID=self._cur_characterID, modFunct=self._modFunct,
                     accessSystem=self._as)
     m.notes = notes
     self._info.setdefault(self._cur_key.replace('X2D', '-'), []).append(m)
     self._cur_title = u''
     self._cur_movieID = None
     self._cur_characterID = None
示例#6
0
 def _add_info(self):
     self._cur_key = self._cur_key.strip()
     self._cur_title = self._cur_title.strip()
     if not (self._cur_key and self._cur_title and self._cur_movieID):
         self._cur_title = u''
         self._cur_movieID = None
         self._cur_characterID = None
         return
     ridx = self._cur_title.find('[')
     notes = u''
     if ridx != -1:
         notes = self._cur_title[ridx:].lstrip()
         self._cur_title = self._cur_title[:ridx].rstrip()
     m = build_movie(self._cur_title,
                     movieID=self._cur_movieID,
                     roleID=self._cur_characterID,
                     modFunct=self._modFunct,
                     accessSystem=self._as)
     m.notes = notes
     self._info.setdefault(self._cur_key.replace('X2D', '-'), []).append(m)
     self._cur_title = u''
     self._cur_movieID = None
     self._cur_characterID = None
示例#7
0
class DOMHTMLPersonGenresParser(DOMParserBase):
    """Parser for the "by genre" and "by keywords" pages of a given person.
    The page should be provided as a string, as taken from
    the akas.imdb.com server.  The final result will be a
    dictionary, with a key for every relevant section.

    Example:
        gparser = DOMHTMLPersonGenresParser()
        result = gparser.parse(bygenre_html_string)
    """
    kind = 'genres'
    _containsObjects = True

    extractors = [
            Extractor(label='genres',
                        group="//b/a[@name]/following-sibling::a[1]",
                        group_key="./text()",
                        group_key_normalize=lambda x: x.lower(),
                        path="../../following-sibling::ol[1]/li//a[1]",
                        attrs=Attribute(key=None,
                            multi=True,
                            path={
                                'link': "./@href",
                                'title': "./text()",
                                'info': "./following-sibling::text()"
                                },
                            postprocess=lambda x: \
                                    build_movie(x.get('title') + \
                                    x.get('info').split('[')[0],
                                    analyze_imdbid(x.get('link')))))
            ]

    def postprocess_data(self, data):
        if len(data) == 0:
            return {}
        return {self.kind: data}
示例#8
0
class DOMHTMLMaindetailsParser(DOMParserBase):
    """Parser for the "categorized" (maindetails) page of a given person.
    The page should be provided as a string, as taken from
    the akas.imdb.com server.  The final result will be a
    dictionary, with a key for every relevant section.

    Example:
        cparser = DOMHTMLMaindetailsParser()
        result = cparser.parse(categorized_html_string)
    """
    _containsObjects = True
    _name_imdb_index = re.compile(r'\([IVXLCDM]+\)')

    _birth_attrs = [Attribute(key='birth date',
                        path='.//time[@itemprop="birthDate"]/@datetime'),
                    Attribute(key='birth place',
                        path=".//a[starts-with(@href, " \
                                "'/search/name?birth_place=')]/text()")]
    _death_attrs = [Attribute(key='death date',
                        path='.//time[@itemprop="deathDate"]/@datetime'),
                    Attribute(key='death place',
                        path=".//a[starts-with(@href, " \
                                "'/search/name?death_place=')]/text()")]
    _film_attrs = [
        Attribute(key=None,
                  multi=True,
                  path={
                      'link': "./b/a[1]/@href",
                      'title': "./b/a[1]/text()",
                      'notes': "./b/following-sibling::text()",
                      'year': "./span[@class='year_column']/text()",
                      'status': "./a[@class='in_production']/text()",
                      'rolesNoChar': './/br/following-sibling::text()',
                      'chrRoles': "./a[@imdbpyname]/@imdbpyname",
                      'roleID': "./a[starts-with(@href, '/character/')]/@href"
                  },
                  postprocess=lambda x: build_movie(
                      x.get('title') or u'',
                      year=x.get('year'),
                      movieID=analyze_imdbid(x.get('link') or u''),
                      rolesNoChar=(x.get('rolesNoChar') or u'').strip(),
                      chrRoles=(x.get('chrRoles') or u'').strip(),
                      additionalNotes=x.get('notes'),
                      roleID=(x.get('roleID') or u''),
                      status=x.get('status') or None))
    ]

    extractors = [
            Extractor(label='name',
                        path="//h1[@class='header']",
                        attrs=Attribute(key='name',
                            path=".//text()",
                            postprocess=lambda x: analyze_name(x,
                                                               canonical=1))),
            Extractor(label='name_index',
                        path="//h1[@class='header']/span[1]",
                        attrs=Attribute(key='name_index',
                            path="./text()")),

            Extractor(label='birth info',
                        path="//div[h4='Born:']",
                        attrs=_birth_attrs),

            Extractor(label='death info',
                        path="//div[h4='Died:']",
                        attrs=_death_attrs),

            Extractor(label='headshot',
                        path="//td[@id='img_primary']/div[@class='image']/a",
                        attrs=Attribute(key='headshot',
                            path="./img/@src")),

            Extractor(label='akas',
                        path="//div[h4='Alternate Names:']",
                        attrs=Attribute(key='akas',
                            path="./text()",
                            postprocess=lambda x: x.strip().split('  '))),

            Extractor(label='filmography',
                        group="//div[starts-with(@id, 'filmo-head-')]",
                        group_key="./a[@name]/text()",
                        group_key_normalize=lambda x: x.lower().replace(': ', ' '),
                        path="./following-sibling::div[1]" \
                                "/div[starts-with(@class, 'filmo-row')]",
                        attrs=_film_attrs),

            Extractor(label='indevelopment',
                        path="//div[starts-with(@class,'devitem')]",
                        attrs=Attribute(key='in development',
                            multi=True,
                            path={
                                'link': './a/@href',
                                'title': './a/text()'
                                },
                                postprocess=lambda x:
                                    build_movie(x.get('title') or u'',
                                        movieID=analyze_imdbid(x.get('link') or u''),
                                        roleID=(x.get('roleID') or u'').split('/'),
                                        status=x.get('status') or None)))
            ]

    preprocessors = [
        ('<div class="clear"/> </div>', ''), ('<br/>', '<br />'),
        (re.compile(r'(<a href="/character/ch[0-9]{7}")>(.*?)</a>'),
         r'\1 imdbpyname="\2@@">\2</a>')
    ]

    def postprocess_data(self, data):
        for what in 'birth date', 'death date':
            if what in data and not data[what]:
                del data[what]
        name_index = (data.get('name_index') or '').strip()
        if name_index:
            if self._name_imdb_index.match(name_index):
                data['imdbIndex'] = name_index[1:-1]
            del data['name_index']
        # XXX: the code below is for backwards compatibility
        # probably could be removed
        for key in data.keys():
            if key.startswith('actor '):
                if not data.has_key('actor'):
                    data['actor'] = []
                data['actor'].extend(data[key])
                del data[key]
            if key.startswith('actress '):
                if not data.has_key('actress'):
                    data['actress'] = []
                data['actress'].extend(data[key])
                del data[key]
            if key.startswith('self '):
                if not data.has_key('self'):
                    data['self'] = []
                data['self'].extend(data[key])
                del data[key]
            if key == 'birth place':
                data['birth notes'] = data[key]
                del data[key]
            if key == 'death place':
                data['death notes'] = data[key]
                del data[key]
        return data
class DOMHTMLCharacterMaindetailsParser(DOMHTMLMaindetailsParser):
    """Parser for the "biography" page of a given character.
    The page should be provided as a string, as taken from
    the akas.imdb.com server.  The final result will be a
    dictionary, with a key for every relevant section.

    Example:
        bparser = DOMHTMLCharacterMaindetailsParser()
        result = bparser.parse(character_biography_html_string)
    """
    _containsObjects = True

    _film_attrs = [Attribute(key=None,
                      multi=True,
                      path={
                          'link': "./a[1]/@href",
                          'title': ".//text()",
                          'status': "./i/a//text()",
                          'roleID': "./a/@href"
                          },
                      postprocess=lambda x:
                          build_movie(x.get('title') or u'',
                              movieID=analyze_imdbid(x.get('link') or u''),
                              roleID=_personIDs.findall(x.get('roleID') or u''),
                              status=x.get('status') or None,
                              _parsingCharacter=True))]

    extractors = [
            Extractor(label='title',
                        path="//title",
                        attrs=Attribute(key='name',
                            path="./text()",
                            postprocess=lambda x: \
                                    x.replace(' (Character)', '').strip())),

            Extractor(label='headshot',
                        path="//a[@name='headshot']",
                        attrs=Attribute(key='headshot',
                            path="./img/@src")),

            Extractor(label='akas',
                        path="//div[h5='Alternate Names:']",
                        attrs=Attribute(key='akas',
                            path="./text()",
                            postprocess=lambda x: x.strip().split(' / '))),

            Extractor(label='filmography',
                        path="//div[@class='filmo'][not(h5)]/ol/li",
                        attrs=_film_attrs),

            Extractor(label='filmography sections',
                        group="//div[@class='filmo'][h5]",
                        group_key="./h5/a/text()",
                        group_key_normalize=lambda x: x.lower()[:-1],
                        path="./ol/li",
                        attrs=_film_attrs),
            ]

    preprocessors = [
            # Check that this doesn't cut "status"...
            (re.compile(r'<br>(\.\.\.|   ).+?</li>', re.I | re.M), '</li>')]
示例#10
0
class DOMHTMLMaindetailsParser(DOMParserBase):
    """Parser for the "categorized" (maindetails) page of a given person.
    The page should be provided as a string, as taken from
    the akas.imdb.com server.  The final result will be a
    dictionary, with a key for every relevant section.

    Example:
        cparser = DOMHTMLMaindetailsParser()
        result = cparser.parse(categorized_html_string)
    """
    _containsObjects = True

    _birth_attrs = [Attribute(key='birth date',
                        path={
                            'day': "./a[starts-with(@href, " \
                                    "'/OnThisDay?')]/text()",
                            'year': "./a[starts-with(@href, " \
                                    "'/BornInYear?')]/text()"
                            },
                        postprocess=lambda x: build_date(x)),
                    Attribute(key='birth notes',
                        path="./a[starts-with(@href, '/BornWhere?')]/text()")]
    _death_attrs = [Attribute(key='death date',
                        path={
                            'day': "./a[starts-with(@href, " \
                                    "'/OnThisDay?')]/text()",
                            'year': "./a[starts-with(@href, " \
                                    "'/DiedInYear?')]/text()"
                            },
                        postprocess=lambda x: build_date(x)),
                    Attribute(key='death notes',
                        path="./text()",
                        # TODO: check if this slicing is always correct
                        postprocess=lambda x: x.strip()[2:])]
    _film_attrs = [
        Attribute(key=None,
                  multi=True,
                  path={
                      'link': "./a[1]/@href",
                      'title': ".//text()",
                      'status': "./i/a//text()",
                      'roleID': "./div[@class='_imdbpyrole']/@roleid"
                  },
                  postprocess=lambda x: build_movie(
                      x.get('title') or u'',
                      movieID=analyze_imdbid(x.get('link') or u''),
                      roleID=(x.get('roleID') or u'').split('/'),
                      status=x.get('status') or None))
    ]

    extractors = [
        Extractor(label='page title',
                  path="//title",
                  attrs=Attribute(
                      key='name',
                      path="./text()",
                      postprocess=lambda x: analyze_name(x, canonical=1))),
        Extractor(label='birth info',
                  path="//div[h5='Date of Birth:']",
                  attrs=_birth_attrs),
        Extractor(label='death info',
                  path="//div[h5='Date of Death:']",
                  attrs=_death_attrs),
        Extractor(label='headshot',
                  path="//a[@name='headshot']",
                  attrs=Attribute(key='headshot', path="./img/@src")),
        Extractor(label='akas',
                  path="//div[h5='Alternate Names:']",
                  attrs=Attribute(
                      key='akas',
                      path="./text()",
                      postprocess=lambda x: x.strip().split(' | '))),
        Extractor(label='filmography',
                  group="//div[@class='filmo'][h5]",
                  group_key="./h5/a[@name]/text()",
                  group_key_normalize=lambda x: x.lower()[:-1],
                  path="./ol/li",
                  attrs=_film_attrs)
    ]
    preprocessors = [
        # XXX: check that this doesn't cut "status" or other info...
        (re.compile(r'<br>(\.\.\.|    ?).+?</li>',
                    re.I | re.M | re.S), '</li>'),
        (_reRoles, _manageRoles)
    ]