Пример #1
0
 def get_person_main(self, personID, _parseChr=False):
     if not _parseChr:
         url = imdbURL_person_main % personID + 'maindetails'
     else:
         url = imdbURL_character_main % personID
     s = self._mretrieve(url)
     r = {}
     name = _findBetween(s, '<title>', '</title>', maxRes=1)
     if not name:
         if _parseChr: w = 'characterID'
         else: w = 'personID'
         raise IMDbDataAccessError, 'unable to get %s "%s"' % (w, personID)
     name = _unHtml(name[0])
     if _parseChr:
         name = name.replace('(Character)', '').strip()
         name = name.replace('- Filmography by type', '').strip()
     else:
         name = name.replace('- Filmography by', '').strip()
     r = analyze_name(name, canonical=not _parseChr)
     for dKind in ('birth', 'death'):
         date = _findBetween(s,
                             '<h5>Date of %s:</h5>' % dKind.capitalize(),
                             ('<a class', '</div>', '<br/><br/>'),
                             maxRes=1)
         if date:
             date = _unHtml(date[0])
             if date:
                 date, notes = date_and_notes(date)
                 if date:
                     r['%s date' % dKind] = date
                 if notes:
                     r['%s notes' % dKind] = notes
     akas = _findBetween(s,
                         'Alternate Names:</h5>', ('</div>', '<br/><br/>'),
                         maxRes=1)
     if akas:
         akas = akas[0]
         if akas.find(' | ') != -1:
             akas = _unHtml(akas).split(' | ')
         else:
             akas = _unHtml(akas).split(' / ')
         if akas: r['akas'] = akas
     hs = _findBetween(s, 'name="headshot"', '</a>', maxRes=1)
     if hs:
         hs[:] = _findBetween(hs[0], 'src="', '"', maxRes=1)
         if hs: r['headshot'] = hs[0]
     # Build a list of tuples such [('hrefLink', 'section name')]
     workkind = _findBetween(s,
                             '<div class="strip jump">',
                             '</div>',
                             maxRes=1)
     if workkind:
         workkind[:] = _findBetween(workkind[0], 'href="#', '</a>')
     else:
         # Assume there's only one section and/or there are no
         # section links, for some reason.
         workkind[:] = _findBetween(s, '<h5><a name=', '</a></h5>')
         workkind[:] = [x.lstrip('"').rstrip(':').lower() for x in workkind]
     ws = []
     for work in workkind:
         wsplit = work.split('">', 1)
         if len(wsplit) == 2:
             sect = wsplit[0]
             if '"' in sect:
                 sect = sect[:sect.find('"')]
             ws.append((sect, wsplit[1].lower()))
     # XXX: I think "guest appearances" are gone.
     if s.find('<a href="#guest-appearances"') != -1:
         ws.append(('guest-appearances', 'notable tv guest appearances'))
     #if _parseChr:
     #    ws.append(('filmography', 'filmography'))
     for sect, sectName in ws:
         raws = u''
         # Everything between the current section link and the end
         # of the <ol> tag.
         if _parseChr and sect == 'filmography':
             inisect = s.find('<div class="filmo">')
         else:
             inisect = s.find('<a name="%s' % sect)
         if inisect != -1:
             endsect = s[inisect:].find('</ol>')
             if endsect != -1: raws = s[inisect:inisect + endsect]
         if not raws: continue
         mlist = _findBetween(raws, '<li>', ('</li>', '<br>', '<br/>'))
         for m in mlist:
             # For every movie in the current section.
             movieID = re_imdbID.findall(m)
             if not movieID:
                 self._mobile_logger.debug('no movieID in %s', m)
                 continue
             if not _parseChr:
                 chrIndx = m.find(' .... ')
             else:
                 chrIndx = m.find(' Played by ')
             chids = []
             if chrIndx != -1:
                 chrtxt = m[chrIndx + 6:]
                 if _parseChr:
                     chrtxt = chrtxt[5:]
                 for ch in chrtxt.split(' / '):
                     chid = re_imdbID.findall(ch)
                     if not chid:
                         chids.append(None)
                     else:
                         chids.append(chid[-1])
             if not chids:
                 chids = None
             elif len(chids) == 1:
                 chids = chids[0]
             movieID = str(movieID[0])
             # Search the status.
             stidx = m.find('<i>')
             status = u''
             if stidx != -1:
                 stendidx = m.rfind('</i>')
                 if stendidx != -1:
                     status = _unHtml(m[stidx + 3:stendidx])
                     m = m.replace(m[stidx + 3:stendidx], '')
             m = _unHtml(m)
             if not m:
                 self._mobile_logger.warn('no title fo rmovieID %s',
                                          movieID)
                 continue
             movie = build_movie(m,
                                 movieID=movieID,
                                 status=status,
                                 roleID=chids,
                                 modFunct=self._defModFunct,
                                 accessSystem=self.accessSystem,
                                 _parsingCharacter=_parseChr)
             r.setdefault(sectName, []).append(movie)
     # If available, take the always correct name from a form.
     itag = _getTagsWith(s, 'NAME="primary"', maxRes=1)
     if not itag:
         itag = _getTagsWith(s, 'name="primary"', maxRes=1)
     if itag:
         vtag = _findBetween(itag[0], 'VALUE="', ('"', '>'), maxRes=1)
         if not vtag:
             vtag = _findBetween(itag[0], 'value="', ('"', '>'), maxRes=1)
         if vtag:
             try:
                 vtag = unquote(str(vtag[0]))
                 vtag = unicode(vtag, 'latin_1')
                 r.update(analyze_name(vtag))
             except UnicodeEncodeError:
                 pass
     return {'data': r, 'info sets': ('main', 'filmography')}
Пример #2
0
 def get_person_main(self, personID, _parseChr=False):
     if not _parseChr:
         url = self.urls['person_main'] % personID + 'maindetails'
     else:
         url = self.urls['character_main'] % personID
     s = self._mretrieve(url)
     r = {}
     name = _findBetween(s, '<title>', '</title>', maxRes=1)
     if not name:
         if _parseChr: w = 'characterID'
         else: w = 'personID'
         raise IMDbDataAccessError('unable to get %s "%s"' % (w, personID))
     name = _unHtml(name[0].replace(' - IMDb', ''))
     if _parseChr:
         name = name.replace('(Character)', '').strip()
         name = name.replace('- Filmography by type', '').strip()
     else:
         name = name.replace('- Filmography by', '').strip()
     r = analyze_name(name, canonical=not _parseChr)
     for dKind in ('Born', 'Died'):
         date = _findBetween(s, '%s:</h4>' % dKind.capitalize(),
                             ('<div class', '</div>', '<br/><br/>'), maxRes=1)
         if date:
             date = _unHtml(date[0])
             if date:
                 #date, notes = date_and_notes(date)
                 # TODO: fix to handle real names.
                 date_notes = date.split(' in ', 1)
                 notes = u''
                 date = date_notes[0]
                 if len(date_notes) == 2:
                     notes = date_notes[1]
                 dtitle = 'birth'
                 if dKind == 'Died':
                     dtitle = 'death'
                 if date:
                     r['%s date' % dtitle] = date
                 if notes:
                     r['%s notes' % dtitle] = notes
     akas = _findBetween(s, 'Alternate Names:</h4>', ('</div>',
                         '<br/><br/>'), maxRes=1)
     if akas:
         akas = akas[0]
         if akas:
             akas = _unHtml(akas)
         if akas.find(' | ') != -1:
             akas = akas.split(' | ')
         else:
             akas = akas.split(' / ')
         if akas: r['akas'] = filter(None, [x.strip() for x in akas])
     hs = _findBetween(s, "rel='image_src'", '>', maxRes=1)
     if not hs:
         hs = _findBetween(s, 'rel="image_src"', '>', maxRes=1)
     if not hs:
         hs = _findBetween(s, '<a name="headshot"', '</a>', maxRes=1)
     if hs:
         hsl = _findBetween(hs[0], "href='", "'", maxRes=1)
         if not hsl:
             hsl = _findBetween(hs[0], 'href="', '"', maxRes=1)
         if hsl and 'imdb-share-logo' not in hsl[0]:
             r['headshot'] = hsl[0]
     # Build a list of tuples such [('hrefLink', 'section name')]
     workkind = _findBetween(s, 'id="jumpto_', '</a>')
     ws = []
     for work in workkind:
         sep = '" >'
         if '">' in work:
             sep = '">'
         wsplit = work.split(sep, 1)
         if len(wsplit) == 2:
             sect = wsplit[0]
             if '"' in sect:
                 sect = sect[:sect.find('"')]
             ws.append((sect, wsplit[1].lower()))
     # XXX: I think "guest appearances" are gone.
     if s.find('<a href="#guest-appearances"') != -1:
         ws.append(('guest-appearances', 'notable tv guest appearances'))
     #if _parseChr:
     #    ws.append(('filmography', 'filmography'))
     for sect, sectName in ws:
         raws = u''
         if sectName == 'self':
             sect = 'Self'
         # Everything between the current section link and the end
         # of the <ol> tag.
         if _parseChr and sect == 'filmography':
             inisect = s.find('<div class="filmo">')
         else:
             inisect = s.find('<a name="%s' % sect)
         if inisect != -1:
             endsect = s[inisect:].find('<div id="filmo-head-')
             if endsect == -1:
                 endsect = s[inisect:].find('<div class="article"')
             if endsect != -1: raws = s[inisect:inisect+endsect]
         #if not raws: continue
         mlist = _findBetween(raws, '<div class="filmo-row',
                 ('<div class="clear"/>',))
         for m in mlist:
             fCB = m.find('>')
             if fCB != -1:
                 m = m[fCB+1:].lstrip()
             m = re_filmo_episodes.sub('', m)
             # For every movie in the current section.
             movieID = re_imdbID.findall(m)
             if not movieID:
                 self._mobile_logger.debug('no movieID in %s', m)
                 continue
             m = m.replace('<br/>', ' .... ', 1)
             if not _parseChr:
                 chrIndx = m.find(' .... ')
             else:
                 chrIndx = m.find(' Played by ')
             chids = []
             if chrIndx != -1:
                 chrtxt = m[chrIndx+6:]
                 if _parseChr:
                     chrtxt = chrtxt[5:]
                 for ch in chrtxt.split(' / '):
                     chid = re_imdbID.findall(ch)
                     if not chid:
                         chids.append(None)
                     else:
                         chids.append(chid[-1])
             if not chids:
                 chids = None
             elif len(chids) == 1:
                 chids = chids[0]
             movieID = str(movieID[0])
             # Search the status.
             stidx = m.find('<i>')
             status = u''
             if stidx != -1:
                 stendidx = m.rfind('</i>')
                 if stendidx != -1:
                     status = _unHtml(m[stidx+3:stendidx])
                     m = m.replace(m[stidx+3:stendidx], '')
             year = _findBetween(m, 'year_column">', '</span>', maxRes=1)
             if year:
                 year = year[0]
                 m = m.replace('<span class="year_column">%s</span>' % year,
                         '')
             else:
                 year = None
             m = _unHtml(m)
             if not m:
                 self._mobile_logger.warn('no title for movieID %s', movieID)
                 continue
             movie = build_movie(m, movieID=movieID, status=status,
                                 roleID=chids, modFunct=self._defModFunct,
                                 accessSystem=self.accessSystem,
                                 _parsingCharacter=_parseChr, year=year)
             sectName = sectName.split(':')[0]
             r.setdefault(sectName, []).append(movie)
     # If available, take the always correct name from a form.
     itag = _getTagsWith(s, 'NAME="primary"', maxRes=1)
     if not itag:
         itag = _getTagsWith(s, 'name="primary"', maxRes=1)
     if itag:
         vtag = _findBetween(itag[0], 'VALUE="', ('"', '>'), maxRes=1)
         if not vtag:
             vtag = _findBetween(itag[0], 'value="', ('"', '>'), maxRes=1)
         if vtag:
             try:
                 vtag = unquote(str(vtag[0]))
                 vtag = unicode(vtag, 'latin_1')
                 r.update(analyze_name(vtag))
             except UnicodeEncodeError:
                 pass
     return {'data': r, 'info sets': ('main', 'filmography')}
Пример #3
0
 def get_person_main(self, personID, _parseChr=False):
     if not _parseChr:
         url = imdbURL_person_main % personID + 'maindetails'
     else:
         url = imdbURL_character_main % personID
     s = self._mretrieve(url)
     r = {}
     name = _findBetween(s, '<title>', '</title>', maxRes=1)
     if not name:
         if _parseChr: w = 'characterID'
         else: w = 'personID'
         raise IMDbDataAccessError, 'unable to get %s "%s"' % (w, personID)
     name = _unHtml(name[0])
     if _parseChr:
         name = name.replace('(Character)', '').strip()
     r = analyze_name(name, canonical=not _parseChr)
     for dKind in ('birth', 'death'):
         date = _findBetween(s, '<h5>Date of %s:</h5>' % dKind.capitalize(),
                             ('<a class', '</div>', '<br/><br/>'), maxRes=1)
         if date:
             date = _unHtml(date[0])
             if date:
                 date, notes = date_and_notes(date)
                 if date:
                     r['%s date' % dKind] = date
                 if notes:
                     r['%s notes' % dKind] = notes
     akas = _findBetween(s, 'Alternate Names:</h5>', ('</div>',
                         '<br/><br/>'), maxRes=1)
     if akas:
         akas = akas[0]
         if akas.find(' | ') != -1:
             akas = _unHtml(akas).split(' | ')
         else:
             akas = _unHtml(akas).split(' / ')
         if akas: r['akas'] = akas
     hs = _findBetween(s, 'name="headshot"', '</a>', maxRes=1)
     if hs:
         hs[:] = _findBetween(hs[0], 'src="', '"', maxRes=1)
         if hs: r['headshot'] = hs[0]
     # Build a list of tuples such [('hrefLink', 'section name')]
     workkind = _findBetween(s, '<div class="strip jump">', '</div>',
                             maxRes=1)
     if workkind:
         workkind[:] = _findBetween(workkind[0], 'href="#', '</a>')
     else:
         # Assume there's only one section and/or there are no
         # section links, for some reason.
         workkind[:] = _findBetween(s, '<h5><a name=', '</a></h5>')
         workkind[:] = [x.lstrip('"').rstrip(':').lower() for x in workkind]
     ws = []
     for work in workkind:
         wsplit = work.split('">', 1)
         if len(wsplit) == 2:
             ws.append((wsplit[0], wsplit[1].lower()))
     # XXX: I think "guest appearances" are gone.
     if s.find('<a href="#guest-appearances"') != -1:
         ws.append(('guest-appearances', 'notable tv guest appearances'))
     if _parseChr:
         ws.append(('filmography', 'filmography'))
     for sect, sectName in ws:
         raws = u''
         # Everything between the current section link and the end
         # of the <ol> tag.
         if _parseChr and sect == 'filmography':
             inisect = s.find('<div class="filmo">')
         else:
             inisect = s.find('<a name="%s' % sect)
         if inisect != -1:
             endsect = s[inisect:].find('</ol>')
             if endsect != -1: raws = s[inisect:inisect+endsect]
         if not raws: continue
         mlist = _findBetween(raws, '<li>', ('</li>', '<br>', '<br/>'))
         for m in mlist:
             # For every movie in the current section.
             movieID = re_imdbID.findall(m)
             if not movieID: continue
             if not _parseChr:
                 chrIndx = m.find(' .... ')
             else:
                 chrIndx = m.find(' Played by ')
             chids = []
             if chrIndx != -1:
                 chrtxt = m[chrIndx+6:]
                 if _parseChr:
                     chrtxt = chrtxt[5:]
                 for ch in chrtxt.split(' / '):
                     chid = re_imdbID.findall(ch)
                     if not chid:
                         chids.append(None)
                     else:
                         chids.append(chid[-1])
             if not chids:
                 chids = None
             elif len(chids) == 1:
                 chids = chids[0]
             movieID = str(movieID[0])
             # Search the status.
             stidx = m.find('<i>')
             status = u''
             if stidx != -1:
                 stendidx = m.rfind('</i>')
                 if stendidx != -1:
                     status = _unHtml(m[stidx+3:stendidx])
                     m = m.replace(m[stidx+3:stendidx], '')
             m = _unHtml(m)
             if not m: continue
             movie = build_movie(m, movieID=movieID, status=status,
                                 roleID=chids, modFunct=self._defModFunct,
                                 accessSystem=self.accessSystem,
                                 _parsingCharacter=_parseChr)
             r.setdefault(sectName, []).append(movie)
     # If available, take the always correct name from a form.
     itag = _getTagsWith(s, 'NAME="primary"', maxRes=1)
     if not itag:
         itag = _getTagsWith(s, 'name="primary"', maxRes=1)
     if itag:
         vtag = _findBetween(itag[0], 'VALUE="', ('"', '>'), maxRes=1)
         if not vtag:
             vtag = _findBetween(itag[0], 'value="', ('"', '>'), maxRes=1)
         if vtag:
             try:
                 vtag = unquote(str(vtag[0]))
                 vtag = unicode(vtag, 'latin_1')
                 r.update(analyze_name(vtag, canonical=0))
             except UnicodeEncodeError:
                 pass
             
     photo = _findBetween(s, '<div class="photo">', '</div>', maxRes=1)
     image_url = ''
     if (len(photo)>0):
         img = _findBetween(photo[0], '<img', '/a>', maxRes=1)
         if (len(img)>0):
             image_url = _findBetween(img[0],' src="', '"', maxRes=1)[0]
     r['image_url'] = image_url
     
     return {'data': r, 'info sets': ('main', 'filmography')}