def get_person_main(self, personID, _parseChr=False): if not _parseChr: url = imdbURL_person_main % personID + 'maindetails' else: url = imdbURL_character_main % personID s = self._mretrieve(url) r = {} name = _findBetween(s, '<title>', '</title>', maxRes=1) if not name: if _parseChr: w = 'characterID' else: w = 'personID' raise IMDbDataAccessError, 'unable to get %s "%s"' % (w, personID) name = _unHtml(name[0]) if _parseChr: name = name.replace('(Character)', '').strip() name = name.replace('- Filmography by type', '').strip() else: name = name.replace('- Filmography by', '').strip() r = analyze_name(name, canonical=not _parseChr) for dKind in ('birth', 'death'): date = _findBetween(s, '<h5>Date of %s:</h5>' % dKind.capitalize(), ('<a class', '</div>', '<br/><br/>'), maxRes=1) if date: date = _unHtml(date[0]) if date: date, notes = date_and_notes(date) if date: r['%s date' % dKind] = date if notes: r['%s notes' % dKind] = notes akas = _findBetween(s, 'Alternate Names:</h5>', ('</div>', '<br/><br/>'), maxRes=1) if akas: akas = akas[0] if akas.find(' | ') != -1: akas = _unHtml(akas).split(' | ') else: akas = _unHtml(akas).split(' / ') if akas: r['akas'] = akas hs = _findBetween(s, 'name="headshot"', '</a>', maxRes=1) if hs: hs[:] = _findBetween(hs[0], 'src="', '"', maxRes=1) if hs: r['headshot'] = hs[0] # Build a list of tuples such [('hrefLink', 'section name')] workkind = _findBetween(s, '<div class="strip jump">', '</div>', maxRes=1) if workkind: workkind[:] = _findBetween(workkind[0], 'href="#', '</a>') else: # Assume there's only one section and/or there are no # section links, for some reason. workkind[:] = _findBetween(s, '<h5><a name=', '</a></h5>') workkind[:] = [x.lstrip('"').rstrip(':').lower() for x in workkind] ws = [] for work in workkind: wsplit = work.split('">', 1) if len(wsplit) == 2: sect = wsplit[0] if '"' in sect: sect = sect[:sect.find('"')] ws.append((sect, wsplit[1].lower())) # XXX: I think "guest appearances" are gone. if s.find('<a href="#guest-appearances"') != -1: ws.append(('guest-appearances', 'notable tv guest appearances')) #if _parseChr: # ws.append(('filmography', 'filmography')) for sect, sectName in ws: raws = u'' # Everything between the current section link and the end # of the <ol> tag. if _parseChr and sect == 'filmography': inisect = s.find('<div class="filmo">') else: inisect = s.find('<a name="%s' % sect) if inisect != -1: endsect = s[inisect:].find('</ol>') if endsect != -1: raws = s[inisect:inisect + endsect] if not raws: continue mlist = _findBetween(raws, '<li>', ('</li>', '<br>', '<br/>')) for m in mlist: # For every movie in the current section. movieID = re_imdbID.findall(m) if not movieID: self._mobile_logger.debug('no movieID in %s', m) continue if not _parseChr: chrIndx = m.find(' .... ') else: chrIndx = m.find(' Played by ') chids = [] if chrIndx != -1: chrtxt = m[chrIndx + 6:] if _parseChr: chrtxt = chrtxt[5:] for ch in chrtxt.split(' / '): chid = re_imdbID.findall(ch) if not chid: chids.append(None) else: chids.append(chid[-1]) if not chids: chids = None elif len(chids) == 1: chids = chids[0] movieID = str(movieID[0]) # Search the status. stidx = m.find('<i>') status = u'' if stidx != -1: stendidx = m.rfind('</i>') if stendidx != -1: status = _unHtml(m[stidx + 3:stendidx]) m = m.replace(m[stidx + 3:stendidx], '') m = _unHtml(m) if not m: self._mobile_logger.warn('no title fo rmovieID %s', movieID) continue movie = build_movie(m, movieID=movieID, status=status, roleID=chids, modFunct=self._defModFunct, accessSystem=self.accessSystem, _parsingCharacter=_parseChr) r.setdefault(sectName, []).append(movie) # If available, take the always correct name from a form. itag = _getTagsWith(s, 'NAME="primary"', maxRes=1) if not itag: itag = _getTagsWith(s, 'name="primary"', maxRes=1) if itag: vtag = _findBetween(itag[0], 'VALUE="', ('"', '>'), maxRes=1) if not vtag: vtag = _findBetween(itag[0], 'value="', ('"', '>'), maxRes=1) if vtag: try: vtag = unquote(str(vtag[0])) vtag = unicode(vtag, 'latin_1') r.update(analyze_name(vtag)) except UnicodeEncodeError: pass return {'data': r, 'info sets': ('main', 'filmography')}
def get_person_main(self, personID, _parseChr=False): if not _parseChr: url = self.urls['person_main'] % personID + 'maindetails' else: url = self.urls['character_main'] % personID s = self._mretrieve(url) r = {} name = _findBetween(s, '<title>', '</title>', maxRes=1) if not name: if _parseChr: w = 'characterID' else: w = 'personID' raise IMDbDataAccessError('unable to get %s "%s"' % (w, personID)) name = _unHtml(name[0].replace(' - IMDb', '')) if _parseChr: name = name.replace('(Character)', '').strip() name = name.replace('- Filmography by type', '').strip() else: name = name.replace('- Filmography by', '').strip() r = analyze_name(name, canonical=not _parseChr) for dKind in ('Born', 'Died'): date = _findBetween(s, '%s:</h4>' % dKind.capitalize(), ('<div class', '</div>', '<br/><br/>'), maxRes=1) if date: date = _unHtml(date[0]) if date: #date, notes = date_and_notes(date) # TODO: fix to handle real names. date_notes = date.split(' in ', 1) notes = u'' date = date_notes[0] if len(date_notes) == 2: notes = date_notes[1] dtitle = 'birth' if dKind == 'Died': dtitle = 'death' if date: r['%s date' % dtitle] = date if notes: r['%s notes' % dtitle] = notes akas = _findBetween(s, 'Alternate Names:</h4>', ('</div>', '<br/><br/>'), maxRes=1) if akas: akas = akas[0] if akas: akas = _unHtml(akas) if akas.find(' | ') != -1: akas = akas.split(' | ') else: akas = akas.split(' / ') if akas: r['akas'] = filter(None, [x.strip() for x in akas]) hs = _findBetween(s, "rel='image_src'", '>', maxRes=1) if not hs: hs = _findBetween(s, 'rel="image_src"', '>', maxRes=1) if not hs: hs = _findBetween(s, '<a name="headshot"', '</a>', maxRes=1) if hs: hsl = _findBetween(hs[0], "href='", "'", maxRes=1) if not hsl: hsl = _findBetween(hs[0], 'href="', '"', maxRes=1) if hsl and 'imdb-share-logo' not in hsl[0]: r['headshot'] = hsl[0] # Build a list of tuples such [('hrefLink', 'section name')] workkind = _findBetween(s, 'id="jumpto_', '</a>') ws = [] for work in workkind: sep = '" >' if '">' in work: sep = '">' wsplit = work.split(sep, 1) if len(wsplit) == 2: sect = wsplit[0] if '"' in sect: sect = sect[:sect.find('"')] ws.append((sect, wsplit[1].lower())) # XXX: I think "guest appearances" are gone. if s.find('<a href="#guest-appearances"') != -1: ws.append(('guest-appearances', 'notable tv guest appearances')) #if _parseChr: # ws.append(('filmography', 'filmography')) for sect, sectName in ws: raws = u'' if sectName == 'self': sect = 'Self' # Everything between the current section link and the end # of the <ol> tag. if _parseChr and sect == 'filmography': inisect = s.find('<div class="filmo">') else: inisect = s.find('<a name="%s' % sect) if inisect != -1: endsect = s[inisect:].find('<div id="filmo-head-') if endsect == -1: endsect = s[inisect:].find('<div class="article"') if endsect != -1: raws = s[inisect:inisect+endsect] #if not raws: continue mlist = _findBetween(raws, '<div class="filmo-row', ('<div class="clear"/>',)) for m in mlist: fCB = m.find('>') if fCB != -1: m = m[fCB+1:].lstrip() m = re_filmo_episodes.sub('', m) # For every movie in the current section. movieID = re_imdbID.findall(m) if not movieID: self._mobile_logger.debug('no movieID in %s', m) continue m = m.replace('<br/>', ' .... ', 1) if not _parseChr: chrIndx = m.find(' .... ') else: chrIndx = m.find(' Played by ') chids = [] if chrIndx != -1: chrtxt = m[chrIndx+6:] if _parseChr: chrtxt = chrtxt[5:] for ch in chrtxt.split(' / '): chid = re_imdbID.findall(ch) if not chid: chids.append(None) else: chids.append(chid[-1]) if not chids: chids = None elif len(chids) == 1: chids = chids[0] movieID = str(movieID[0]) # Search the status. stidx = m.find('<i>') status = u'' if stidx != -1: stendidx = m.rfind('</i>') if stendidx != -1: status = _unHtml(m[stidx+3:stendidx]) m = m.replace(m[stidx+3:stendidx], '') year = _findBetween(m, 'year_column">', '</span>', maxRes=1) if year: year = year[0] m = m.replace('<span class="year_column">%s</span>' % year, '') else: year = None m = _unHtml(m) if not m: self._mobile_logger.warn('no title for movieID %s', movieID) continue movie = build_movie(m, movieID=movieID, status=status, roleID=chids, modFunct=self._defModFunct, accessSystem=self.accessSystem, _parsingCharacter=_parseChr, year=year) sectName = sectName.split(':')[0] r.setdefault(sectName, []).append(movie) # If available, take the always correct name from a form. itag = _getTagsWith(s, 'NAME="primary"', maxRes=1) if not itag: itag = _getTagsWith(s, 'name="primary"', maxRes=1) if itag: vtag = _findBetween(itag[0], 'VALUE="', ('"', '>'), maxRes=1) if not vtag: vtag = _findBetween(itag[0], 'value="', ('"', '>'), maxRes=1) if vtag: try: vtag = unquote(str(vtag[0])) vtag = unicode(vtag, 'latin_1') r.update(analyze_name(vtag)) except UnicodeEncodeError: pass return {'data': r, 'info sets': ('main', 'filmography')}
def get_person_main(self, personID, _parseChr=False): if not _parseChr: url = imdbURL_person_main % personID + 'maindetails' else: url = imdbURL_character_main % personID s = self._mretrieve(url) r = {} name = _findBetween(s, '<title>', '</title>', maxRes=1) if not name: if _parseChr: w = 'characterID' else: w = 'personID' raise IMDbDataAccessError, 'unable to get %s "%s"' % (w, personID) name = _unHtml(name[0]) if _parseChr: name = name.replace('(Character)', '').strip() r = analyze_name(name, canonical=not _parseChr) for dKind in ('birth', 'death'): date = _findBetween(s, '<h5>Date of %s:</h5>' % dKind.capitalize(), ('<a class', '</div>', '<br/><br/>'), maxRes=1) if date: date = _unHtml(date[0]) if date: date, notes = date_and_notes(date) if date: r['%s date' % dKind] = date if notes: r['%s notes' % dKind] = notes akas = _findBetween(s, 'Alternate Names:</h5>', ('</div>', '<br/><br/>'), maxRes=1) if akas: akas = akas[0] if akas.find(' | ') != -1: akas = _unHtml(akas).split(' | ') else: akas = _unHtml(akas).split(' / ') if akas: r['akas'] = akas hs = _findBetween(s, 'name="headshot"', '</a>', maxRes=1) if hs: hs[:] = _findBetween(hs[0], 'src="', '"', maxRes=1) if hs: r['headshot'] = hs[0] # Build a list of tuples such [('hrefLink', 'section name')] workkind = _findBetween(s, '<div class="strip jump">', '</div>', maxRes=1) if workkind: workkind[:] = _findBetween(workkind[0], 'href="#', '</a>') else: # Assume there's only one section and/or there are no # section links, for some reason. workkind[:] = _findBetween(s, '<h5><a name=', '</a></h5>') workkind[:] = [x.lstrip('"').rstrip(':').lower() for x in workkind] ws = [] for work in workkind: wsplit = work.split('">', 1) if len(wsplit) == 2: ws.append((wsplit[0], wsplit[1].lower())) # XXX: I think "guest appearances" are gone. if s.find('<a href="#guest-appearances"') != -1: ws.append(('guest-appearances', 'notable tv guest appearances')) if _parseChr: ws.append(('filmography', 'filmography')) for sect, sectName in ws: raws = u'' # Everything between the current section link and the end # of the <ol> tag. if _parseChr and sect == 'filmography': inisect = s.find('<div class="filmo">') else: inisect = s.find('<a name="%s' % sect) if inisect != -1: endsect = s[inisect:].find('</ol>') if endsect != -1: raws = s[inisect:inisect+endsect] if not raws: continue mlist = _findBetween(raws, '<li>', ('</li>', '<br>', '<br/>')) for m in mlist: # For every movie in the current section. movieID = re_imdbID.findall(m) if not movieID: continue if not _parseChr: chrIndx = m.find(' .... ') else: chrIndx = m.find(' Played by ') chids = [] if chrIndx != -1: chrtxt = m[chrIndx+6:] if _parseChr: chrtxt = chrtxt[5:] for ch in chrtxt.split(' / '): chid = re_imdbID.findall(ch) if not chid: chids.append(None) else: chids.append(chid[-1]) if not chids: chids = None elif len(chids) == 1: chids = chids[0] movieID = str(movieID[0]) # Search the status. stidx = m.find('<i>') status = u'' if stidx != -1: stendidx = m.rfind('</i>') if stendidx != -1: status = _unHtml(m[stidx+3:stendidx]) m = m.replace(m[stidx+3:stendidx], '') m = _unHtml(m) if not m: continue movie = build_movie(m, movieID=movieID, status=status, roleID=chids, modFunct=self._defModFunct, accessSystem=self.accessSystem, _parsingCharacter=_parseChr) r.setdefault(sectName, []).append(movie) # If available, take the always correct name from a form. itag = _getTagsWith(s, 'NAME="primary"', maxRes=1) if not itag: itag = _getTagsWith(s, 'name="primary"', maxRes=1) if itag: vtag = _findBetween(itag[0], 'VALUE="', ('"', '>'), maxRes=1) if not vtag: vtag = _findBetween(itag[0], 'value="', ('"', '>'), maxRes=1) if vtag: try: vtag = unquote(str(vtag[0])) vtag = unicode(vtag, 'latin_1') r.update(analyze_name(vtag, canonical=0)) except UnicodeEncodeError: pass photo = _findBetween(s, '<div class="photo">', '</div>', maxRes=1) image_url = '' if (len(photo)>0): img = _findBetween(photo[0], '<img', '/a>', maxRes=1) if (len(img)>0): image_url = _findBetween(img[0],' src="', '"', maxRes=1)[0] r['image_url'] = image_url return {'data': r, 'info sets': ('main', 'filmography')}