def open_unknown_proxy(self, proxy, fullurl, data=None): raise IMDbDataAccessError({ 'proxy': str(proxy), 'fullurl': fullurl, 'error type': 'open_unknown_proxy', 'data': str(data) })
def open_unknown(self, fullurl, data=None): raise IMDbDataAccessError({ 'fullurl': fullurl, 'data': str(data), 'error type': 'open_unknown', 'proxy': self.get_proxy() })
def retrieve_unicode(self, url, size=-1): """Retrieves the given URL, and returns a unicode string, trying to guess the encoding of the data (assuming utf8 by default)""" encode = None try: if size != -1: self.set_header('Range', 'bytes=0-%d' % size) uopener = self.open(url) kwds = {} content = uopener.read(**kwds) self._last_url = uopener.url # Maybe the server is so nice to tell us the charset... if PY2: server_encode = uopener.headers.getparam('charset') else: server_encode = (uopener.info().get_charsets() or [None])[0] # Otherwise, look at the content-type HTML meta tag. if server_encode is None and content: begin_h = content.find(b'text/html; charset=') if begin_h != -1: end_h = content[19 + begin_h:].find('"') if end_h != -1: server_encode = content[19 + begin_h:19 + begin_h + end_h] if server_encode: try: if lookup(server_encode): encode = server_encode except (LookupError, ValueError, TypeError): pass uopener.close() if size != -1: self.del_header('Range') self.close() except IOError as e: if size != -1: # Ensure that the Range header is removed. self.del_header('Range') raise IMDbDataAccessError({ 'errcode': e.errno, 'errmsg': str(e.strerror), 'url': url, 'proxy': self.get_proxy(), 'exception type': 'IOError', 'original exception': e }) if encode is None: encode = 'utf8' # The detection of the encoding is error prone... self._logger.warn( 'Unable to detect the encoding of the retrieved page [%s];' ' falling back to default utf8.', encode) if isinstance(content, str): return content return str(content, encode, 'replace')
def http_error_default(self, url, fp, errcode, errmsg, headers): if errcode == 404: self._logger.warn('404 code returned for %s: %s (headers: %s)', url, errmsg, headers) return _FakeURLOpener(url, headers) raise IMDbDataAccessError({'url': 'http:%s' % url, 'errcode': errcode, 'errmsg': errmsg, 'headers': headers, 'error type': 'http_error_default', 'proxy': self.get_proxy()})
def retrieve_unicode(self, url, size=-1): """Retrieves the given URL, and returns a unicode string, trying to guess the encoding of the data (assuming latin_1 by default)""" encode = None try: if size != -1: self.set_header('Range', 'bytes=0-%d' % size) uopener = self.open(url) kwds = {} if PY_VERSION > (2, 3) and not IN_GAE: kwds['size'] = size content = uopener.read(**kwds) self._last_url = uopener.url # Maybe the server is so nice to tell us the charset... server_encode = uopener.info().getparam('charset') # Otherwise, look at the content-type HTML meta tag. if server_encode is None and content: first_bytes = content[:512] begin_h = first_bytes.find('text/html; charset=') if begin_h != -1: end_h = first_bytes[19 + begin_h:].find('"') if end_h != -1: server_encode = first_bytes[19 + begin_h:19 + begin_h + end_h] if server_encode: try: if lookup(server_encode): encode = server_encode except (LookupError, ValueError, TypeError): pass uopener.close() if size != -1: self.del_header('Range') self.close() except IOError, e: if size != -1: # Ensure that the Range header is removed. self.del_header('Range') raise IMDbDataAccessError({ 'errcode': e.errno, 'errmsg': str(e.strerror), 'url': url, 'proxy': self.get_proxy(), 'exception type': 'IOError', 'original exception': e })
def update(self, mop, info=None, override=0): """Given a Movie, Person, Character or Company object with only partial information, retrieve the required set of information. info is the list of sets of information to retrieve. If override is set, the information are retrieved and updated even if they're already in the object.""" # XXX: should this be a method of the Movie/Person/Character/Company # classes? NO! What for instances created by external functions? mopID = None prefix = '' if isinstance(mop, Movie.Movie): mopID = mop.movieID prefix = 'movie' elif isinstance(mop, Person.Person): mopID = mop.personID prefix = 'person' elif isinstance(mop, Character.Character): mopID = mop.characterID prefix = 'character' elif isinstance(mop, Company.Company): mopID = mop.companyID prefix = 'company' else: raise IMDbError('object ' + repr(mop) + \ ' is not a Movie, Person, Character or Company instance') if mopID is None: # XXX: enough? It's obvious that there are Characters # objects without characterID, so I think they should # just do nothing, when an i.update(character) is tried. if prefix == 'character': return raise IMDbDataAccessError( \ 'the supplied object has null movieID, personID or companyID') if mop.accessSystem == self.accessSystem: aSystem = self else: aSystem = IMDb(mop.accessSystem) if info is None: info = mop.default_info elif info == 'all': if isinstance(mop, Movie.Movie): info = self.get_movie_infoset() elif isinstance(mop, Person.Person): info = self.get_person_infoset() elif isinstance(mop, Character.Character): info = self.get_character_infoset() else: info = self.get_company_infoset() if not isinstance(info, (tuple, list)): info = (info, ) res = {} for i in info: if i in mop.current_info and not override: continue if not i: continue self._imdb_logger.debug('retrieving "%s" info set', i) try: method = getattr(aSystem, 'get_%s_%s' % (prefix, i.replace(' ', '_'))) except AttributeError: self._imdb_logger.error('unknown information set "%s"', i) # Keeps going. method = lambda *x: {} try: ret = method(mopID) except Exception as e: self._imdb_logger.critical('caught an exception retrieving ' \ 'or parsing "%s" info set for mopID ' \ '"%s" (accessSystem: %s)', i, mopID, mop.accessSystem, exc_info=True) ret = {} # If requested by the user, reraise the exception. if self._reraise_exceptions: raise keys = None if 'data' in ret: res.update(ret['data']) if isinstance(ret['data'], dict): keys = ret['data'].keys() if 'info sets' in ret: for ri in ret['info sets']: mop.add_to_current_info(ri, keys, mainInfoset=i) else: mop.add_to_current_info(i, keys) if 'titlesRefs' in ret: mop.update_titlesRefs(ret['titlesRefs']) if 'namesRefs' in ret: mop.update_namesRefs(ret['namesRefs']) if 'charactersRefs' in ret: mop.update_charactersRefs(ret['charactersRefs']) mop.set_data(res, override=0)
def update_series_seasons(self, mop, season_nums, override=0): """Given a Movie object with only retrieve the season data. season_nums is the list of the specific seasons to retrieve. If override is set, the information are retrieved and updated even if they're already in the object.""" mopID = None if isinstance(mop, Movie.Movie): mopID = mop.movieID else: raise IMDbError('object ' + repr(mop) + ' is not a Movie instance') if mopID is None: raise IMDbDataAccessError( 'supplied object has null movieID, personID or companyID') if mop.accessSystem == self.accessSystem: aSystem = self else: aSystem = IMDb(mop.accessSystem) info = 'episodes' res = {} if info in mop.current_info and not override: return _imdb_logger.debug('retrieving "%s" info set', info) try: method = getattr(aSystem, 'get_movie_episodes') except AttributeError: _imdb_logger.error('unknown information set "%s"', info) # Keeps going. method = lambda *x: {} try: ret = method(mopID, season_nums) except Exception: _imdb_logger.critical( 'caught an exception retrieving or parsing "%s" info set' ' for mopID "%s" (accessSystem: %s)', info, mopID, mop.accessSystem, exc_info=True) ret = {} # If requested by the user, reraise the exception. if self._reraise_exceptions: raise keys = None if 'data' in ret: res.update(ret['data']) if isinstance(ret['data'], dict): keys = list(ret['data'].keys()) if 'info sets' in ret: for ri in ret['info sets']: mop.add_to_current_info(ri, keys, mainInfoset=info) else: mop.add_to_current_info(info, keys) if 'titlesRefs' in ret: mop.update_titlesRefs(ret['titlesRefs']) if 'namesRefs' in ret: mop.update_namesRefs(ret['namesRefs']) if 'charactersRefs' in ret: mop.update_charactersRefs(ret['charactersRefs']) mop.set_data(res, override=0)
def retrieve_unicode(self, url, size=-1): """Retrieves the given URL, and returns a unicode string, trying to guess the encoding of the data (assuming utf8 by default)""" encode = None try: if size != -1: self.set_header('Range', 'bytes=0-%d' % size) handlers = [] if 'http' in self.proxies: proxy_handler = ProxyHandler({ 'http': self.proxies['http'], 'https': self.proxies['http'] }) handlers.append(proxy_handler) handlers.append(self.https_handler) uopener = build_opener(*handlers) uopener.addheaders = list(self.addheaders) response = uopener.open(url) content = response.read() self._last_url = response.url # Maybe the server is so nice to tell us the charset... if PY2: server_encode = response.headers.getparam('charset') or None else: server_encode = response.headers.get_content_charset(None) # Otherwise, look at the content-type HTML meta tag. if server_encode is None and content: begin_h = content.find(b'text/html; charset=') if begin_h != -1: end_h = content[19 + begin_h:].find('"') if end_h != -1: server_encode = content[19 + begin_h:19 + begin_h + end_h] if server_encode: try: if lookup(server_encode): encode = server_encode except (LookupError, ValueError, TypeError): pass if size != -1: self.del_header('Range') response.close() except IOError as e: if size != -1: # Ensure that the Range header is removed. self.del_header('Range') raise IMDbDataAccessError({ 'errcode': e.errno, 'errmsg': str(e.strerror), 'url': url, 'proxy': self.get_proxy(), 'exception type': 'IOError', 'original exception': e }) if encode is None: encode = 'utf8' # The detection of the encoding is error prone... self._logger.warn( 'Unable to detect the encoding of the retrieved page [%s];' ' falling back to default utf8.', encode) if isinstance(content, str): return content return str(content, encode, 'replace')
def get_person_main(self, personID, _parseChr=False): if not _parseChr: url = self.urls['person_main'] % personID + 'maindetails' else: url = self.urls['character_main'] % personID s = self._mretrieve(url) r = {} name = _findBetween(s, '<title>', '</title>', maxRes=1) if not name: if _parseChr: w = 'characterID' else: w = 'personID' raise IMDbDataAccessError('unable to get %s "%s"' % (w, personID)) name = _unHtml(name[0].replace(' - IMDb', '')) if _parseChr: name = name.replace('(Character)', '').strip() name = name.replace('- Filmography by type', '').strip() else: name = name.replace('- Filmography by', '').strip() r = analyze_name(name, canonical=not _parseChr) for dKind in ('Born', 'Died'): date = _findBetween(s, '%s:</h4>' % dKind.capitalize(), ('<div class', '</div>', '<br/><br/>'), maxRes=1) if date: date = _unHtml(date[0]) if date: #date, notes = date_and_notes(date) # TODO: fix to handle real names. date_notes = date.split(' in ', 1) notes = u'' date = date_notes[0] if len(date_notes) == 2: notes = date_notes[1] dtitle = 'birth' if dKind == 'Died': dtitle = 'death' if date: r['%s date' % dtitle] = date if notes: r['%s notes' % dtitle] = notes akas = _findBetween(s, 'Alternate Names:</h4>', ('</div>', '<br/><br/>'), maxRes=1) if akas: akas = akas[0] if akas: akas = _unHtml(akas) if akas.find(' | ') != -1: akas = akas.split(' | ') else: akas = akas.split(' / ') if akas: r['akas'] = filter(None, [x.strip() for x in akas]) hs = _findBetween(s, "rel='image_src'", '>', maxRes=1) if not hs: hs = _findBetween(s, 'rel="image_src"', '>', maxRes=1) if not hs: hs = _findBetween(s, '<a name="headshot"', '</a>', maxRes=1) if hs: hsl = _findBetween(hs[0], "href='", "'", maxRes=1) if not hsl: hsl = _findBetween(hs[0], 'href="', '"', maxRes=1) if hsl and 'imdb-share-logo' not in hsl[0]: r['headshot'] = hsl[0] # Build a list of tuples such [('hrefLink', 'section name')] workkind = _findBetween(s, 'id="jumpto_', '</a>') ws = [] for work in workkind: sep = '" >' if '">' in work: sep = '">' wsplit = work.split(sep, 1) if len(wsplit) == 2: sect = wsplit[0] if '"' in sect: sect = sect[:sect.find('"')] ws.append((sect, wsplit[1].lower())) # XXX: I think "guest appearances" are gone. if s.find('<a href="#guest-appearances"') != -1: ws.append(('guest-appearances', 'notable tv guest appearances')) #if _parseChr: # ws.append(('filmography', 'filmography')) for sect, sectName in ws: raws = u'' if sectName == 'self': sect = 'Self' # Everything between the current section link and the end # of the <ol> tag. if _parseChr and sect == 'filmography': inisect = s.find('<div class="filmo">') else: inisect = s.find('<a name="%s' % sect) if inisect != -1: endsect = s[inisect:].find('<div id="filmo-head-') if endsect == -1: endsect = s[inisect:].find('<div class="article"') if endsect != -1: raws = s[inisect:inisect+endsect] #if not raws: continue mlist = _findBetween(raws, '<div class="filmo-row', ('<div class="clear"/>',)) for m in mlist: fCB = m.find('>') if fCB != -1: m = m[fCB+1:].lstrip() m = re_filmo_episodes.sub('', m) # For every movie in the current section. movieID = re_imdbID.findall(m) if not movieID: self._mobile_logger.debug('no movieID in %s', m) continue m = m.replace('<br/>', ' .... ', 1) if not _parseChr: chrIndx = m.find(' .... ') else: chrIndx = m.find(' Played by ') chids = [] if chrIndx != -1: chrtxt = m[chrIndx+6:] if _parseChr: chrtxt = chrtxt[5:] for ch in chrtxt.split(' / '): chid = re_imdbID.findall(ch) if not chid: chids.append(None) else: chids.append(chid[-1]) if not chids: chids = None elif len(chids) == 1: chids = chids[0] movieID = str(movieID[0]) # Search the status. stidx = m.find('<i>') status = u'' if stidx != -1: stendidx = m.rfind('</i>') if stendidx != -1: status = _unHtml(m[stidx+3:stendidx]) m = m.replace(m[stidx+3:stendidx], '') year = _findBetween(m, 'year_column">', '</span>', maxRes=1) if year: year = year[0] m = m.replace('<span class="year_column">%s</span>' % year, '') else: year = None m = _unHtml(m) if not m: self._mobile_logger.warn('no title for movieID %s', movieID) continue movie = build_movie(m, movieID=movieID, status=status, roleID=chids, modFunct=self._defModFunct, accessSystem=self.accessSystem, _parsingCharacter=_parseChr, year=year) sectName = sectName.split(':')[0] r.setdefault(sectName, []).append(movie) # If available, take the always correct name from a form. itag = _getTagsWith(s, 'NAME="primary"', maxRes=1) if not itag: itag = _getTagsWith(s, 'name="primary"', maxRes=1) if itag: vtag = _findBetween(itag[0], 'VALUE="', ('"', '>'), maxRes=1) if not vtag: vtag = _findBetween(itag[0], 'value="', ('"', '>'), maxRes=1) if vtag: try: vtag = unquote(str(vtag[0])) vtag = unicode(vtag, 'latin_1') r.update(analyze_name(vtag)) except UnicodeEncodeError: pass return {'data': r, 'info sets': ('main', 'filmography')}
def get_movie_main(self, movieID): cont = self._mretrieve(self.urls['movie_main'] % movieID + 'maindetails') title = _findBetween(cont, '<title>', '</title>', maxRes=1) if not title: raise IMDbDataAccessError('unable to get movieID "%s"' % movieID) title = _unHtml(title[0]) if title.endswith(' - IMDb'): title = title[:-7] if cont.find('<span class="tv-extra">TV mini-series</span>') != -1: title += ' (mini)' d = analyze_title(title) kind = d.get('kind') tv_series = _findBetween(cont, 'TV Series:</h5>', '</a>', maxRes=1) if tv_series: mid = re_imdbID.findall(tv_series[0]) else: mid = None if tv_series and mid: s_title = _unHtml(tv_series[0]) s_data = analyze_title(s_title) m = Movie(movieID=str(mid[0]), data=s_data, accessSystem=self.accessSystem, modFunct=self._defModFunct) d['kind'] = kind = u'episode' d['episode of'] = m if kind in ('tv series', 'tv mini series'): years = _findBetween(cont, '<h1>', '</h1>', maxRes=1) if years: years[:] = _findBetween(years[0], 'TV series', '</span>', maxRes=1) if years: d['series years'] = years[0].strip() air_date = _findBetween(cont, 'Original Air Date:</h5>', '</div>', maxRes=1) if air_date: air_date = air_date[0] vi = air_date.find('(') if vi != -1: date = _unHtml(air_date[:vi]).strip() if date != '????': d['original air date'] = date air_date = air_date[vi:] season = _findBetween(air_date, 'Season', ',', maxRes=1) if season: season = season[0].strip() try: season = int(season) except: pass if season or type(season) is _inttype: d['season'] = season episode = _findBetween(air_date, 'Episode', ')', maxRes=1) if episode: episode = episode[0].strip() try: episode = int(episode) except: pass if episode or type(season) is _inttype: d['episode'] = episode direct = _findBetween(cont, '<h5>Director', ('</div>', '<br/> <br/>'), maxRes=1) if direct: direct = direct[0] h5idx = direct.find('/h5>') if h5idx != -1: direct = direct[h5idx+4:] direct = self._getPersons(direct) if direct: d['director'] = direct if kind in ('tv series', 'tv mini series', 'episode'): if kind != 'episode': seasons = _findBetween(cont, 'Seasons:</h5>', '</div>', maxRes=1) if seasons: d['number of seasons'] = seasons[0].count('|') + 1 creator = _findBetween(cont, 'Created by</h5>', ('class="tn15more"', '</div>', '<br/> <br/>'), maxRes=1) if not creator: # They change 'Created by' to 'Creator' and viceversa # from time to time... # XXX: is 'Creators' also used? creator = _findBetween(cont, 'Creator:</h5>', ('class="tn15more"', '</div>', '<br/> <br/>'), maxRes=1) if creator: creator = creator[0] if creator.find('tn15more'): creator = '%s>' % creator creator = self._getPersons(creator) if creator: d['creator'] = creator writers = _findBetween(cont, '<h5>Writer', ('</div>', '<br/> <br/>'), maxRes=1) if writers: writers = writers[0] h5idx = writers.find('/h5>') if h5idx != -1: writers = writers[h5idx+4:] writers = self._getPersons(writers) if writers: d['writer'] = writers cvurl = _getTagsWith(cont, 'name="poster"', toClosure=True, maxRes=1) if cvurl: cvurl = _findBetween(cvurl[0], 'src="', '"', maxRes=1) if cvurl: d['cover url'] = cvurl[0] genres = _findBetween(cont, 'href="/genre/', '"') if genres: d['genres'] = list(set(genres)) ur = _findBetween(cont, 'id="star-bar-user-rate">', '</div>', maxRes=1) if ur: rat = _findBetween(ur[0], '<b>', '</b>', maxRes=1) if rat: if rat: d['rating'] = rat[0].strip() else: self._mobile_logger.warn('wrong rating: %s', rat) vi = ur[0].rfind('href="ratings"') if vi != -1 and ur[0][vi+10:].find('await') == -1: try: votes = _findBetween(ur[0][vi:], "title='", " IMDb", maxRes=1) votes = int(votes[0].replace(',', '')) d['votes'] = votes except (ValueError, IndexError): self._mobile_logger.warn('wrong votes: %s', ur) top250 = _findBetween(cont, 'href="/chart/top?', '</a>', maxRes=1) if top250: fn = top250[0].rfind('#') if fn != -1: try: td = int(top250[0][fn+1:]) d['top 250 rank'] = td except ValueError: self._mobile_logger.warn('wrong top250: %s', top250) castdata = _findBetween(cont, 'Cast overview', '</table>', maxRes=1) if not castdata: castdata = _findBetween(cont, 'Credited cast', '</table>', maxRes=1) if not castdata: castdata = _findBetween(cont, 'Complete credited cast', '</table>', maxRes=1) if not castdata: castdata = _findBetween(cont, 'Series Cast Summary', '</table>', maxRes=1) if not castdata: castdata = _findBetween(cont, 'Episode Credited cast', '</table>', maxRes=1) if castdata: castdata = castdata[0] # Reintegrate the fist tag. fl = castdata.find('href=') if fl != -1: castdata = '<a ' + castdata[fl:] # Exclude the 'rest of cast listed alphabetically' row. smib = castdata.find('<tr><td align="center" colspan="4"><small>') if smib != -1: smie = castdata.rfind('</small></td></tr>') if smie != -1: castdata = castdata[:smib].strip() + \ castdata[smie+18:].strip() castdata = castdata.replace('/tr> <tr', '/tr><tr') cast = self._getPersons(castdata, sep='</tr><tr') if cast: d['cast'] = cast akas = _findBetween(cont, 'Also Known As:</h5>', '</div>', maxRes=1) if akas: # For some reason, here <br> is still used in place of <br/>. akas[:] = [x for x in akas[0].split('<br>') if x.strip()] akas = [_unHtml(x).replace('" - ','::', 1).lstrip('"').strip() for x in akas] if 'See more' in akas: akas.remove('See more') akas[:] = [x for x in akas if x] if akas: d['akas'] = akas mpaa = _findBetween(cont, 'MPAA</a>:', '</div>', maxRes=1) if mpaa: d['mpaa'] = _unHtml(mpaa[0]) runtimes = _findBetween(cont, 'Runtime:</h5>', '</div>', maxRes=1) if runtimes: runtimes = runtimes[0] runtimes = [x.strip().replace(' min', '').replace(' (', '::(', 1) for x in runtimes.split('|')] d['runtimes'] = [_unHtml(x).strip() for x in runtimes] if kind == 'episode': # number of episodes. epsn = _findBetween(cont, 'title="Full Episode List">', '</a>', maxRes=1) if epsn: epsn = epsn[0].replace(' Episodes', '').strip() if epsn: try: epsn = int(epsn) except: self._mobile_logger.warn('wrong episodes #: %s', epsn) d['number of episodes'] = epsn country = _findBetween(cont, 'Country:</h5>', '</div>', maxRes=1) if country: country[:] = country[0].split(' | ') country[:] = ['<a %s' % x for x in country if x] country[:] = [_unHtml(x.replace(' <i>', '::')) for x in country] if country: d['countries'] = country lang = _findBetween(cont, 'Language:</h5>', '</div>', maxRes=1) if lang: lang[:] = lang[0].split(' | ') lang[:] = ['<a %s' % x for x in lang if x] lang[:] = [_unHtml(x.replace(' <i>', '::')) for x in lang] if lang: d['languages'] = lang col = _findBetween(cont, '"/search/title?colors=', '</div>') if col: col[:] = col[0].split(' | ') col[:] = ['<a %s' % x for x in col if x] col[:] = [_unHtml(x.replace(' <i>', '::')) for x in col] if col: d['color info'] = col sm = _findBetween(cont, '/search/title?sound_mixes=', '</div>', maxRes=1) if sm: sm[:] = sm[0].split(' | ') sm[:] = ['<a %s' % x for x in sm if x] sm[:] = [_unHtml(x.replace(' <i>', '::')) for x in sm] if sm: d['sound mix'] = sm cert = _findBetween(cont, 'Certification:</h5>', '</div>', maxRes=1) if cert: cert[:] = cert[0].split(' | ') cert[:] = [_unHtml(x.replace(' <i>', '::')) for x in cert] if cert: d['certificates'] = cert plotoutline = _findBetween(cont, 'Plot:</h5>', ['<a ', '</div>'], maxRes=1) if plotoutline: plotoutline = plotoutline[0].strip() plotoutline = plotoutline.rstrip('|').rstrip() if plotoutline: d['plot outline'] = _unHtml(plotoutline) aratio = _findBetween(cont, 'Aspect Ratio:</h5>', ['<a ', '</div>'], maxRes=1) if aratio: aratio = aratio[0].strip().replace(' (', '::(', 1) if aratio: d['aspect ratio'] = _unHtml(aratio) return {'data': d}