class IMDbURLopener: """Fetch web pages and handle errors.""" _logger = logger.getChild('urlopener') def __init__(self, *args, **kwargs): self._last_url = '' self.https_handler = IMDbHTTPSHandler(logger=self._logger) self.proxies = {} self.addheaders = [] for header in ('User-Agent', 'User-agent', 'user-agent'): self.del_header(header) self.set_header('User-Agent', 'Mozilla/5.0') lang = kwargs.get('languages', 'en-us,en;q=0.5') self.set_header('Accept-Language', lang) def get_proxy(self): """Return the used proxy, or an empty string.""" return self.proxies.get('http', '') def set_proxy(self, proxy): """Set the proxy.""" if not proxy: if 'http' in self.proxies: del self.proxies['http'] else: if not proxy.lower().startswith('http://'): proxy = 'http://%s' % proxy self.proxies['http'] = proxy def set_header(self, header, value, _overwrite=True): """Set a default header.""" if _overwrite: self.del_header(header) self.addheaders.append((header, value)) def get_header(self, header): """Return the first value of a header, or None if not present.""" for index in range(len(self.addheaders)): if self.addheaders[index][0] == header: return self.addheaders[index][1] return None def del_header(self, header): """Remove a default header.""" for index in range(len(self.addheaders)): if self.addheaders[index][0] == header: del self.addheaders[index] break def retrieve_unicode(self, url, size=-1): """Retrieves the given URL, and returns a unicode string, trying to guess the encoding of the data (assuming utf8 by default)""" encode = None try: if size != -1: self.set_header('Range', 'bytes=0-%d' % size) handlers = [] if 'http' in self.proxies: proxy_handler = ProxyHandler({ 'http': self.proxies['http'], 'https': self.proxies['http'] }) handlers.append(proxy_handler) handlers.append(self.https_handler) uopener = build_opener(*handlers) uopener.addheaders = list(self.addheaders) response = uopener.open(url) content = response.read() self._last_url = response.url # Maybe the server is so nice to tell us the charset... if PY2: server_encode = response.headers.getparam('charset') or None else: server_encode = response.headers.get_content_charset(None) # Otherwise, look at the content-type HTML meta tag. if server_encode is None and content: begin_h = content.find(b'text/html; charset=') if begin_h != -1: end_h = content[19 + begin_h:].find('"') if end_h != -1: server_encode = content[19 + begin_h:19 + begin_h + end_h] if server_encode: try: if lookup(server_encode): encode = server_encode except (LookupError, ValueError, TypeError): pass if size != -1: self.del_header('Range') response.close() except IOError as e: if size != -1: # Ensure that the Range header is removed. self.del_header('Range') raise IMDbDataAccessError({ 'errcode': e.errno, 'errmsg': str(e.strerror), 'url': url, 'proxy': self.get_proxy(), 'exception type': 'IOError', 'original exception': e }) if encode is None: encode = 'utf8' # The detection of the encoding is error prone... self._logger.warn( 'Unable to detect the encoding of the retrieved page [%s];' ' falling back to default utf8.', encode) if isinstance(content, str): return content return str(content, encode, 'replace')
from imdb._exceptions import IMDbDataAccessError, IMDbParserError from . import (companyParser, movieParser, personParser, searchMovieParser, searchMovieAdvancedParser, searchPersonParser, searchCompanyParser, searchKeywordParser, topBottomParser, listParser) if PY2: from urllib import quote_plus from urllib2 import HTTPSHandler, ProxyHandler, build_opener else: from urllib.parse import quote_plus from urllib.request import HTTPSHandler, ProxyHandler, build_opener # Logger for miscellaneous functions. _aux_logger = logger.getChild('aux') class _ModuleProxy: """A proxy to instantiate and access parsers.""" def __init__(self, module, defaultKeys=None): """Initialize a proxy for the given module; defaultKeys, if set, muste be a dictionary of values to set for instanced objects.""" if defaultKeys is None: defaultKeys = {} self._defaultKeys = defaultKeys self._module = module def __getattr__(self, name): """Called only when no look-up is found.""" _sm = self._module
elif isinstance(d[i], (list, dict)): _putRefs(d[i], re_titles, re_names, lastKey=lastKey) elif isinstance(d, dict): for k, v in list(d.items()): lastKey = k if isinstance(v, str): if lastKey in _modify_keys: if re_names: d[k] = re_names.sub(r"'\1' (qv)", v) if re_titles: d[k] = re_titles.sub(r'_\1_ (qv)', v) elif isinstance(v, (list, dict)): _putRefs(d[k], re_titles, re_names, lastKey=lastKey) _b_p_logger = logger.getChild('build_person') def build_person(txt, personID=None, billingPos=None, roleID=None, accessSystem='http', modFunct=None, headshot=None): """Return a Person instance from the tipical <tr>...</tr> strings found in the IMDb's web site.""" # if personID is None # _b_p_logger.debug('empty name or personID for "%s"', txt) notes = '' role = ''
class DOMParserBase(object): """Base parser to handle HTML data from the IMDb's web server.""" _defGetRefs = False _containsObjects = False preprocessors = [] rules = [] _logger = logger.getChild('domparser') def __init__(self): """Initialize the parser.""" self._modFunct = None self._as = 'http' self._cname = self.__class__.__name__ self._init() self.reset() def reset(self): """Reset the parser.""" # Names and titles references. self._namesRefs = {} self._titlesRefs = {} self._reset() def _init(self): """Subclasses can override this method, if needed.""" pass def _reset(self): """Subclasses can override this method, if needed.""" pass def parse(self, html_string, getRefs=None, **kwds): """Return the dictionary generated from the given html string; getRefs can be used to force the gathering of movies/persons references.""" self.reset() if getRefs is not None: self.getRefs = getRefs else: self.getRefs = self._defGetRefs if PY2 and isinstance(html_string, str): html_string = html_string.decode('utf-8') # Temporary fix: self.parse_dom must work even for empty strings. html_string = self.preprocess_string(html_string) if html_string: html_string = html_string.replace(' ', ' ') dom = self.get_dom(html_string) try: dom = self.preprocess_dom(dom) except Exception: self._logger.error('%s: caught exception preprocessing DOM', self._cname, exc_info=True) if self.getRefs: try: self.gather_refs(dom) except Exception: self._logger.warn('%s: unable to gather refs', self._cname, exc_info=True) data = self.parse_dom(dom) else: data = {} try: data = self.postprocess_data(data) except Exception: self._logger.error('%s: caught exception postprocessing data', self._cname, exc_info=True) if self._containsObjects: self.set_objects_params(data) data = self.add_refs(data) return data def get_dom(self, html_string): """Return a dom object, from the given string.""" try: if not _USE_LXML: html_string = html_to_xhtml(html_string, omit_tags={"script"}) dom = build_tree(html_string, force_html=True) if dom is None: dom = build_tree('') self._logger.error('%s: using a fake empty DOM', self._cname) return dom except Exception: self._logger.error('%s: caught exception parsing DOM', self._cname, exc_info=True) return build_tree('') def xpath(self, element, path): """Return elements matching the given XPath.""" try: return piculet_xpath(element, path) except Exception: self._logger.error('%s: caught exception extracting XPath "%s"', self._cname, path, exc_info=True) return [] def tostring(self, element): """Convert the element to a string.""" if isinstance(element, str): return str(element) else: try: return ElementTree.tostring(element, encoding='utf8') except Exception: self._logger.error('%s: unable to convert to string', self._cname, exc_info=True) return '' def clone(self, element): """Clone an element.""" return build_tree(self.tostring(element)) def preprocess_string(self, html_string): """Here we can modify the text, before it's parsed.""" if not html_string: return html_string try: preprocessors = self.preprocessors except AttributeError: return html_string for src, sub in preprocessors: # re._pattern_type is present only since Python 2.5. if isinstance(getattr(src, 'sub', None), Callable): html_string = src.sub(sub, html_string) elif isinstance(src, str) or isinstance(src, unicode): html_string = html_string.replace(src, sub) elif isinstance(src, Callable): try: html_string = src(html_string) except Exception: _msg = '%s: caught exception preprocessing html' self._logger.error(_msg, self._cname, exc_info=True) continue return html_string def gather_refs(self, dom): """Collect references.""" grParser = GatherRefs() grParser._as = self._as grParser._modFunct = self._modFunct refs = grParser.parse_dom(dom) refs = grParser.postprocess_data(refs) self._namesRefs = refs['names refs'] self._titlesRefs = refs['titles refs'] def preprocess_dom(self, dom): """Last chance to modify the dom, before the rules are applied.""" return dom def parse_dom(self, dom): """Parse the given dom according to the rules specified in self.rules.""" return Rules(self.rules).extract(dom) def postprocess_data(self, data): """Here we can modify the data.""" return data def set_objects_params(self, data): """Set parameters of Movie/Person/... instances, since they are not always set in the parser's code.""" for obj in flatten(data, yieldDictKeys=True, scalar=_Container): obj.accessSystem = self._as obj.modFunct = self._modFunct def add_refs(self, data): """Modify data according to the expected output.""" if self.getRefs: titl_re = r'(%s)' % '|'.join( [re.escape(x) for x in list(self._titlesRefs.keys())]) if titl_re != r'()': re_titles = re.compile(titl_re, re.U) else: re_titles = None nam_re = r'(%s)' % '|'.join( [re.escape(x) for x in list(self._namesRefs.keys())]) if nam_re != r'()': re_names = re.compile(nam_re, re.U) else: re_names = None _putRefs(data, re_titles, re_names) return { 'data': data, 'titlesRefs': self._titlesRefs, 'namesRefs': self._namesRefs }
from contextlib import contextmanager @contextmanager def redirect_stdout(new_stdout): """Context manager for temporarily redirecting stdout.""" old_stdout, sys.stdout = sys.stdout, new_stdout try: yield new_stdout finally: sys.stdout = old_stdout else: from contextlib import redirect_stdout from imdb.parser.http.logging import logger _logger = logger.getChild('piculet') ########################################################### # HTML OPERATIONS ########################################################### # TODO: this is too fragile _CHARSET_TAGS = [ b'<meta http-equiv="content-type" content="text/html; charset=', b'<meta charset="' ] def decode_html(content, charset=None, fallback_charset='utf-8'):