def get_character_biography(self, characterID): cont = self._mretrieve(self.urls['character_main'] % characterID + 'bio') d = {} intro = _findBetween(cont, '<div class="display">', ('<span>', '<h4>'), maxRes=1) if intro: intro = _unHtml(intro[0]).strip() if intro: d['introduction'] = intro tocidx = cont.find('<table id="toc..') if tocidx != -1: cont = cont[tocidx:] bios = _findBetween(cont, '<h4>', ('<h4>', '</div>')) if bios: for bio in bios: bio = bio.replace('</h4>', '::') bio = bio.replace('\n', ' ') bio = bio.replace('<br>', '\n') bio = bio.replace('<br/>', '\n') bio = subSGMLRefs(re_unhtmlsub('', bio).strip()) bio = bio.replace(' ::', '::').replace(':: ', '::') bio = bio.replace('::', ': ', 1) if bio: d.setdefault('biography', []).append(bio) return {'data': d}
def get_character_biography(self, characterID): cont = self._mretrieve(imdbURL_character_main % characterID + 'bio') d = {} intro = _findBetween(cont, '<div class="display">', ('<span>', '<h4>'), maxRes=1) if intro: intro = _unHtml(intro[0]).strip() if intro: d['introduction'] = intro bios = _findBetween(cont, '<div class="display">', '<div class="history">') if bios: bios = _findBetween(bios[0], '<h4>', ('<h4>', '</div>')) if bios: for bio in bios: bio = bio.replace('</h4>', '::') bio = bio.replace('\n', ' ') bio = bio.replace('<br>', '\n') bio = bio.replace('<br/>', '\n') bio = subSGMLRefs(re_unhtmlsub('', bio).strip()) bio = bio.replace(' ::', '::').replace(':: ', '::') bio = bio.replace('::', ': ', 1) if bio: d.setdefault('biography', []).append(bio) return {'data': d}
def get_character_biography(self, characterID): cont = self._mretrieve(self.urls["character_main"] % characterID + "bio") d = {} intro = _findBetween(cont, '<div class="display">', ("<span>", "<h4>"), maxRes=1) if intro: intro = _unHtml(intro[0]).strip() if intro: d["introduction"] = intro tocidx = cont.find('<table id="toc..') if tocidx != -1: cont = cont[tocidx:] bios = _findBetween(cont, "<h4>", ("<h4>", "</div>")) if bios: for bio in bios: bio = bio.replace("</h4>", "::") bio = bio.replace("\n", " ") bio = bio.replace("<br>", "\n") bio = bio.replace("<br/>", "\n") bio = subSGMLRefs(re_unhtmlsub("", bio).strip()) bio = bio.replace(" ::", "::").replace(":: ", "::") bio = bio.replace("::", ": ", 1) if bio: d.setdefault("biography", []).append(bio) return {"data": d}
def _unHtml(s): """Return a string without tags and no multiple spaces.""" return subSGMLRefs(re_spacessub(' ', re_unhtmlsub('', s)).strip())