def _extract_id_from_html(cls, lines, ident): h = HTMLParser() suffix = extract_name_suffix(ident) if suffix: suffix = suffix.replace('(', '\(', 1).replace(')', '\)', 1) suffix = '(:?%s)?\s*' % suffix person_id_string = cls._person_id_string % (rname(clean_name(ident)), suffix) re_person_id = re.compile(person_id_string, re.I) for line in imap(h.unescape, lines): id_match = re_person_id.search(line) if id_match: return int(id_match.group(1)) return None
def produce_persons(cls, role_type, idents_only=False, sans_roles=False): re_person_start = re.compile('^----\t\t\t------$') re_person_name = re.compile('^(.*?)\t+(.*)$') type_path = config.imdb['%s_path' % role_type] log.info('Loading roles for "%s" from %s' % (role_type, type_path)) f = open(type_path, 'r') while not re_person_start.match(f.readline()): pass person_new = lambda: { 'name' : None, 'id' : None, 'roles' : [] } person_ident, person = None, person_new() for line in imap(lambda l: l.strip().decode('latin_1'), f): if line[:9] == '---------': log.info('End of File, done importing %s' % role_type) break if not person_ident: name_match = re_person_name.match(line) if name_match: person_ident = name_match.group(1) person['name'] = rname(clean_name(person_ident)) role_ident = name_match.group(2) elif not line: if person_ident and person['roles']: if idents_only: yield person_ident else: person['id'] = cls.person_ident_to_id.get(person_ident) person['href'] = cls._person_href(person['id'], ident=person_ident) if sans_roles: del person['roles'] yield person person_ident, person = None, person_new() continue else: role_ident = line role = cls._parse_role_ident(role_ident, role_type) if role['title_ident'] in cls.title_ident_to_id: role['title_id'] = cls.title_ident_to_id[role['title_ident']] del role['title_ident'] person['roles'].append(role) f.close()