示例#1
0
文件: imdb.py 项目: mrowl/filmdata
 def _extract_id_from_html(cls, lines, ident):
     h = HTMLParser()
     suffix = extract_name_suffix(ident)
     if suffix:
         suffix = suffix.replace('(', '\(', 1).replace(')', '\)', 1)
         suffix = '(:?%s)?\s*' % suffix
     person_id_string = cls._person_id_string % (rname(clean_name(ident)),
                                                 suffix) 
     re_person_id = re.compile(person_id_string, re.I)
     for line in imap(h.unescape, lines):
         id_match = re_person_id.search(line)
         if id_match:
             return int(id_match.group(1))
     return None
示例#2
0
文件: imdb.py 项目: mrowl/filmdata
    def produce_persons(cls, role_type, idents_only=False, sans_roles=False):
        re_person_start = re.compile('^----\t\t\t------$')
        re_person_name = re.compile('^(.*?)\t+(.*)$')

        type_path = config.imdb['%s_path' % role_type]
        log.info('Loading roles for "%s" from %s' % (role_type, type_path))
        f = open(type_path, 'r')

        while not re_person_start.match(f.readline()):
            pass

        person_new = lambda: { 'name' : None, 'id' : None, 'roles' : [] }
        person_ident, person = None, person_new()
        for line in imap(lambda l: l.strip().decode('latin_1'), f):
            if line[:9] == '---------':
                log.info('End of File, done importing %s' % role_type)
                break

            if not person_ident:
                name_match = re_person_name.match(line)
                if name_match:
                    person_ident = name_match.group(1)
                    person['name'] = rname(clean_name(person_ident))
                    role_ident = name_match.group(2)
            elif not line:
                if person_ident and person['roles']:
                    if idents_only:
                        yield person_ident
                    else:
                        person['id'] = cls.person_ident_to_id.get(person_ident)
                        person['href'] = cls._person_href(person['id'],
                                                          ident=person_ident)
                        if sans_roles:
                            del person['roles']
                        yield person
                person_ident, person = None, person_new()
                continue
            else:
                role_ident = line

            role = cls._parse_role_ident(role_ident, role_type)
            if role['title_ident'] in cls.title_ident_to_id:
                role['title_id'] = cls.title_ident_to_id[role['title_ident']]
                del role['title_ident']
                person['roles'].append(role)
        f.close()