def _parse_dump(self, filename): f = gzip.open(filename, 'rb') context = iter(etree.iterparse(f, events=("start", "end"))) event, root = context.next() nid_re = re.compile('.*\/([0-9]*)$') language_re = re.compile('.*\/languages$') match_genre_re = re.compile('.*\/genres$') match_ratings_re = re.compile('.*\/mpaa_ratings$') match_genre_func = lambda c: re.match(match_genre_re, c.get('scheme')) is not None match_ratings_func = lambda c: re.match(match_ratings_re, c.get('scheme')) is not None match_language_func = lambda c: re.match(language_re, c.get('scheme')) is not None count = 0 bonus_materials = set() # loop through each XML catalog_title element and parse it as a movie Entity for event, elem in context: if event == "end" and elem.tag == "catalog_title": root.clear() try: rating_elem = elem.find('average_rating') if rating_elem is None: continue entity = Entity() nid = elem.find('id').text nid = int(re.match(nid_re, nid).groups()[0]) bonus_materials_elem = elem.find('.//bonus_materials') if bonus_materials_elem is not None: links = map(lambda l: l.get('href'), bonus_materials_elem.findall('link')) for link in links: bonus_material_id = int(re.match(nid_re, link).groups()[0]) #bonus_material_id = re.match(bonus_materials_id_re, link).groups()[0] bonus_materials.add(bonus_material_id) if nid in bonus_materials: continue title = elem.find('title').get('regular') titlel = title.lower() if 'bonus material' in titlel: continue entity.title = title entity.nid = nid entity.desc = elem.find('.//synopsis').text entity.nrating = float(rating_elem.text) categories = elem.findall('category') genres = map(lambda c: c.get('label'), filter(match_genre_func, categories)) entity.ngenres = genres tv = False for genre in genres: if 'tv' in genre.lower(): tv = True break if tv: entity.subcategory = 'tv' else: entity.subcategory = 'movie' ratings = map(lambda c: c.get('label'), filter(match_ratings_func, categories)) if 1 == len(ratings): entity.mpaa_rating = ratings[0] images = elem.find('.//box_art').findall('link') if 3 == len(images) or 4 == len(images): entity.tiny = images[0].get('href') entity.small = images[1].get('href') entity.large = images[2].get('href') if 4 == len(images): entity.hd = images[3].get('href') links = filter(lambda l: 'web page' == l.get('title'), elem.findall('link')) if 1 == len(links): entity.nurl = links[0].get('href') language_elem = elem.find('.//languages_and_audio') language_elems = filter(match_language_func, language_elem.findall('.//category')) release_year_elem = elem.find('release_year') if release_year_elem is not None: entity.original_release_date = release_year_elem.text duration = elem.find('runtime') if duration is not None: entity.track_length = duration.text languages = set() for elem2 in language_elems: languages.add(elem2.get('label').lower()) if 'english' not in languages: continue #utils.log(entity.title) #pprint(entity.getDataAsDict()) """ self._globals['n'] = elem self._globals['s'] = etree.tostring(elem, pretty_print=True) self._globals['e'] = entity break """ self._output.put(entity) count += 1 # give the downstream consumer threads an occasional chance to work if 0 == (count % 512): time.sleep(0.1) elem.clear() except Exception, e: utils.printException() utils.log(elem.find('title').get('regular'))