def build_collection_index(movies, tvshows): start = time.time() entities = list(itertools.chain.from_iterable([movies, tvshows])) values = list(set([ parse_collection(strip_accents(entity['set'])) for entity in entities if 'set' in entity and len(entity['set']) > 0 ])) mapped_entities = {} for entity in entities: if 'set' in entity and entity['set']: value = parse_collection(strip_accents(entity['set'])) if value not in mapped_entities: mapped_entities[value] = [] mapped_entities[value].append(entity) logger.debug(u'Iterating collection took {} ms'.format(int((time.time() - start) * 1000))) start = time.time() index = NGram(items=values, key=lambda x: x.lower()) logger.debug(u'Building collection index took {} ms'.format(int((time.time() - start) * 1000))) return index, mapped_entities
def build_title_index(movies, tvshows): start = time.time() entities = list(itertools.chain.from_iterable([movies, tvshows])) values = [strip_accents(entity['title']) for entity in entities] mapped_entities = {} for entity in entities: value = strip_accents(entity['title']) if value not in mapped_entities: mapped_entities[value] = [] mapped_entities[value].append(entity) logger.debug(u'Iterating title took {} ms'.format(int((time.time() - start) * 1000))) start = time.time() index = NGram(items=values, key=lambda x: x.lower()) logger.debug(u'Building title index took {} ms'.format(int((time.time() - start) * 1000))) return index, mapped_entities
def build_cast_index(movies, tvshows, key): start = time.time() entities = list(itertools.chain.from_iterable([movies, tvshows])) values = [[strip_accents(cast[key]) for cast in entity['cast']] for entity in entities] values = list(set(itertools.chain.from_iterable(values))) mapped_entities = {} for entity in entities: for cast in entity['cast']: value = strip_accents(cast[key]) if value not in mapped_entities: mapped_entities[value] = [] mapped_entities[value].append(entity) logger.debug(u'Iterating {} took {} ms'.format(key, int((time.time() - start) * 1000))) start = time.time() index = NGram(items=values, key=lambda x: x.lower()) logger.debug(u'Building {} index took {} ms'.format(key, int((time.time() - start) * 1000))) return index, mapped_entities
def _find_by(self, filter_value, value_type): index = self.compose_index[value_type]['ix'] value_map = self.compose_index[value_type]['map'] threshold = self.compose_index[value_type]['threshold'] similar_values = index.search(strip_accents(filter_value).lower()) similar_values = [(value, score) for value, score in similar_values if score > threshold] logger.debug(similar_values) matched_entities = [(value_map[value], score) for value, score in similar_values] matched_entities = [[(entity, score) for entity in entities] for entities, score in matched_entities] matched_entities = list(itertools.chain.from_iterable(matched_entities)) return matched_entities
def test_library_index(self): values = [ "Padre no hay más que uno", "Élite", "Pequeñas mentirosas", "Capitán América: El primer vengador", "Alita: Ángel del combate", "Animales fantásticos y dónde encontrarlos", "Animales fantásticos: Los crímenes de Grindelwald", "Cafarnaúm", "El Camino: Una película de Breaking Bad", "Cómo entrenar a tu dragón", ] index, _ = build_title_index([dict(title=value) for value in values], []) self.assertEqual( index.search( strip_accents("Capitán América: El primer vengador").lower()) [0][1], 1.0)
def build_genre_index(movies, tvshows): start = time.time() entities = list(itertools.chain.from_iterable([movies, tvshows])) values = list(set(filter( strip_accents, itertools.chain.from_iterable([entity['genre'] for entity in entities]) ))) mapped_entities = {} for entity in entities: for genre in [strip_accents(genre) for genre in entity['genre']]: if genre not in mapped_entities: mapped_entities[genre] = [] mapped_entities[genre].append(entity) logger.debug(u'Iterating genre took {} ms'.format(int((time.time() - start) * 1000))) start = time.time() index = NGram(items=values, key=lambda x: x.lower()) logger.debug(u'Building genre index took {} ms'.format(int((time.time() - start) * 1000))) return index, mapped_entities
def test_strip_accent(self): self.assertEqual("Padre no hay mas que uno", strip_accents("Padre no hay más que uno")) self.assertEqual("Elite", strip_accents("Élite")) self.assertEqual("Pequenas mentirosas", strip_accents("Pequeñas mentirosas")) self.assertEqual("Capitan America: El primer vengador", strip_accents("Capitán América: El primer vengador")) self.assertEqual("Aladdin", strip_accents("Aladdín")) self.assertEqual("Alita: Angel del combate", strip_accents("Alita: Ángel del combate")) self.assertEqual( "Animales fantasticos y donde encontrarlos", strip_accents("Animales fantásticos y dónde encontrarlos")) self.assertEqual( "Animales fantasticos: Los crimenes de Grindelwald", strip_accents("Animales fantásticos: Los crímenes de Grindelwald")) self.assertEqual("Cafarnaum", strip_accents("Cafarnaúm")) self.assertEqual( "El Camino: Una pelicula de Breaking Bad", strip_accents("El Camino: Una película de Breaking Bad")) self.assertEqual("Como entrenar a tu dragon", strip_accents("Cómo entrenar a tu dragón")) self.assertEqual("Erase una vez en Hollywood", strip_accents("Érase una vez en… Hollywood"))