def _extract_from_master_node(node, relationships_set): entity = DiscogsMasterEntity() entity.catalog_id = node.attrib['id'] genres = set() for child in node: if child.tag == 'main_release': entity.main_release_id = child.text elif child.tag == 'genres': for genre in child: genres.update(text_utils.tokenize(genre.text)) elif child.tag == 'styles': for style in child: genres.update(text_utils.tokenize(style.text)) elif child.tag == 'title': entity.name = child.text entity.name_tokens = ' '.join(text_utils.tokenize(child.text)) elif child.tag == 'data_quality': entity.data_quality = child.text.lower() elif child.tag == 'year': try: entity.born = date(year=int(child.text), month=1, day=1) entity.born_precision = 9 except ValueError: LOGGER.debug( 'Master with id %s has an invalid year: %s', entity.catalog_id, child.text, ) elif child.tag == 'artists': for artist in child: relationships_set.add( (entity.catalog_id, artist.find('id').text)) entity.genres = ' '.join(genres) return entity
def _denormalize_name_variation_entities(self, main_entity: DiscogsArtistEntity, name_variation_nodes): entity_class = type(main_entity) for node in name_variation_nodes: name_variation = node.text if not name_variation: LOGGER.debug( 'Artist %s: skipping empty <name> tag in <namevariations>', main_entity.catalog_id, ) continue variation_entity = entity_class() variation_entity.catalog_id = main_entity.catalog_id variation_entity.name = name_variation name_tokens = text_utils.tokenize(name_variation) if name_tokens: variation_entity.name_tokens = ' '.join(name_tokens) variation_entity.real_name = main_entity.real_name variation_entity.data_quality = main_entity.data_quality self.total_entities += 1 if 'Musician' in entity_class.__name__: self.musicians += 1 else: self.bands += 1 yield variation_entity
def _alias_entities(entity: BaseEntity, aliases_class, aliases: []): for alias_label in aliases: alias_entity = aliases_class() alias_entity.catalog_id = entity.catalog_id alias_entity.born = entity.born alias_entity.born_precision = entity.born_precision alias_entity.died = entity.died alias_entity.died_precision = entity.died_precision alias_entity.birth_place = entity.birth_place alias_entity.death_place = entity.death_place alias_entity.name = alias_label name_tokens = text_utils.tokenize(alias_label) if name_tokens: alias_entity.name_tokens = ' '.join(name_tokens) yield alias_entity
def _populate_nlp_entity(self, entity_array, infos: dict, entity_class): if infos.get('profile'): nlp_entity = entity_class() nlp_entity.catalog_id = infos['identifier'] nlp_entity.description = infos['profile'] description_tokens = text_utils.tokenize(infos['profile']) if description_tokens: nlp_entity.description_tokens = ' '.join(description_tokens) entity_array.append(nlp_entity) self.total_entities += 1 if 'Musician' in entity_class.__name__: self.musician_nlp += 1 else: self.band_nlp += 1 else: LOGGER.debug('Artist %s has an empty <profile/> tag', infos['identifier'])
def _fill_entity(self, entity, info, areas): entity.catalog_id = info['gid'] entity.name = info['label'] entity.tokens = " ".join(text_utils.tokenize(info['label'])) birth_date = self._get_date_and_precision( info['b_year'], info['b_month'], info['b_day']) death_date = self._get_date_and_precision( info['d_year'], info['d_month'], info['d_day']) entity.born = birth_date[0] entity.born_precision = birth_date[1] entity.died = death_date[0] entity.died_precision = death_date[1] try: entity.birth_place = areas[info['b_place']] except KeyError: entity.birth_place = None try: entity.death_place = areas[info['d_place']] except KeyError: entity.death_place = None
def similar_name_match(source, target, tokenize) -> dict: """Given a dictionaries ``{person_name: identifier}, a BaseEntity and a tokenization function``, match similar names and return a dataset ``{source_id: target_id}``. This strategy only applies to people names. """ matches = defaultdict(list) to_exclude = set() for label, qid in source.items(): if not label: continue to_exclude.clear() tokenized = tokenize(label) if len(tokenized) <= 1: continue # NOTICE: sets of size 1 are always exluded # Looks for sets equal or bigger containing our tokens for res in data_gathering.tokens_fulltext_search( target, True, tokenized): matches[qid].append(res.catalog_id) to_exclude.add(res.catalog_id) # Looks for sets contained in our set of tokens for res in data_gathering.tokens_fulltext_search( target, False, tokenized): res_tokenized = text_utils.tokenize(res.tokens) if len(res_tokenized) > 1 and res_tokenized.issubset(tokenized): matches[qid].append(res.catalog_id) if matches[qid]: matches[qid] = list(set(matches[qid])) else: del matches[qid] return matches
def _fill_entity(entity: DiscogsArtistEntity, infos): # Base fields entity.catalog_id = infos['identifier'] entity.name = infos['name'] name_tokens = text_utils.tokenize(infos['name']) if name_tokens: entity.name_tokens = ' '.join(name_tokens) # Real name real_name = infos['realname'] if real_name: entity.real_name = real_name else: LOGGER.debug('Artist %s has an empty <realname/> tag', infos['identifier']) # Data quality data_quality = infos['data_quality'] if data_quality: entity.data_quality = data_quality else: LOGGER.debug( 'Artist %s has an empty <data_quality/> tag', infos['identifier'], )
def _fill_entity(self, entity, info, areas): entity.catalog_id = info['gid'] entity.name = info['label'] name_tokens = text_utils.tokenize(info['label']) if name_tokens: entity.name_tokens = ' '.join(name_tokens) try: birth_date = self._get_date_and_precision(info['b_year'], info['b_month'], info['b_day']) entity.born = birth_date[0] entity.born_precision = birth_date[1] except KeyError: entity.born = None entity.born_precision = None try: death_date = self._get_date_and_precision(info['d_year'], info['d_month'], info['d_day']) entity.died = death_date[0] entity.died_precision = death_date[1] except KeyError: entity.died = None entity.died_precision = None if isinstance(entity, (MusicbrainzArtistEntity, MusicbrainzBandEntity)): try: entity.birth_place = areas[info['b_place']] except KeyError: entity.birth_place = None try: entity.death_place = areas[info['d_place']] except KeyError: entity.death_place = None
def extract_and_populate(self, dump_file_paths: List[str], resolve: bool) -> None: """ Extracts the data in the dumps (person and movie) and processes them. It then proceeds to add the appropriate data to the database. See :ref:`soweego.importer.models.imdb_entity` module to see the SQLAlchemy definition of the entities we use to save IMDB data. :param dump_file_paths: the absolute paths of the already downloaded dump files. """ # the order of these files is specified in `self.get_dump_download_urls` person_file_path = dump_file_paths[0] movies_file_path = dump_file_paths[1] LOGGER.debug('Path to movie info dump: %s', movies_file_path) LOGGER.debug('Path to person info dump: %s', person_file_path) start = datetime.datetime.now() tables = [ imdb_entity.ImdbActorEntity, imdb_entity.ImdbDirectorEntity, imdb_entity.ImdbMovieEntity, imdb_entity.ImdbMusicianEntity, imdb_entity.ImdbProducerEntity, imdb_entity.ImdbWriterEntity, imdb_entity.ImdbMoviePersonRelationship, ] db_manager = DBManager() LOGGER.info('Connected to database: %s', db_manager.get_engine().url) db_manager.drop(tables) db_manager.create(tables) LOGGER.info( 'SQL tables dropped and re-created: %s', [table.__tablename__ for table in tables], ) LOGGER.info('Starting import of movies ...') # Here we open the movie dump file, and add everything to the DB for movie_info, entity_array in self._loop_through_entities( movies_file_path): # create the movie SQLAlchemy entity and populate it movie_entity = imdb_entity.ImdbMovieEntity() movie_entity.catalog_id = movie_info.get('tconst') movie_entity.title_type = movie_info.get('titleType') if movie_info.get('primaryTitle') is not None: movie_entity.name = movie_info.get('primaryTitle') movie_entity.name_tokens = ' '.join( text_utils.tokenize(movie_info.get('primaryTitle'))) movie_entity.is_adult = (True if movie_info.get('isAdult') == '1' else False) try: movie_entity.born = datetime.date(year=int( movie_info.get('startYear')), month=1, day=1) movie_entity.born_precision = 9 except (KeyError, TypeError): LOGGER.debug('No start year value for %s', movie_entity) try: movie_entity.died = datetime.date(year=int( movie_info.get('endYear')), month=1, day=1) movie_entity.died_precision = 9 except (KeyError, TypeError): LOGGER.debug('No end year value for %s', movie_entity) movie_entity.runtime_minutes = movie_info.get('runtimeMinutes') if movie_info.get('genres'): # if movie has a genre specified movie_entity.genres = ' '.join( text_utils.tokenize(movie_info.get('genres'))) # Creates entity for alias alias = movie_info.get('originalTitle') if alias is not None and movie_entity.name != alias: alias_entity = copy.deepcopy(movie_entity) alias_entity.name = alias alias_entity.name_tokens = ' '.join(text_utils.tokenize(alias)) entity_array.append(alias_entity) entity_array.append(movie_entity) self.n_movies += 1 # mark end for movie import process end = datetime.datetime.now() LOGGER.info( 'Movie import completed in %s. ' 'Total movies imported: %d', end - start, self.n_movies, ) LOGGER.info('Starting import of people ...') # reset timer for persons import start = datetime.datetime.now() for person_info, entity_array in self._loop_through_entities( person_file_path): # IMDb saves the list of professions as a comma separated # string professions = person_info.get('primaryProfession') # if person has no professions then ignore it if not professions: LOGGER.debug('Person %s has no professions', person_info.get('nconst')) continue professions = professions.split(',') # each person can be added to multiple tables in the DB, # each table stands for one of the main professions types_of_entities = [] if 'actor' in professions or 'actress' in professions: self.n_actors += 1 types_of_entities.append(imdb_entity.ImdbActorEntity()) if 'director' in professions: self.n_directors += 1 types_of_entities.append(imdb_entity.ImdbDirectorEntity()) if 'producer' in professions: self.n_producers += 1 types_of_entities.append(imdb_entity.ImdbProducerEntity()) if any(prof in [ 'sound_department', 'composer', 'music_department', 'soundtrack', ] for prof in professions): self.n_musicians += 1 types_of_entities.append(imdb_entity.ImdbMusicianEntity()) if 'writer' in professions: self.n_writers += 1 types_of_entities.append(imdb_entity.ImdbWriterEntity()) # if the only profession a person has is `miscellaneous` then we # add it to all tables if professions == ['miscellaneous']: self.n_misc += 1 types_of_entities = [ imdb_entity.ImdbActorEntity(), imdb_entity.ImdbDirectorEntity(), imdb_entity.ImdbMusicianEntity(), imdb_entity.ImdbProducerEntity(), imdb_entity.ImdbWriterEntity(), ] # add person to every matching table for etype in types_of_entities: self._populate_person(etype, person_info, entity_array) # if person is known for any movies then add these to the # database as well if person_info.get('knownForTitles'): self.n_person_movie_links += 1 self._populate_person_movie_relations(person_info, entity_array) self.n_persons += 1 # mark the end time for the person import process end = datetime.datetime.now() LOGGER.info( 'Person import completed in %s. ' 'Total people imported: %d - ' 'Actors: %d - Directors: %d - Musicians: %d - ' 'Producers: %d - Writers: %d - Misc: %d', end - start, self.n_persons, self.n_actors, self.n_directors, self.n_musicians, self.n_producers, self.n_writers, self.n_misc, )
def _populate_person( self, person_entity: imdb_entity.ImdbPersonEntity, person_info: Dict, entity_array: object, ) -> None: """ Given an instance of :ref:`soweego.importer.models.imdb_entity.ImdbPersonEntity` this function populates its attributes according to the provided `person_info` dictionary. It then adds said instance to the SQLAlchemy session. :param person_entity: the entity which we want to populate :param person_info: the data we want to populate the entity with :param entity_array: an external array to which we'll add the entity once it is populated. """ person_entity.catalog_id = person_info.get('nconst') person_entity.name = person_info.get('primaryName') person_entity.name_tokens = ' '.join( text_utils.tokenize(person_entity.name)) # If either `actor` or `actress` in primary profession # (which is a comma separated string of professions) # then we can distinguish the gender if any(prof in person_info.get('primaryProfession') for prof in ['actor', 'actress']): person_entity.gender = ('male' if 'actor' in person_info.get('primaryProfession') else 'female') # IMDb only provides us with the birth and death year of # a person, so this is the only one we'll take into # account. Month and Day are set by default to 1. The # base `ImdbPersonEntity` defines a precision of 9 for the # birth and death dates, which (according to # `vocab.DATE_PRECISION`) means that only the year is correct. born_year = person_info.get('birthYear') if born_year: # datetime.date(year, month, day) person_entity.born = datetime.date(int(born_year), 1, 1) death_year = person_info.get('deathYear') if death_year: person_entity.died = datetime.date(int(death_year), 1, 1) # The array of primary professions gets translated to a list # of the QIDs that represent said professions in Wikidata if person_info.get('primaryProfession'): # get QIDs of occupations for person translated_occupations = self._translate_professions( person_info.get('primaryProfession').split(',')) # only save those occupations which are not the main # occupation of the entity type (ie, for ActorEntity # don't include 'actor' occupation since it is implicit) person_entity.occupations = ' '.join( occ for occ in translated_occupations if occ != person_entity.table_occupation) entity_array.append(person_entity)