def _release_group_artist_relationship_generator(dump_path): release_group_path = os.path.join(dump_path, 'mbdump', 'release_group') artist_credit_release = defaultdict(list) with open(release_group_path, 'r') as releasefile: n_rows = count_num_lines_in_file(releasefile) release_reader = DictReader( releasefile, delimiter='\t', fieldnames=['id', 'gid', 'label', 'artist_credit', 'type_id'], ) for row in tqdm(release_reader, total=n_rows): artist_credit_release[row['artist_credit']].append(row['gid']) artist_credit_name_path = os.path.join(dump_path, 'mbdump', 'artist_credit_name') artist_id_release = defaultdict(list) with open(artist_credit_name_path) as artistcreditfile: artist_credit_reader = DictReader( artistcreditfile, delimiter='\t', fieldnames=['id', 'nd', 'artist_id', 'artist_name'], ) n_rows = count_num_lines_in_file(artistcreditfile) for row in tqdm(artist_credit_reader, total=n_rows): artist_id_release[row['artist_id']] = artist_credit_release[ row['id']] # memory free up for performance del artist_credit_release[row['id']] artist_path = os.path.join(dump_path, 'mbdump', 'artist') with open(artist_path, 'r') as artistfile: n_rows = count_num_lines_in_file(artistfile) artist_link_reader = DictReader( artistfile, delimiter='\t', fieldnames=[ 'id', 'gid', 'label', 'sort_label', 'b_year', 'b_month', 'b_day', 'd_year', 'd_month', 'd_day', 'type_id', ], ) for artist in tqdm(artist_link_reader, total=n_rows): for release_id in artist_id_release[artist['id']]: yield (release_id, artist['gid']) # memory freeup for performance del artist_id_release[artist['id']]
def _get_urls_for_entity_id(dump_path: str, l_path: str, resolve: bool) -> dict: """given a l_{something}_url relationship file, return a dict of somethingid-[urls]""" LOGGER.info(f"Loading %s relationships", l_path) urlid_entityid_relationship = {} with open(l_path, "r") as tsvfile: url_relationships = DictReader(tsvfile, delimiter='\t', fieldnames=[i for i in range(0, 6)]) for relationship in tqdm(url_relationships, total=count_num_lines_in_file(tsvfile)): # url id matched with its user id if relationship[3] in urlid_entityid_relationship: LOGGER.warning( 'Url with ID %s has multiple entities, only one will ' 'be stored', relationship[3], ) else: urlid_entityid_relationship[ relationship[3]] = relationship[2] url_path = os.path.join(dump_path, 'mbdump', 'url') url_entityid = {} LOGGER.info('Checking URLs related to entity') # Translates URL IDs to the relative URL with open(url_path, "r") as tsvfile: urls = DictReader(tsvfile, delimiter='\t', fieldnames=[i for i in range(0, 5)]) for url_record in tqdm(urls, total=count_num_lines_in_file(tsvfile)): urlid = url_record[0] if urlid in urlid_entityid_relationship: for candidate_url in url_utils.clean(url_record[2]): if not url_utils.validate(candidate_url): continue if resolve and not url_utils.resolve(candidate_url): continue url_entityid[ candidate_url] = urlid_entityid_relationship[urlid] del urlid_entityid_relationship[urlid] entityid_url = defaultdict(list) # Inverts dictionary for url, entityid in url_entityid.items(): entityid_url[entityid].append(url) return entityid_url
def _similar_tokens_linker( wd_dataset: TextIO, target_db_entity: Union[BaseEntity, BaseLinkEntity], fields: Tuple[str, str], catalog_pid: str, compare_dates: bool, tokenize: Callable[[str], Set[str]], ) -> Iterable[Tuple[str, str, str]]: wd_field, target_field = fields to_exclude = set() for row in tqdm(wd_dataset, total=count_num_lines_in_file(wd_dataset)): wd_item = json.loads(row) qid = wd_item[keys.QID] for wd_name in wd_item[wd_field]: if not wd_name: continue to_exclude.clear() wd_tokens = tokenize(wd_name) if len(wd_tokens) <= 1: continue try: # Check if target token sets are equal or larger for target in data_gathering.tokens_fulltext_search( target_db_entity, True, wd_tokens): if not compare_dates or _birth_death_date_match( wd_item, target): yield qid, catalog_pid, target.catalog_id to_exclude.add(target.catalog_id) # Check if target token sets are smaller where_clause = target_db_entity.catalog_id.notin_(to_exclude) for target in data_gathering.tokens_fulltext_search( target_db_entity, False, wd_tokens, where_clause=where_clause, ): target_tokens = set(getattr(target, target_field).split()) if len(target_tokens) > 1 and target_tokens.issubset( wd_tokens): if not compare_dates or _birth_death_date_match( wd_item, target): yield qid, catalog_pid, target.catalog_id except SQLAlchemyError as error: LOGGER.warning( "Skipping failed full-text search query due to %s. " "You can enable the debug log with the CLI option " "'-l soweego.linker.baseline DEBUG' for more details", error.__class__.__name__, ) LOGGER.debug(error) continue
def _artist_link_generator(self, dump_path: str, resolve: bool): l_artist_url_path = os.path.join(dump_path, 'mbdump', 'l_artist_url') # Loads all the relationships between URL and ARTIST ID artistid_url = self._get_urls_for_entity_id(dump_path, l_artist_url_path, resolve) LOGGER.info('Adding link entities to DB') # Translates ARTIST ID to the relative ARTIST artist_path = os.path.join(dump_path, 'mbdump', 'artist') with open(artist_path, 'r') as artistfile: n_rows = count_num_lines_in_file(artistfile) artist_link_reader = DictReader( artistfile, delimiter='\t', fieldnames=[ 'id', 'gid', 'label', 'sort_label', 'b_year', 'b_month', 'b_day', 'd_year', 'd_month', 'd_day', 'type_id', ], ) for artist in tqdm(artist_link_reader, total=n_rows): if artist['id'] in artistid_url: for link in artistid_url[artist['id']]: if self._check_person(artist['type_id']): current_entity = MusicbrainzArtistLinkEntity() self._fill_link_entity(current_entity, artist['gid'], link) yield current_entity if self._check_band(artist['type_id']): current_entity = MusicbrainzBandLinkEntity() self._fill_link_entity(current_entity, artist['gid'], link) yield current_entity
def _release_group_link_generator(self, dump_path: str, resolve: bool): l_release_group_url_path = os.path.join(dump_path, 'mbdump', 'l_release_group_url') release_group_id_urls = self._get_urls_for_entity_id( dump_path, l_release_group_url_path, resolve) release_group_path = os.path.join(dump_path, 'mbdump', 'release_group') with open(release_group_path) as rfile: n_rows = count_num_lines_in_file(rfile) releases = DictReader(rfile, delimiter='\t', fieldnames=['id', 'gid', 'label']) for release in tqdm(releases, total=n_rows): if release['id'] in release_group_id_urls: for link in release_group_id_urls[release['id']]: entity = MusicbrainzReleaseGroupLinkEntity() self._fill_link_entity(entity, release['gid'], link) yield entity
def _release_group_generator(self, dump_path): release_group_datesprec = self._retrieve_release_group_dates(dump_path) release_group_path = os.path.join(dump_path, 'mbdump', 'release_group') with open(release_group_path, 'r') as releasefile: release_reader = DictReader( releasefile, delimiter='\t', fieldnames=['id', 'gid', 'label', 'artist_credit', 'type_id'], ) for row in tqdm(release_reader, total=count_num_lines_in_file(releasefile)): entity = MusicbrainzReleaseGroupEntity() self._fill_entity(entity, row, None) if row['id'] in release_group_datesprec: dateprec = release_group_datesprec[row['id']] if dateprec[1] != 0: entity.born_precision = dateprec[1] entity.born = dateprec[0] yield entity
def _perfect_names_linker( wd_dataset: TextIO, target_db_entity: BaseEntity, catalog_pid: str, compare_dates: bool, ) -> Iterable[Tuple[str, str, str]]: bucket, bucket_names, bucket_size = [], set(), 100 total = count_num_lines_in_file(wd_dataset) missing = total for row in tqdm(wd_dataset, total=total): wd_item = json.loads(row) bucket_names.update(wd_item[keys.NAME]) bucket.append(wd_item) # Build a bucket of `bucket_size` Wikidata items if len(bucket) >= bucket_size or missing < bucket_size: missing -= len(bucket) # Look the names up in the target database for target in data_gathering.perfect_name_search_bucket( target_db_entity, bucket_names): # Run a n^2 comparison and yield matches for wd in bucket: # Wikidata items have lists of names for wd_name in wd[keys.NAME]: if not wd_name: continue if wd_name.lower() == target.name.lower(): if not compare_dates or _birth_death_date_match( wd, target): yield wd[ keys.QID], catalog_pid, target.catalog_id bucket.clear() bucket_names.clear()
def _artist_generator(self, dump_path): artist_alias_path = os.path.join(dump_path, 'mbdump', 'artist_alias') artist_path = os.path.join(dump_path, 'mbdump', 'artist') area_path = os.path.join(dump_path, 'mbdump', 'area') aliases = defaultdict(list) areas = {} LOGGER.info('Getting artist aliases') # Key is the entity id which has a list of aliases with open(artist_alias_path, 'r') as aliasesfile: for alias in DictReader( aliasesfile, delimiter='\t', fieldnames=['id', 'parent_id', 'label'], ): aliases[alias['parent_id']].append(alias['label']) LOGGER.info('Getting area IDs and related names') # Key is the area internal id, value is the name with open(area_path, 'r') as areafile: for area in DictReader(areafile, delimiter='\t', fieldnames=['id', 'gid', 'name']): areas[area['id']] = area['name'].lower() LOGGER.info('Importing artist entities into DB') with open(artist_path, 'r') as artistfile: n_rows = count_num_lines_in_file(artistfile) artist_reader = DictReader( artistfile, delimiter='\t', fieldnames=[ 'id', 'gid', 'label', 'sort_label', 'b_year', 'b_month', 'b_day', 'd_year', 'd_month', 'd_day', 'type_id', 'area', 'gender', 'ND1', 'ND2', 'ND3', 'ND4', 'b_place', 'd_place', ], ) for artist in tqdm(artist_reader, total=n_rows): if self._check_person(artist['type_id']): current_entity = MusicbrainzArtistEntity() try: self._fill_entity(current_entity, artist, areas) current_entity.gender = self._artist_gender( artist['gender']) except KeyError: LOGGER.error('Wrong gender code: %s', artist) continue # Creates an entity foreach available alias for alias in self._alias_entities( current_entity, MusicbrainzArtistEntity, aliases[artist['id']], ): alias.gender = current_entity.gender yield alias yield current_entity if self._check_band(artist['type_id']): current_entity = MusicbrainzBandEntity() try: self._fill_entity(current_entity, artist, areas) except ValueError: LOGGER.error('Wrong date: %s', artist) continue # Creates an entity foreach available alias for alias in self._alias_entities( current_entity, MusicbrainzBandEntity, aliases[artist['id']], ): yield alias yield current_entity
def _isni_link_generator(self, dump_path: str, resolve: bool): isni_file_path = os.path.join(dump_path, 'mbdump', 'artist_isni') artist_link = {} done = False for result in external_id_pids_and_urls(): if done: break for pid, formatter in result.items(): if pid != 'P213': continue for url_formatter, _ in formatter.items(): with open(isni_file_path, 'r') as artistfile: for artistid_isni in DictReader( artistfile, delimiter='\t', fieldnames=['id', 'isni'], ): # If ISNI is valid, generates an url artistid = artistid_isni['id'] isni = artistid_isni['isni'] link = url_formatter.replace('$1', isni) for candidate_url in url_utils.clean(link): if not url_utils.validate(candidate_url): continue if resolve and not url_utils.resolve( candidate_url): continue artist_link[artistid] = candidate_url done = True artist_path = os.path.join(dump_path, 'mbdump', 'artist') with open(artist_path, 'r') as artistfile: n_rows = count_num_lines_in_file(artistfile) artist_isni_reader = DictReader( artistfile, delimiter='\t', fieldnames=[ 'id', 'gid', 'label', 'sort_label', 'b_year', 'b_month', 'b_day', 'd_year', 'd_month', 'd_day', 'type_id', ], ) for artist in tqdm(artist_isni_reader, total=n_rows): try: # Checks if artist has isni link = artist_link[artist['id']] if self._check_person(artist['type_id']): current_entity = MusicbrainzArtistLinkEntity() self._fill_link_entity(current_entity, artist['gid'], link) yield current_entity if self._check_band(artist['type_id']): current_entity = MusicbrainzBandLinkEntity() self._fill_link_entity(current_entity, artist['gid'], link) yield current_entity except KeyError: continue