def _get_urls_for_entity_id(dump_path: str, l_path: str, resolve: bool) -> dict: """given a l_{something}_url relationship file, return a dict of somethingid-[urls]""" LOGGER.info(f"Loading %s relationships", l_path) urlid_entityid_relationship = {} with open(l_path, "r") as tsvfile: url_relationships = DictReader(tsvfile, delimiter='\t', fieldnames=[i for i in range(0, 6)]) for relationship in tqdm(url_relationships, total=count_num_lines_in_file(tsvfile)): # url id matched with its user id if relationship[3] in urlid_entityid_relationship: LOGGER.warning( 'Url with ID %s has multiple entities, only one will ' 'be stored', relationship[3], ) else: urlid_entityid_relationship[ relationship[3]] = relationship[2] url_path = os.path.join(dump_path, 'mbdump', 'url') url_entityid = {} LOGGER.info('Checking URLs related to entity') # Translates URL IDs to the relative URL with open(url_path, "r") as tsvfile: urls = DictReader(tsvfile, delimiter='\t', fieldnames=[i for i in range(0, 5)]) for url_record in tqdm(urls, total=count_num_lines_in_file(tsvfile)): urlid = url_record[0] if urlid in urlid_entityid_relationship: for candidate_url in url_utils.clean(url_record[2]): if not url_utils.validate(candidate_url): continue if resolve and not url_utils.resolve(candidate_url): continue url_entityid[ candidate_url] = urlid_entityid_relationship[urlid] del urlid_entityid_relationship[urlid] entityid_url = defaultdict(list) # Inverts dictionary for url, entityid in url_entityid.items(): entityid_url[entityid].append(url) return entityid_url
def _isni_link_generator(self, dump_path): isni_file_path = os.path.join(dump_path, 'mbdump', 'artist_isni') artist_link = {} done = False for result in external_id_pids_and_urls_query(): if done: break for pid, formatter in result.items(): if pid == 'P213': for url_formatter, regex in formatter.items(): r = re.compile(regex) with open(isni_file_path, 'r') as artistfile: for artistid_isni in DictReader(artistfile, delimiter='\t', fieldnames=['id', 'isni']): # If ISNI is valid, generates an url for the artist artistid = artistid_isni['id'] isni = artistid_isni['isni'] link = url_formatter.replace( '$1', isni) for candidate_url in url_utils.clean(link): if not url_utils.validate(candidate_url): continue if not url_utils.resolve(candidate_url): continue artist_link[artistid] = candidate_url done = True artist_path = os.path.join(dump_path, 'mbdump', 'artist') with open(artist_path, 'r') as artistfile: for artist in DictReader(artistfile, delimiter='\t', fieldnames=['id', 'gid', 'label', 'sort_label', 'b_year', 'b_month', 'b_day', 'd_year', 'd_month', 'd_day', 'type_id']): try: # Checks if artist has isni link = artist_link[artist['id']] if self._check_person(artist['type_id']): current_entity = MusicbrainzArtistLinkEntity() self._fill_link_entity( current_entity, artist['gid'], link) yield current_entity if self._check_band(artist['type_id']): current_entity = MusicbrainzBandLinkEntity() self._fill_link_entity( current_entity, artist['gid'], link) yield current_entity except KeyError: continue
def _check_link(self, link, resolve: bool): LOGGER.debug('Processing link <%s>', link) clean_parts = url_utils.clean(link) LOGGER.debug('Clean link: %s', clean_parts) for part in clean_parts: valid = url_utils.validate(part) if not valid: self.dead_links += 1 continue LOGGER.debug('Valid URL: <%s>', valid) if not resolve: yield valid continue alive = url_utils.resolve(valid) if not alive: self.dead_links += 1 continue LOGGER.debug('Living URL: <%s>', alive) self.valid_links += 1 yield alive
def _link_generator(self, dump_path): l_artist_url_path = os.path.join(dump_path, 'mbdump', 'l_artist_url') # Loads all the relationships between URL ID and ARTIST ID urlid_artistid_relationship = {} with open(l_artist_url_path, "r") as tsvfile: url_relationships = DictReader(tsvfile, delimiter='\t', fieldnames=[i for i in range(0, 6)]) for relationship in url_relationships: # url id matched with its user id if relationship[3] in urlid_artistid_relationship: LOGGER.warning( 'Url with ID %s has multiple artists, only one will be stored' % relationship[3]) else: urlid_artistid_relationship[relationship[3] ] = relationship[2] url_artistid = {} url_path = os.path.join(dump_path, 'mbdump', 'url') # Translates URL IDs to the relative URL with open(url_path, "r") as tsvfile: urls = DictReader(tsvfile, delimiter='\t', fieldnames=[i for i in range(0, 5)]) for url_record in urls: urlid = url_record[0] if urlid in urlid_artistid_relationship: for candidate_url in url_utils.clean(url_record[2]): if not url_utils.validate(candidate_url): continue if not url_utils.resolve(candidate_url): continue url_artistid[candidate_url] = urlid_artistid_relationship[urlid] del urlid_artistid_relationship[urlid] urlid_artistid_relationship = None artistid_url = defaultdict(list) # Inverts dictionary for url, artistid in url_artistid.items(): artistid_url[artistid].append(url) url_artistid = None # Translates ARTIST ID to the relative ARTIST artist_path = os.path.join(dump_path, 'mbdump', 'artist') with open(artist_path, 'r') as artistfile: for artist in DictReader(artistfile, delimiter='\t', fieldnames=['id', 'gid', 'label', 'sort_label', 'b_year', 'b_month', 'b_day', 'd_year', 'd_month', 'd_day', 'type_id']): if artist['id'] in artistid_url: for link in artistid_url[artist['id']]: if self._check_person(artist['type_id']): current_entity = MusicbrainzArtistLinkEntity() self._fill_link_entity( current_entity, artist['gid'], link) yield current_entity if self._check_band(artist['type_id']): current_entity = MusicbrainzBandLinkEntity() self._fill_link_entity( current_entity, artist['gid'], link) yield current_entity