def tracks_lyrics(**kwargs): track_df = read_feature_dataframe('track_name') track_album_artists_id = data.track_album_artists_id() artist_df = read_feature_dataframe('artist_name') df = track_df.merge(track_album_artists_id, how='left').merge( artist_df, how='left') argument_values = [df.track_name.values, df.artist_name.values] array_feature(track_lyrics, argument_values=argument_values, **kwargs)
def release_groups_id(**kwargs): artist_name = rdf('artist_name') album_name = rdf('album_name') # Correction to track_album_artist_id dataframe: # based on how it is created, an album can have more than one author # Infact, it is track-indexed in principle: for every track, it tells author and album # if in an album there are two different authors (e.g., compilation), then we will # have more author associated to that album. In develop, we force just one manually like this track_album_artists_id = data.track_album_artists_id() track_album_artists_id = track_album_artists_id[[ 'alid', 'arid' ]].drop_duplicates().groupby('alid').head(1) df = album_name.merge(track_album_artists_id, how='left', on='alid').merge(artist_name, how='left', on='arid') assert len(df) == len(album_name) argument_values = [df.album_name.values, df.artist_name.values] array_feature(release_group_musicbrainz_id, argument_values=argument_values, **kwargs)
Arguments: artist_musicbrainz_id {str} -- Returns: list -- Record lables ids """ releases = mz.browse_releases(artist=artist_musicbrainz_id['value'], includes=['labels']) labels = set() for r in releases['release-list']: label_list = r['label-info-list'] for l in label_list: try: if l['label']['name'] != '[no label]': labels.add(l['label']['id']) except KeyError: continue labels = list(labels) if len(labels) > 0: return [{'value': l} for l in labels] else: logging.getLogger('root.features').warning( f"I was not able to find any label for which artist {artist_musicbrainz_id['value']} has recorded" ) return None if __name__ == "__main__": array_feature(artist_recorded_label, mp=True)
is_chorus = False phrase_chorus_repetition = {} for phrase in track_lyrics['value'].split('\n'): if re.search('\[.*.*\]', phrase): is_chorus = True if re.search(r'\[(.*Chorus.*)|(.*Hook.*)\]', phrase, re.I) else False continue if is_chorus: phrase_chorus_repetition[phrase] = phrase_chorus_repetition[phrase] + \ 1 if phrase in phrase_chorus_repetition else 1 # if no chorus, return None if len(phrase_chorus_repetition.values()) == 0: return None highest_frequency = max(phrase_chorus_repetition.values()) candidate_phrases = [ key for key in phrase_chorus_repetition.keys() if phrase_chorus_repetition[key] == highest_frequency ] assert len( candidate_phrases ) >= 1, "At this point, we should have at least one candidate phrase" candidate_phrases = sorted(candidate_phrases, key=len, reverse=True) return {'value': candidate_phrases[0]} if __name__ == '__main__': array_feature(track_chorus, mp=False)
str -- Year, formatted as a string with 4 digits. Eg. 1994 """ if artist_wikidata_id is not None and artist_type['value'] == 'Person': query = "select ?y where {" + \ artist_wikidata_id['value'] + " wdt:P2032 ?y .}" results = query_sparql(query) if len(results) > 0: if len(results) == 1: date = results[0]['y']['value'] # Check if satisfies the pattern if re.match(r"^\d{4}-\d{2}-\d{2}T00:00:00Z$", date): year = date.split('-')[0] return {'value': year} else: logging.getLogger('root.features').warning( f"Date {date} does not match pattern") else: logging.getLogger('root.features').warning( f"Found more than one value for work period (end) for entity {artist_wikidata_id['value']}, skipping" ) else: logging.getLogger('root.features').warning( f"No attribute work period (end) associated with entity {artist_wikidata_id['value']}" ) if __name__ == "__main__": array_feature(artist_solo_end_activity_year, mp=False)
import musicbrainzngs as mz @musicbrainz_feature @cached_feature @timing_feature def artist_self_releasing_records( artist_musicbrainz_id) -> 'artist_self_releasing_records': """Tell if the artist have self released records (i.e. without label) or not Arguments: artist_musicbrainz_id {str} -- Returns: bool -- """ releases = mz.browse_releases(artist=artist_musicbrainz_id['value'], includes=['labels']) for r in releases['release-list']: label_list = r['label-info-list'] for l in label_list: try: if l['label']['name'] == '[no label]': return {'value': True} except KeyError: continue if __name__ == "__main__": array_feature(artist_self_releasing_records, mp=True)
elif 'American Music Awards' in ceremony_label: d['award_series'] = 'American Music Award' elif 'World Music Awards' in ceremony_label: d['award_series'] = 'World Music Award' elif 'Tony Award' in ceremony_label: d['award_series'] = 'Tony Award' elif 'Golden Raspberry Awards' in ceremony_label: d['award_series'] = 'Golden Raspberry Award' elif 'BRIT Awards' in ceremony_label or 'Brit Awards' in ceremony_label: d['award_series'] = 'Brit Award' elif 'BET' in ceremony_label: d['award_series'] = 'BET Award' elif "People's Choice Awards" in ceremony_label: d['award_series'] = "People's Choice Award" elif 'Academy Award' in ceremony_label: d['awards_series'] = "Oscar" else: logging.getLogger('root.features').warning( f"Not able to associate any award series to the award {ceremony_label}" ) except KeyError: pass awards.append(d) return [{'value': a} for a in awards] if __name__ == "__main__": array_feature(artist_awards, mp=False)
f"Artist {artist_musicbrainz_id['value']} do not have a type, skipping") return None if artist['artist']['type'] == 'Person': try: birth_date = artist['artist']['life-span']['begin'] except KeyError: logging.getLogger('root.features').warning( f"Artist {artist_musicbrainz_id['value']} do not have known a birth date") return None try: datetime.datetime.strptime(birth_date, '%Y-%m-%d') except ValueError: logging.getLogger('root.features').warning( f"Incorrect artist_birth_date format for {artist_musicbrainz_id['value']}, should be YYYY-MM-DD, but got {birth_date}") return None try: birth_date_pandas = pd.to_datetime(birth_date) except pd.errors.OutOfBoundsDatetime: logging.getLogger('root.features').warning( f"Invalid artist_birth_date for {artist_musicbrainz_id['value']}: {birth_date}. Skipping") return None return {'value': birth_date_pandas} if __name__ == "__main__": array_feature(artist_birth_date, mp=True)
try: date = release_group['first-release-date'] except KeyError: logging.getLogger('root.features').warning( f"Release-group {release_group_musicbrainz_id['value']} has not first-release-date attribute" ) return None try: datetime.datetime.strptime(date, '%Y-%m-%d') except ValueError: logging.getLogger('root.features').warning( f"Incorrect first release date format for {release_group_musicbrainz_id['value']}, should be YYYY-MM-DD, but got {date}" ) return None try: date_pandas = pd.to_datetime(date) except pd.errors.OutOfBoundsDatetime: logging.getLogger('root.features').warning( f"Invalid artist_date for {release_group_musicbrainz_id['value']}: {date}. Skipping" ) return None return {'value': date_pandas} if __name__ == "__main__": array_feature(album_release_date, mp=False)
""" artist = musicbrainzngs.get_artist_by_id(artist_musicbrainz_id['value'], includes=['tags'])['artist'] genres = [] try: tags = artist['tag-list'] except: logging.getLogger('root.features').warning( f"No tag list associated with artist {artist_musicbrainz_id['value']}" ) return None for tag in tags: try: musicbrainz_genre_id = genres_musicbrainz(tag['name']) genres.append(musicbrainz_genre_id) except KeyError: continue if len(genres) > 0: return [{'value': g} for g in genres] else: logging.getLogger('root.features').warning( f"No genres associated with artist {artist_musicbrainz_id['value']}" ) if __name__ == "__main__": array_feature(artist_genres, mp=True)
import musicbrainzngs import logging @musicbrainz_feature @cached_feature @timing_feature def artist_birth_place_area(artist_musicbrainz_id) -> 'area_musicbrainz_id': """Extracts the area the actual artist was born Arguments: artist_musicbrainz_id {str} -- Returns: str -- The id of the area in musicbrainz """ if artist_musicbrainz_id is not None: artist = musicbrainzngs.get_artist_by_id( artist_musicbrainz_id['value']) try: birth_area_id = artist['artist']['begin-area']['id'] return {'value': birth_area_id} except KeyError: logging.getLogger('root.features').warning( f"Artist {artist_musicbrainz_id} has not begin-area attribute") return None if __name__ == "__main__": array_feature(artist_birth_place_area, mp=True)
import musicbrainzngs import logging @musicbrainz_feature @timing_feature def record_label_area(record_label_musicbrainz_id) -> 'area_musicbrainz_id': """Extracts the area the actual record label is based in Arguments: record_label_musicbrainz_id {str} -- Returns: str -- The id of the area in musicbrainz """ if record_label_musicbrainz_id is not None: label = musicbrainzngs.get_label_by_id( record_label_musicbrainz_id['value']) try: label_area_id = label['label']['area']['id'] return {'value': label_area_id} except KeyError: logging.getLogger('root.features').warning( f"Record label {record_label_musicbrainz_id} has not area attribute" ) return None if __name__ == "__main__": array_feature(record_label_area, mp=False)
@cached_feature @timing_feature def artist_type(artist_musicbrainz_id) -> 'artist_type': """States the type of the artist. Artist can be Person, Group, Choir, Orchestra or Character (a finctional character). We discard Other in this method Arguments: artist_musicbrainz_id {str} -- Returns: str -- Artist type """ if artist_musicbrainz_id is not None: artist = musicbrainzngs.get_artist_by_id( artist_musicbrainz_id['value']) try: artist_type = artist['artist']['type'] if artist_type != 'Other': return {'value': artist_type} else: return None except KeyError: logging.getLogger('root.features').warning( f"Artist {artist_musicbrainz_id} do not have a type, skipping") return None if __name__ == "__main__": array_feature(artist_type, mp=False)
Returns: str -- """ if record_label_musicbrainz_id is not None: label = musicbrainzngs.get_label_by_id( record_label_musicbrainz_id['value']) try: start_activity_year = label['label']['life-span']['end'] except KeyError: logging.getLogger('root.features').warning( f"Label {record_label_musicbrainz_id} does not have known dissolution date" ) return None if re.match(r"^\d{4}$", start_activity_year): return {'value': start_activity_year} elif re.match(r"^\d{4}-\d{2}$", start_activity_year): return {'value': start_activity_year.split('-')[0]} elif re.match(r"^\d{4}-\d{2}-\d{2}$", start_activity_year): return {'value': start_activity_year.split('-')[0]} else: logging.getLogger('root.features').warning( f"Record label dissolution year {start_activity_year} does not match pattern" ) if __name__ == "__main__": array_feature(record_label_dissolution_year, mp=False)
def artist_gender(artist_musicbrainz_id) -> 'artist_gender': """States whether the artist is a male or a female Arguments: artist_musicbrainz_id {str} -- Returns: str -- Either Male or Female """ if artist_musicbrainz_id is not None: artist = musicbrainzngs.get_artist_by_id( artist_musicbrainz_id['value']) try: artist_type = artist['artist']['gender'] except KeyError: logging.getLogger('root.features').warning( f"Artist {artist_musicbrainz_id} has no gender, skipping") return None if artist['artist']['gender'] not in ['Male', 'Female']: logging.getLogger( f"Artist {artist_musicbrainz_id} has unknown gender {artist['artist']['gender']}, skipping" ) return None else: return {'value': artist['artist']['gender']} if __name__ == "__main__": array_feature(artist_gender, mp=False)
if artist['artist']['type'] == 'Person': try: death_date = artist['artist']['life-span']['end'] except KeyError: logging.getLogger('root.features').warning( f"Artist {artist_musicbrainz_id['value']} do not have known a death date" ) return None try: datetime.datetime.strptime(death_date, '%Y-%m-%d') except ValueError: logging.getLogger('root.features').warning( f"Incorrect artist_death_date format for {artist_musicbrainz_id['value']}, should be YYYY-MM-DD, but got {death_date}" ) return None try: death_date_pandas = pd.to_datetime(death_date) except pd.errors.OutOfBoundsDatetime: logging.getLogger('root.features').warning( f"Invalid artist_death_date for {artist_musicbrainz_id['value']}: {death_date}. Skipping" ) return None return {'value': death_date_pandas} if __name__ == "__main__": array_feature(artist_death_date, mp=True)
This allows us to retrieve the artist page in the 89% of the cases Arguments: artist_name {str} - - Returns: uri - - uri """ artist = musicbrainzngs.get_artist_by_id( artist_musicbrainz_id['value'], includes=['url-rels'])['artist'] try: urls = artist['url-relation-list'] except KeyError: logging.getLogger('root.features').warning( f"No relations to external pages specified for {artist_musicbrainz_id['value']}") return None url_wikidata = [u for u in urls if u['type'] == 'wikidata' and u['direction'] == 'forward'] if len(url_wikidata) > 0: return {'value': f"wd:{url_wikidata[0]['target'].split('/')[-1]}"} else: logging.getLogger('root.features').warning( f"No wikidata page specified for artist {artist_musicbrainz_id['value']}") return None if __name__ == '__main__': array_feature(artist_wikidata_id, mp=True,)
str -- Year, formatted as a string with 4 digits. Eg. 1994 """ if artist_wikidata_id is not None and artist_type['value'] == 'Person': query = "select ?y where {" + \ artist_wikidata_id['value'] + " wdt:P2031 ?y .}" results = query_sparql(query) if len(results) > 0: if len(results) == 1: date = results[0]['y']['value'] # Check if satisfies the pattern if re.match(r"^\d{4}-\d{2}-\d{2}T00:00:00Z$", date): year = date.split('-')[0] return {'value': year} else: logging.getLogger('root.features').warning( f"Date {date} does not match pattern") else: logging.getLogger('root.features').warning( f"Found more than one value for work period (start) for entity {artist_wikidata_id['value']}, skipping" ) else: logging.getLogger('root.features').warning( f"No attribute work period (start) associated with entity {artist_wikidata_id['value']}" ) if __name__ == "__main__": array_feature(artist_solo_start_activity_year, mp=True)
import musicbrainzngs import logging @musicbrainz_feature @cached_feature @timing_feature def artist_based_in_area(artist_musicbrainz_id) -> 'area_musicbrainz_id': """Extracts the area the actual artist is based in Arguments: artist_musicbrainz_id {str} -- Returns: str -- The id of the area in musicbrainz """ if artist_musicbrainz_id is not None: artist = musicbrainzngs.get_artist_by_id( artist_musicbrainz_id['value']) try: birth_area_id = artist['artist']['area']['id'] return {'value': birth_area_id} except KeyError: logging.getLogger('root.features').warning( f"Artist {artist_musicbrainz_id} has not area attribute") return None if __name__ == "__main__": array_feature(artist_based_in_area, mp=True)
Returns: str -- """ if record_label_musicbrainz_id is not None: label = musicbrainzngs.get_label_by_id( record_label_musicbrainz_id['value']) try: start_activity_year = label['label']['life-span']['begin'] except KeyError: logging.getLogger('root.features').warning( f"Label {record_label_musicbrainz_id} does not have known fundation date" ) return None if re.match(r"^\d{4}$", start_activity_year): return {'value': start_activity_year} elif re.match(r"^\d{4}-\d{2}$", start_activity_year): return {'value': start_activity_year.split('-')[0]} elif re.match(r"^\d{4}-\d{2}-\d{2}$", start_activity_year): return {'value': start_activity_year.split('-')[0]} else: logging.getLogger('root.features').warning( f"Record label fundation year {start_activity_year} does not match pattern" ) if __name__ == "__main__": array_feature(record_label_fundation_year, mp=False)
def album_record_label( release_group_musicbrainz_id) -> 'record_label_musicbrainz_id': """Extracts the record label associated with an album. Taken the releases belonging to the release-group, the first valid record label is considered Returns: str -- record label musicbrainz id """ if release_group_musicbrainz_id is not None: releases = mz.browse_releases( release_group=release_group_musicbrainz_id['value'], includes=['labels']) for r in releases['release-list']: label_list = r['label-info-list'] if len(label_list) > 0: for l in label_list: try: if l['label']['name'] != '[no label]': return {'value': l['label']['id']} except KeyError: continue logging.getLogger('root.features').warning( f"Release {r['id']} has no associated record label") return None if __name__ == "__main__": array_feature(album_record_label, mp=False)
if release_group_musicbrainz_id is not None: release_group = musicbrainzngs.get_release_group_by_id( release_group_musicbrainz_id['value'], includes=['tags'])['release-group'] try: tags = release_group['tag-list'] except KeyError: logging.getLogger('root.features').warning( f"Release-group {release_group_musicbrainz_id} has not tags") return None genres = [] for tag in tags: try: musicbrainz_genre_id = genres_musicbrainz(tag['name']) genres.append(musicbrainz_genre_id) except KeyError: continue if len(genres) > 0: return [{'value': g} for g in genres] else: logging.getLogger('root.features').warning( f"No genres associated with release-group {release_group_musicbrainz_id}" ) return None if __name__ == "__main__": array_feature(album_genres, mp=True)