def write_out_songs(song_names, outfile): for song_name in tqdm(song_names): with open("{}/{}.json".format( verses_with_tokens, name_to_file_name(song_name))) as song_file: song = json.load(song_file) for verse in song['verses']: if verse['valid']: outfile.write(verse['lyrics'] + '\n')
def process_song(song, bar): with open("{}/{}.json".format(verses_with_tokens, name_to_file_name(song))) as song_file: song = json.load(song_file) for verse in song['verses']: if verse['valid']: lyrics = verse['lyrics'] lyrics = apply_bpe_to_string(lyrics, bpe) verse['lyrics'] = lyrics return song
def process_song(song, bar): with open("{}/{}.json".format(cleaned_verses_dir, name_to_file_name(song))) as song_file: song = json.load(song_file) for verse in song['verses']: artists = get_artists_from_metadata(song['title'], verse['metadata'], song['artist'], song['featured_artists']) verse['artists'] = artists return song
def create_train_file(list_file, out_file): verses_list = [] song_names = read_list_from_file(list_file) for song_name in tqdm(song_names): with open("{}/{}.json".format( bpe_songs_dir, name_to_file_name(song_name))) as song_file: song = json.load(song_file) for verse in song['verses']: if verse['valid']: train_verse = {} train_verse['artist_id'] = verse['artist_id'] train_verse['lyrics'] = verse['lyrics'] verses_list.append(train_verse) with open(out_file, 'w') as openfile: json.dump(verses_list, openfile)
def process_song(song, bar): with open("{}/{}.json".format(marked_verses_dir, name_to_file_name(song))) as song_file: song = json.load(song_file) for verse in song['verses']: new_lyrics = verse['lyrics'] lines = [line.strip() for line in new_lyrics.split('\n')] lines = list(filter(lambda s: s != '', lines)) # reconstruct the lines together cleaned_lyrics = '' for line in lines: # concat lines together and add the end line token back cleaned_lyrics = cleaned_lyrics + 'S ' + line + ' L ' cleaned_lyrics = cleaned_lyrics.strip() verse['lyrics'] = cleaned_lyrics return song
def analyze_verses(song_list_path, song_dir): song_list = read_list_from_file(song_list_path) analysis = {'verses': {}, 'lines': {}, 'words': {}, 'words_per_verse': {}} artists = {} songs = {} bar = tqdm(song_list) update_songs = get_update_songs() for song_name in bar: # bar.set_description("Starting {}".format(song_name)) with open("{}/{}.json".format( song_dir, name_to_file_name(song_name))) as song_file: song = json.load(song_file) title = song['title'] num_verses = len(song['verses']) update_analysis(analysis, 'verses', num_verses, title) update_artist(artists, clean_artist_names(song['artist']).strip(), 1, 0, 0) # handle verses for verse in song['verses']: if not verse['valid']: continue # handle lines lines = [line.strip() for line in verse['lyrics'].split('\n')] lines = list(filter(lambda s: s != '', lines)) num_lines = len(lines) update_analysis(analysis, 'lines', num_lines, title) update_artist(artists, verse['artists'][0], 0, 1, num_lines) update_songs(songs, "{} || {}".format(song['artist'], song['title']), verse['artists'][0], num_lines) # handle words total_words = 0 for line in lines: words = [word.strip() for word in line.split()] words = list(filter(lambda s: s != '', words)) num_words = len(words) total_words = total_words + num_words update_analysis(analysis, 'words', num_words, title) update_analysis(analysis, 'words_per_verse', total_words, title) for key in analysis.keys(): for num in analysis[key].keys(): analysis[key][num]['songs'] = list(analysis[key][num]['songs']) with open("verse_analysis.json", "w") as outfile: json.dump(analysis, outfile) with open("artist_analysis.json", "w") as outfile: json.dump(artists, outfile) with open("song_analysis.json", "w") as outfile: json.dump(songs, outfile)
def analyze_characters(dir_path, list_file, input_format, out_file): song_list = read_list_from_file("{}/{}".format(dir_path, list_file)) character_dict = {} j = 1 start = time.time() bar = tqdm(song_list) for song_name in bar: # bar.write("starting {}, {} out of {}".format(song_name, j, len(song_list))) song_file_name = name_to_file_name(song_name.strip()) with open('{}/{}.json'.format(dir_path, song_file_name)) as jsonfile: song = json.load(jsonfile) lyric_blocks = get_lyric_blocks(song, input_format) for lyrics in lyric_blocks: for i in range(0, len(lyrics)): c = lyrics[i] if re.search(r'[^a-zA-Z0-9]+', c) is not None and c not in char_allow_list: # add to characters dictionary if c not in character_dict.keys(): character_dict[c] = { "count": 1, "context": [{ "song": song_name, "line": get_context(lyrics, i) }] } else: character_dict[c][ 'count'] = character_dict[c]['count'] + 1 character_dict[c]['context'].append({ "song": song_name, "line": get_context(lyrics, i) }) j = j + 1 with open("{}.json".format(out_file), "w") as openfile: json.dump(character_dict, openfile) time_taken = str(datetime.timedelta(seconds=time.time() - start)) print("{} for {}".format(time_taken, len(song_list)))
def process_song(song, bar): with open("{}/{}.json".format(verse_split_songs_dir, name_to_file_name(song))) as song_file: song = json.load(song_file) new_verses = [] for verse in song['verses']: new_lyrics = clean_lyrics(verse['lyrics']) verse['lyrics'] = new_lyrics if new_lyrics.strip() != '': # this is still a good verse, add it back # we remove the verses that are empty new_verses.append(verse) else: with open( "{}/{}".format(cleaned_verses_dir, removed_verse_metadata_file), 'a') as openfile: openfile.write('{} || {}\n'.format(song['title'], verse['metadata'])) song['verses'] = new_verses return song
def process_song(song, bar): with open("{}/{}.json".format(cleaned_songs_dir, name_to_file_name(song))) as song_file: song = json.load(song_file) verses = [] lyrics = song['lyrics'] i = 0 verse_lyrics = '' verse_metadata = '' def write_verse(v_metadata, v_lyrics, verses): if len(v_metadata.strip()) > 0: verses.append({'metadata': v_metadata, 'lyrics': v_lyrics}) return verses while i < len(lyrics): # Parse the songs into each verse if lyrics[i] == '[': # we reached a new verse # append the previous verse if there was one verses = write_verse(verse_metadata, verse_lyrics, verses) # reset for the new verse verse_lyrics = '' verse_metadata = '' # start processing the new verse while i < len(lyrics) and lyrics[i] != ']': verse_metadata = verse_metadata + lyrics[i] i = i + 1 if i < len(lyrics): verse_metadata = verse_metadata + lyrics[i] else: verse_lyrics = verse_lyrics + lyrics[i] i = i + 1 verses = write_verse(verse_metadata, verse_lyrics, verses) return { 'title': song['title'], 'verses': verses, 'artist': song['artist'], 'featured_artists': song['featured_artists'] }
def artist_to_raw_song_files(artists_file): with open(artists_file) as openfile: artists = openfile.readlines() artists = [artist.strip() for artist in artists] for artist_name in tqdm(artists): with open("{}/{}".format(artist_lyric_dir, name_to_file_name(artist_name))) as jsonfile: artist = json.load(jsonfile) songs = artist["songs"] def process_song(song): return { 'title': song['title'], 'artist': song['primary_artist']['name'], 'lyrics': song['lyrics'] } def get_song_name(song): return song['title'] loop_and_process(songs, process_song, "Song", get_song_name, raw_songs_dir)
def process_song(song, bar): with open("{}/{}.json".format(verse_artists_dir, name_to_file_name(song))) as song_file: song = json.load(song_file) # TODO: we just add this here for ease, but it should be moved # somewhere else # here we remove songs that are not by someone in our artist list if clean_artist_names(song['artist']).strip() not in artist_list: return False verse_lyrics_set = set() for verse in song['verses']: verse['valid'] = is_verse_artist_valid(verse, artist_list) and \ is_verse_type_valid(verse) and \ has_enough_lines(verse) and \ verse['lyrics'] not in verse_lyrics_set verse_lyrics_set.add(verse['lyrics']) if verse['valid']: verse['artist_id'] = artist_list.index(verse['artists'][0]) + 1 for verse in song['verses']: if verse['valid']: return song # if all verses are invalid, remove the song return False
def process_song(song, bar): with open("{}/{}.json".format(raw_songs_dir, name_to_file_name(song))) as song_file: song = json.load(song_file) song['lyrics'] = clean_lyrics(song['lyrics']) return song