def get_artist_albums(artist_name_object, number_of_albums): artist_album_ids = {} counter = 1 if VERBOSE: helper.log_highlight('Fetching albums of artists') for artist_id, artist_name in artist_name_object.items(): response = fetch_artist_albums(artist_id) header = response['message']['header'] status_code = header['status_code'] if VERBOSE: print 'Fetching albums of ' + str(artist_id) + ' [' + str(counter) + ' of ' + str(len(artist_name_object)) + ']' if status_code is 200 and len(response['message']['body']['album_list']) > 0: albums = response['message']['body']['album_list'] for index, album in enumerate(albums, start = 0): if index < number_of_albums: album_id = album['album']['album_id'] try: artist_album_ids[artist_id].append(album_id) except: artist_album_ids[artist_id] = [] artist_album_ids[artist_id].append(album_id) else: if VERBOSE: 'Album ' + str(counter) + ' of ' + str(artist_id) + ' not found' counter += 1 return artist_album_ids
def lfm_save_user_characteristics(users): content = "" users_iter = iter(users) if VERBOSE: helper.log_highlight("Saving user characteristics") # mkdir in py if not os.path.exists(OUTPUT_DIR): os.makedirs(OUTPUT_DIR) for index, user in enumerate(users_iter, start = 1): try: if VERBOSE: print "Fetch user characteristics [" + str(index) + " of " + str(len(users)) + "]" user_info = helper.api_user_call("getinfo", user, "") user = user_info['user'] user_string = lfm_prepare_user_characteristics_string(user) content += user_string + "\n" except: print "EXCEPTION lfm_save_user_characteristics" next(users_iter) text_file = open(USER_CHARACTERISTICS_FILE, 'w') text_file.write(content) text_file.close() if VERBOSE: print "\nSuccessfully created " + USER_CHARACTERISTICS_FILE + "\n" return
def get_lyrics_by_tracks(artist_tracks_id_object): artist_tracks_object = {} counter = 1 musixmatch_regex = re.compile(r'\*.*\*\s*$') # this will delete **** This Lyrics is NOT... *** at the end of the string is_limit_reached = False helper.ensure_dir(OUTPUT_DIR_MUSIXMATCH_JSON) if VERBOSE: helper.log_highlight('Fetching lyrics of tracks') for artist_id, tracks in artist_tracks_id_object.items(): artist_tracks = {} artist_tracks[artist_id] = [] if VERBOSE: print 'Fetching tracks of artist ' + str(artist_id) + ' [' + str(counter) + ' of ' + str(len(artist_tracks_id_object)) + ']' if os.path.exists(OUTPUT_DIR_MUSIXMATCH_JSON + artist_id + '.json') and SKIP_EXISTING_LYRICS: if VERBOSE: print " Tracks of artist already fetched: " + OUTPUT_DIR_MUSIXMATCH_JSON + str(artist_id) + '.json' counter += 1 continue for index, track_id in enumerate(tracks, start = 1): response = fetch_lyrics_by_track_id(track_id) header = response['message']['header'] status_code = header['status_code'] if VERBOSE: print ' Fetching lyrics of track ' + str(track_id) + ' [' + str(index) + ' of ' + str(len(tracks)) + ']' if status_code is 200: lyrics = response['message']['body']['lyrics']['lyrics_body'] lyrics_replaced = re.sub(r'\*.*\*\s*$', '', lyrics) artist_tracks[artist_id].append(lyrics_replaced) try: artist_tracks_object[artist_id] += lyrics_replaced except: artist_tracks_object[artist_id] = '' artist_tracks_object[artist_id] += lyrics_replaced if status_code is 402: is_limit_reached = True counter += 1 if not is_limit_reached: if VERBOSE: print '\n Save JSON with lyrics\n' save_json(artist_tracks, OUTPUT_DIR_MUSIXMATCH_JSON + artist_id + '.json') return artist_tracks_object
def get_user_friends(all_users, limit_user): all_user_friends = [] all_users_and_friends = [] all_unique_users = [] user_list = iter(all_users) list = all_users if VERBOSE: helper.log_highlight("Fetching friends of user") for index, user in enumerate(user_list, start = 1): user_get_friends = helper.api_user_call("getfriends", user, "") try: user_friends = user_get_friends['friends']['user'] if VERBOSE: print "Fetching friends of " + user for friend in user_friends: friend_name = friend['name'] all_user_friends.append(friend_name) if index > limit_user: all_users_and_friends = list + all_user_friends all_unique_users = helper.get_unique_items(all_users_and_friends) np.savetxt(USER_LIST_FILE, all_unique_users, delimiter=",", fmt='%s') # print len(all_users_and_friends) # print len(all_unique_users) if VERBOSE: print "\nSuccessfully created " + USER_LIST_FILE print "Successfully fetched friends\n" return except KeyError: print "" print "SKIP: User has no friends" next(user_list) except Exception: print "" print "ERROR: " print(traceback.format_exc()) all_users_and_friends = list + all_user_friends all_unique_users = helper.get_unique_items(all_users_and_friends) np.savetxt(USER_LIST_FILE, all_unique_users, delimiter=",", fmt='%s') return
def language_stats_musixmatch(): languages = {} musixmatch_artists = mf.read_txt(mf.GENERATED_ARTISTS_FILE) artists_file = Wikipedia_Fetcher.read_file(ARTISTS_FILE)[:MUSIXMATCH_MAX_ARTISTS] ########################### ## keep artist structure ## ########################### if VERBOSE: helper.log_highlight('Generate lyrics content') # iterate over the same artist file and check # if the values are in the same order # so the later generated AAM is still in the same order for index, artist_name in enumerate(artists_file, start = 0): # make it short for debugging if VERBOSE: print 'Get lyrics of ' + artist_name + ' [' + str(index + 1) + ' of ' + str(len(artists_file)) + ']' if index < len(artists_file): for artist_mm_id, artist_mm_name in musixmatch_artists.items(): # if the name is in the musixmatch array # to checking it is still in the same order if artist_name == artist_mm_name: # check the lyrics and sort everything file = mf.OUTPUT_DIR_MUSIXMATCH_JSON + str(artist_mm_id) + '.json' try: with open(file, 'r') as f: data = json.load(f) # create reader data_by_artist = data[artist_mm_id] for string in data_by_artist: # remove all non english try: lang = detect(string) try: languages[lang] += 1 except: languages[lang] = 1 except: continue; except: print 'File ' + file + ' not found' return languages
def lfm_save_history_of_users(users): """ saves the history of users :param users: an array of users """ content = "" user_iter = iter(users) if VERBOSE: helper.log_highlight("Saving listening history") # mkdir in py if not os.path.exists(OUTPUT_DIR): os.makedirs(OUTPUT_DIR) for index, user in enumerate(user_iter, start = 1): try: if VERBOSE: print "Fetch recent tracks from user [" + str(index) + " of " + str(len(users)) + "]" for page_number in range(1, (MAX_RECENT_TRACK_PAGES + 1)): recent_tracks = helper.api_user_call("getrecenttracks", user, "&limit=" + str(MAX_RECENT_TRACK_PER_PAGE) + "&page=" + str(page_number))['recenttracks']['track'] for index, recent_track in enumerate(recent_tracks, start = 1): if VERBOSE and VERBOSE_DEPTH == 2: print " Fetch recent track [" + str(index * page_number) + " of " + str(len(recent_tracks) * MAX_RECENT_TRACK_PAGES) + "]" listening_history = lfm_prepare_history_string(recent_track, user) content += listening_history + "\n" except Exception: print "EXCEPTION lfm_save_history_of_users" next(user_iter) text_file = open(USER_LISTENING_HISTORY, 'w') text_file.write(content) text_file.close() if VERBOSE: print "\nSuccessfully created " + USER_LISTENING_HISTORY + "\n" return
def get_html_by_tracks(artist_tracks_id_object): artist_tracks_object = {} counter = 1 musixmatch_regex = re.compile(r'\*.*\*\s*$') # this will delete **** This Lyrics is NOT... *** at the end of the string if VERBOSE: helper.log_highlight('Fetching lyrics HTML of tracks') helper.ensure_dir(OUTPUT_DIR_MUSIXMATCH_HTML) for artist_id, tracks in artist_tracks_id_object.items(): if VERBOSE: print 'Fetching tracks of artist ' + str(artist_id) + ' [' + str(counter) + ' of ' + str(len(artist_tracks_id_object)) + ']' for index, track_id in enumerate(tracks, start = 1): response = fetch_html_lyrics_by_track_id(track_id) header = response['message']['header'] status_code = header['status_code'] has_lyrics = response['message']['body']['track']['has_lyrics'] if VERBOSE: print ' Fetching lyrics of track ' + str(track_id) + ' [' + str(index) + ' of ' + str(len(tracks)) + ']' if status_code is 200 and int(has_lyrics) > 0: track_url = response['message']['body']['track']['track_share_url'] filename = OUTPUT_DIR_MUSIXMATCH_HTML + str(artist_id) + '_' + str(track_id) + '.html' try: if VERBOSE: print ' Storing and retrieving data from ' + track_url content = urllib.urlopen(track_url).read() with open(filename, 'w') as f: f.write(content) except IOError: # return empty content in case some IO / socket error occurred if VERBOSE: print ' Cannot retrieve data from ' + track_url
def save_lfmb_c1ku_combined_file(c1ku_file, lfmb1_file, output_file, header_string): helper.log_highlight('save ' + output_file) LFM1b_file = mf.read_txt(lfmb1_file) sorted_string = header_string + "\n" with open(c1ku_file, 'r') as f: reader = csv.reader(f, delimiter='\t') # create reader headers = reader.next() # skip header for index, row in enumerate(reader, start=1): the_id = row[0] sorted_string += LFM1b_file[the_id] + "\n" helper.ensure_dir(OUTPUT_DIR) text_file = open(output_file, 'w') text_file.write(sorted_string) text_file.close()
def get_artist_album_tracks(artist_album_object, number_of_tracks_per_album): artist_album_tracks = {} counter = 1 if VERBOSE: helper.log_highlight('Fetching tracks of albums') for artist_id, album_array in artist_album_object.items(): if VERBOSE: print 'Fetching albums of artist ' + str(artist_id) + ' [' + str(counter) + ' of ' + str(len(artist_album_object)) + ']' for index, album_id in enumerate(album_array, start = 1): response = fetch_artist_album_tracks(album_id) header = response['message']['header'] status_code = header['status_code'] if VERBOSE: print ' Fetching tracks of album ' + str(album_id) + ' [' + str(index) + ' of ' + str(len(album_array)) + ']' if status_code is 200 and len(response['message']['body']['track_list']) > 0: tracks = response['message']['body']['track_list'] for index, track in enumerate(tracks, start = 0): if index < number_of_tracks_per_album: track_id = track['track']['track_id'] try: artist_album_tracks[artist_id].append(track_id) except: artist_album_tracks[artist_id] = [] artist_album_tracks[artist_id].append(track_id) else: if VERBOSE: print ' Tracks of album ' + str(album_id) + ' not found' counter += 1 return artist_album_tracks
def get_artist_ids(artist_name_array): artists_with_id = {} if VERBOSE: helper.log_highlight('Fetching Artist IDs') for index, artist_name in enumerate(artist_name_array, start = 1): response = fetch_artist_by_term(artist_name) header = response['message']['header'] status_code = header['status_code'] if VERBOSE: print 'Fetching ' + artist_name + ' [' + str(index) + ' of ' + str(NUMBER_OF_MAX_ARTISTS) + ']' if status_code is 200 and len(response['message']['body']['artist_list']) > 0: # always get the first artist chosen_artist = response['message']['body']['artist_list'][0]['artist'] chosen_artist_id = chosen_artist['artist_id'] artists_with_id[chosen_artist_id] = artist_name else: if VERBOSE: print artist_name + ' not found' return artists_with_id
return file_contents # /read_file # Main program if __name__ == '__main__': artists = helper.read_csv(ARTIST_FILE) if type(NUMBER_OF_MAX_ARTISTS) is bool and NUMBER_OF_MAX_ARTISTS is True: NUMBER_OF_MAX_ARTISTS = len(artists) artists = artists[:NUMBER_OF_MAX_ARTISTS] number_of_fetches = NUMBER_OF_MAX_ARTISTS * 2 + (NUMBER_OF_MAX_ARTISTS * NUMBER_OF_ALBUMS) * (1 + NUMBER_OF_MAX_TRACKS) if VERBOSE: helper.log_highlight('You will have ' + str(number_of_fetches) + ' queries to the musixmatch api') print '' print 'Artist queries: ' + str(NUMBER_OF_MAX_ARTISTS) print 'Album queries: ' + str(NUMBER_OF_MAX_ARTISTS) print 'Track queries: ' + str(NUMBER_OF_MAX_ARTISTS * NUMBER_OF_ALBUMS) print 'Lyrics queries: ' + str((NUMBER_OF_MAX_ARTISTS * NUMBER_OF_ALBUMS) * NUMBER_OF_MAX_TRACKS) print '' print 'These numbers can vary if an artists has less albums, tracks or tracks with lyrics' print '' helper.ensure_dir(OUTPUT_DIR_MUSIXMATCH) # live fetching # fetched_artist_ids = get_artist_ids(artists) # save_txt(fetched_artist_ids, 'artist_ids.txt')
def generate_musixmatch_AAM(): ps = PorterStemmer() lyrics_contents = {} terms_df = {} term_list = [] total_string = '' musixmatch_artists = mf.read_txt(mf.GENERATED_ARTISTS_FILE) artists_file = Wikipedia_Fetcher.read_file(ARTISTS_FILE)[:MUSIXMATCH_MAX_ARTISTS] ########################### ## keep artist structure ## ########################### if VERBOSE: helper.log_highlight('Generate lyrics content') # iterate over the same artist file and check # if the values are in the same order # so the later generated AAM is still in the same order for index, artist_name in enumerate(artists_file, start = 0): # make it short for debugging if VERBOSE: print 'Get lyrics of ' + artist_name + ' [' + str(index + 1) + ' of ' + str(len(artists_file)) + ']' if index < len(artists_file): for artist_mm_id, artist_mm_name in musixmatch_artists.items(): # if the name is in the musixmatch array # to checking it is still in the same order if artist_name == artist_mm_name: # check the lyrics and sort everything file = mf.OUTPUT_DIR_MUSIXMATCH_JSON + str(artist_mm_id) + '.json' try: with open(file, 'r') as f: data = json.load(f) # create reader data_by_artist = data[artist_mm_id] lyrics_content = '' for string in data_by_artist: # remove all non english try: lyrics = re.sub(r'\*.*\*(\s|\S)*$', '', string) lang = detect(lyrics) # translate non-english strings if lang != 'en': total_string += lyrics # translation = translate_client.translate(translated_string, target_language = 'en') # translated_string = translation['translatedText'].encode('utf-8') translated_string = lyrics else: translated_string = lyrics lyrics_content += translated_string except Exception, e: continue ##################################### ## sorting | stamming | stopwords ## ##################################### # remove dots content_no_dots = re.sub(r'\.', ' ', lyrics_content) # remove numbers content_no_numbers = re.sub(r'[0-9]+', ' ', content_no_dots) # Perform case-folding, i.e., convert to lower case content_casefolded = content_no_numbers.lower() # Tokenize stripped content at white space characters tokens = content_casefolded.split() # Remove all tokens containing non-alphanumeric characters; using a simple lambda function (i.e., anonymous function, can be used as parameter to other function) tokens_filtered = filter(lambda t: t.isalnum(), tokens) # Remove words in the stop word list tokens_filtered_stopped = filter(lambda t: t not in STOP_WORDS, tokens_filtered) tokens_stemmed = [] for w in tokens_filtered_stopped: tokens_stemmed.append(ps.stem(w)) if len(tokens_stemmed) > 0: lyrics_contents[index] = tokens_stemmed except Exception, e: print e print 'File ' + file + ' not found'
# Main program if __name__ == '__main__': # Initialize variables # artists = [] # artists # users = [] # users # UAM = [] # user-artist-matrix # Load metadata from provided files into lists artists = helper.read_csv(ARTISTS_FILE) users = helper.read_csv(USERS_FILE) recommender_users = {} # Load UAM - Konstruiert Matrix aus einem File if VERBOSE: helper.log_highlight('Loading UAM') UAM = np.loadtxt(UAM_FILE, delimiter='\t', dtype=np.float32) if VERBOSE: print '\nSuccessfully read UAM\n' # For all users if VERBOSE: helper.log_highlight('Initialize CF recommendation for users') for u in range(0, UAM.shape[0]): recommender = recommend_CF(UAM, u, users) recommender_users[users[u]] = recommender[users[u]] if VERBOSE:
all_artists = [] with open(LE_FILE, 'r') as f: reader = csv.reader(f, delimiter='\t') # create reader headers = reader.next() # skip header for row in reader: artist = row[2] all_artists.append(artist) return len(helper.get_unique_items(all_artists)) # /unique_artists_total # Main if __name__ == "__main__": helper.log_highlight('Users In Total') print len(helper.read_csv(USER_FILE)) print '' helper.log_highlight('Unique Tracks In Total') print unique_tracks_total() print '' helper.log_highlight('Unique Artists In Total') print unique_artists_total() print '' helper.log_highlight('Listening Events In Total') print le_total() print ''
print("%.3f, %.3f" % (avg_prec, avg_rec)) print("K neighbors " + str(K)) print("Recommendation: " + str(MIN_RECOMMENDED_ARTISTS)) # /run # Main program, for experimentation. if __name__ == '__main__': # Load metadata from provided files into lists artists = read_from_file(ARTISTS_FILE) users = read_from_file(USERS_FILE) if VERBOSE: helper.log_highlight('Read UAM file') UAM = np.loadtxt(UAM_FILE, delimiter='\t', dtype=np.float32)[:, :MAX_ARTISTS] if VERBOSE: print 'Successfully read UAM file\n' if VERBOSE: helper.log_highlight('Read AAM file') AAM = np.loadtxt(AAM_FILE, delimiter='\t', dtype=np.float32)[:MAX_ARTISTS, :MAX_ARTISTS] if VERBOSE: print 'Successfully read AAM file\n'
if (lyrics_contents[index]): continue; except: lyrics_contents[index] = '' if VERBOSE: print 'Stored lyrics into "lyrics_contents" object\n' ####################### ## termslist counter ## ####################### if VERBOSE: helper.log_highlight('Get termsweight of fetched lyrics') # get terms list # Iterate over all (key, value) tuples from dictionary just created to determine document frequency (DF) of all terms for aid, terms in lyrics_contents.items(): # convert list of terms to set of terms ("uniquify" words for each artist/document) for t in set(terms): # and iterate over all terms in this set # update number of artists/documents in which current term t occurs if t not in terms_df: terms_df[t] = 1 else: terms_df[t] += 1 # remove all values which are one terms_df = dict((k, v) for k, v in terms_df.iteritems() if v != 1)
def limit_user(all_users, min_amount_of_users, play_count, min_amount_of_artists_user, min_amount_of_unique_artists_all_users): """ Get a minimum of 5 users and 50 unique artists and return users Data cleansing: Only add user if user if it is equal to min_amount_of_artists and if the atrists playcount id equal to play_count :param all_users: a list of all users :param min_amount_of_users: how many users should get saved :param play_count: playcount of artist :param min_amount_of_artists_user: min. amount of unique artists per user :param min_amount_of_unique_artists_all_users: min. amount of unique artists for all users :return: returns limited_users """ limited_users = [] all_artist_names = [] user_list = iter(all_users) counter = 1 if VERBOSE: helper.log_highlight("Limit users - data cleansing") # Loop through list of all users for index, user in enumerate(user_list, start = 1): # Get artist-history from users via LastFM-API call top_artists = helper.api_user_call("gettopartists", user, "") # Error Handling: error = User not found try: artists = top_artists['topartists']['artist'] artist_counter = 0 except KeyError: print "" print "SKIP: User has no artists" next(user_list) except Exception: print "" print "ERROR:" print(traceback.format_exc()) # Loop through artists-list and evaluate playcount for artist in artists: # Save playcount for artist artist_playcount = int(artist['playcount']) # Check if playcount of artist is equal or greater than defined play_count and # if true add artist to all_artist_names list if artist_playcount >= play_count: artist_counter += 1 artist_name = artist['name'] all_artist_names.append(artist_name.encode('utf-8')) if VERBOSE and VERBOSE_DEPTH == 2: print " Artists (not unique): " + str(len(all_artist_names)) # Data cleansing: only add users with more than 10 unique artists if artist_counter > min_amount_of_artists_user: if VERBOSE: print "Fetched satisfying user [" + str(counter) + " of " + str(min_amount_of_users) + "]" # Fill list with users limited_users.append(user) counter += 1 # Delete duplicates from all_artist_names and save to new list all_artist_names all_artist_names = helper.get_unique_items(all_artist_names) if VERBOSE and VERBOSE_DEPTH == 2: print " Artists (unique): " + str(len(all_artist_names)) # Limit amount of unique artists for all users to a defined minimum (min_amount_of_unique_artists_all_users) # and limit amount of all users to a defined minimum (min_amount_of_users) # If true - stop for loop and return users (limited_users) if len(all_artist_names) >= min_amount_of_unique_artists_all_users \ and len(limited_users) >= min_amount_of_users: np.savetxt(OUTPUT_DIR + "/limited_user_list.csv", limited_users, delimiter=",", fmt='%s') np.savetxt(OUTPUT_DIR + "/all_artist_names.csv", all_artist_names, delimiter=",", fmt='%s') if VERBOSE: print "\nData cleansing successful\n" return limited_users
data = {} data['avg_prec'] = avg_prec data['avg_rec'] = avg_rec data['f1_score'] = f1_score data['recommended'] = recommended_artists return data # /run # Main program, for experimentation. if __name__ == '__main__': artists = helper.read_csv(ARTISTS_FILE) users = helper.read_csv(USERS_FILE) if VERBOSE: helper.log_highlight('Loading UAM') UAM = np.loadtxt(UAM_FILE, delimiter='\t', dtype=np.float32) if VERBOSE: print 'Successfully loaded UAM' time_start = time.time() run_recommender(run, METHOD, [1]) # serial time_end = time.time() elapsed_time = (time_end - time_start) print elapsed_time
sorted_string = header_string + "\n" with open(c1ku_file, 'r') as f: reader = csv.reader(f, delimiter='\t') # create reader headers = reader.next() # skip header for index, row in enumerate(reader, start=1): the_id = row[0] sorted_string += LFM1b_file[the_id] + "\n" helper.ensure_dir(OUTPUT_DIR) text_file = open(output_file, 'w') text_file.write(sorted_string) text_file.close() # /save_lfmb_c1ku_combined_file # Main if __name__ == "__main__": # first get all artists from the LFM1b_artists and # just save artists which are saved in C1ku_idx_artists.txt save_lfmb_c1ku_combined_file(C1KU_ARTISTS_IDX, ARTISTS, CHOSEN_ARTISTS, 'artists') save_lfmb_c1ku_combined_file(C1KU_USERS_IDX, USERS, CHOSEN_USERS, 'users') helper.log_highlight("Done")