def test(features, ndigits): indices = features.index[features.isnull().any(axis=1)] if len(indices) > 0: print('Failed tracks: {}'.format(', '.join(str(i) for i in indices))) # Failed features extraction should be due to files without audio. assert set(int(i) for i in indices) == set(fma.FILES_NO_AUDIO) tmp = fma.load('data/features.csv') np.testing.assert_allclose(tmp.values, features.values, rtol=10**-ndigits)
#!/usr/bin/env python3 """Adapted from https://github.com/mdeff/fma/blob/master/baselines.ipynb""" import numpy as np import pandas as pd from sklearn.utils import shuffle from sklearn.preprocessing import StandardScaler from sklearn.svm import SVC import fma y_train = pd.read_csv('data/train_labels.csv', index_col=0, squeeze=True) features = fma.load('data/features.csv') X_train = features[:25000] X_test = features[25000:] # Data cleaning X_train = X_train.drop(fma.FAULTY_FILES) y_train = y_train.drop(fma.FAULTY_FILES) # The track IDs are integers for the training set. X_train.index = pd.Index((int(i) for i in X_train.index), name='track_id') # Should be done already, but better be sure. X_train.sort_index(inplace=True) X_test.sort_index(inplace=True) y_train.sort_index(inplace=True) assert (X_train.index == y_train.index).all()
#!/usr/bin/env python import fma tracks = fma.load('data/fma_metadata/tracks.csv') subset = tracks.index[tracks['set', 'subset'] <= 'medium'] labels = tracks.loc[subset, ('track', 'genre_top')] labels.name = 'genre' labels.to_csv('data/train_labels.csv', header=True)