def training_and_classification_with_kfold_cross_validation(collection_name, k): ''' Training and classification of an autotagger using k-fold cross validation ''' _split_metadata_and_features(collection_name, k) for i in range(1,k+1): # Create a gaia dataset with the training set print "----------------------- DATASET CREATION (FOLD %d)-----------------------" % i training_features='train/%s_features__fold%d.tsv' % (collection_name, i) chunk_size=5000 dataset_suffix="fold%d" % i replace_dataset=True dataset_creator = DatasetCreator(collection_name) dataset_creator.create(training_features, chunk_size, dataset_suffix, replace_dataset) # Feature selection over the gaia dataset print "----------------------- FEATURE SELECTION (FOLD %d)-----------------------" % i dataset='dbs/%s__fold%d.db' % (collection_name, i) pca_covered_variance=75 include_highlevel=True feature_selector = FeatureSelector() feature_selector.select(dataset, pca_covered_variance, include_highlevel) # Autotag a given test set print "----------------------- AUTOTAGGING (FOLD %d)-----------------------" % i dataset='transformed_dbs/%s__fold%d.db' % (collection_name, i) training_metadata='train/%s_metadata__fold%d.tsv' % (collection_name, i) test_features='test/%s_features__fold%d.tsv' % (collection_name, i) output_binary='test/%s_output_binary__fold%d.tsv' % (collection_name, i) output_affinity='test/%s_output_affinity__fold%d.tsv' % (collection_name, i) metric='LC' num_sim=18 threshold=0.2 autotagger = Autotagger() autotagger.train(dataset, training_metadata) autotagger.classify(test_features, output_binary, metric, num_sim, threshold, ranked=False) autotagger.classify(test_features, output_affinity, metric, num_sim, threshold, ranked=True)
# You should have received a copy of the GNU General Public License # along with music-autotagging-msordo. If not, see <http://www.gnu.org/licenses/>. # Written by Mohamed Sordo (@neomoha) # Email: mohamed ^dot^ sordo ^at^ gmail ^dot^ com # Website: http://msordo.weebly.com import os, sys, argparse from DatasetCreator import DatasetCreator if __name__ == '__main__': parser = argparse.ArgumentParser(description='Create a Gaia Dataset given a list of feature files') parser.add_argument('collection_name', help='Name of the collection') parser.add_argument('--training-features', default=None, help='A file containing paths to the features of the audios used for training (default="train/COLLECTIONNAME_features.txt")') parser.add_argument('--chunk-size', type=int, default=5000, help='The dataset will be created in chunks of N songs at a time (default=5000)') parser.add_argument('--dataset-suffix', default=None, help='suffix to add to the dataset filename (useful when doing k-fold cross validation, for example) (default=None)') parser.add_argument('-r', '--replace-dataset', help='Replace old dataset (if it exists)', action="store_true") args = parser.parse_args() if args.training_features is None: args.training_features = "train/"+args.collection_name+"_features.tsv" if not os.path.exists(args.training_features): print "Taining features file '%s' not found" % args.training_features sys.exit(-1) print args dataset_creator = DatasetCreator(args.collection_name) dataset_creator.create(args.training_features, args.chunk_size, args.dataset_suffix, args.replace_dataset)
parser.add_argument( '--chunk-size', type=int, default=5000, help= 'The dataset will be created in chunks of N songs at a time (default=5000)' ) parser.add_argument( '--dataset-suffix', default=None, help= 'suffix to add to the dataset filename (useful when doing k-fold cross validation, for example) (default=None)' ) parser.add_argument('-r', '--replace-dataset', help='Replace old dataset (if it exists)', action="store_true") args = parser.parse_args() if args.training_features is None: args.training_features = "train/" + args.collection_name + "_features.tsv" if not os.path.exists(args.training_features): print "Taining features file '%s' not found" % args.training_features sys.exit(-1) print args dataset_creator = DatasetCreator(args.collection_name) dataset_creator.create(args.training_features, args.chunk_size, args.dataset_suffix, args.replace_dataset)