def training_and_classification_with_kfold_cross_validation(collection_name, k):
    '''
    Training and classification of an autotagger using k-fold cross validation
    '''
    _split_metadata_and_features(collection_name, k)
    for i in range(1,k+1):
        # Create a gaia dataset with the training set
        print "----------------------- DATASET CREATION (FOLD %d)-----------------------" % i
        training_features='train/%s_features__fold%d.tsv' % (collection_name, i)
        chunk_size=5000
        dataset_suffix="fold%d" % i
        replace_dataset=True
        dataset_creator = DatasetCreator(collection_name)
        dataset_creator.create(training_features, chunk_size, dataset_suffix, replace_dataset)
            
        # Feature selection over the gaia dataset
        print "----------------------- FEATURE SELECTION (FOLD %d)-----------------------" % i
        dataset='dbs/%s__fold%d.db' % (collection_name, i)
        pca_covered_variance=75
        include_highlevel=True
        feature_selector = FeatureSelector()
        feature_selector.select(dataset, pca_covered_variance, include_highlevel)
        
        # Autotag a given test set
        print "----------------------- AUTOTAGGING (FOLD %d)-----------------------" % i
        dataset='transformed_dbs/%s__fold%d.db' % (collection_name, i)
        training_metadata='train/%s_metadata__fold%d.tsv' % (collection_name, i)
        test_features='test/%s_features__fold%d.tsv' % (collection_name, i)
        output_binary='test/%s_output_binary__fold%d.tsv' % (collection_name, i)
        output_affinity='test/%s_output_affinity__fold%d.tsv' % (collection_name, i)
        metric='LC'
        num_sim=18
        threshold=0.2
        autotagger = Autotagger()
        autotagger.train(dataset, training_metadata)
        autotagger.classify(test_features, output_binary, metric, num_sim, threshold, ranked=False)
        autotagger.classify(test_features, output_affinity, metric, num_sim, threshold, ranked=True)
# You should have received a copy of the GNU General Public License
# along with music-autotagging-msordo.  If not, see <http://www.gnu.org/licenses/>.

# Written by Mohamed Sordo (@neomoha)
# Email: mohamed ^dot^ sordo ^at^ gmail ^dot^ com
# Website: http://msordo.weebly.com

import os, sys, argparse

from DatasetCreator import DatasetCreator

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Create a Gaia Dataset given a list of feature files')
    parser.add_argument('collection_name', help='Name of the collection')
    parser.add_argument('--training-features', default=None, help='A file containing paths to the features of the audios used for training (default="train/COLLECTIONNAME_features.txt")')
    parser.add_argument('--chunk-size', type=int, default=5000, help='The dataset will be created in chunks of N songs at a time (default=5000)')
    parser.add_argument('--dataset-suffix', default=None, help='suffix to add to the dataset filename (useful when doing k-fold cross validation, for example) (default=None)')
    parser.add_argument('-r', '--replace-dataset', help='Replace old dataset (if it exists)', action="store_true")
    args = parser.parse_args()
    
    if args.training_features is None:
        args.training_features = "train/"+args.collection_name+"_features.tsv"
    
    if not os.path.exists(args.training_features):
        print "Taining features file '%s' not found" % args.training_features
        sys.exit(-1)
    
    print args
    dataset_creator = DatasetCreator(args.collection_name)
    dataset_creator.create(args.training_features, args.chunk_size, args.dataset_suffix, args.replace_dataset)
    parser.add_argument(
        '--chunk-size',
        type=int,
        default=5000,
        help=
        'The dataset will be created in chunks of N songs at a time (default=5000)'
    )
    parser.add_argument(
        '--dataset-suffix',
        default=None,
        help=
        'suffix to add to the dataset filename (useful when doing k-fold cross validation, for example) (default=None)'
    )
    parser.add_argument('-r',
                        '--replace-dataset',
                        help='Replace old dataset (if it exists)',
                        action="store_true")
    args = parser.parse_args()

    if args.training_features is None:
        args.training_features = "train/" + args.collection_name + "_features.tsv"

    if not os.path.exists(args.training_features):
        print "Taining features file '%s' not found" % args.training_features
        sys.exit(-1)

    print args
    dataset_creator = DatasetCreator(args.collection_name)
    dataset_creator.create(args.training_features, args.chunk_size,
                           args.dataset_suffix, args.replace_dataset)