def run_test(username_='tathagata', samplesize=.10, K=5):
    """
    username_ - user in smartplayer.user_rating db table
    samplesize_ - what percentage will be the cutoff for cross validation;
    example:
        .10 - 10% will be test and 90% will be train
    The sample set is changed per each interation of the KNN algorithm;
    Therefore, for a samplesize of .10 the algorithm will run
    10 times while changing the subset which will be the testing set
    and which subset will be the training set.
    RETURN:
        Return values are the RMSE, correct value, incorrect value , respectively.
        
    """
    print '--------- Running Tests and cross validation ----------'
    print 'Size of k = {0}'.format(K)
    print 'Cross validating with partition size: {0}'.format(samplesize)
    pred_d = dict()
    actual_d = dict()
    actual_d = get_data.get_user_rat(username_)
    video_list = glob.glob(videosloc_ + '*.wav')
    testnum_ = len(video_list) * samplesize
    correct = 0
    incorrect = 0
    RMSE_val = 0
    while(len(video_list) != 0):
        test_l = []
        for i in xrange(int(floor(testnum_))):
            test_l.append(video_list.pop().split('/')[-1].replace('.wav', ''))
            if(len(video_list) == 0):
                break
        # run tests
        pred_d = run_knn.run_all(username_, test_l, K)
        for song, pred in pred_d.iteritems():
            print 'Actual {0}: Predicted: {1}'.format(actual_d[song], pred)
            if(int(actual_d[song]) == int(pred)):
                correct += 1
            else:
                incorrect += 1
            
            RMSE_val += (int(pred) - int(actual_d[song]))**2
        del test_l #empty list

    RMSE = sqrt((RMSE_val)/len(actual_d))                 
    return RMSE, correct, incorrect
def create_mf(userid_, querysongs_, dir_):
    """
    userid_ - userid for user in smartplayer database
    querysongs_ - list of song(s) that need ratings
    dir - dir is a directory that contains all of the songs/videos
          in .wav format

    Create mf file that contain these songs with either
    a like or a dislike with them.
    This what an mf file should look like:

    /User/Rob/files/rap1.wav    1
    /User/Rob/files/rap2.wav    1
    /User/Rob/files/rap3.wav    1
    /User/Rob/files/rap4.wav    0

    Returns - name of file written to
    """
    new_filename = userid_ + '.mf'
    if dir_.endswith('/') == False:
        dir_ += '/'
    mffile_ = open(new_filename, 'w')

    """
    query database for all songs that the user liked or
    disliked and put then in a list
    """

    D_videos = get_data.get_user_rat(userid_)
    for song in querysongs_:
        songpath = dir_ + song + '.wav'
        if os.path.isfile(songpath):
            mffile_.write(songpath + '\t0\n')
            D_videos[song] = 0  # test song has rating 0
    for song, rating in D_videos.iteritems():
        if song in querysongs_: # don't include test songs 
            continue
        songpath = dir_ + song + '.wav'
        if os.path.isfile(songpath):
            mffile_.write(songpath + '\t' + str(rating) + '\n')

    return new_filename, D_videos
def run_all_ks(username_='tathagata', samplesize_=0.1):
    """
    Run for k | k = 1 to sqrt(n)
    """
    k_d = {}
    corr_d = {}
    incorr_d = {}
    actual_d = get_data.get_user_rat(username_)
    upto = int(sqrt(len(actual_d)))
    for k in xrange(1, upto):
        k_d[k], corr_d[k], incorr_d[k] = run_test(username_, samplesize_, k)
        if(k == 1):
            bestk = k
            low_incorr_num =  incorr_d[k]
        elif(incorr_d[k] < low_incorr_num):
            bestk = k
        print 'for k = {0} the RMSE was {1}'.format(str(k), str(k_d[k]))
        print 'for k = {0} the correct was {1}'.format(str(k), str(corr_d[k]))
        print 'for k = {0} the incorrect was {1}'.format(str(k), str(incorr_d[k]))
    print 'Best value of k to use is {0}'.format(str(bestk))                                                                            
def run_all_ks(username_="tathagata", samplesize_=0.1):
    """
    Run for k | k = 1 to sqrt(n)
    """
    k_d = {}
    corr_d = {}
    incorr_d = {}
    actual_d = get_data.get_user_rat(username_)
    upto = int(sqrt(len(actual_d)))
    for k in xrange(1, upto):
        k_d[k], corr_d[k], incorr_d[k] = run_test(username_, samplesize_, k)
        if k == 1 or incorr_d[k] < low_incorr_num:
            bestk = k
            low_incorr_num = incorr_d[k]
            low_RMSE = k_d[k]
        print "for k = {0} the RMSE was {1}".format(str(k), str(k_d[k]))
        print "for k = {0} the correct was {1}".format(str(k), str(corr_d[k]))
        print "for k = {0} the incorrect was {1}".format(str(k), str(incorr_d[k]))
    print "Best value of k to use is {0}".format(str(bestk))
    print "Number of incorrect for this k was {0}".format(str(low_incorr_num))
    print "Lowest RMSE was {0}".format(str(low_RMSE))