예제 #1
0
def main(datafile = '../data_by_cookie_slim.json', outputFolder = '.', iterations = 10, epochmult = 4):
    
    filename = 'runMLP'
    outputFile = '{}/{}.p'.format(outputFolder,filename)
    data = funcs.loadData(datafile)
    
    #Filter away bottom 75%
    data = funcs.filterByPercRank(data, 75)
    
    print 'iterations: {}\nMultiplier Samplesize Epochs: {}\n output file: {}'.format(iterations,epochmult,outputFile)
    
    
    #Get first 10 values and try to decide whether people will keep on playing past 20 games
    samples = np.fromiter(((funcs.normalize(np.array(k[:10])),0 if len(k) < 20 else 1) for k in data if len(k) >= 10),
                    dtype=[('input',  float, 10), ('output', float, 1)])
    print 'Learning from {} samples...'.format(samples.size)
    network = MLP(10,10,10,1)
    
    def processResults(network,results):
        stepf = lambda x: 0 if x < .5 else 1
        test_data = [(t[0], t[1], stepf(t[2])) for t in results]
        percHits = np.mean([1 if t[2] == 1 else 0 for t in test_data if t[1] == 1]) # Percentage right hits
        falseAlarm = np.mean([1 if t[2] == 1 else 0 for t in test_data if t[1] == 0]) # Percentage false positives
        
        dPrime = funcs.dprime(percHits, falseAlarm)
        out = (percHits, falseAlarm, dPrime, network.weights)
        return out
    #print 'Hit % = {}, but false alarm % = {}, d\' = {}'.format(percHits,falseAlarm, dPrime)  
    out = network.learnLoop(samples, iterations = iterations, epochs = epochmult * samples.size, processResults = processResults) #40 million epochs for full dataset.. Too many? 
    
    pickle.dump(out,open(outputFile, 'wb'))
    #print out
    
    #results = network.test(samples)
    dprimes = pickle.load(open(outputFile,'rb'))
    #set nan to 0
    
    dprimes = [[0 if np.isnan(i) or np.isinf(i)  else i for i in k[2]] for k in dprimes]   
    
    print
    print 'Results:'
    print 'Mean d\' score for each quit opportunity: {}'.format([np.mean([k[i] for k in dprimes]) for i in xrange(1)])
    print 'Std : {}'.format([np.std([k[i] for k in dprimes]) for i in xrange(1)])
    print 'Max : {}'.format([np.max([k[i] for k in dprimes]) for i in xrange(1)])
    print
    print
예제 #2
0
def runObs(data, outfolder ='.', rankFilter = 0, preprocess = False, processX = False, processY = False):
    windowSizes1 = range(5,30,5)    # Sizes of attempt group 1
    windowSizes2 = range(5,30,5)    # Sizes of attempt group 2
    if processX == False:
        processX = lambda x,x_plays: np.var(x,axis=0)
    if processY == False:
        processY = lambda x,x_plays: np.mean(x,axis=0)        
    #Run preprocessing if passed along
    if(preprocess != False):
        data = preprocess(data)
    
    for i1 in xrange(len(windowSizes1)):
        for i2 in xrange(len(windowSizes2)):
            window1 = windowSizes1[i1]       
            window2 = windowSizes2[i2]            
            first_plays = range(window1)
            second_plays = range(window1,window1+window2)
            total_attempts = window1 + window2            
                        
                        
        # --------------------------------------------
            # look at subsample of people who played more than x times   
            #print "organising data"
            big = [k for k in data if len(k) >= total_attempts]
            # --------------------------------------------
            #calc dict of maximum score for each player(=each key)
            #maxscore=[max(a) for a in big]
            
            #calc percentile ranking for each player (=each key)
            #prcentiles= np.percentile(maxscore,range(100))             
                        
                        
                        
            
            #construct vaiables dicts
            
            #print "calculating summary stats"
            #for each player make two lists, of plays 1-5 (first) and 6-10 (second)
            #and calculate summary stats av1,var1 and av2, var2
            
            if(rankFilter != 0):
                big = funcs.filterByPercRank(big,rankFilter)
                
            first = [k[0:window1] for k in big]
            second = [k[window1:window2+window1] for k in big]
            
            
            #av1=np.mean(first,axis=1)
            x = processX(zip(*first),first_plays)
            #x = processX(zip(*first), first_plays)   
            y = processY(zip(*second),second_plays) #processY(second, second_plays) 
            #var2 = np.var(second,axis=1)
            
            
                                     
                    
        
            #find percentile values                    
            #prcentiles_x=np.percentile(x,range(100))                    
            #prcentiles_y=np.percentile(y,range(100))
        
        
            #make dict of prcentile values for each statistic for each player
            #xlist=[bisect.bisect(prcentiles_x,k) for k in x]
            #ylist=[bisect.bisect(prcentiles_y,k) for k in y]
                   
            #print "saving data"
            pickle.dump(x, open(outfolder + '/save_a5_xlist' + str(window1) + "," + str(window2) +'.p', 'wb'))
            pickle.dump(y, open(outfolder + '/save_a5_ylist' + str(window1) + "," + str(window2) +'.p', 'wb'))
            
            #print "mean x: ",np.mean(x)
            #print "mean y: ",np.mean(y)