def run_ml_analysis(log_file="log.txt", splitfrac=0.1, nfolds=10, feat_choice="ads", nfeat=5, verbose=False): if(feat_choice != "ads" and feat_choice != "words"): print "Illegal feat_choice", feat_choice return collection, names = converter.get_ads_from_log(log_file) #print(collection) #print(names) if len(collection) < nfolds: print "Too few blocks (%s). Analysis requires at least as many blocks as nfolds (%s)." % (len(collection), nfolds) return # intX, inty, intFeat = converter.get_interest_vectors(collection) # plot.treatment_feature_histogram(intX, inty, intFeat, names) s = datetime.now() X,y,feat = converter.get_feature_vectors(collection, feat_choice='ads') """ print("XXXXXXXXXXXXXXXXXX") print("len:" + str(len(X)) ) print(X[0]) print("YYYYYYYYYYYYYYYY") print(y) return """ e = datetime.now() if(verbose): print "Time for constructing feature vectors: ", str(e-s) stat.print_counts(X,y) ml.run_ml_analysis(X, y, feat, names, feat_choice, nfeat, splitfrac=splitfrac, nfolds=nfolds, verbose=verbose)
def run_ml_analysis(log_file="log.txt", splitfrac=0.1, nfolds=10, feat_choice="ads", nfeat=5, verbose=False): if(feat_choice != "ads" and feat_choice != "words"): print "Illegal feat_choice", feat_choice return collection, names = converter.read_log(log_file) # collection = collection[:100] # print collection[0]['adv'] # plot.temporalPlots(collection[0]['adv'][0:1]) # raw_input("wait") if len(collection) < nfolds: print "Too few blocks (%s). Analysis requires at least as many blocks as nfolds (%s)." % (len(collection), nfolds) return # intX, inty, intFeat = converter.get_interest_vectors(collection) # plot.treatment_feature_histogram(intX, inty, intFeat, names) s = datetime.now() X,y,feat = converter.get_feature_vectors(collection, feat_choice='ads') print X.shape print y.shape e = datetime.now() if(verbose): print "Time for constructing feature vectors: ", str(e-s) stat.print_counts(X,y) ml.run_ml_analysis(X, y, feat, names, feat_choice, nfeat, splitfrac=splitfrac, nfolds=nfolds, verbose=verbose)
def run_ml_analysis(log_file="log.txt", splitfrac=0.1, nfolds=10, feat_choice="ads", nfeat=5, verbose=False): if (feat_choice != "ads" and feat_choice != "words"): print "Illegal feat_choice", feat_choice return collection, names = converter.get_ads_from_log(log_file) #print(collection) #print(names) if len(collection) < nfolds: print "Too few blocks (%s). Analysis requires at least as many blocks as nfolds (%s)." % ( len(collection), nfolds) return # intX, inty, intFeat = converter.get_interest_vectors(collection) # plot.treatment_feature_histogram(intX, inty, intFeat, names) s = datetime.now() X, y, feat = converter.get_feature_vectors(collection, feat_choice='ads') """ print("XXXXXXXXXXXXXXXXXX") print("len:" + str(len(X)) ) print(X[0]) print("YYYYYYYYYYYYYYYY") print(y) return """ e = datetime.now() if (verbose): print "Time for constructing feature vectors: ", str(e - s) stat.print_counts(X, y) ml.run_ml_analysis(X, y, feat, names, feat_choice, nfeat, splitfrac=splitfrac, nfolds=nfolds, verbose=verbose)
def run_ml_analysis(log_file="log.txt", splitfrac=0.1, nfolds=10, feat_choice="ads", nfeat=5, verbose=False): if (feat_choice != "ads" and feat_choice != "words"): print "Illegal feat_choice", feat_choice return collection, names = converter.read_log(log_file) # collection = collection[:100] # print collection[0]['adv'] # plot.temporalPlots(collection[0]['adv'][0:1]) # raw_input("wait") if len(collection) < nfolds: print "Too few blocks (%s). Analysis requires at least as many blocks as nfolds (%s)." % ( len(collection), nfolds) return # intX, inty, intFeat = converter.get_interest_vectors(collection) # plot.treatment_feature_histogram(intX, inty, intFeat, names) s = datetime.now() X, y, feat = converter.get_feature_vectors(collection, feat_choice='ads') print X.shape print y.shape e = datetime.now() if (verbose): print "Time for constructing feature vectors: ", str(e - s) stat.print_counts(X, y) ml.run_ml_analysis(X, y, feat, names, feat_choice, nfeat, splitfrac=splitfrac, nfolds=nfolds, verbose=verbose)
def compute_influence(log_file="log.txt"): ## eventually move it to analysis collection, names = converter.read_log(log_file) print names # collection = collection[:5] X,y,feat = converter.get_feature_vectors(collection, feat_choice='ads') print X.shape, y.shape out = np.array([[0.]*X.shape[2]]*len(names)) print out.shape for i in range(0, X.shape[0]): for j in range(0, X.shape[1]): out[j] = out[j] + X[i][np.where(y[i]==j)] # print out total = out[0]+out[1]+out[2]+out[3]+out[4]+out[5] print total raw_input("wait") print "Computing gender influence" diff = (abs(out[0] - out[3]) + abs(out[1] - out[4]) + abs(out[2] - out[5]))/total # for i in range(0,len(total)): # diff[i] = diff[i]*1.0/total[i] print diff print "Computing age influence" diff2 = (abs(out[0] - out[1]) + abs(out[1] - out[2]) + abs(out[2] - out[0]) + abs(out[3] - out[4]) + abs(out[4] - out[5]) + abs(out[5] - out[3]))/total print diff2 male = out[0]+out[1]+out[2] female = out[3]+out[4]+out[5] print "-------" print male print female print "-------" print "total ads:", out.sum() # print "Computing age influence" # diff = abs(out[0] - out[3]) + abs(out[1] - out[4]) + abs(out[2] - out[5]) # print diff # feat.display("url+title") sortdiff = np.sort(diff) sortdiff = sortdiff[::-1] print sortdiff count = 0 for i in sortdiff: print "out:-----", i # print np.where(diff==i) for j in np.where(diff==i)[0]: count += 1 print "index:", j, "infl:", i, "---", print "m:", male[j], "f:", female[j] print out[0][j], out[1][j], out[2][j], out[3][j], out[4][j], out[5][j] feat.choose_by_index(j).display() if count > 20: break; X2 = np.array([[[0.]*X.shape[2]]*2]*X.shape[0]) y2 = np.array([[0]*2]*y.shape[0]) print X.shape, print X2.shape names2 = ['m18', 'f35'] for i in range(0, X.shape[0]): k = np.where(y[i]%4==0) X2[i] = X[i][k] y2[i] = y[i][k]/4 # print X2 # print y2 # print X2.shape, y2.shape # raw_input("wait") ml.run_ml_analysis(X2, y2, feat, names2, feat_choice="ads", nfeat=5, splitfrac=0.1, nfolds=10, verbose=False)
def compute_influence(log_file="log.txt"): ## eventually move it to analysis collection, names = converter.read_log(log_file) print names # collection = collection[:5] X, y, feat = converter.get_feature_vectors(collection, feat_choice='ads') print X.shape, y.shape out = np.array([[0.] * X.shape[2]] * len(names)) print out.shape for i in range(0, X.shape[0]): for j in range(0, X.shape[1]): out[j] = out[j] + X[i][np.where(y[i] == j)] # print out total = out[0] + out[1] + out[2] + out[3] + out[4] + out[5] print total raw_input("wait") print "Computing gender influence" diff = (abs(out[0] - out[3]) + abs(out[1] - out[4]) + abs(out[2] - out[5])) / total # for i in range(0,len(total)): # diff[i] = diff[i]*1.0/total[i] print diff print "Computing age influence" diff2 = (abs(out[0] - out[1]) + abs(out[1] - out[2]) + abs(out[2] - out[0]) + abs(out[3] - out[4]) + abs(out[4] - out[5]) + abs(out[5] - out[3])) / total print diff2 male = out[0] + out[1] + out[2] female = out[3] + out[4] + out[5] print "-------" print male print female print "-------" print "total ads:", out.sum() # print "Computing age influence" # diff = abs(out[0] - out[3]) + abs(out[1] - out[4]) + abs(out[2] - out[5]) # print diff # feat.display("url+title") sortdiff = np.sort(diff) sortdiff = sortdiff[::-1] print sortdiff count = 0 for i in sortdiff: print "out:-----", i # print np.where(diff==i) for j in np.where(diff == i)[0]: count += 1 print "index:", j, "infl:", i, "---", print "m:", male[j], "f:", female[j] print out[0][j], out[1][j], out[2][j], out[3][j], out[4][j], out[ 5][j] feat.choose_by_index(j).display() if count > 20: break X2 = np.array([[[0.] * X.shape[2]] * 2] * X.shape[0]) y2 = np.array([[0] * 2] * y.shape[0]) print X.shape, print X2.shape names2 = ['m18', 'f35'] for i in range(0, X.shape[0]): k = np.where(y[i] % 4 == 0) X2[i] = X[i][k] y2[i] = y[i][k] / 4 # print X2 # print y2 # print X2.shape, y2.shape # raw_input("wait") ml.run_ml_analysis(X2, y2, feat, names2, feat_choice="ads", nfeat=5, splitfrac=0.1, nfolds=10, verbose=False)