def learning_curve(): n = 50000 nsteps = 10 full = cu.get_sample_data_frame(n) data = full.ix[0 : int(n * 0.6) - 1].reset_index() cval = full.ix[int(n * 0.6) : int(n * 0.8) - 1].reset_index() test = full.ix[int(n * 0.8) : n - 1].reset_index() step = len(data) / nsteps ndata = len(data) mvec = range(step, ndata + step, step) test_features = features.extract_features(test) data_error = [] cval_error = [] for i in range(len(mvec)): m = mvec[i] print "running for size", m train = data.ix[0 : m - 1].reset_index() fea = features.extract_features(train) rf = RandomForestClassifier(n_estimators=50, verbose=0, compute_importances=False, n_jobs=5) rf.fit(fea, train["OpenStatus"]) new_priors = cu.load_priors("train.csv") old_priors = cu.compute_priors(train.OpenStatus) # predict train probs = rf.predict_proba(fea) # probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) y_true = compute_y_true(train) score = multiclass_log_loss(y_true, probs) data_error.append(score) # predict cval probs = rf.predict_proba(test_features) # probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) y_true = compute_y_true(test) score = multiclass_log_loss(y_true, probs) cval_error.append(score) return mvec, data_error, cval_error
def learning_curve(): n = 50000 nsteps = 10 full = cu.get_sample_data_frame(n) data = full.ix[0:int(n*.6)-1].reset_index() cval = full.ix[int(n*.6):int(n*.8)-1].reset_index() test = full.ix[int(n*.8):n-1].reset_index() step = len(data) / nsteps ndata = len(data) mvec = range(step, ndata + step, step) test_features = features.extract_features(test) data_error = [] cval_error = [] for i in range(len(mvec)): m = mvec[i] print 'running for size', m train = data.ix[0:m-1].reset_index() fea = features.extract_features(train) rf = RandomForestClassifier(n_estimators=50, verbose=0, compute_importances=False, n_jobs=5) rf.fit(fea, train["OpenStatus"]) new_priors = cu.load_priors('train.csv') old_priors = cu.compute_priors(train.OpenStatus) # predict train probs = rf.predict_proba(fea) #probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) y_true = compute_y_true(train) score = multiclass_log_loss(y_true, probs) data_error.append(score) # predict cval probs = rf.predict_proba(test_features) #probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) y_true = compute_y_true(test) score = multiclass_log_loss(y_true, probs) cval_error.append(score) return mvec, data_error, cval_error
def measure_bayes(datasize=1000, testsize=500): data = cu.get_sample_data_frame(datasize) test = cu.get_test_data_frame(testsize) nbfd = features.naive_features(data) nbft = features.naive_features(test) nb = nltk.NaiveBayesClassifier.train(nbfd) probs = [] for i in range(len(nbft)): p = nb.prob_classify(nbft[i][0]) probs.append([p.prob(s) for s in p.samples()]) probs = np.array(probs) new_priors = cu.load_priors("train.csv") old_priors = cu.compute_priors(data.OpenStatus) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) y_true = compute_y_true(test) score = multiclass_log_loss(y_true, probs) return score, nb, y_true, probs
def measure_bayes(datasize=1000, testsize=500): data = cu.get_sample_data_frame(datasize) test = cu.get_test_data_frame(testsize) nbfd = features.naive_features(data) nbft = features.naive_features(test) nb = nltk.NaiveBayesClassifier.train(nbfd) probs = [] for i in range(len(nbft)): p = nb.prob_classify(nbft[i][0]) probs.append([p.prob(s) for s in p.samples()]) probs = np.array(probs) new_priors = cu.load_priors('train.csv') old_priors = cu.compute_priors(data.OpenStatus) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) y_true = compute_y_true(test) score = multiclass_log_loss(y_true, probs) return score, nb, y_true, probs
def measure_model(datasize=1000, testsize=500): data = cu.get_sample_data_frame(datasize) test = cu.get_test_data_frame(testsize) # data = full.ix[len(full)/4:].reset_index() # last n/4 * 3 records # test = full.ix[:(len(full)/4)-1].reset_index() # first n/4 records # data = cu.get_dataframe('train-sample.csv') # test = cu.get_dataframe('public_leaderboard.csv') fea = features.extract_features(data) test_features = features.extract_features(test) rf = RandomForestClassifier(n_estimators=50, verbose=2, compute_importances=True, n_jobs=5) rf.fit(fea, data["OpenStatus"]) probs = rf.predict_proba(test_features) new_priors = cu.load_priors("train.csv") old_priors = cu.compute_priors(data.OpenStatus) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) y_true = compute_y_true(test) score = multiclass_log_loss(y_true, probs) return score, rf, fea
def measure_model(datasize=1000, testsize=500): data = cu.get_sample_data_frame(datasize) test = cu.get_test_data_frame(testsize) #data = full.ix[len(full)/4:].reset_index() # last n/4 * 3 records #test = full.ix[:(len(full)/4)-1].reset_index() # first n/4 records #data = cu.get_dataframe('train-sample.csv') #test = cu.get_dataframe('public_leaderboard.csv') fea = features.extract_features(data) test_features = features.extract_features(test) rf = RandomForestClassifier(n_estimators=50, verbose=2, compute_importances=True, n_jobs=5) rf.fit(fea, data["OpenStatus"]) probs = rf.predict_proba(test_features) new_priors = cu.load_priors('train.csv') old_priors = cu.compute_priors(data.OpenStatus) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) y_true = compute_y_true(test) score = multiclass_log_loss(y_true, probs) return score, rf, fea
def measure_svm(datasize=1000, testsize=500): data = cu.get_sample_data_frame(datasize) test = cu.get_test_data_frame(testsize) vocab = [w.strip() for w in file("vocab4.txt")][0:1000] vidx = get_vocab_index_lookup(vocab) print "extracting data features" xdata = extract_svm_features(vidx, data) print "extracting test features" xtest = extract_svm_features(vidx, test) labels = sorted(cu.labels) ydata = data.OpenStatus.apply(labels.index).tolist() model = svm.sparse.SVC(probability=True) print "fitting model" model.fit(xdata, ydata) print "rest" probs = model.predict_proba(xtest) new_priors = cu.load_priors("train.csv") old_priors = cu.compute_priors(data.OpenStatus) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) y_true = compute_y_true(test) score = multiclass_log_loss(y_true, probs) return score, model
def measure_svm(datasize=1000, testsize=500): data = cu.get_sample_data_frame(datasize) test = cu.get_test_data_frame(testsize) vocab = [w.strip() for w in file('vocab4.txt')][0:1000] vidx = get_vocab_index_lookup(vocab) print 'extracting data features' xdata = extract_svm_features(vidx, data) print 'extracting test features' xtest = extract_svm_features(vidx, test) labels = sorted(cu.labels) ydata = data.OpenStatus.apply(labels.index).tolist() model = svm.sparse.SVC(probability=True) print 'fitting model' model.fit(xdata, ydata) print 'rest' probs = model.predict_proba(xtest) new_priors = cu.load_priors('train.csv') old_priors = cu.compute_priors(data.OpenStatus) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) y_true = compute_y_true(test) score = multiclass_log_loss(y_true, probs) return score, model
def measure_lda(datasize=1000, testsize=500): # The number of documents to analyze each iteration batchsize = 100 # The total number of questions on Stack Overflow D = 3.3e6 # The number of topics K = 100 # How many documents to look at documentstoanalyze = datasize / batchsize # Our vocabulary vocab = [w.strip() for w in file("./vocab2.txt")] W = len(vocab) # the data data = cu.get_sample_data_frame(datasize) test = cu.get_test_data_frame(testsize) # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 lda = OnlineLDA(vocab, K, D, 1.0 / K, 1.0 / K, 1024.0, 0.7) make_topic_columns(lda, data, K, D, batchsize) make_topic_columns(lda, test, K, D, batchsize) # data = full.ix[len(full)/4:].reset_index() # last n/4 * 3 records # test = full.ix[:(len(full)/4)-1].reset_index() # first n/4 records # data = cu.get_dataframe('train-sample.csv') # test = cu.get_dataframe('public_leaderboard.csv') fea = features.extract_features(data) test_features = features.extract_features(test) rf = RandomForestClassifier(n_estimators=50, verbose=2, compute_importances=True, n_jobs=5) rf.fit(fea, data["OpenStatus"]) probs = rf.predict_proba(test_features) new_priors = cu.load_priors("train.csv") old_priors = cu.compute_priors(data.OpenStatus) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) y_true = compute_y_true(test) score = multiclass_log_loss(y_true, probs) return score, rf, fea, data, datagamma
def measure_lda(datasize=1000, testsize=500): # The number of documents to analyze each iteration batchsize = 100 # The total number of questions on Stack Overflow D = 3.3e6 # The number of topics K = 100 # How many documents to look at documentstoanalyze = datasize / batchsize # Our vocabulary vocab = [w.strip() for w in file('./vocab2.txt')] W = len(vocab) # the data data = cu.get_sample_data_frame(datasize) test = cu.get_test_data_frame(testsize) # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 lda = OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7) make_topic_columns(lda, data, K, D, batchsize) make_topic_columns(lda, test, K, D, batchsize) #data = full.ix[len(full)/4:].reset_index() # last n/4 * 3 records #test = full.ix[:(len(full)/4)-1].reset_index() # first n/4 records #data = cu.get_dataframe('train-sample.csv') #test = cu.get_dataframe('public_leaderboard.csv') fea = features.extract_features(data) test_features = features.extract_features(test) rf = RandomForestClassifier(n_estimators=50, verbose=2, compute_importances=True, n_jobs=5) rf.fit(fea, data["OpenStatus"]) probs = rf.predict_proba(test_features) new_priors = cu.load_priors('train.csv') old_priors = cu.compute_priors(data.OpenStatus) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) y_true = compute_y_true(test) score = multiclass_log_loss(y_true, probs) return score, rf, fea, data, datagamma