示例#1
0
def measure_prior(datasize=1000, testsize=500):
    data = cu.get_sample_data_frame(datasize)
    test = cu.get_test_data_frame(testsize)
    priors = cu.load_priors('train.csv')
    num_samples = len(test)
    probs = np.kron(np.ones((num_samples,1)), priors)
    y_true = compute_y_true(test)
    score = multiclass_log_loss(y_true, probs)
    return score
示例#2
0
def measure_prior(datasize=1000, testsize=500):
    data = cu.get_sample_data_frame(datasize)
    test = cu.get_test_data_frame(testsize)
    priors = cu.load_priors("train.csv")
    num_samples = len(test)
    probs = np.kron(np.ones((num_samples, 1)), priors)
    y_true = compute_y_true(test)
    score = multiclass_log_loss(y_true, probs)
    return score
示例#3
0
def measure_bayes(datasize=1000, testsize=500):
    data = cu.get_sample_data_frame(datasize)
    test = cu.get_test_data_frame(testsize)
    nbfd = features.naive_features(data)
    nbft = features.naive_features(test)
    nb = nltk.NaiveBayesClassifier.train(nbfd)
    probs = []
    for i in range(len(nbft)):
        p = nb.prob_classify(nbft[i][0])
        probs.append([p.prob(s) for s in p.samples()])
    probs = np.array(probs)
    new_priors = cu.load_priors("train.csv")
    old_priors = cu.compute_priors(data.OpenStatus)
    probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)
    y_true = compute_y_true(test)
    score = multiclass_log_loss(y_true, probs)
    return score, nb, y_true, probs
示例#4
0
def measure_bayes(datasize=1000, testsize=500):
    data = cu.get_sample_data_frame(datasize)
    test = cu.get_test_data_frame(testsize)
    nbfd = features.naive_features(data)
    nbft = features.naive_features(test)
    nb = nltk.NaiveBayesClassifier.train(nbfd)
    probs = []
    for i in range(len(nbft)):
        p = nb.prob_classify(nbft[i][0])
        probs.append([p.prob(s) for s in p.samples()])
    probs = np.array(probs)
    new_priors = cu.load_priors('train.csv')
    old_priors = cu.compute_priors(data.OpenStatus)
    probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)
    y_true = compute_y_true(test)
    score = multiclass_log_loss(y_true, probs)
    return score, nb, y_true, probs
示例#5
0
def measure_model(datasize=1000, testsize=500):
    data = cu.get_sample_data_frame(datasize)
    test = cu.get_test_data_frame(testsize)
    # data = full.ix[len(full)/4:].reset_index() # last n/4 * 3 records
    # test = full.ix[:(len(full)/4)-1].reset_index() # first n/4 records
    # data = cu.get_dataframe('train-sample.csv')
    # test = cu.get_dataframe('public_leaderboard.csv')
    fea = features.extract_features(data)
    test_features = features.extract_features(test)
    rf = RandomForestClassifier(n_estimators=50, verbose=2, compute_importances=True, n_jobs=5)
    rf.fit(fea, data["OpenStatus"])
    probs = rf.predict_proba(test_features)
    new_priors = cu.load_priors("train.csv")
    old_priors = cu.compute_priors(data.OpenStatus)
    probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)
    y_true = compute_y_true(test)
    score = multiclass_log_loss(y_true, probs)
    return score, rf, fea
示例#6
0
def measure_model(datasize=1000, testsize=500):
    data = cu.get_sample_data_frame(datasize)
    test = cu.get_test_data_frame(testsize)
    #data = full.ix[len(full)/4:].reset_index() # last n/4 * 3 records
    #test = full.ix[:(len(full)/4)-1].reset_index() # first n/4 records
    #data = cu.get_dataframe('train-sample.csv')
    #test = cu.get_dataframe('public_leaderboard.csv')
    fea = features.extract_features(data)
    test_features = features.extract_features(test)
    rf = RandomForestClassifier(n_estimators=50, verbose=2,
                                compute_importances=True, n_jobs=5)
    rf.fit(fea, data["OpenStatus"])
    probs = rf.predict_proba(test_features)
    new_priors = cu.load_priors('train.csv')
    old_priors = cu.compute_priors(data.OpenStatus)
    probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)
    y_true = compute_y_true(test)
    score = multiclass_log_loss(y_true, probs)
    return score, rf, fea
示例#7
0
def measure_svm(datasize=1000, testsize=500):
    data = cu.get_sample_data_frame(datasize)
    test = cu.get_test_data_frame(testsize)
    vocab = [w.strip() for w in file("vocab4.txt")][0:1000]
    vidx = get_vocab_index_lookup(vocab)
    print "extracting data features"
    xdata = extract_svm_features(vidx, data)
    print "extracting test features"
    xtest = extract_svm_features(vidx, test)
    labels = sorted(cu.labels)
    ydata = data.OpenStatus.apply(labels.index).tolist()
    model = svm.sparse.SVC(probability=True)
    print "fitting model"
    model.fit(xdata, ydata)
    print "rest"
    probs = model.predict_proba(xtest)
    new_priors = cu.load_priors("train.csv")
    old_priors = cu.compute_priors(data.OpenStatus)
    probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)
    y_true = compute_y_true(test)
    score = multiclass_log_loss(y_true, probs)
    return score, model
示例#8
0
def measure_svm(datasize=1000, testsize=500):
    data = cu.get_sample_data_frame(datasize)
    test = cu.get_test_data_frame(testsize)
    vocab = [w.strip() for w in file('vocab4.txt')][0:1000]
    vidx = get_vocab_index_lookup(vocab)
    print 'extracting data features'
    xdata = extract_svm_features(vidx, data)
    print 'extracting test features'
    xtest = extract_svm_features(vidx, test)
    labels = sorted(cu.labels)
    ydata = data.OpenStatus.apply(labels.index).tolist()
    model = svm.sparse.SVC(probability=True)
    print 'fitting model'
    model.fit(xdata, ydata)
    print 'rest'
    probs = model.predict_proba(xtest)
    new_priors = cu.load_priors('train.csv')
    old_priors = cu.compute_priors(data.OpenStatus)
    probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)
    y_true = compute_y_true(test)
    score = multiclass_log_loss(y_true, probs)
    return score, model
示例#9
0
def measure_lda(datasize=1000, testsize=500):
    # The number of documents to analyze each iteration
    batchsize = 100
    # The total number of questions on Stack Overflow
    D = 3.3e6
    # The number of topics
    K = 100
    # How many documents to look at
    documentstoanalyze = datasize / batchsize
    # Our vocabulary
    vocab = [w.strip() for w in file("./vocab2.txt")]
    W = len(vocab)
    # the data
    data = cu.get_sample_data_frame(datasize)
    test = cu.get_test_data_frame(testsize)
    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    lda = OnlineLDA(vocab, K, D, 1.0 / K, 1.0 / K, 1024.0, 0.7)
    make_topic_columns(lda, data, K, D, batchsize)
    make_topic_columns(lda, test, K, D, batchsize)

    # data = full.ix[len(full)/4:].reset_index() # last n/4 * 3 records
    # test = full.ix[:(len(full)/4)-1].reset_index() # first n/4 records
    # data = cu.get_dataframe('train-sample.csv')
    # test = cu.get_dataframe('public_leaderboard.csv')

    fea = features.extract_features(data)
    test_features = features.extract_features(test)
    rf = RandomForestClassifier(n_estimators=50, verbose=2, compute_importances=True, n_jobs=5)
    rf.fit(fea, data["OpenStatus"])
    probs = rf.predict_proba(test_features)
    new_priors = cu.load_priors("train.csv")
    old_priors = cu.compute_priors(data.OpenStatus)
    probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)
    y_true = compute_y_true(test)
    score = multiclass_log_loss(y_true, probs)
    return score, rf, fea, data, datagamma
示例#10
0
def measure_lda(datasize=1000, testsize=500):
    # The number of documents to analyze each iteration
    batchsize = 100
    # The total number of questions on Stack Overflow
    D = 3.3e6
    # The number of topics
    K = 100
    # How many documents to look at
    documentstoanalyze = datasize / batchsize
    # Our vocabulary
    vocab = [w.strip() for w in file('./vocab2.txt')]
    W = len(vocab)
    # the data
    data = cu.get_sample_data_frame(datasize)
    test = cu.get_test_data_frame(testsize)
    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    lda = OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7)
    make_topic_columns(lda, data, K, D, batchsize)
    make_topic_columns(lda, test, K, D, batchsize)
    
    #data = full.ix[len(full)/4:].reset_index() # last n/4 * 3 records
    #test = full.ix[:(len(full)/4)-1].reset_index() # first n/4 records
    #data = cu.get_dataframe('train-sample.csv')
    #test = cu.get_dataframe('public_leaderboard.csv')

    fea = features.extract_features(data)
    test_features = features.extract_features(test)
    rf = RandomForestClassifier(n_estimators=50, verbose=2, compute_importances=True, n_jobs=5)
    rf.fit(fea, data["OpenStatus"])
    probs = rf.predict_proba(test_features)
    new_priors = cu.load_priors('train.csv')
    old_priors = cu.compute_priors(data.OpenStatus)
    probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)
    y_true = compute_y_true(test)
    score = multiclass_log_loss(y_true, probs)
    return score, rf, fea, data, datagamma