Exemplo n.º 1
0
def train(cutoffs):
    print "\n========== Start Training =========="
    if len(__TRAIN_DATA) == 3:
        list_io_addr = get_io_addr(__TRAIN_DATA[0], __TRAIN_DATA[1], __TRAIN_DATA[2])
    else:
        list_io_addr = get_io_addr_random_sample(__TRAIN_DATA[0], __TRAIN_DATA[1])
    clf = BernoulliNB(fit_prior=True)

    for i in range(len(list_io_addr)):
        path_in = list_io_addr[i]
        print "\nGenerating training set from {}".format(path_in)
        with open(path_in, "r") as file_in:
            X = Sparse_Matrix_IO.load_sparse_csr(file_in)

        if len(cutoffs) > 0:
            print "Discarding selected features......"
            X = discard_vars(X, cutoffs)

        vector_len = len(X[0])
        X_train = X[:, 0:vector_len-1]
        y_train = X[:, vector_len-1]
        print "Done"

        # sm = SMOTE(ratio=0.9)
        # X_train_sm, y_train_sm = sm.fit_sample(X_train, y_train)

        print "Fitting Model......"
        clf.partial_fit(X_train, y_train, classes=[0, 1])
        print "Done"

    with open(__ROOT_MODEL, "w") as file_out:
        pickle.dump(clf, file_out)
Exemplo n.º 2
0
    def train_nb(self, X, y):
        def map5eval(actual, preds):

            predicted = preds.argsort(axis=1)[:, -np.arange(5)]
            # print(predicted)
            metric = 0.
            for i in range(5):
                metric += np.sum(actual == predicted[:, i]) / (i + 1)
            metric /= actual.shape[0]
            return metric

        X_train, X_test, y_train, y_test = cross_validation.train_test_split(
            X, y, stratify=y, test_size=0.2)
        map5 = make_scorer(map5eval, greater_is_better=True, needs_proba=True)

        clf = BernoulliNB(alpha=1.0)

        sw = 1 + 4 * X_train.is_booking
        clf.partial_fit(X_train,
                        y_train,
                        classes=np.arange(100),
                        sample_weight=sw)
        score = cross_val_score(clf, X_train, y_train, cv=5, scoring=map5)
        print(score)
        return clf, X_test, y_test
Exemplo n.º 3
0
def step4():
    print('Training...')
    trainCsv = open('dataset-train.csv', 'r')
    X = np.zeros((chunk, lenDict))
    if 'ber' in steps:
        model = BernoulliNB(alpha=smooth)
    elif 'mul' in steps:
        model = MultinomialNB(alpha=smooth)
    else:
        model = MultinomialNB(alpha=smooth)
    isDone = False
    COUNT = 0
    while not isDone:
        ct = 0
        Y = []
        for line in trainCsv:
            vector = [int(i) for i in line.split(',')]
            X[ct] = vector[:-1]
            Y.append(vector[-1])
            ct += 1
            if ct == chunk:
                break
        if ct == 0:
            break
        if ct != chunk:
            X = np.resize(X, (ct, lenDict))
            isDone = True
        model.partial_fit(X, Y, classes=[0, 1])
        COUNT += ct
        print(COUNT)

    print('Testing on training set...')
    test_on_csv(model, 'dataset-train.csv')
    print('Testing on test set...')
    test_on_csv(model, 'dataset-test.csv')
Exemplo n.º 4
0
def train_bnb_model(msg):
    msg_copy = msg.copy()
    msg_copy['train'] = True
    if not 'month' in msg_copy.keys():
        msg_copy['month'] = msg_copy['train_month']
    ret = dataset.get_data(msg_copy)
    input_data, output_data = ret[0:2]
    bnb = BernoulliNB(alpha=1e-2)
    bnb.partial_fit(input_data, output_data, classes=range(24))
    return bnb
def trainBernoulliNB(X,y,loadweights):
	print("Training BernoulliNB...")
	BN_classifier = BernoulliNB()
	if loadweights:
		with open('weights/BernoulliNB.pickle', 'rb') as handle:
			BN_classifier = pickle.load(handle)
	for _ in range(10):
		BN_classifier.partial_fit(X,y,classes=[0,1])
	with open('weights/BernoulliNB.pickle', 'wb') as handle:
		pickle.dump(BN_classifier, handle, protocol=pickle.HIGHEST_PROTOCOL)
	print (BN_classifier.score(X,y))
Exemplo n.º 6
0
def train(cutoffs):
    print "\n========== Start Training =========="
    if __DATA_FROM == 2:
        list_io_addr = get_io_addr(__TRAIN_DATA[0], __TRAIN_DATA[1])
    else:
        list_io_addr = get_io_addr_random_sample(__TRAIN_DATA[0], __TRAIN_DATA[1])
    clf = BernoulliNB(class_prior=[0.05, 0.95])

    if __IF_TRAIN_WITHOUT_SAVE:
        print "Performing correlation explanation......"
        with open("/home/wlu/Desktop/day_samp_bin_1-2.npy", "r") as file_in:
            X = Sparse_Matrix_IO.load_sparse_csr(file_in)
            if len(cutoffs) > 0:
                X = discard_vars(X, cutoffs)
            layer = correlation_ex(X)

    for i in range(0, len(list_io_addr)):
        path_in = list_io_addr[i]
        print "\nGenerating training set from {}".format(path_in)
        with open(path_in, "r") as file_in:
            X = Sparse_Matrix_IO.load_sparse_csr(file_in)

        if len(cutoffs) > 0:
            X = discard_vars(X, cutoffs)

        vector_len = len(X[0])
        X_train = X[:, 0:vector_len-1]
        y_train = X[:, vector_len-1]

        if __IF_TRAIN_WITHOUT_SAVE:
            print "Transforming training set according to CorEx......"
            X_train = corex_transform(layer, X_train)

        sm = SMOTE(ratio=0.95)
        X_train, y_train = sm.fit_sample(X_train, y_train)

        print "Fitting Model......"
        clf.partial_fit(X_train, y_train, classes=[0, 1])
        print "Done"

    if __IF_TRAIN_WITHOUT_SAVE:
        return [clf, layer]
    else:
        with open(__ROOT_MODEL, "w") as file_out:
            pickle.dump(clf, file_out)
        return []
Exemplo n.º 7
0
def main(pos_tweets_filename,
         neg_tweets_filename,
         tweets_count=float("inf"),
         pos_label="pos",
         neg_label="neg",
         chunk_size=10000,
         features_used=(1, 1)):
    machine = BernoulliNB()
    vectorizer = HashingVectorizer(ngram_range=features_used)
    chunk_features = []
    # pos_features = extract_features(pos_tweets_filename, pos_label, tweets_count)
    # neg_features = extract_features(neg_tweets_filename, neg_label, tweets_count)
    # features = vectorizer.fit_transform(pos_features + neg_features)
    # labels = [pos_label, ] * len(pos_features) + [neg_label,] * len(neg_features)
    # machine.fit(features, labels)
    # return (machine, vectorizer, features.toarray())
    for pos_features in extract_features(pos_tweets_filename, pos_label,
                                         tweets_count):
        chunk_features.append(pos_features)
        if len(chunk_features) >= chunk_size:
            features = vectorizer.fit_transform(chunk_features)
            machine.partial_fit(features, [
                pos_label,
            ] * len(chunk_features),
                                classes=[pos_label, neg_label])
            chunk_features = []

    if len(chunk_features) != 0:
        features = vectorizer.fit_transform(chunk_features)
        machine.partial_fit(features, [
            pos_label,
        ] * len(chunk_features),
                            classes=[pos_label, neg_label])
        chunk_features = []

    for neg_features in extract_features(neg_tweets_filename, neg_label,
                                         tweets_count):
        chunk_features.append(neg_features)
        if len(chunk_features) >= chunk_size:
            features = vectorizer.fit_transform(chunk_features)
            machine.partial_fit(features, [
                neg_label,
            ] * len(chunk_features),
                                classes=[pos_label, neg_label])
            chunk_features = []

    if len(chunk_features) != 0:
        features = vectorizer.fit_transform(chunk_features)
        machine.partial_fit(features, [
            neg_label,
        ] * len(chunk_features),
                            classes=[pos_label, neg_label])
        chunk_features = []

    return (machine, vectorizer)
def nb_onehot():
    X_tr, Y_tr, X_va, Y_va, dictionary, X_te, id_list = util.create_or_load_data(freq_threshold=50)

    Y_te_pred_list = []
    sum_auc_va = 0.0
    for i in range(Y_tr.shape[1]):
        nb = BernoulliNB()

        j = 0
        batch_size = 10000
        while j < len(X_tr):
            end = min(j + batch_size, len(X_tr) - 1)
            batch = [data_process.seq2onehot(seq, dictionary) for seq in X_tr[j:end]]
            nb.partial_fit(batch, Y_tr[j:end, i], classes=[0, 1])
            j += batch_size

        logging.info("Finish training")

        Y_va_pred = []
        j = 0
        while j < len(X_va):
            end = min(j + batch_size, len(X_va))
            batch = [data_process.seq2onehot(seq, dictionary) for seq in X_va[j:end]]
            Y_va_pred.extend(nb.predict_proba(batch))
            j += batch_size

        auc_va = util.auc(Y_va[:, i], Y_va_pred)
        logging.info("tag{}, valid auc: ".format(i) + str(auc_va))
        sum_auc_va += auc_va

        Y_te_pred = []
        j = 0
        while j < len(X_te):
            end = min(j + batch_size, len(X_te))
            batch = [data_process.seq2onehot(seq, dictionary) for seq in X_te[j:end]]
            Y_te_pred.extend(nb.predict_proba(batch))
            j += batch_size
        Y_te_pred_list.append(Y_te_pred)

    logging.info("Avg auc: {}".format(sum_auc_va / Y_tr.shape[1]))

    util.submission(Y_te_pred_list, id_list)
Exemplo n.º 9
0
def train():
    print "\n========== Start Training =========="
    list_io_addr = get_io_addr(__TRAIN_DATA)
    clf = BernoulliNB(class_prior=[0.1, 0.9], alpha=0.5)

    for addr_in in list_io_addr:
        print "\nGenerating training set from {}".format(addr_in)
        X_train, y_train = gd.get(addr_in, __RATIO)
        print "Done"

        print "Fitting Model......"
        clf.partial_fit(X_train, y_train, classes=[0, 1])
        print "Done"

    if __SAVE_MODEL:
        with open(__ROOT_MODEL, "w") as file_out:
            pickle.dump(clf, file_out)
        return None
    else:
        return clf
Exemplo n.º 10
0
def test_alpha():
    # Setting alpha=0 should not output nan results when p(x_i|y_j)=0 is a case
    X = np.array([[1, 0], [1, 1]])
    y = np.array([0, 1])
    nb = BernoulliNB(alpha=0.0)
    msg = "alpha too small will result in numeric errors, setting alpha = 1.0e-10"
    with pytest.warns(UserWarning, match=msg):
        nb.partial_fit(X, y, classes=[0, 1])
    with pytest.warns(UserWarning, match=msg):
        nb.fit(X, y)
    prob = np.array([[1, 0], [0, 1]])
    assert_array_almost_equal(nb.predict_proba(X), prob)

    nb = MultinomialNB(alpha=0.0)
    with pytest.warns(UserWarning, match=msg):
        nb.partial_fit(X, y, classes=[0, 1])
    with pytest.warns(UserWarning, match=msg):
        nb.fit(X, y)
    prob = np.array([[2.0 / 3, 1.0 / 3], [0, 1]])
    assert_array_almost_equal(nb.predict_proba(X), prob)

    nb = CategoricalNB(alpha=0.0)
    with pytest.warns(UserWarning, match=msg):
        nb.fit(X, y)
    prob = np.array([[1.0, 0.0], [0.0, 1.0]])
    assert_array_almost_equal(nb.predict_proba(X), prob)

    # Test sparse X
    X = scipy.sparse.csr_matrix(X)
    nb = BernoulliNB(alpha=0.0)
    with pytest.warns(UserWarning, match=msg):
        nb.fit(X, y)
    prob = np.array([[1, 0], [0, 1]])
    assert_array_almost_equal(nb.predict_proba(X), prob)

    nb = MultinomialNB(alpha=0.0)
    with pytest.warns(UserWarning, match=msg):
        nb.fit(X, y)
    prob = np.array([[2.0 / 3, 1.0 / 3], [0, 1]])
    assert_array_almost_equal(nb.predict_proba(X), prob)
def train_bnb_model(msg):
    """
    Trains a model using the given parameters
    
    month: int or list with the number of the month we want
        the data to be taken of
    input_columns: a list with the name of the columns we are going to use
        in the task
    use_product: bool, if true adds the product columns of the month before
    use_change: bool, if true adds the change columns of the month before
    """
    msg_copy = msg.copy()
    msg_copy['train'] = True
    if not 'month' in msg_copy.keys():
        msg_copy['month'] = msg_copy['train_month']
    #Get the data for training
    ret = dataset.get_data(msg_copy)
    input_data, output_data = ret[0:2]
    #Fit the model
    bnb = BernoulliNB(alpha=1e-2)
    bnb.partial_fit(input_data, output_data, classes=range(24))
    return bnb
Exemplo n.º 12
0
def bern_naive_bayes(df):
    fail_df = df.copy(deep=True)
    pass_df = df.copy(deep=True)

    # Target values are G3
    Y = df.pop("G3")
    Y_fail = fail_df.pop("G3")
    Y_pass = pass_df.pop("G3")

    # Feature set is remaining features
    X = df
    X_fail = fail_df
    X_pass = pass_df

    bnb = BernoulliNB()
    bnb.partial_fit(X_fail, Y_fail, [0, 1])
    bnb.partial_fit(X_pass, Y_pass, [0, 1])

    print("\n\nBernoulli Naive Bayes Accuracy: ", bnb.score(X, Y))
    confuse(Y, bnb.predict(X))

    return bnb
Exemplo n.º 13
0
class Agent_NaiveBayes(Agent): #TODO. Currently only works with 1 other user. 
    def __init__(self, loc, actspace, index, name = ''):
        super().__init__(loc, actspace, index, name)
        self.distribution = np.zeros((len(actspace), len(actspace))) #joint distribution (with probabilities)
        self.distribution_rewards = np.zeros((len(actspace), len(actspace))) #joint distribution (with rewards)
        self.model= BernoulliNB();
        self.windowsize = 10; # Configurable
        self.predictions = np.zeros((self.windowsize, 1))
        
    def updatereward(self, reward, Agents):
        super().updatereward(reward)
        # update join probabilities (list of who did what)
        myact = self.actions[-1]
        otheract = Agents[1-self.index].actions[-1]
        self.distribution_rewards[myact, otheract] = self.distribution_rewards[myact, otheract] + reward
        self.distribution[myact, otheract] = self.distribution[myact, otheract] + 1

        #update model:
        OtherActions = Agents[1-self.index].actions;
        if len(OtherActions) > self.windowsize + 1:
            self.model.partial_fit(OtherActions[-self.windowsize-2:-2], [OtherActions[-1]], classes = self.actionspace)

    def act(self, BSs, variables,Agents, t): ## TODO write this code
        pexplore = 1-variables['p_explore'];
        if random.random() < pexplore and t > self.windowsize + 2: #exploit stage
            #predict what the other user will do
            others_predict = int(round(self.model.predict(Agents[1-self.index].actions[-self.windowsize-1:-1])[0]));
            np.insert(self.predictions, others_predict, len(self.predictions))            
            #find the action that maximizes assuming the the other does the predicted value
            avgOfEach = np.zeros(len(BSs))
            for i in range(0, len(BSs)):
                if self.distribution[i,others_predict] > 0:
                    avgOfEach[i] = self.distribution_rewards[i,others_predict]/self.distribution[i,others_predict]
            #choose action that maximizes expected
            action = avgOfEach.argmax()
        else: #explore stage
            action = random.randint(0, len(BSs)-1) 
        self.actions.append(action)
        return action
Exemplo n.º 14
0
def this_is_for_fun():
    # clf = MultinomialNB(alpha=1.0)
    # clf = SGDClassifier(alpha=0.0001)
    # clf = PassiveAggressiveClassifier()
    clf = BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)
    # clf = Perceptron(alpha=0.001)
    print('BernoulliNB,a = 1')

    label_list = read_label()
    choose = random.randint(10, 24)
    corpus = corpora.MmCorpus('../corpus_mm/corpus_{}.mm'.format(choose))
    test_X = matutils.corpus2csc(corpus).transpose()  # 测试集
    test_y = label_list[(choose * 20000):(choose + 1) * 20000]  # 测试集标签

    for index in range(10, 25):
        corpus = corpora.MmCorpus('../corpus_mm/corpus_{}.mm'.format(index))
        csi_matrix = matutils.corpus2csc(corpus).transpose()
        clf.partial_fit(csi_matrix,
                    label_list[(index * 20000):(index + 1) * 20000],
                    classes=np.array([0, 1]))
        print("第{}次".format(index))
        pre = clf.predict(test_X)
        totalScore(pre, test_y)
        joblib.dump(clf, "../model/BernoulliNB_model_{}.m".format(index))
Exemplo n.º 15
0
                       inplace=True)

            XN = csr_matrix(chunk[num_col].values)
            X = csr_matrix((chunk.shape[0], n_features))
            rows = np.arange(chunk.shape[0])
            for col in cat_col_all:
                dat = np.ones(chunk.shape[0])
                cols = chunk[col] % n_features
                X += csr_matrix((dat, (rows, cols)),
                                shape=(chunk.shape[0], n_features))
            X = hstack((XN, X))
            book_indices = sw[sw > 1].index.tolist()
            X_test = csr_matrix(X)[book_indices]
            y_test = y[book_indices]

            clf.partial_fit(X, y, classes=np.arange(100), sample_weight=sw)

            count = count + chunksize
            map5 = map5eval(clf.predict_proba(X_test), y_test)
            print('%d rows completed. MAP@5: %f' % (count, map5))
            if (count / chunksize == 200):
                break
        except Exception as e:
            print('Error: %s' % str(e))
            pass

#--------------------------对测试集结果进行预测----------------------------
with open('output/probs/bnb.pkl', 'wb') as f:
    pickle.dump(clf, f)
count = 0
chunksize = 10000
Exemplo n.º 16
0
val = model_preproc(val)

# BernoulliNB classifier for categorical variables
b_clf = BernoulliNB()
# GaussianNB classifier for continous variables
g_clf = GaussianNB()

reader = pd.read_hdf(inputpath + 'train_proc_train.h5', key = 'df', mode = 'r', 
                     iterator = True, chunksize = chunksize_)
for chunk in reader:
    # Shuffle the chunk
    chunk = chunk.sample(frac = 1)
    # Pre-process the chunk
    chunk = model_preproc(chunk)
    # Fit the BernoulliNB classifier
    b_clf.partial_fit(chunk[:, 10:], chunk[:, 0], classes = np.array([0, 1]))
    # Fit the GaissianNB classifier
    g_clf.partial_fit(chunk[:, 1:10], chunk[:, 0], classes = np.array([0, 1]))

# Making predictions for the validation set
b_probs = b_clf.predict_proba(val[:, 10:])
g_probs = g_clf.predict_proba(val[:, 1:10])
# Combining the probabolities from the two NB classifiers
# Multiplying individual classifier class probabilities and normalizing using class priors
probs = np.divide(np.multiply(b_probs, g_probs), g_clf.class_prior_)

# Calculate the ROC-AUC
val_auc = roc_auc_score(val[:, 0], probs[:, 1])
print('Validation set area under the ROC curve: %f' % val_auc)

# Validation set area under the ROC curve: 0.683644
Exemplo n.º 17
0
print(clf.predict_proba(Features))

import numpy as np
X = np.random.randint(5, size=(6, 3))
y = np.array([1, 2, 3, 4, 5, 6])
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X, y)
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
print(clf.predict_proba(X))
print(clf.predict_proba(np.array([3, 3, 2])))

XX = np.random.randint(5, size=(6, 3))
yy = np.array([1, 2, 3, 4, 5, 6])

clf.partial_fit(XX, yy)
print(clf.predict_proba(X))

measurements = [
    {
        'city': 'Dubai',
        'temperature': 33.
    },
    {
        'city': 'Dubai',
        'temperature': 12.
    },
    {
        'city': 'San Fransisco',
        'temperature': 18.
    },
Exemplo n.º 18
0
     sw = 1 + 4*chunk.is_booking
     chunk.drop(['cnt', 'hotel_cluster', 'is_booking'], axis=1, inplace=True)
     
     XN = csr_matrix(chunk[num_col].values)
     X = csr_matrix((chunk.shape[0], n_features))
     rows = np.arange(chunk.shape[0])
     for col in cat_col_all:
         dat = np.ones(chunk.shape[0])
         cols = chunk[col] % n_features
         X += csr_matrix((dat, (rows, cols)), shape=(chunk.shape[0], n_features))
     X = hstack((XN, X))
     book_indices = sw[sw > 1].index.tolist()
     X_test = csr_matrix(X)[book_indices]
     y_test = y[book_indices]
     
     clf.partial_fit(X, y, classes=np.arange(100), sample_weight=sw)
     #len([i for i in clf.coef_[1] if i != 0])
     #len([i for i in clf.coef_[1] if i > 0])
     #jb = [col for h in np.argsort(abs(clf.coef_[5])) for col in chunk.columns if (hash(col) % n_features) == h]
     
     #preds += np.vstack(tuple([clf.predict_proba(test.loc[i*chunksize:min((i+1)*chunksize,test.shape[0]),:]) for i in range(int(test.shape[0]/100000))]))
     #preds += clf.predict_proba(test)
     
     count = count + chunksize
     map5 = map5eval(clf.predict_proba(X_test), y_test)
     print('%d rows completed. MAP@5: %f' % (count, map5))
     if(count/chunksize == 200):
         break
 except Exception as e:
     #e = sys.exc_info()[0]
     print('Error: %s' % str(e))
Exemplo n.º 19
0
print kobe_shots[kobe_shots['shot_type_2PT Field Goal']==1].count()
score = []
shot_yn_2 = []
shot_yn_3 = []
bnb_2 = BernoulliNB()
bnb_3 = BernoulliNB()

for x, y, s in zip(kobe_x.iterrows(),kobe_y.iterrows(),kobe_shots.iterrows()):
    if s[1]['shot_type_2PT Field Goal'] == 1:
        if x[0]==0:
            print x, y, s
            op = [x[0]+1,0.5]
            score.append(op)
        else:
            if pd.notnull(y[1]['shot_made_flag']):
                bnb_2.partial_fit([x[1].tolist()],[y[1]['shot_made_flag']],classes=[0,1])
                shot_yn_2.append(y[1]['shot_made_flag'])
            else:
                op = [x[0]+1,bnb_2.predict_proba(x[1])[0][1]]
                if x[0]%1000==0:
                    print op
                score.append(op)
    else:
        if x[0]==0:
            print x, y, s
            op = [x[0]+1,0.5]
            score.append(op)
        else:
            if pd.notnull(y[1]['shot_made_flag']):
                bnb_3.partial_fit([x[1].tolist()],[y[1]['shot_made_flag']],classes=[0,1])
                shot_yn_3.append(y[1]['shot_made_flag'])
Exemplo n.º 20
0
    m = len(X[0])
    n = len(X)

    X_train = X[:, 0:m-1]
    y_train = X[:, m-1]

    print "Done"
    print

    sm = SMOTE(ratio=0.9)
    X_train_sm, y_train_sm = sm.fit_sample(X_train, y_train)

    print
    print "Fitting Model"
    clf.partial_fit(X_train_sm, y_train_sm, classes=[0, 1])
    print "Done"
    print

with open("/home/ubuntu/Weiyi/model_05_01", "w") as file_out:
    pickle.dump(clf, file_out)

# with open("/home/wlu/Desktop/model_bernoulli", "r") as file_in:
#     clf = pickle.load(file_in)
#
# print "Generating the training set"
# with open("/home/wlu/Desktop/test_sparse.npy", "r") as file_in:
#     X = load_sparse_csr(file_in).toarray()
#
# m = len(X[0])
# n = len(X)
def main():
	#################################
	# Possible values for i: 		#
	# 1: getMail() : preprocessing	#
	# 2: generate dictionary.txt 	#
	# 3: getFeaturesMatrix()		#
	# 	 getFeaturesMatrixTest()	#
	#	 getLabels() 				#
	# 	 getLabelsTest() 			#
	#	: feature vectors & labels 	#
	# 4: Naive Bayes Models 		#
	# 5: removeStopWords()   		#
	# 6: stemDictionary() 			#
	# 7: stemEmails() 				#
	#################################
	i = 4
	if i == 1:
		fileList = []
		fileDir = "data/"
		for filename in os.listdir(fileDir):
			fileList.append(os.path.join(fileDir, filename))
		#print(fileList)
		for filePath in fileList:
			inputname = filePath
			getMail(inputname)
	elif i == 2:
		fileList = []
		fileDir = "preprocessed/"
		for filename in os.listdir(fileDir):
			fileList.append(os.path.join(fileDir, filename))
		with open("dictionary.txt", "w") as outFile:
			for filePath in fileList:
				with open(filePath, "r") as textFile:
					for line in textFile:
						data = line.split(" ")
					for i in data:
						outFile.write(i)
						outFile.write("\n")
		textList = []
		newList = []
		with open("dictionary.txt", "r") as textFile:
			for line in textFile:
				textList.append(line)
		uWords = set(textList)
		for i in range(len(textList)):
			if textList[i].isalpha():
				newList.append(textList[i])
		with open("dictionary.txt", "w") as outFile:
			for i in range(len(newList)):
				outFile.write(newList[i])
				outFile.write("\n")
		with open("dictionary.txt", "r") as textFile:
			lines = textFile.readLines()
			lines.sort()
		with open("dictionary.txt", "w") as outFile:
			for i in lines:
				outFile.write(i)
	elif i == 3:
		dictionary = []
		with open("dictionary.txt", "r") as dicionaryText:
			for line in dicionaryText:
				word = line.split()
				dictionary += word

		trainFeatures = getFeaturesMatrix(dictionary)
		testFeatures = getFeaturesMatrixTest(dictionary)
		trainLabels = getLabels()
		testLabels = getLabelsTest()
	elif i == 4:
		aClass = np.array([0,1])

		datasetTrain = np.genfromtxt("dataset-training.csv", dtype=np.int, delimiter=",",)
		datasetTest = np.genfromtxt("dataset-test.csv", dtype=np.int, delimiter=",")
		datasetTrainLabels = np.genfromtxt("dataset-training-labels.csv", dtype=np.int, delimiter=",")
		datasetTestLabels = np.genfromtxt("dataset-test-labels.csv", dtype=np.int, delimiter=",")

		modelM = MultinomialNB(alpha=1)
		modelB = BernoulliNB(alpha=1)

		splitTrain = datasetTrain.shape[0]//2
		newTrainFeatures0, newTrainFeatures1 = datasetTrain[:splitTrain,:], datasetTrain[splitTrain:,:]

		newTrainLabels0 = datasetTrainLabels[0:splitTrain]
		newTrainLabels1 = datasetTrainLabels[splitTrain:]

		modelM.partial_fit(newTrainFeatures0, newTrainLabels0, classes=aClass)
		modelM.partial_fit(newTrainFeatures1, newTrainLabels1, classes=aClass)

		modelB.partial_fit(newTrainFeatures0, newTrainLabels0, classes=aClass)
		modelB.partial_fit(newTrainFeatures1, newTrainLabels1, classes=aClass)

		#modelM.fit(datasetTrain, datasetTrainLabels)
		#modelB.fit(datasetTrain, datasetTrainLabels)

		predictionsM = modelM.predict(datasetTest)
		predictionsM1 = modelM.predict(datasetTrain)

		predictionsB = modelB.predict(datasetTest)
		predictionsB1 = modelB.predict(datasetTrain)

		modelAccuracyM = accuracy_score(datasetTestLabels, predictionsM)
		modelAccuracyM1 = accuracy_score(datasetTrainLabels, predictionsM1)

		modelAccuracyB = accuracy_score(datasetTestLabels, predictionsB)
		modelAccuracyB1 = accuracy_score(datasetTrainLabels, predictionsB1)

		print("Multinomial Naive Bayes (Test):\t", modelAccuracyM)
		print("Multinomial Naive Bayes (Train):\t", modelAccuracyM1)

		print("Bernoulli Naive Bayes (Test):\t", modelAccuracyB)
		print("Bernoulli Naive Bayes (Train):\t", modelAccuracyB1)
	elif i == 5:
		removeStopWords()
	elif i == 6:
		stemDictionary()
	elif i == 7:
		stemEmails()
     # for i in range(counts.size):
     #     ratios[i] = counts[i]/sum(counts)
     # print('Ratios of classes', ratios*100)
     # # m = max(counts)
     # # for i in range(counts.size):
     # #     ratios[i] = m/counts[i]
     # ratios = 1 - ratios
     # print('Weights to penalize loss function', ratios)
     trweights = np.zeros(len(X_train))
     for i in range(len(y_train)):
         trweights[i] = cl_weights[y_train[i]]
     teweights = np.zeros(len(X_test))
     for i in range(len(y_test)):
         teweights[i] = cl_weights[y_test[i]]
     clf1.partial_fit(X_train,
                      y_train,
                      sample_weight=trweights,
                      classes=[0, 1, 2, 3])
     pred = clf1.predict(X_test)
     accuracy = clf1.score(X_test, y_test)
 print('Training time: {:.2f}s'.format(time() - t1))
 print('Test accuracy = {:.2f}'.format(accuracy * 100))
 acc[s - 1] = accuracy * 100
 cfmat = confusion_matrix(y_test, pred)
 print(cfmat)
 predicted += cfmat.sum(axis=0)
 expected += cfmat.transpose().sum(axis=0)
 # porter = Porter(clf1)
 # op = porter.export(export_data=True)
 # with open('{}.java'.format(model), 'w') as f:
 #     f.write(op)
 # os.rename('data.json', '{}.json'.format(model))
Exemplo n.º 23
0
#kobe_id = pd.read_csv('data/kobe_x_id.csv')
kobe_x = pd.read_csv('data/kobe_x_transformed.csv')
kobe_y = pd.read_csv('data/kobe_y.csv')

score = []
shot_yn = []
bnb = BernoulliNB()
new_scores = []
for x, y in zip(kobe_x.iterrows(), kobe_y.iterrows()):
    if x[0] == 0:
        op = [x[0] + 1, 0.5]
        score.append(op)
    else:
        if pd.notnull(y[1]['shot_made_flag']):
            bnb.partial_fit([x[1].tolist()], [y[1]['shot_made_flag']],
                            classes=[0, 1])
            shot_yn.append(y[1]['shot_made_flag'])
        else:
            if x[0] < 0:
                op = [x[0] + 1, float(sum(shot_yn)) / float(len(shot_yn))]
            else:
                op = [x[0] + 1, bnb.predict_proba(x[1])[0][1]]
            if x[0] % 1000 == 0:
                print op
            score.append(op)

#print score[0:5]
with open('data/attempt_2_output.csv', 'w') as f:
    f.write('shot_id,shot_made_flag' + '\n')
    f.writelines(str(s[0]) + ',' + str(s[1]) + '\n' for s in score)
Exemplo n.º 24
0
 l = ['month','TrainDay','testDay','recall','filtered']
 wr.writerow(l)
 for diff in [1]:  #1,7  # as for now, only [1] means test on next day
     for month in range(6,7): #5,7    # as for now, only range(6,7) means june
         for day in range(4,25): #1,32  # as for now, only range(4,5) means 1st day
             print '------------------------------------------------'
             print '------------------------------------------------'
             print 'month = ', month,' and day = ',  day
             try:
                 # Inputting training and testing set
                 train_data, train_label = GetData(month, day)
                 test_data, test_label = GetData(month, day+diff)
                 print 'Data Read'
                 #time.sleep(20)  #sleep
                 print 'Training Data...'
                 clf.partial_fit(train_data, train_label, classes=[0, 1])
                 print 'Data Trained...'
                 y_true = test_label
                 n = len(y_true)
                 ### Here's a problem...
                 #print 'predictin jo ...'
                 y_pred = clf.predict(test_data)
                 #print 'getting conf matrix...'
                 cf = confusion_matrix(y_true,y_pred)
                 #print 'calculating recall...'
                 recalll = recall_score(y_true, y_pred)
                 #print 'calculating filtering'
                 filtered = (cf[0,0])/float(n)
                 print "Recall is: %s" % recalll
                 print 'Filtering is = ', filtered
                 print cf
Exemplo n.º 25
0
# test subset contains a third of the original dataset, train contains the rest
# Dataset is not shuffled before splitting
train, test, train_labels, test_labels = train_test_split(features,
                                                          labels,
                                                          test_size=0.3,
                                                          random_state=1,
                                                          shuffle=False)

print("Train shape:", train.shape)
print("Train_labels shape:", train_labels.shape)
print("Test shape:", test.shape)
print("Test_labels shape:", test_labels.shape)
"""## Build classifier and evaluate performance"""

# Initialize our classifier
# Use Bernoulli distribution as only 2 outputs remain after stripping unlabeled data
# either malignant or benign
gnb = BernoulliNB()
# Train our classifier
for i in range(0, len(train), len(train) // 4):
    train_subset = train[i:i + len(train) // 4]
    train_labels_subset = train_labels[i:i + len(train) // 4]
    gnb.partial_fit(train_subset, train_labels_subset,
                    np.unique(train_labels_subset))

# Make predictions
preds = gnb.predict(test)

# Evaluate accuracy
print("Accuracy :", accuracy_score(test_labels, preds))
Exemplo n.º 26
0
y = data.target

# Vectorize the movie reviews using our 8 words
vect = CountVectorizer(vocabulary=["awful", "bad", "boring", "dull", "effective", "enjoyable", "great", "hilarious"])
X = vect.fit_transform(data.data)
X = X.toarray()

# Define our classifier and cross-validation
clf = BernoulliNB(binarize=True)
kf = KFold(n_splits=10, shuffle=True)
kf.get_n_splits(X)

# Perform cross-validation
score = 0
for k, (train, test) in enumerate(kf.split(X, y)):
    clf.partial_fit(X[train], y[train], [0,1])
    score += clf.score(X[test], y[test])

# Calculate average prediction accuracy
score = score / 10
print("Bernoulli Average Score: {0:.5f}".format(score))

# Define our classifier and cross-validation
clf = MultinomialNB()
kf = KFold(n_splits=10, shuffle=True)
kf.get_n_splits(X)

# Perform cross-validation
score = 0
for k, (train, test) in enumerate(kf.split(X, y)):
    clf.partial_fit(X[train], y[train], [0,1])
Exemplo n.º 27
0
def test_alpha():
    # Setting alpha=0 should not output nan results when p(x_i|y_j)=0 is a case
    X = np.array([[1, 0], [1, 1]])
    y = np.array([0, 1])
    nb = BernoulliNB(alpha=0.0)
    msg = "alpha too small will result in numeric errors, setting alpha = 1.0e-10"
    with pytest.warns(UserWarning, match=msg):
        nb.partial_fit(X, y, classes=[0, 1])
    with pytest.warns(UserWarning, match=msg):
        nb.fit(X, y)
    prob = np.array([[1, 0], [0, 1]])
    assert_array_almost_equal(nb.predict_proba(X), prob)

    nb = MultinomialNB(alpha=0.0)
    with pytest.warns(UserWarning, match=msg):
        nb.partial_fit(X, y, classes=[0, 1])
    with pytest.warns(UserWarning, match=msg):
        nb.fit(X, y)
    prob = np.array([[2.0 / 3, 1.0 / 3], [0, 1]])
    assert_array_almost_equal(nb.predict_proba(X), prob)

    nb = CategoricalNB(alpha=0.0)
    with pytest.warns(UserWarning, match=msg):
        nb.fit(X, y)
    prob = np.array([[1.0, 0.0], [0.0, 1.0]])
    assert_array_almost_equal(nb.predict_proba(X), prob)

    # Test sparse X
    X = scipy.sparse.csr_matrix(X)
    nb = BernoulliNB(alpha=0.0)
    with pytest.warns(UserWarning, match=msg):
        nb.fit(X, y)
    prob = np.array([[1, 0], [0, 1]])
    assert_array_almost_equal(nb.predict_proba(X), prob)

    nb = MultinomialNB(alpha=0.0)
    with pytest.warns(UserWarning, match=msg):
        nb.fit(X, y)
    prob = np.array([[2.0 / 3, 1.0 / 3], [0, 1]])
    assert_array_almost_equal(nb.predict_proba(X), prob)

    # Test for alpha < 0
    X = np.array([[1, 0], [1, 1]])
    y = np.array([0, 1])
    expected_msg = re.escape(
        "Smoothing parameter alpha = -1.0e-01. alpha should be > 0."
    )
    b_nb = BernoulliNB(alpha=-0.1)
    m_nb = MultinomialNB(alpha=-0.1)
    c_nb = CategoricalNB(alpha=-0.1)
    with pytest.raises(ValueError, match=expected_msg):
        b_nb.fit(X, y)
    with pytest.raises(ValueError, match=expected_msg):
        m_nb.fit(X, y)
    with pytest.raises(ValueError, match=expected_msg):
        c_nb.fit(X, y)

    b_nb = BernoulliNB(alpha=-0.1)
    m_nb = MultinomialNB(alpha=-0.1)
    with pytest.raises(ValueError, match=expected_msg):
        b_nb.partial_fit(X, y, classes=[0, 1])
    with pytest.raises(ValueError, match=expected_msg):
        m_nb.partial_fit(X, y, classes=[0, 1])
Exemplo n.º 28
0
#clf = MultinomialNB()
#clf.fit(SP_train.iloc[:, 1:23], SP_train.iloc[:, 0])
#MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)


yTest_predNB = clf.predict(SP_test.iloc[:, 1:23])
print(clf.predict(SP_test.iloc[:, 1:23]))

tn, fp, fn, tp = confusion_matrix(SP_test.iloc[:, 0], yTest_predNB).ravel()
AR = (tn+tp)/(tn+fp+fn+tp)
Sens = tp/(tp+fn)
Spec = tn/(tn+fp)
print("""The accuracy rate is %.3f, the sensitivity is %.3f, the specificity is 
      %.3f""" %(AR, Sens, Spec))


#%%

clf1 = BernoulliNB()
clf1.partial_fit(SP_train.iloc[:, 1:23], SP_train.iloc[:, 0], np.array([0, 1]))
BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

yTest_predNB1 = clf.predict(SP_test.iloc[:, 1:23])
print(clf1.predict(SP_test.iloc[:, 1:23]))

tn, fp, fn, tp = confusion_matrix(SP_test.iloc[:, 0], yTest_predNB1).ravel()
AR = (tn+tp)/(tn+fp+fn+tp)
Sens = tp/(tp+fn)
Spec = tn/(tn+fp)
print("""The accuracy rate is %.3f, the sensitivity is %.3f, the specificity is 
      %.3f""" %(AR, Sens, Spec))