def Get_Features(X): clf = NBClassifier() seq1, seq2 = X gnb1_pssm, lr1 = Blast_Utils.Get_PSSM(seq1 + ".pssm") gnb2_pssm, lr2 = Blast_Utils.Get_PSSM(seq2 + ".pssm") means_and_variances, priors = project3_functions.load_model( "800_sequence_model.mdl") gnb_predictions1 = clf.predict(gnb1_pssm, means_and_variances, priors) gnb_predictions2 = clf.predict(gnb2_pssm, means_and_variances, priors) gnb1_features = efe.Extract_GNB_Features(gnb_predictions1) gnb2_features = efe.Extract_GNB_Features(gnb_predictions2) dt1 = efe.Extract_Decision_Tree_Features(seq1 + ".fasta") dt2 = efe.Extract_Decision_Tree_Features(seq2 + ".fasta") lr1.extend(lr2) lr1.extend(gnb1_features) lr1.extend(gnb2_features) lr1.extend(dt1) lr1.extend(dt2) lr1.append(1) return lr1
def main(): if len(sys.argv)!=3: print('python3 '+sys.argv[0]+' [MODEL FILE] [TEST FILE]', file = sys.stderr) return mod_path = sys.argv[1] test_path = sys.argv[2] util = Utility('', mod_path, test_path) nbc = NBClassifier(False, True, False, util) nbc.classify()
def Get_Features(X): clf = NBClassifier() seq1, seq2 = X gnb1_pssm, lr1 = Blast_Utils.Get_PSSM(seq1 + ".pssm") # gnb2_pssm, lr2 = Blast_Utils.Get_PSSM(seq2 + ".pssm") means_and_variances, priors = project3_functions.load_model( "800_sequence_model.mdl") gnb_predictions1 = clf.predict(gnb1_pssm, means_and_variances, priors) # gnb_predictions2 = clf.predict(gnb2_pssm, means_and_variances, priors) gnb1_features = efe.Extract_GNB_Features(gnb_predictions1) print gnb1_features
def scoreClassifier(postType, featureDimension): #MLP acc_list = [] for k in range(2, 11): l = MLPClassifier('CSV Files_' + postType +'/data_std_hr24.csv', 'CSV Files_' + postType +'/label_hr24.csv', k , featureDimension) acc_list.append(l.kfold_validator()) t = np.arange(2, 11, 1) plt.plot(t, acc_list, 'ro') plt.ylabel('accuracy') plt.xlabel('K-fold') plt.title('MLP Classifier') plt.savefig('Images_' + postType + "/"+"MLP_Classifier.png") plt.close() #plt.show() #LR acc_list = [] avgCoeff = np.zeros(shape=(1,featureDimension)) for k in range(2,11): l = LRClassifier('CSV Files_' + postType +'/data_std_hr24.csv', 'CSV Files_' + postType +'/label_hr24.csv',k, featureDimension) accuracy, lrCoeff = l.kfold_validator() acc_list.append(accuracy) avgCoeff = avgCoeff + lrCoeff avgCoeff /= 9 print(avgCoeff) t = np.arange(2, 11, 1) plt.plot(t, acc_list, 'ro') plt.ylabel('accuracy') plt.xlabel('K-fold') plt.title('Logistic Regression') plt.savefig('Images_' + postType + "/" + "Logistic_Regression.png") plt.close() #plt.show() #SVM acc_list =[] for k in range(2,11): l = SVMClassifier('CSV Files_' + postType +'/data_std_hr24.csv', 'CSV Files_' + postType +'/label_hr24.csv',k, featureDimension) acc_list.append(l.kfold_validator()) t = np.arange(2, 11, 1) plt.plot(t, acc_list, 'ro') plt.ylabel('accuracy') plt.xlabel('K-fold') plt.title('Support Vector Machine') plt.savefig('Images_' + postType + "/" + "Support_Vector_Machine.png") plt.close() #plt.show() #NB acc_list =[] for k in range(2,11): l = NBClassifier('CSV Files_' + postType +'/data_std_hr24.csv', 'CSV Files_' + postType +'/label_hr24.csv',k, featureDimension) acc_list.append(l.kfold_validator()) t = np.arange(2, 11, 1) plt.plot(t, acc_list, 'ro') plt.ylabel('accuracy') plt.xlabel('K-fold') plt.title('Gaussian Naive Bayes') plt.savefig('Images_' + postType + "/" + "Gaussian_Naive_Bayes.png") plt.close()
def __init__(self, V, delta, train_file, test_file, w1, w2): """Parameterized constructor for Build Your Own Model Arguments: V {int} -- Vocabulary choice (1=[a,z], 2=[a-zA-Z], 3=isalpha()) delta {int} -- smoothing factor train_file {string} -- path to train file test_file {string} -- path to test file w1 {float} -- Arbitrary weight for Trigram w2 {float} -- Arbitrary weight for Bigram (w1 + w2 < 1) """ self.m1 = NBClassifier(V=V, n=1, delta=delta, train_file=train_file, test_file=test_file) self.m2 = NBClassifier(V=V, n=2, delta=delta, train_file=train_file, test_file=test_file) self.m3 = NBClassifier(V=V, n=3, delta=delta, train_file=train_file, test_file=test_file) self.w1 = w1 self.w2 = w2 self.w3 = 1 - (w1 + w2)
def vcClassifier(postType, featureDimension): gp = GraphPlotting.GraphPlot() hours = [1, 6, 12, 24] graph_list = [] #LR for hour in hours: acc_list = [] print("For Hour: " + str(hour)) for k in range(2, 11): l = LRClassifier('CSV Files/data_std_hr' + str(hour) + '.csv', 'CSV Files/label_std_hr' + str(hour) + '.csv', k, featureDimension) acc_list.append(l.kfold_validator()[0]) graph_list.append((hour, acc_list)) gp.PlotGraph(graph_list, "Hours", "Accuracy", "Logistic Regression", postType) #SVM grap_list = [] for hour in hours: acc_list = [] print("For Hour: " + str(hour)) for k in range(2, 11): l = SVMClassifier('CSV Files/data_std_hr' + str(hour) + '.csv', 'CSV Files/label_std_hr' + str(hour) + '.csv', k, featureDimension) acc_list.append(l.kfold_validator()) grap_list.append((hour, acc_list)) gp.PlotGraph(grap_list, "Hours", "Accuracy", "Support vector machines", postType) #NB graph_list = [] for hour in hours: acc_list = [] print("For Hour: " + str(hour)) for k in range(2, 11): l = NBClassifier('CSV Files/data_std_hr' + str(hour) + '.csv', 'CSV Files/label_std_hr' + str(hour) + '.csv', k, featureDimension) acc_list.append(l.kfold_validator()) graph_list.append((hour, acc_list)) gp.PlotGraph(graph_list, "Hours", "Accuracy", "Navie Bayes", postType)
def main(): if len(sys.argv) < 3: print('python3 ' + sys.argv[0] + ' [TRAIN FILE] -h(optional) [MODEL FILE]', file=sys.stderr) return train_file = '' mod_file = '' dev_flag = False if len(sys.argv) == 3: train_file = sys.argv[1] mod_file = sys.argv[2] elif len(sys.argv) == 4: train_file = sys.argv[1] mod_file = sys.argv[3] if sys.argv[2] == '-h': dev_flag = True util = Utility(train_file, mod_file, '') nbc = NBClassifier(True, False, dev_flag, util) nbc.train() util.save_mod((nbc.prior, nbc.model_param)) if dev_flag: nbc.model_eval()
from NBClassifier import NBClassifier import project3_functions import project3_datafunctions import sys clf = NBClassifier() X, y = project3_datafunctions.read_dataset( sys.argv[1]) # "Data/protein_dataset_800.csv" X_train, X_test, y_train, y_test = project3_functions.split_data(X, y) means_and_variances, priors = clf.fit(X_train, y_train) project3_functions.save_model(means_and_variances, priors, "{}.mdl".format(sys.argv[1][:-4])) predictions = clf.predict(X_test, means_and_variances, priors) accuracy = project3_functions.evaluate(predictions, y_test) print "Accuracy: {}".format(accuracy)
#Train dataset = loadDataset(TRAINPATHS) # print "Done loading" classes = parseDataset(dataset) # print "Len before cleaning:", len(classes[0]) # # classes = cleanDataset(classes) # print "Len after cleaning:", len(classes[0]) # print "Done parsing" # exportClasses(classes) # print "Done exporting" NBC = NBClassifier(classes,len(dataset)) #Test testDataset = loadDataset(TESTPATHS) correct = 0.0 total = len(testDataset) for review in testDataset: words = parseReview(review[0]) result = NBC.classify(words) if result == review[1]: correct += 1 accuracy = correct/total*100 print accuracy
for fold in range(5): # this is ugly, but works for now train_idx_pos, test_idx_pos = posInd[:0.8 * len(posTwt)], posInd[0.8 * len(posTwt):] train_idx_neg, test_idx_neg = negInd[:0.8 * len(negTwt)], negInd[0.8 * len(negTwt):] train_pos = [posTwt[i] for i in train_idx_pos] train_neg = [negTwt[i] for i in train_idx_neg] test = [posTwt[i] for i in test_idx_pos] + [negTwt[i] for i in test_idx_neg] target = np.concatenate((np.ones( (len(test_idx_pos), 1)), np.zeros((len(test_idx_neg), 1))), axis=0) NBC = NBClassifier() NBC.train(train_pos, train_neg) SCC = SCClassifier(nbc=NBC) SCC.train(train_pos, train_neg) BGC = BGClassifier() BGC.train(train_pos, train_neg) score_1 = NBC.test(test) score_2 = SCC.test(test) score_3 = BGC.test(test) X_1 = np.append(X_1, score_1) X_2 = np.append(X_2, score_2) X_3 = np.append(X_3, score_3) Y = np.append(Y, target)
from NBClassifier import NBClassifier import project3_functions import project3_datafunctions import sys clf = NBClassifier() X_raw = project3_functions.read_file(sys.argv[2]) y_raw = None if len(sys.argv) > 5: y_raw = project3_functions.read_file(sys.argv[5]) X = project3_datafunctions.extract_features(X_raw, sys.argv[3], sys.argv[4]) means_and_variances, priors = project3_functions.load_model(sys.argv[1]) predictions = clf.predict(X, means_and_variances, priors) if y_raw != None: accuracy = project3_functions.evaluate(predictions, y_raw) print "Accuracy: {}".format(accuracy) for seq in y_raw: print("Actual: {}".format("".join(seq))) for seq in predictions: print("Predicted: {}".format("".join(seq)))
"C:\\Users\\Etienne\\workspace\\Sentiment-analysis-with-Naive-Bayes\\aclImdb\\test\\neg" ] #Train dataset = loadDataset(TRAINPATHS) # print "Done loading" classes = parseDataset(dataset) # print "Len before cleaning:", len(classes[0]) # # classes = cleanDataset(classes) # print "Len after cleaning:", len(classes[0]) # print "Done parsing" # exportClasses(classes) # print "Done exporting" NBC = NBClassifier(classes, len(dataset)) #Test testDataset = loadDataset(TESTPATHS) correct = 0.0 total = len(testDataset) for review in testDataset: words = parseReview(review[0]) result = NBC.classify(words) if result == review[1]: correct += 1 accuracy = correct / total * 100 print accuracy
from NBClassifier import NBClassifier from BYOM import BYOM import utils import os import shutil if __name__ == '__main__': # Initialize classifier # nbc = NBClassifier(V=2, n=2, delta=0.5, train_file="input/training-tweets.txt", # test_file="input/test-tweets-given.txt") # Initialize classifier nbc = NBClassifier(V=0, n=1, delta=0, train_file="input/training-tweets.txt", test_file="input/test5.txt") # Import, train, test. train_df, test_df = nbc.import_data() nbc.train(train_df) out_df = nbc.predict(test_df) # Remove out/ folder if it exists in preparation for a new run. desired_folder_path = os.path.join(os.getcwd(), "demo_results/") # if os.path.isdir(desired_folder_path): # shutil.rmtree(desired_folder_path, ignore_errors=True) # os.mkdir(desired_folder_path) # Output trace and evaluation file.
class BYOM(): """New model built using linear interpolation of probabilities from each individual model (Unigram, Bigram, Trigram) """ global languages languages = ['eu', 'ca', 'gl', 'es', 'en', 'pt'] def __init__(self, V, delta, train_file, test_file, w1, w2): """Parameterized constructor for Build Your Own Model Arguments: V {int} -- Vocabulary choice (1=[a,z], 2=[a-zA-Z], 3=isalpha()) delta {int} -- smoothing factor train_file {string} -- path to train file test_file {string} -- path to test file w1 {float} -- Arbitrary weight for Trigram w2 {float} -- Arbitrary weight for Bigram (w1 + w2 < 1) """ self.m1 = NBClassifier(V=V, n=1, delta=delta, train_file=train_file, test_file=test_file) self.m2 = NBClassifier(V=V, n=2, delta=delta, train_file=train_file, test_file=test_file) self.m3 = NBClassifier(V=V, n=3, delta=delta, train_file=train_file, test_file=test_file) self.w1 = w1 self.w2 = w2 self.w3 = 1 - (w1 + w2) def train_all(self, train_df): """Trains all individual models with training data set """ self.m1.train(train_df) self.m2.train(train_df) self.m3.train(train_df) def lin_interp_weighted_prob(self): """Creates new linearly interpolated probabilities for one model """ for language in languages: table1 = self.m1.selector[language].probs_table table2 = self.m2.selector[language].probs_table table3 = self.m3.selector[language].probs_table for key1 in table3.keys(): for key2 in table3.keys(): for key3 in table3.keys(): table3[key1][key2][key3] = self.w1 * table3[key1][ key2][key3] + self.w2 * table2[key2][ key3] + self.w3 * table1[key3] def predict_new(self, test_df): """Tests test dataset with new probabilities """ return self.m3.predict(test_df)