示例#1
0
def Get_Features(X):

    clf = NBClassifier()

    seq1, seq2 = X
    gnb1_pssm, lr1 = Blast_Utils.Get_PSSM(seq1 + ".pssm")
    gnb2_pssm, lr2 = Blast_Utils.Get_PSSM(seq2 + ".pssm")

    means_and_variances, priors = project3_functions.load_model(
        "800_sequence_model.mdl")
    gnb_predictions1 = clf.predict(gnb1_pssm, means_and_variances, priors)
    gnb_predictions2 = clf.predict(gnb2_pssm, means_and_variances, priors)

    gnb1_features = efe.Extract_GNB_Features(gnb_predictions1)
    gnb2_features = efe.Extract_GNB_Features(gnb_predictions2)

    dt1 = efe.Extract_Decision_Tree_Features(seq1 + ".fasta")
    dt2 = efe.Extract_Decision_Tree_Features(seq2 + ".fasta")

    lr1.extend(lr2)
    lr1.extend(gnb1_features)
    lr1.extend(gnb2_features)
    lr1.extend(dt1)
    lr1.extend(dt2)
    lr1.append(1)
    return lr1
示例#2
0
def main():
	if len(sys.argv)!=3:
		print('python3 '+sys.argv[0]+' [MODEL FILE] [TEST FILE]', file = sys.stderr)
		return
	mod_path = sys.argv[1]
	test_path = sys.argv[2]
	util = Utility('', mod_path, test_path)
	nbc = NBClassifier(False, True, False, util)
	nbc.classify()
示例#3
0
def Get_Features(X):

    clf = NBClassifier()

    seq1, seq2 = X
    gnb1_pssm, lr1 = Blast_Utils.Get_PSSM(seq1 + ".pssm")
    # gnb2_pssm, lr2 = Blast_Utils.Get_PSSM(seq2 + ".pssm")

    means_and_variances, priors = project3_functions.load_model(
        "800_sequence_model.mdl")
    gnb_predictions1 = clf.predict(gnb1_pssm, means_and_variances, priors)
    # gnb_predictions2 = clf.predict(gnb2_pssm, means_and_variances, priors)

    gnb1_features = efe.Extract_GNB_Features(gnb_predictions1)

    print gnb1_features
def scoreClassifier(postType, featureDimension):
    #MLP
    acc_list = []
    for k in range(2, 11):
        l = MLPClassifier('CSV Files_' + postType +'/data_std_hr24.csv', 'CSV Files_' + postType +'/label_hr24.csv', k , featureDimension)
        acc_list.append(l.kfold_validator())
    t = np.arange(2, 11, 1)
    plt.plot(t, acc_list, 'ro')
    plt.ylabel('accuracy')
    plt.xlabel('K-fold')
    plt.title('MLP Classifier')
    plt.savefig('Images_' + postType + "/"+"MLP_Classifier.png")
    plt.close()
    #plt.show()
    #LR
    acc_list = []
    avgCoeff = np.zeros(shape=(1,featureDimension))
    for k in range(2,11):
        l = LRClassifier('CSV Files_' + postType +'/data_std_hr24.csv', 'CSV Files_' + postType +'/label_hr24.csv',k, featureDimension)
        accuracy, lrCoeff = l.kfold_validator()
        acc_list.append(accuracy)
        avgCoeff = avgCoeff + lrCoeff

    avgCoeff /= 9
    print(avgCoeff)
    t = np.arange(2, 11, 1)
    plt.plot(t, acc_list, 'ro')
    plt.ylabel('accuracy')
    plt.xlabel('K-fold')
    plt.title('Logistic Regression')
    plt.savefig('Images_' + postType + "/" + "Logistic_Regression.png")
    plt.close()
    #plt.show()
    #SVM
    acc_list =[]
    for k in range(2,11):
        l = SVMClassifier('CSV Files_' + postType +'/data_std_hr24.csv', 'CSV Files_' + postType +'/label_hr24.csv',k, featureDimension)
        acc_list.append(l.kfold_validator())
    t = np.arange(2, 11, 1)
    plt.plot(t, acc_list, 'ro')
    plt.ylabel('accuracy')
    plt.xlabel('K-fold')
    plt.title('Support Vector Machine')
    plt.savefig('Images_' + postType + "/" + "Support_Vector_Machine.png")
    plt.close()
    #plt.show()
    #NB
    acc_list =[]
    for k in range(2,11):
        l = NBClassifier('CSV Files_' + postType +'/data_std_hr24.csv', 'CSV Files_' + postType +'/label_hr24.csv',k, featureDimension)
        acc_list.append(l.kfold_validator())
    t = np.arange(2, 11, 1)
    plt.plot(t, acc_list, 'ro')
    plt.ylabel('accuracy')
    plt.xlabel('K-fold')
    plt.title('Gaussian Naive Bayes')
    plt.savefig('Images_' + postType + "/" + "Gaussian_Naive_Bayes.png")
    plt.close()
示例#5
0
    def __init__(self, V, delta, train_file, test_file, w1, w2):
        """Parameterized constructor for Build Your Own Model

        Arguments:
            V {int} -- Vocabulary choice (1=[a,z], 2=[a-zA-Z], 3=isalpha())
            delta {int} -- smoothing factor
            train_file {string} -- path to train file
            test_file {string} -- path to test file
            w1 {float} -- Arbitrary weight for Trigram
            w2 {float} -- Arbitrary weight for Bigram (w1 + w2 < 1)
        """

        self.m1 = NBClassifier(V=V,
                               n=1,
                               delta=delta,
                               train_file=train_file,
                               test_file=test_file)
        self.m2 = NBClassifier(V=V,
                               n=2,
                               delta=delta,
                               train_file=train_file,
                               test_file=test_file)
        self.m3 = NBClassifier(V=V,
                               n=3,
                               delta=delta,
                               train_file=train_file,
                               test_file=test_file)
        self.w1 = w1
        self.w2 = w2
        self.w3 = 1 - (w1 + w2)
def vcClassifier(postType, featureDimension):

    gp = GraphPlotting.GraphPlot()

    hours = [1, 6, 12, 24]
    graph_list = []
    #LR
    for hour in hours:
        acc_list = []
        print("For Hour: " + str(hour))
        for k in range(2, 11):
            l = LRClassifier('CSV Files/data_std_hr' + str(hour) + '.csv',
                             'CSV Files/label_std_hr' + str(hour) + '.csv', k,
                             featureDimension)
            acc_list.append(l.kfold_validator()[0])
        graph_list.append((hour, acc_list))
    gp.PlotGraph(graph_list, "Hours", "Accuracy", "Logistic Regression",
                 postType)

    #SVM
    grap_list = []
    for hour in hours:
        acc_list = []
        print("For Hour: " + str(hour))
        for k in range(2, 11):
            l = SVMClassifier('CSV Files/data_std_hr' + str(hour) + '.csv',
                              'CSV Files/label_std_hr' + str(hour) + '.csv', k,
                              featureDimension)
            acc_list.append(l.kfold_validator())
        grap_list.append((hour, acc_list))
    gp.PlotGraph(grap_list, "Hours", "Accuracy", "Support vector machines",
                 postType)

    #NB
    graph_list = []
    for hour in hours:
        acc_list = []
        print("For Hour: " + str(hour))
        for k in range(2, 11):
            l = NBClassifier('CSV Files/data_std_hr' + str(hour) + '.csv',
                             'CSV Files/label_std_hr' + str(hour) + '.csv', k,
                             featureDimension)
            acc_list.append(l.kfold_validator())
        graph_list.append((hour, acc_list))
    gp.PlotGraph(graph_list, "Hours", "Accuracy", "Navie Bayes", postType)
示例#7
0
def main():
    if len(sys.argv) < 3:
        print('python3 ' + sys.argv[0] +
              ' [TRAIN FILE] -h(optional) [MODEL FILE]',
              file=sys.stderr)
        return
    train_file = ''
    mod_file = ''
    dev_flag = False
    if len(sys.argv) == 3:
        train_file = sys.argv[1]
        mod_file = sys.argv[2]
    elif len(sys.argv) == 4:
        train_file = sys.argv[1]
        mod_file = sys.argv[3]
        if sys.argv[2] == '-h':
            dev_flag = True
    util = Utility(train_file, mod_file, '')
    nbc = NBClassifier(True, False, dev_flag, util)
    nbc.train()
    util.save_mod((nbc.prior, nbc.model_param))
    if dev_flag:
        nbc.model_eval()
示例#8
0
from NBClassifier import NBClassifier
import project3_functions
import project3_datafunctions
import sys

clf = NBClassifier()

X, y = project3_datafunctions.read_dataset(
    sys.argv[1])  # "Data/protein_dataset_800.csv"
X_train, X_test, y_train, y_test = project3_functions.split_data(X, y)

means_and_variances, priors = clf.fit(X_train, y_train)
project3_functions.save_model(means_and_variances, priors,
                              "{}.mdl".format(sys.argv[1][:-4]))
predictions = clf.predict(X_test, means_and_variances, priors)

accuracy = project3_functions.evaluate(predictions, y_test)
print "Accuracy: {}".format(accuracy)

#Train
dataset = loadDataset(TRAINPATHS)
# print "Done loading"

classes = parseDataset(dataset)
# print "Len before cleaning:", len(classes[0])
# # classes = cleanDataset(classes)
# print "Len after cleaning:", len(classes[0])
# print "Done parsing"

# exportClasses(classes)
# print "Done exporting"

NBC = NBClassifier(classes,len(dataset))

#Test
testDataset = loadDataset(TESTPATHS)

correct = 0.0
total = len(testDataset)
for review in testDataset:
    words = parseReview(review[0])
    result = NBC.classify(words)
    if result == review[1]: correct += 1
            

accuracy = correct/total*100
print accuracy
示例#10
0
for fold in range(5):
    # this is ugly, but works for now
    train_idx_pos, test_idx_pos = posInd[:0.8 *
                                         len(posTwt)], posInd[0.8 *
                                                              len(posTwt):]
    train_idx_neg, test_idx_neg = negInd[:0.8 *
                                         len(negTwt)], negInd[0.8 *
                                                              len(negTwt):]
    train_pos = [posTwt[i] for i in train_idx_pos]
    train_neg = [negTwt[i] for i in train_idx_neg]
    test = [posTwt[i]
            for i in test_idx_pos] + [negTwt[i] for i in test_idx_neg]
    target = np.concatenate((np.ones(
        (len(test_idx_pos), 1)), np.zeros((len(test_idx_neg), 1))),
                            axis=0)
    NBC = NBClassifier()
    NBC.train(train_pos, train_neg)
    SCC = SCClassifier(nbc=NBC)
    SCC.train(train_pos, train_neg)
    BGC = BGClassifier()
    BGC.train(train_pos, train_neg)

    score_1 = NBC.test(test)
    score_2 = SCC.test(test)
    score_3 = BGC.test(test)

    X_1 = np.append(X_1, score_1)
    X_2 = np.append(X_2, score_2)
    X_3 = np.append(X_3, score_3)
    Y = np.append(Y, target)
示例#11
0
from NBClassifier import NBClassifier
import project3_functions
import project3_datafunctions
import sys

clf = NBClassifier()

X_raw = project3_functions.read_file(sys.argv[2])
y_raw = None
if len(sys.argv) > 5:
    y_raw = project3_functions.read_file(sys.argv[5])

X = project3_datafunctions.extract_features(X_raw, sys.argv[3], sys.argv[4])

means_and_variances, priors = project3_functions.load_model(sys.argv[1])
predictions = clf.predict(X, means_and_variances, priors)

if y_raw != None:
    accuracy = project3_functions.evaluate(predictions, y_raw)
    print "Accuracy: {}".format(accuracy)
    for seq in y_raw:
        print("Actual: {}".format("".join(seq)))
for seq in predictions:
    print("Predicted: {}".format("".join(seq))) 


示例#12
0
    "C:\\Users\\Etienne\\workspace\\Sentiment-analysis-with-Naive-Bayes\\aclImdb\\test\\neg"
]

#Train
dataset = loadDataset(TRAINPATHS)
# print "Done loading"

classes = parseDataset(dataset)
# print "Len before cleaning:", len(classes[0])
# # classes = cleanDataset(classes)
# print "Len after cleaning:", len(classes[0])
# print "Done parsing"

# exportClasses(classes)
# print "Done exporting"

NBC = NBClassifier(classes, len(dataset))

#Test
testDataset = loadDataset(TESTPATHS)

correct = 0.0
total = len(testDataset)
for review in testDataset:
    words = parseReview(review[0])
    result = NBC.classify(words)
    if result == review[1]: correct += 1

accuracy = correct / total * 100
print accuracy
示例#13
0
from NBClassifier import NBClassifier
from BYOM import BYOM
import utils
import os
import shutil

if __name__ == '__main__':

    # Initialize classifier
    # nbc = NBClassifier(V=2, n=2, delta=0.5, train_file="input/training-tweets.txt",
    #                    test_file="input/test-tweets-given.txt")

    # Initialize classifier
    nbc = NBClassifier(V=0,
                       n=1,
                       delta=0,
                       train_file="input/training-tweets.txt",
                       test_file="input/test5.txt")

    # Import, train, test.
    train_df, test_df = nbc.import_data()
    nbc.train(train_df)
    out_df = nbc.predict(test_df)

    # Remove out/ folder if it exists in preparation for a new run.
    desired_folder_path = os.path.join(os.getcwd(), "demo_results/")
    # if os.path.isdir(desired_folder_path):
    #     shutil.rmtree(desired_folder_path, ignore_errors=True)
    # os.mkdir(desired_folder_path)

    # Output trace and evaluation file.
示例#14
0
class BYOM():
    """New model built using linear interpolation of probabilities
    from each individual model (Unigram, Bigram, Trigram)
    """
    global languages
    languages = ['eu', 'ca', 'gl', 'es', 'en', 'pt']

    def __init__(self, V, delta, train_file, test_file, w1, w2):
        """Parameterized constructor for Build Your Own Model

        Arguments:
            V {int} -- Vocabulary choice (1=[a,z], 2=[a-zA-Z], 3=isalpha())
            delta {int} -- smoothing factor
            train_file {string} -- path to train file
            test_file {string} -- path to test file
            w1 {float} -- Arbitrary weight for Trigram
            w2 {float} -- Arbitrary weight for Bigram (w1 + w2 < 1)
        """

        self.m1 = NBClassifier(V=V,
                               n=1,
                               delta=delta,
                               train_file=train_file,
                               test_file=test_file)
        self.m2 = NBClassifier(V=V,
                               n=2,
                               delta=delta,
                               train_file=train_file,
                               test_file=test_file)
        self.m3 = NBClassifier(V=V,
                               n=3,
                               delta=delta,
                               train_file=train_file,
                               test_file=test_file)
        self.w1 = w1
        self.w2 = w2
        self.w3 = 1 - (w1 + w2)

    def train_all(self, train_df):
        """Trains all individual models with training data set
        """

        self.m1.train(train_df)
        self.m2.train(train_df)
        self.m3.train(train_df)

    def lin_interp_weighted_prob(self):
        """Creates new linearly interpolated probabilities for one model
        """

        for language in languages:
            table1 = self.m1.selector[language].probs_table
            table2 = self.m2.selector[language].probs_table
            table3 = self.m3.selector[language].probs_table

            for key1 in table3.keys():
                for key2 in table3.keys():
                    for key3 in table3.keys():
                        table3[key1][key2][key3] = self.w1 * table3[key1][
                            key2][key3] + self.w2 * table2[key2][
                                key3] + self.w3 * table1[key3]

    def predict_new(self, test_df):
        """Tests test dataset with new probabilities
        """
        return self.m3.predict(test_df)