예제 #1
0
 def getSolution(self):
     classifier = NaiveBayesClassifier()
     solution = ''.join([
         classifier.getClassification(jchar_CSV)
         for jchar_CSV in self.jchar_CSV_list
     ])
     return solution
예제 #2
0
    def summarize_text(self, sites, articles):
        if (len(articles) < 1):
            return ["Not enought information about player"]

        summary_methods = NaiveBayesClassifier()

        return summary_methods.get_summary(sites, articles)
예제 #3
0
def NBCTest():
    nbc = NaiveBayesClassifier()

    sites = [
        "https://www.theplayerstribune.com/doublelift-league-of-legends-everyone-else-is-trash/"
    ]
    article_extractor = articles.ArticleExtractor('doublelift',
                                                  'league of legends', 5)
    #sites = article_extractor.get_websites()
    article = [article_extractor.parse_websites(site) for site in sites]

    string_text = list_to_string(article)
    string_text = string_text.replace("', '", ' ')
    string_text = string_text.replace('", "', ' ')

    summary = nbc.get_summary(string_text, 5)

    print("Summary:")
    for sentence in summary:
        print(u'\u2022 ' + sentence.lstrip("[]1234567890',.\" "))
def main():
    data = []
    for verdict in ['spam', 'not_spam']:
        for files in glob.glob(PATH + verdict + "/*")[:500]:
            is_spam = True if verdict == 'spam' else False
            with open(files, "r", encoding='utf-8', errors='ignore') as f:
                for line in f:
                    if line.startswith("Subject:"):
                        subject = re.sub("^Subject: ", "", line).strip()
                        data.append((subject, is_spam))

    random.seed(0)
    train_data, test_data = split_data(data, 0.75)
    classifier = NaiveBayesClassifier()
    classifier.train(train_data)

    print("Spam" if classifier.classify("Get free laptops now!") > 0.5 else
          "Not Spam")

    classified = [(subject, is_spam, classifier.classify(subject))
                  for subject, is_spam in test_data]

    count = Counter((is_spam, spam_probability > 0.5)
                    for _, is_spam, spam_probability in classified)

    spammiest_hams, hammiest_spams = most_misclassified(classified)

    print("Accuracy: ", accuracy(count))
    print("Precision: ", precision(count))
    print("Recall: ", recall(count))
    print("\nTop 5 falsely classified as spam:\n\n", spammiest_hams)
    print("\nTop 5 falsely classified as not spam:\n\n", hammiest_spams)
    print("\nMost spammiest words: ", spammiest_word(classifier))
def spamFilterChecker():
    print('Received the JAVA Request!')
    # Get the text data from the JAVA Program.
    req_data = abhishek_request.get_json()
    text_to_be_classified = req_data['text_to_be_classified']
    print(text_to_be_classified)

    # ----------------------------------------------------------------------------
    # Make a POST request to the plino Spam API.
    # ----------------------------------------------------------------------------
    data = []
    for verdict in ['spam', 'not_spam']:
        for files in glob.glob(PATH + verdict + "/*")[:500]:
            is_spam = True if verdict == 'spam' else False
            with open(files, "r", encoding='utf-8', errors='ignore') as f:
                for line in f:
                    if line.startswith("Subject:"):
                        subject = re.sub("^Subject: ", "", line).strip()
                        data.append((subject, is_spam))

    random.seed(0)
    train_data, test_data = split_data(data, 0.80)
    classifier = NaiveBayesClassifier()
    classifier.train(train_data)

    json_response = ""
    value = classifier.classify(text_to_be_classified)
    if value < 0.9:
        json_response = "{'email_class' : 'spam'}"
    else:
        json_response = "{'email_class' : 'ham'}"
    print("====================================================")
    print("POSSIBILITY OF HAM : ", value)
    print(json_response)
    print("====================================================")
    return json_response
예제 #6
0
import pandas

from naive_bayes_classifier import NaiveBayesClassifier

df = pandas.read_csv("weather.csv", sep=";")

naive_bayes = NaiveBayesClassifier(df, 0.6)
naive_bayes.train_algorithm()

accuracy = naive_bayes.test_algorithm()
print("Algorithm Accuracy : " + str(accuracy) + " %")
tuple_data = ('Overcast', 'Hot', 'Normal', True)
prediction = naive_bayes.predict(tuple_data)
print("Prediction for " + str(tuple_data) + " is " + prediction)
svm_classifier = SvmClassifier()
svm_classifier.setSvm(svm)

#CONFIGURACAO DO RF
rf = RfModule()
rf_classifier = RfClassifier()
rf_classifier.setRf(rf)

#CONFIGURACAO DO RF
dt = DecisionTreeModule()
dt_classifier = DecisionTreeClassifier()
dt_classifier.setDecisionTree(dt)

#CONFIGURACAO DA NAIVEBAYES
naive_bayes = NaiveBayesModule()
naive_bayes_classifier = NaiveBayesClassifier()
naive_bayes_classifier.setNaiveBayes(naive_bayes)

#CONFIGURACAO DO LSTM
lstm = LstmModule()
lstm.setInputLength(20)
lstm.setNumberExamples(1000)
lstm_classifier = LstmClassifier()
lstm_classifier.setLstm(lstm)

#CONFIGURACAO DA REDE NEURAL
rna = RnaModule()
rna.setNumberNeuronsImputLayer(20)
rna.setActivationFunctionImputLayer("tanh")
rna.setImputDimNeurons(20)
rna.setNumberNeuronsHiddenLayer(20)
예제 #8
0
 def setUp(self):
     self.examples = {'university': ['''Abbottabad Public School , also commonly referred to as
     APS and Railway Public School , is a private , all boys , boarding
     school for , 7th to 12th grade students , located in Abbottabad ,
     Pakistan .''']}
     self.classifier = NaiveBayesClassifier(self.examples)
예제 #9
0
class TestClassifier(unittest.TestCase):

    def setUp(self):
        self.examples = {'university': ['''Abbottabad Public School , also commonly referred to as
        APS and Railway Public School , is a private , all boys , boarding
        school for , 7th to 12th grade students , located in Abbottabad ,
        Pakistan .''']}
        self.classifier = NaiveBayesClassifier(self.examples)

    def test_create_vocabulary(self):
        self.classifier.vocabulary.should.contain('private')

    def test_vocabulary_size(self):
        self.classifier.vocabulary_size.should.eql(28)

    def test_subset_of_documents_with_target_value(self):
        len(self.classifier.get_documents_with_target_value('university')).should.eql(1)

    def test_text_of_documents(self):
        documents = self.classifier.get_documents_with_target_value('university')
        self.classifier.get_text(documents).should.contain('private')

    def test_text_distinct_words(self):
        documents = self.classifier.get_documents_with_target_value('university')
        text = self.classifier.get_text(documents)
        self.classifier.get_text_diff_words_count(text).should.eql(28)

    def test_example_count(self):
        self.classifier.get_example_count().should.eql(1)

    def test_occurrences_of_word_count(self):
        documents = self.classifier.get_documents_with_target_value('university')
        text = self.classifier.get_text(documents)
        self.classifier.occurrences_count(',', text).should.eql(7)

    def test_learn(self):
        self.classifier.learn()

    def test_word_positions_in_doc(self):
        documents = self.classifier.get_documents_with_target_value('university')
        len(self.classifier.word_positions(documents[0])).should.eql(38)

    def test_classify(self):
        self.classifier.learn()
        self.classifier.classify(self.examples['university'][0]).should.eql('university')
예제 #10
0
y_train = []
X_test = []
y_test = []

for i in range(len(train)):
    y_train.append(train[i][0])
    X_train.append(train[i][1:])
    
for i in range(len(test)):
    y_test.append(test[i][0])
    X_test.append(test[i][1:])


# Instantiate the Naive Bayes classifier model made from scratch

model = NaiveBayesClassifier()

# Fit the model

model.fit(X_train, y_train)

# Make predictions

y_pred = model.predict(X_test)

# Print the accuracy of the model

print("NaiveBayesClassifier accuracy: {0:.3f}".format(model.accuracy(y_test, y_pred)))


# Instantiate the Gaussian Naive Bayes classifier model from Scikit-learn
예제 #11
0
	time_period = "daily"
	keywords = "hillary-trump"
	training_required = 0
#	log('Failed to load configurations; error in function loadConfig; exiting...', 'error')
#	print 'FAILED to load configurations'


# check if the model dump file is missing
#if training_required == 0:
#	if not os.path.exists(classifier):
#		training_required = 1

# train a model if required
if training_required:
	tweets = []
	nb = NaiveBayesClassifier(tweets, keywords, time_period, training_data, classifier, training_required)

# create the HBase tweets table if missing
print "Checking database tables..."
try:
	connection = happybase.Connection('localhost')
	connection.create_table('tweets',
		{'keyword': dict(max_versions=10),
		'sentiment': dict(max_versions=10),
		'tweet': dict(max_versions=10)
		}
	)
	print "...OK"
except:
	print "Table already exists"
#	log('Table already exists; skipping creation.')
# glob.glob returns every filename that matches the wildcarded path
for fn in glob.glob(path):
    is_spam = "ham" not in fn

    with open(fn, "r") as file:
        for line in file:
            if line.startswith("Subject:"):
                # remove the leading "Subject: " and keep what's left
                subject = re.sub(r"^Subject: ", "", line).strip()
                data.append((subject, is_spam))

random.seed(0)
train_data, test_data = split_data(data, 0.75)

classifier = NaiveBayesClassifier()
classifier.train(train_data)

# triplets (subject, actual is_spam, predicted spam probability)
classified = [(subject, is_spam, classifier.classify(subject)) for subject, is_spam in test_data]

# assume that spam_probability > 0.5 corresponds to spam prediction
# and count the combinations of (actual is_spam, predicted is_spam)
counts = Counter((is_spam, spam_probability > 0.5) for _, is_spam, spam_probability in classified)

# sort by spam_probability from smallest to largest
classified.sort(key=lambda row: row[2])

# the highest predicted spam probabilities among the non-spams
spammiest_hams = filter(lambda row: not row[1], classified)[-5:]
예제 #13
0
    mid_point = len(unique_data) // 2
    high = unique_data[mid_point:]
    low = unique_data[:mid_point]
    fixed_array = []
    for element in array:
        if element in low:
            fixed_array.append(False)
        elif element in high:
            fixed_array.append(True)
    return fixed_array


naive_bayes_classifier_random_50_dataframe_1 = DataFrame.from_array(
    [boolize_data(row) for row in dataframe.to_array()], dataframe.columns)
naive_bayes_classifier = NaiveBayesClassifier(
    dataframe=naive_bayes_classifier_random_50_dataframe_1,
    dependent_variable='Survived')
naive_bayes_classifier_classifications = get_classifications(
    naive_bayes_classifier, testing_dataframe)
for row in naive_bayes_classifier_classifications:
    print(row)
print('\n')

max_depth_5_decision_tree = DecisionTree(
    dataframe=dataframe,
    class_name='Survived',
    features=[column for column in dataframe.columns if column != 'Survived'],
    max_depth=5)
max_depth_5_decision_tree.fit()
max_depth_5_decision_tree_classifications = get_classifications(
    max_depth_5_decision_tree, testing_dataframe)
예제 #14
0
import pandas as pd
from naive_bayes_classifier import NaiveBayesClassifier

if __name__ == '__main__':
    df = pd.read_csv('spam.csv', encoding='ISO-8859-1')
    df = df.sample(frac=1.0)

    spam_data, legit_data = [], []

    for _, row in df.iterrows():
        if row['v1'] == 'spam':
            spam_data.append(row['v2'])
        else:
            legit_data.append(row['v2'])

    NB_classifier = NaiveBayesClassifier()

    spam_train = spam_data[:int(len(spam_data) * 2 / 3)]
    spam_test = spam_data[int(len(spam_data) * 2 / 3):]

    legit_train = legit_data[:int(len(legit_data) * 2 / 3)]
    legit_test = legit_data[int(len(legit_data) * 2 / 3):]

    NB_classifier.train(spam_train, legit_train)

    spam_accuracy = 0
    legit_accuracy = 0

    for text in spam_test:
        prediction = NB_classifier.predict(text)
        spam_accuracy += prediction
df = DataFrame.from_array(
    [
        [False, False, False],
        [True, True, True],
        [True, True, True],
        [False, False, False],
        [False, True, False],
        [True, True, True],
        [True, False, False],
        [False, True, False],
        [True, False, True],
        [False, True, False]
    ],
    columns = ['errors', 'links', 'scam']
)
naive_bayes = NaiveBayesClassifier(df, dependent_variable='scam')

print("Testing Probabilities")
assert naive_bayes.probability('scam', True) == 0.4

assert naive_bayes.probability('scam', False) == 0.6
print("passed")

print("Testing Conditional Probabilities")
assert naive_bayes.conditional_probability(('errors',True), given=('scam',True)) == 1.0

assert naive_bayes.conditional_probability(('links',False), given=('scam',True)) == 0.25

assert naive_bayes.conditional_probability(('errors',True), given=('scam',False)) == 0.16666666666666666

assert naive_bayes.conditional_probability(('links',False), given=('scam',False)) == 0.5
예제 #16
0
# glob.glob returns every filename that matches the wildcarded path
for fn in glob.glob(path):
    is_spam = "ham" not in fn

    with open(fn, 'r') as file:
        for line in file:
            if line.startswith("Subject:"):
                # remove the leading "Subject: " and keep what's left
                subject = re.sub(r"^Subject: ", "", line).strip()
                data.append((subject, is_spam))

random.seed(0)
train_data, test_data = split_data(data, 0.75)

classifier = NaiveBayesClassifier()
classifier.train(train_data)

# triplets (subject, actual is_spam, predicted spam probability)
classified = [(subject, is_spam, classifier.classify(subject))
              for subject, is_spam in test_data]

# assume that spam_probability > 0.5 corresponds to spam prediction
# and count the combinations of (actual is_spam, predicted is_spam)
counts = Counter((is_spam, spam_probability > 0.5)
                 for _, is_spam, spam_probability in classified)

# sort by spam_probability from smallest to largest
classified.sort(key=lambda row: row[2])

# the highest predicted spam probabilities among the non-spams