def classifyNonClusteredJira(self): columnName = 'C' for index, row in self.df.iterrows(): clusterName = row['Labels'] keyWords = row['KeyWords'] if (clusterName in constantsObj.INITIAL_CLUSTERS): self.issueSet.append(({ "class": row['Labels'], "sentence": keyWords })) for issue in self.issueSet: self.jiraTrainer.train(issue['sentence'], issue['class']) jiraClassifier = Classifier(self.jiraTrainer.data, tokenizer) for index, row in self.df.iterrows(): clusterName = row['Labels'] keyWords = row['KeyWords'] if (clusterName not in constantsObj.INITIAL_CLUSTERS): identifiedCluster = jiraClassifier.classify( row['KeyWords']).__getitem__(0) identifiedCluster = identifiedCluster.__getitem__(0) self.issueSet.append(({ "class": identifiedCluster, "sentence": keyWords })) self.nonClusteredJirasAfterClusteringFile.write( "%s --- %s\n" % (keyWords, identifiedCluster)) '''writeIndex = columnName + str(index-2) self.activeWorkSheet[writeIndex] = identifiedCluster''' self.nonClusteredJirasAfterClusteringFile.close() return self.issueSet
def __call__(self, text): context = self.context request = self.request response = request.response catalog = context.portal_catalog bayesFilter = api.portal.get_registry_record( 'i8d.content.browser.coverSetting.ICoverSetting.bayesFilter') trainingSet = [] for line in bayesFilter.split('\n'): trainingSet.append({ 'category': 'hasKey', 'text': safe_unicode(line) }) trainer = Trainer(tokenizer) for record in trainingSet: trainer.train(record['text'], record['category']) classifier = Classifier(trainer.data, tokenizer) result = classifier.classify(safe_unicode(text)) import pdb pdb.set_trace()
def train_spam_texts(): # Reading dataset file dataset_lang = "ru" dataset_file = codecs.open(os.path.abspath(os.curdir) + "/data/assets/spam_texts.json", "r", "utf_8_sig") dataset_data = json.load(dataset_file) # Preparing adverts spam dataset prepared_dataset = [] for idx, item in enumerate(dataset_data[dataset_lang]["adverts"]): prepared_dataset.append({ "text": item["text"], "category": "adverts" }) # Training # (Will be replaced by another library soon) advertsTrainer = Trainer(tokenizer) for one_dataset_item in prepared_dataset: advertsTrainer.train(one_dataset_item["text"], one_dataset_item["category"]) adverts_classifier = Classifier(advertsTrainer.data, tokenizer) # Usage # classification = adverts_classifier.classify("рассылка") # category_chance = classification[0][1] # print(category_chance)
def classifyNewJiraToOneOfTheClusters(self, inputTrainingData, inputJira): for item in inputTrainingData: self.jiraTrainer.train(item['sentence'], item['class']) jiraClassifier = Classifier(self.jiraTrainer.data, tokenizer) clusterForInputJira = jiraClassifier.classify(inputJira) return clusterForInputJira
def getKeywords(self, html): text = self.getHtml2Text(html) # print text text = self.zhsJieba(text) #取得registry reg = api.portal.get_registry_record('mingjing.content.browser.mjnetSetting.IMJNetSetting.catDict') trainSet = [] for item in reg: key = item.split('|||')[0] for line in reg[item].split('\n'): zhsString = self.zhsJieba(line) trainSet.append({'category': key, 'text': zhsString}) #用簡單貝氏分類文章 newsTrainer = Trainer(tokenizer) for news in trainSet: newsTrainer.train(news['text'].encode('utf-8'), news['category']) newsClassifier = Classifier(newsTrainer.data, tokenizer) classification = newsClassifier.classify(text) print classification # import pdb; pdb.set_trace() if classification[0][1] == 0.0: classification.insert(0, (u'n99', 0.0)) result = [] for item in classification: result.append(item[0]) return result
def classificationNB(index): ''' Train the Naive Bayes classifier and classify data naiveBayesClassifier is used. https://github.com/muatik/naive-bayes-classifier ''' # Initial training set from file trainset = [] f = open('E:\\databases\\trainset.txt', 'r') for line in f: if len(line.strip()) == 0: continue line = line.strip().split() assert len(line) == 22 trainset.append({ 'text': '%08d' % int(line[(index + 1) * 2]), 'category': line[(index + 1) * 2 + 1] }) pass # for line in f f.close() # Train the classifier trainer = Trainer(tokenizer) for case in trainset: trainer.train(case['text'], case['category']) classifier = Classifier(trainer.data, tokenizer) # Classification for each of the rest sets for i in range(10): if index == i: continue print '%-2d ~ %-2d' % (index, i) # Read cases from the file and classify each case f = open('E:\\databases\\classification%02d.txt' % (i + 1), 'r') results = [] count = 0 for line in f: count += 1 line = line.strip() if len(line) == 0: continue if count == 1: # the first line -- title header = 'CAT%02d' % (index + 1) assert header not in line results.append('%s\t%s' % (line, header)) continue pass # if count == 1 case = line.split() assert len(case) >= 4 clf = classifier.classify(case[2]) results.append('%s\t%s' % (line, clf)) pass # for line in f f.close() # Save the results back to the file f = open('E:\\databases\\classification%02d.txt' % (i + 1), 'w') for re in results: f.write('%s\n' % re) f.close() pass # for i in range(10)
def train(self): """Train on base and FB data""" with open('res/data/base_data.csv', 'r') as csv_file: reader = csv.reader(csv_file) i = 0 for line in reader: i += 1 line_split = line read_dict = {} if i == 1 or len(line_split) <= 2 or len(line_split[0]) == 0: continue read_dict['class'] = line_split[2].strip() # Accounting for our inconsistency in Spreadsheet if read_dict["class"] == "Real": read_dict['text'] = line_split[6].strip() else: read_dict['text'] = line_split[5].strip() print(read_dict) self.training_data.append(read_dict) print('---->>>>>><<<<<<<-------') with open('res/data/fb_data.csv', 'r') as csv_file: reader = csv.reader(csv_file) i = 0 for line in reader: i += 1 line_split = line read_dict = {} if i == 1 or len(line_split) <= 2: continue read_dict['class'] = line_split[2].strip() read_dict['text'] = line_split[5].strip() print(read_dict) self.training_data.append(read_dict) #print training_data for data in self.training_data: self.newsTrainer.train(data['text'], data['class']) self.newsClassifier = Classifier( self.newsTrainer.data, tokenizer.Tokenizer(stop_words=[], signs_to_remove=["?!#%&"]))
def classify(filename, size): trainingSet, testingSet = make_chronological_sets.create_sets( filename, size) trainer = Trainer(tokenizer.Tokenizer(stop_words=[], signs_to_remove=[""])) for sample in trainingSet: trainer.train(sample['url'], sample['result']) classifier = Classifier( trainer.data, tokenizer.Tokenizer(stop_words=[], signs_to_remove=[""])) mal_mal = 0 mal_clean = 0 clean_clean = 0 clean_mal = 0 for sample in testingSet: predicted = classifier.classify(sample['url'])[0][0] actual = sample['result'] if predicted == 'malicious' and actual == 'malicious': mal_mal += 1 elif predicted == 'malicious' and actual == 'clean': mal_clean += 1 elif predicted == 'clean' and actual == 'clean': clean_clean += 1 elif predicted == 'clean' and actual == 'malicious': clean_mal += 1 prop_caught = float(mal_mal) / float(mal_mal + clean_mal) prop_missed = float(clean_mal) / float(mal_mal + clean_mal) ## Stuff to get proportions: # size = float(size) # mal_mal = float(mal_mal)/size # mal_clean = float(mal_clean)/size # clean_mal = float(clean_mal)/size # clean_clean = float(clean_clean)/size ## Confusion matrix stuff: # confusionMatrix = [['Actually malicious', mal_mal, clean_mal], ['Actually clean', mal_clean, clean_clean]] # print tabulate(confusionMatrix, headers=['', 'Predicted malicious', 'Predicted clean']) print "Total: " + str(mal_mal + mal_clean + clean_mal + clean_clean) print "Malware: " + str(mal_mal + clean_mal) print "Clean: " + str(mal_clean + clean_clean) print "Caught: " + str(mal_mal) + " (" + "{:.1%}".format(prop_caught) + ")" print "Missed: " + str(clean_mal) + " (" + "{:.1%}".format( prop_missed) + ")"
def train_classifier(newsData_train): data_process = Data_process() for data in newsData_train: data_process.final_process(data['text'], data['category']) newsClassifier = Classifier(data_process, data_process.tokenizer) return newsClassifier
def get_classer(): newsTrainer = Trainer(tokenizer) for news in newsSet: newsTrainer.train(news['text'], news['category']) # When you have sufficient trained data, you are almost done and can start to use # a classifier. newsClassifier = Classifier(newsTrainer.data, tokenizer) return newsClassifier
def init(cls, lang='tr', namesCollection=NamesCollection, classifier=None): cls.lang = lang cls.namesCollection = namesCollection if classifier: cls.classifier = classifier else: cls.classifier = Classifier(CachedModel.get(lang), tokenizer) cls.initialized = True
def train(self): """Train on base and FB data""" # Run through each training example in data interface and # feed them into model for data_point in self.data_interface.arr: data_class = data_point[2].strip() # Class is "Credibility" data_text = data_point[4].strip() # Text is "Content URL" self.newsTrainer.train(data_text, data_class) self.newsClassifier = Classifier(self.newsTrainer.data, \ tokenizer.Tokenizer(stop_words = [], signs_to_remove = ["?!#%&"]))
def classify(filename, size, url, result): trainingSet = make_training_set.create_set(filename, size) trainer = Trainer(tokenizer.Tokenizer(stop_words = [], signs_to_remove = [""])) for sample in trainingSet: trainer.train(sample['url'], sample['result']) classifier = Classifier(trainer.data, tokenizer.Tokenizer(stop_words = [], signs_to_remove = [""])) print "Expected: " + result print classifier.classify(url)
def post_logfile(): if request.method == 'GET': log_file = request.args['symptom'] print(log_file) diseaseclassifier = Trainer(tokenizer) #STARTS CLASIFIERS with open("Dataset.csv", "r") as file: #OPENS DATASET for i in file: #FOR EACH LINE lines = file.next().split(",") #PARSE CSV <DISEASE> <SYMPTOM> diseaseclassifier.train(lines[1], lines[0]) #TRAINING diseaseclassifier = Classifier(diseaseclassifier.data, tokenizer) classification = diseaseclassifier.classify(log_file) #CLASIFY INPUT print classification return json.dumps(dict(classification))
def create_naive_bayes_classifier(training_examples, training_annotations): print("creating naive bayes classifier") annotations = [categories[x] for x in training_annotations] news_trainer = Trainer( tokenizer.Tokenizer(stop_words=[], signs_to_remove=[tokenization_string])) for example, annotation in zip(training_examples, annotations): news_trainer.train(example, annotation) classifier = Classifier( news_trainer.data, tokenizer.Tokenizer(stop_words=[], signs_to_remove=[tokenization_string])) print("\t->done") return classifier
def train(self): with open('src/URL.csv', 'r') as csv_file: reader = csv_file.readlines() for line in reader: read_dict = {} line_split = line.split(',') if len(line_split) < 2 or len(line_split[0]) == 0: continue read_dict['text'] = line_split[0].strip() read_dict['class'] = line_split[1].strip() self.training_data.append(read_dict) #print training_data for data in self.training_data: self.newsTrainer.train(data['text'], data['class']) self.newsClassifier = Classifier(self.newsTrainer.data, tokenizer.Tokenizer(stop_words = [], signs_to_remove = ["?!#%&"]))
def post_logfile(): if request.method == 'POST': log_file = request.args['symptom'] print(log_file) diseaseclassifier = Trainer(tokenizer) #STARTS CLASIFIERS with open("Dataset.csv", "r") as file: #OPENS DATASET for i in file: #FOR EACH LINE lines = file.next().split(",") #PARSE CSV <DISEASE> <SYMPTOM> diseaseclassifier.train(lines[1], lines[0]) #TRAINING diseaseclassifier = Classifier(diseaseclassifier.data, tokenizer) classification = diseaseclassifier.classify(log_file) #CLASIFY INPUT print classification result = [] for item in classification: obj = CustomType(item[0], item[1]) result.append(json.loads(obj.toJSON())) # return json.dumps(OrderedDict(classification)) return json.dumps(result, indent=4)
def classify(filename, size): trainingSet, testingSet = make_balanced_sets.create_sets(filename, size) trainer = Trainer(tokenizer.Tokenizer(stop_words=[], signs_to_remove=[""])) for sample in trainingSet: trainer.train(sample['url'], sample['result']) classifier = Classifier( trainer.data, tokenizer.Tokenizer(stop_words=[], signs_to_remove=[""])) mal_mal = 0 mal_clean = 0 clean_clean = 0 clean_mal = 0 for sample in testingSet: predicted = classifier.classify(sample['url'])[0][0] actual = sample['result'] if predicted == 'malicious' and actual == 'malicious': mal_mal += 1 elif predicted == 'malicious' and actual == 'clean': mal_clean += 1 elif predicted == 'clean' and actual == 'clean': clean_clean += 1 elif predicted == 'clean' and actual == 'malicious': clean_mal += 1 size = float(size) mal_mal = float(mal_mal) / size mal_clean = float(mal_clean) / size clean_mal = float(clean_mal) / size clean_clean = float(clean_clean) / size confusionMatrix = [[mal_mal, clean_mal], [mal_clean, clean_clean]] pprint(confusionMatrix) print "Accuracy: " + str(mal_mal + clean_clean) print "False positives (predicted clean when malicious): " + str(clean_mal) print "False negatives (predicted malicious when clean): " + str(mal_clean)
def create_nbc_nb_classifier(training_dataset): training_examples, training_annotations = training_dataset # training_annotations = [int(not bool(annotation)) for annotation in training_annotations] parsed_training_examples = [ set(tokenize(example)) for example in training_examples ] tr = Trainer( tokenizer.Tokenizer(stop_words=[], signs_to_remove=[tokenization_string])) for example, annotation in zip(parsed_training_examples, training_annotations): tr.train(example, annotation) print("number of tokens seen: %s" % len(tr.data.frequencies.keys())) return tr, Classifier( tr.data, tokenizer.Tokenizer(stop_words=[], signs_to_remove=[tokenization_string]))
def neyronka(self, _str): newsTrainer = Trainer(tokenizer) with open('o', 'rt', encoding='utf8') as csvfile: res = '[' for i in csvfile.readlines(): if i == '\n': continue else: theme, text = i.split('***') res += '{\'text\':' + '\'' + text.strip() + '\'' + ', ' + '\'category\':' + '\'' + str( theme) + '\'},\n' res += ']' newsSet = eval(res) for news in newsSet: newsTrainer.train(news['text'], news['category']) newsClassifier = Classifier(newsTrainer.data, tokenizer) unknownInstance = _str classification = newsClassifier.classify(unknownInstance) return (sorted(classification, key=(lambda x: -x[1])))
def determine(sentence): newsTrainer = Trainer(tokenizer) newsSet = [] with open('data.csv') as csvfile: reader = csv.DictReader(csvfile) for row in reader: newsSet.append({'fact': row['Fact'], 'decision': row['Decision']}) for news in newsSet: newsTrainer.train(news['fact'], news['decision']) newsClassifier = Classifier(newsTrainer.data, tokenizer) classification = newsClassifier.classify(sentence) # False false = classification[0][1] false = str(false).split('.')[0] # True true = classification[1][1] true = str(true).split('.')[0] data = [true, false] return data
import json, os, sys, re from naiveBayesClassifier import tokenizer from naiveBayesClassifier.trainer import Trainer from naiveBayesClassifier.classifier import Classifier ##IMPORTS ''' Usage: python GuessDisease.py "symptomA symptomB symptomC" Example INPUT: python GuessDisease.py "agitation exhaustion vomit" Example OUTPUT: { "disease": "influenza" } ''' ##SETTING UP diseaseclassifier = Trainer(tokenizer) #STARTS CLASIFIERS with open("Dataset.csv", "r") as file: #OPENS DATASET for i in file: #FOR EACH LINE lines = file.next().split(",") #PARSE CSV <DISEASE> <SYMPTOM> diseaseclassifier.train(lines[1], lines[0]) #TRAINING diseaseclassifier = Classifier(diseaseclassifier.data, tokenizer) classification = diseaseclassifier.classify(sys.argv[1]) #CLASIFY INPUT print classification[0] #PRINT CLASIFICATION
documentTrainer = Trainer(tokenizer) documentSet = [] def getTextBasedOnDocumentID(documentID): ID = int(documentID.split('_')[1]) line = linecache.getline('../2.document_set/document_set.csv', ID + 2) text = line.split(',"')[1] return text for i in range(0, len(traincsv)): documentSet.append({ 'text': getTextBasedOnDocumentID(traincsv[i][0]), 'category': traincsv[i][1] }) for documents in documentSet: documentTrainer.train(documents['text'], documents['category']) newsClassifier = Classifier(documentTrainer.data, tokenizer) for i in range(0, len(testcsv)): data = getTextBasedOnDocumentID(testcsv[i][0]) classification = newsClassifier.classify(data) testcsv[i][1] = int(classification[0][0]) df = pd.DataFrame(testcsv) df.to_csv("../5.evaluation_file/predicted_cat.csv", index=False) #np.savetxt("./5.evaluation_file/predicted_cat.csv", testcsv,header="document_id,category" ,delimiter=",")
def classify(input): twitter = Twitter() f = open("data.txt", "r") data = json.loads(f.read()) gradeTrainer = Trainer(tokenizer) loadTrainer = Trainer(tokenizer) lectureTrainer = Trainer(tokenizer) print("Training grade ...") for subject in data: if subject["grade"] != "?": review = subject["comment"].replace('.', '\n').split("\n") for li in review: if len(li.strip()) != 0: gradeTrainer.train(li, subject["grade"]) print("Training load ...") for subject in data: if subject["load"] != "?": review = subject["comment"].replace('.', '\n').split("\n") for li in review: if len(li.strip()) != 0: loadTrainer.train(li, subject["load"]) print("Training lecture ...") for subject in data: if subject["lecture"] != "?": review = subject["comment"].replace('.', '\n').split("\n") for li in review: if len(li.strip()) != 0: lectureTrainer.train(li, subject["lecture"]) gradeClassifier = Classifier(gradeTrainer.data, tokenizer) loadClassifier = Classifier(loadTrainer.data, tokenizer) lectureClassifier = Classifier(lectureTrainer.data, tokenizer) input = u"" + input classify_input = [] for element in twitter.pos(input): if element[1] == 'Noun': classify_input.append(element[0]) elif element[1] == 'Verb': classify_input.append(element[0]) elif element[1] == 'Adjective': classify_input.append(element[0]) elif element[1] == 'Adverb': classify_input.append(element[0]) elif element[1] == 'Exclamation': classify_input.append(element[0]) elif element[1] == 'Alpha': classify_input.append(element[0]) elif element[1] == 'KoreanParticle': classify_input.append(element[0]) text = " ".join(classify_input) print(text) gradeClassification = gradeClassifier.classify(text) loadClassification = loadClassifier.classify(text) lectureClassification = lectureClassifier.classify(text) print( "\n________________________________________GRADE________________________________________\n" ) print(gradeClassification) print( "\n________________________________________LOAD_________________________________________\n" ) print(loadClassification) print( "\n________________________________________LECTURE______________________________________\n" ) print(lectureClassification) return gradeClassification, loadClassification, lectureClassification
def get(self): try: print " " print "TestClassifier start" print " " # pasar los stop words a lista desde el file with open("stop_words.txt", "r") as ins: array = [] for line in ins: array.append((line.rstrip('\n')).decode('unicode-escape')) #print array newsTrainer = Trainer( tokenizer.Tokenizer(stop_words=array, signs_to_remove=["?!#%&_"])) hoy = date.today() query = News3.query(News3.date == hoy, News3.news_from.IN([ "uy_press", ]), News3.category == "Política") # You need to train the system passing each text one by one to the trainer module. #newsSet =[ # {'text': 'not to eat too much is not enough to lose weight', 'category': 'health'}, # {'text': 'Russia try to invade Ukraine', 'category': 'politics'}, # {'text': 'do not neglect exercise', 'category': 'health'}, # {'text': 'Syria is the main issue, Obama says', 'category': 'politics'}, # {'text': 'eat to lose weight', 'category': 'health'}, # {'text': 'you should not eat much', 'category': 'health'} #] query2 = News3.query(News3.date == hoy, News3.news_from == "uy_press", News3.category == "deportes") query4 = News3.query(News3.date == hoy, News3.news_from == "uy_press", News3.category == "salud") #for news in newsSet: # newsTrainer.train(news['text'], news['category']) c = 0 #print query for i in query: print " " print i.category newsTrainer.train(i.html, 'politica') #if c == 10: break c += 1 #for i in query2: # newsTrainer.train(i.html, 'deportes') #raise Exception('I know Python!') #for i in query4: # newsTrainer.train(i.html, 'salud') # When you have sufficient trained data, you are almost done and can start to use # a classifier. # Now you have a classifier which can give a try to classifiy text of news whose # category is unknown, yet. query3 = News3.query( News3.date == hoy, News3.news_from.IN([ "el_pais", ]), News3.id.IN([0]), ) ### newsClassifier = Classifier( newsTrainer.data, tokenizer.Tokenizer(stop_words=array, signs_to_remove=["?!#%&"])) #print unknownInstance classification = newsClassifier.classify( "Vidalín: No quiero que me llamen para saber qué tramite hay que hacer para poner un prostíbulo" ) # the classification variable holds the detected categories sorted print " classification " print(classification) except: print traceback.format_exc()
def article_keywords(article): keys = Keywords.objects.get(article=article) print keys l = [k.keyword for k in keys.keywords.all()] print " ".join(l) keyset = {'keyword': " ".join(l)} return keyset if __name__ == '__main__': print "Starting testing of Bayes Classifer" labeled_articles = [ (a, a.relevant) for a in Article.objects.all()[:(len(Article.objects.all()))] ] print labeled_articles featuresets = [] for (article, relevant) in labeled_articles: r = article_keywords(article) featuresets.append((r, relevant)) print featuresets train_set, test_set = featuresets[:(len(featuresets))], featuresets[( len(featuresets) - 2):] print train_set newsTrainer = Trainer(tokenizer) for f in train_set: newsTrainer.train(f[0]['keyword'], f[1]) newsClassifier = Classifier(newsTrainer.data, tokenizer) url = raw_input("Enter the url: ") testurl(url, newsClassifier)
if data[no][1] == 'negative': cls = data[no][3] else: cls = data[no][1] twi_cont = str_pre_process(data[no][10]) struct = {'text': twi_cont, 'category': cls} #print twi_cont, cls train_twi.append(struct) for twi in train_twi: trainer.train(twi['text'], twi['category']) model = Classifier(trainer.data, tokenizer) print "Testing..." for no in range(12000, num_twi): twi_cont = str_pre_process(data[no][10]) classification = model.classify(twi_cont) #print classification, test_twi.append(classification) if data[no][1] == 'negative': cls = data[no][3] else: cls = data[no][1] true_cls.append(cls)
from naiveBayesClassifier.classifier import Classifier sentimentTrainer = Trainer(tokenizer) # Get the training dataset. with open('training.csv', 'r') as f: data = f.read() trainset = data.splitlines() for line in trainset: pos1 = line.find(',"') pos2 = line.find('",', pos1) if pos1 == -1: pos1 = line.find(',') pos2 = line.find(',', pos1 + 1) comment = line[pos1 + 1:pos2] sentiment = line[pos2 + 1:] else: comment = line[pos1 + 2:pos2 - 2] sentiment = line[pos2 + 2:] sentimentTrainer.train(comment, sentiment) # Use the classifier. sentimentClassifier = Classifier(sentimentTrainer.data, tokenizer) # Classify an unknown review. unknownInstance = "I don't like the app. It crashes everytime." classification = sentimentClassifier.classify(unknownInstance) print classification
{'symptoms': 'unresponsiveness', 'disease': 'dementia'}, {'symptoms': 'lethargy', 'disease': 'dementia'}, {'symptoms': 'agitation', 'disease': 'dementia'}, {'symptoms': 'ecchymosis', 'disease': 'dementia'}, {'symptoms': 'syncope', 'disease': 'dementia'}, {'symptoms': 'rale', 'disease': 'dementia'}, {'symptoms': 'unconscious state', 'disease': 'dementia'}, {'symptoms': 'cough', 'disease': 'dementia'}, {'symptoms': 'bedridden', 'disease': 'dementia'}, {'symptoms': 'unsteady gait', 'disease': 'dementia'}, ] for news in newsSet: newsTrainer.train(news['symptoms'], news['disease']) # When you have sufficient trained data, you are almost done and can start to use # a classifier. newsClassifier = Classifier(newsTrainer.data, tokenizer.Tokenizer(stop_words = [], signs_to_remove = ["?!#%&"])) # Now you have a classifier which can give a try to classifiy text of news whose # category is unknown, yet. unknownInstance = "pain fever coughing" classification = newsClassifier.classify(unknownInstance) # the classification variable holds the possible categories sorted by # their probablity value print(classification)
def get_model(trump, cruz, kasich, clinton, sanders): trainer = Trainer(tokenizer) twiSet = [] for address in trump: with open( "/Users/Helicopter/Desktop/Logan_s/Courses/Stat_AI_ML/election/election_twitters/trump/" + address, "r") as text_file: content = "" for line in text_file: for word in line.split(): content = content + word + " " struct = {'text': content, 'category': 'trump'} twiSet.append(struct) text_file.close() for address in cruz: with open( "/Users/Helicopter/Desktop/Logan_s/Courses/Stat_AI_ML/election/election_twitters/cruz/" + address, "r") as text_file: content = "" for line in text_file: for word in line.split(): content = content + word + " " struct = {'text': content, 'category': 'cruz'} twiSet.append(struct) text_file.close() for address in kasich: with open( "/Users/Helicopter/Desktop/Logan_s/Courses/Stat_AI_ML/election/election_twitters/kasich/" + address, "r") as text_file: content = "" for line in text_file: for word in line.split(): content = content + word + " " struct = {'text': content, 'category': 'kasich'} twiSet.append(struct) text_file.close() for address in clinton: with open( "/Users/Helicopter/Desktop/Logan_s/Courses/Stat_AI_ML/election/election_twitters/clinton/" + address, "r") as text_file: content = "" for line in text_file: for word in line.split(): content = content + word + " " struct = {'text': content, 'category': 'clinton'} twiSet.append(struct) text_file.close() for address in sanders: with open( "/Users/Helicopter/Desktop/Logan_s/Courses/Stat_AI_ML/election/election_twitters/sanders/" + address, "r") as text_file: content = "" for line in text_file: for word in line.split(): content = content + word + " " struct = {'text': content, 'category': 'sanders'} twiSet.append(struct) text_file.close() for twi in twiSet: trainer.train(twi['text'], twi['category']) newclassifier = Classifier(trainer.data, tokenizer) return newclassifier