class Classifier: def __init__(self): fp = open("./data/train.csv") self.cl = NaiveBayesClassifier(fp, format="csv") fp.close() def test(self): return self.cl.classify("This is a test sentence") def classify(self, text): return self.cl.classify(text) def n_classify(self, text): dist = self.cl.prob_classify(text) probs = {"sentiments": []} for s in dist.samples(): if dist.prob(s) >= .10: probs["sentiments"].append({s: dist.prob(s)}) return json.dumps(probs) def accuracy(self): fp = open('./data/train.csv') train_accuracy = self.cl.accuracy(fp, format="csv") fp.close() fp = open('./data/test.csv') test_accuracy = self.cl.accuracy(fp, format="csv") fp.close() return json.dumps({"train_accuracy": train_accuracy, "test_accuracy": test_accuracy}) def labels(self): return json.dumps({"labels": self.cl.labels()})
def classifier(something): speech = something train = [] test = [] with open("training.csv") as csvfile: reader = csv.reader(csvfile) # change contents to floats for row in reader: # each row is a list train.append(row) with open("test.csv") as csvfile: reader = csv.reader(csvfile) # change contents to floats for row in reader: # each row is a list test.append(row) cl = NaiveBayesClassifier(train) cl.classify("This is an amazing library!") prob_dist = cl.prob_classify("This one's a doozy.") prob_dist.max() round(prob_dist.prob("machine"), 2) round(prob_dist.prob("no machine"), 2) blob = TextBlob(speech, classifier=cl) blob.classify() for s in blob.sentences: print("\n\n\n" + str(s)) print("\n" + str(s.classify())) return (s.classify())
class ExpenseClassifier: def __init__(self): training_data = self._load_data("data") self.category_classifier = NaiveBayesClassifier([(x[0], x[1]) for x in training_data]) self.avoidability_classifier = NaiveBayesClassifier([(x[0], x[2]) for x in training_data]) self.ordinary_classifier = NaiveBayesClassifier([(x[0], x[3]) for x in training_data]) def classify(self, description): res = {} res['category'] = self.category_classifier.classify(description) res['avoidable'] = self.avoidability_classifier.classify(description) res['ordinary'] = self.ordinary_classifier.classify(description) return res def accuracy(self): test_data = self._load_data("test") res = {} res['category'] = self.category_classifier.accuracy([(x[0], x[1]) for x in test_data]) res['avoidable'] = self.avoidability_classifier.accuracy([(x[0], x[2]) for x in test_data]) res['ordinary'] = self.ordinary_classifier.accuracy([(x[0], x[3]) for x in test_data]) return res def _load_data(self, folder): data = [] for f in glob.glob(folder + "/*.csv"): with open(f) as csvfile: spamreader = csv.reader(csvfile, delimiter=',') for row in spamreader: if row[DESCRIPTION] and row[CATEGORY] and row[AVOIDABLE] and row[ORDINARY]: data.append((norm(row[DESCRIPTION]), row[CATEGORY], row[AVOIDABLE], row[ORDINARY])) return data
def main(): json = raw_input("Where is the json training set?") print "Program start", time.ctime() #debug with open(json, 'r') as file: classifier = NaiveBayesClassifier(file, format='json') print "Classifier done!", time.ctime() #debug test = raw_input("Where is the test eml_folder?") print "Testing...", time.ctime() for emails in dir_list(test): print classifier.classify(emails) print "Testing done", time.ctime()
class TimeLogicAdapter(LogicAdapter): def __init__(self, **kwargs): super(TimeLogicAdapter, self).__init__(**kwargs) training_data = [ ("what time is it", 1), ("do you know the time", 1), ("do you know what time it is", 1), ("what is the time", 1), ("do you know the time", 0), ("it is time to go to sleep", 0), ("what is your favorite color", 0), ("i had a great time", 0), ("what is", 0) ] self.classifier = NaiveBayesClassifier(training_data) def process(self, statement): now = datetime.now() confidence = self.classifier.classify(statement.text.lower()) response = Statement("The current time is " + now.strftime("%I:%M %p")) return confidence, response
def reply_engine(sentence,train): cl = NaiveBayesClassifier(train) k = str(cl.classify((sentence))) if k == 'pos': return random.choice(POSITIVE_RESPONSE) elif k == 'neg': return random.choice(NEGATIVE_RESPONSE)
def detecting_fake_news(var): train = [ ('15 september is the day when we go back to school.', 'true'), ('Corona isn't deadly.', 'false'), ('Tunisian next elections is in 2024 .', 'true'), ('Says the Annies List political group supports third-trimester abortions on demand.', 'false'), ('Donald Trump is against marriage equality. He wants to go back.', 'true'), ('Says nearly half of Oregons children are poor.', 'true'), ('State revenue projections have missed the mark month after month.', 'true'), ("In the month of January, Canada created more new jobs than we did.", 'true'), ('If people work and make more money, they lose more in benefits than they would earn in salary.', 'false'), ('Originally, Democrats promised that if you liked your health care plan, you could keep it. One year later we know that you need a waiver to keep your plan.', 'false'), ("We spend more money on antacids than we do on politics.", 'false'), ('Barack Obama and Joe Biden oppose new drilling at home and oppose nuclear power.', 'false'), ('President Obama once said he wants everybody in America to go to college.', 'false') ] test = [ ('Because of the steps we took, there are about 2 million Americans working right now who would otherwise be unemployed.', 'true'), ('Scientists project that the Arctic will be ice-free in the summer of 2018', 'false'), ("You cannot build a little guy up by tearing a big guy down -- Abraham Lincoln said it.", 'false'), ("One man opposed a flawed strategy in Iraq. One man had the courage to call for change. One man didn't play politics with the truth.", 'true'), ('When I was governor, not only did test scores improve we also narrowed the achievement gap.', 'true'), ("Ukraine was a nuclear-armed state. They gave away their nuclear arms with the understanding that we would protect them.", 'false') ] cl = NaiveBayesClassifier(train) result = cl.classify(var) return result
class TimeLogicAdapter(LogicAdapter): """ The TimeLogicAdapter returns the current time. """ def __init__(self, **kwargs): super(TimeLogicAdapter, self).__init__(**kwargs) training_data = [ ('what time is it', 1), ('do you know the time', 1), ('do you know what time it is', 1), ('what is the time', 1), ('it is time to go to sleep', 0), ('what is your favorite color', 0), ('i had a great time', 0), ('what is', 0) ] self.classifier = NaiveBayesClassifier(training_data) def process(self, statement): now = datetime.now() confidence = self.classifier.classify(statement.text.lower()) response = Statement('The current time is ' + now.strftime('%I:%M %p')) return confidence, response
def train_data(ticker): df = pd.read_csv('../tmp/training_data/' + ticker + '2015-2016_data1.csv') train_df = df[['snippet', 'price change']] print "Training News Dataset" print train_df.head(5) cl = NaiveBayesClassifier(train_df.as_matrix(columns=None)) df = pd.read_csv('../tmp/training_data/' + ticker + '2016-2017_data1.csv') dataset = df[['snippet', 'price change']] classified = [] right = 0 #print dataset.head(n=5) print "\nClassifying dataset\n" for index, row in dataset.iterrows(): classified.append(cl.classify(row[0])) right += 1 if row[1] == classified[index] else 0 dataset['News Sent.'] = classified path = '../tmp/results/News/' + ticker + '_results.csv' dataset.to_csv(path, encoding='utf-8', index=False) #dataset['Price Sent.'] = real_sent print dataset[['snippet', 'price change', 'News Sent.']].head(n=20) total = len(dataset['snippet']) print "\nCalculating " print "\nRight %d, Total %d" % (right, total) print "Correct percentage %.2f %%" % ((1.0 * right / total) * 100) #print cl.classify(dataset.as_matrix(columns=None)) print cl.show_informative_features(10)
def qa1(): trainData = [ ('Augmentation mentoplasty using Mersilene mesh. Many different materials are available for augmentation mentoplasty. However, the optimal implant material for chin implantation has yet to be found. During the past several years, a number of experienced surgeons have turned to the use of Mersilene mesh. Mersilene mesh is a non-absorbable Dacron polyester fiber that can be conformed easily into layers to achieve tailored dimensions and shape. At the McCollough Plastic Surgery Clinic PA, Birmingham, Ala, 277 patients over a 10-year period underwent chin augmentation with Mersilene mesh implants. The material provides excellent tensile strength, durability, and surgical adaptability. The overall complication rate was 3.2% (nine patients); infection rate, 2.5% (seven patients); and removal secondary to infection, 1.7% (five patients). Based on this 10-year experience, Mersilene mesh remains our material of choice for chin augmentation.', 'C01'), ('Multiple intracranial mucoceles associated with phaeohyphomycosis of the paranasal sinuses. The purpose of this article is to alert clinicians to a new pathogenic fungus of the paranasal sinuses called Exserohilum rostratum. Exserohilum species are one of the etiologic agents of phaeohyphomycosis, a constellation of entities caused by dematiaceous fungi. This class of fungal sinus infection has emerged only in the past decade; it occurs primarily in immunocompetent individuals and produces a tenacious, progressive pansinusitis. To our knowledge, this study describes the first case of multiple intracranial mucoceles secondary to E rostratum. The diagnostic workup includes computed tomography and magnetic resonance imaging followed by direct microscopic examination of tissue biopsy specimens. A craniotomy followed by a bilateral external ethmoidectomy was necessary for complete extirpation of the infected mucoceles. Aggressive surgical management of this mycotic infection is described', 'C01'), ('Laser photodynamic therapy for papilloma viral lesions. Photodynamic therapy was tested for its therapeutic efficacy in eradicating rabbit papilloma warts. The wild-type viral warts suspension was used to induce treatable papilloma warts in the cutaneous tissue of Dutch Belted rabbits. The photosensitizing agents used intravenously were Photofrin II at 10 mg/kg of body weight and Chlorin e6 monoethylene diamine monohydrochloric acid (Chlorin e6 med HCl) at 1 mg/kg of body weight. The lasers used were an argon-dye laser at 628 and 655 nm and a gold vapor laser at 628 nm. The irradiances of 25 to 180 mW/cm2 were applied topically with an end-on lens optical fiber with total radiant doses of 7.5 to 54 J/cm2. Photofrin II and the argon-dye laser at the highest light dosage (54 J/cm2) and Chlorin e6 monoethylene diamine monohydrochloride administered 2 hours before argon-dye laser irradiation at 655 nm at the highest light dosage (54 J/cm2) produced wart regression. Total wart regression without recurrence was achieved with Photofrin II and the gold vapor laser at all light dosages. The difference observed between the argon-dye laser and the gold vapor laser might be explained by the pulsed nature of the gold vapor laser, with its high-peak powers, some 5000 x the average measured light dose. In this model, the smaller, less cornified lesions were more effectively treated with photodynamic therapy.', 'C02'), ('Role of the monocyte-macrophage in influenza virus infection of lymphocytes: implications for HIV infection. Knowledge of the pathogenesis of viruses which are less virulent than human immunodeficiency virus (HIV) may provide valuable insights into the pathogenesis of HIV infection. Influenza virus, an enveloped RNA virus, infects monocyte-macrophages, although the infection is brief and abortive. Isolated purified lymphocytes are completely resistant to infection. In contrast, mixtures of lymphocytes and macrophages can synthesize all virus proteins. Infection requires physical association of monocyte-macrophages and lymphocytes in "clusters." These studies with influenza virus suggest that the pathogenesis of virus infections in mixed cell cultures may be very different from that observed in purified cell populations, and they suggest that similar studies should be performed with HIV.', 'C01'), ('Use of polymerase chain reaction for successful identification of asymptomatic genital infection with herpes simplex virus in pregnant women at delivery. The polymerase chain reaction was adapted to the amplification of a herpes simplex virus (HSV) DNA sequence, common to HSV types 1 and 2 (HSV-1, HSV-2). The amplified product was detectable by ethidium-bromide staining or Southern hybridization of gels and by dot hybridization. The HSV polymerase chain reaction detected HSV DNA in samples obtained from eight patients with genital lesions from which HSV-2 was isolated in tissue culture and from four patients with labial lesions from which HSV-1 was isolated. The HSV polymerase chain reaction identified HSV in clinical specimens obtained from 11 women who had asymptomatic genital HSV infections at delivery. None of 11 samples obtained at delivery from women who had antibodies to HSV-2, but whose delivery cultures were negative, were positive by polymerase chain reaction and no false-positive reactions were obtained when the reaction mixture contained human cell DNA or varicella-zoster virus, cytomegalovirus, Epstein-Barr virus, or human papillomavirus DNA.', 'C02') ] classifier = NaiveBayesClassifier(trainData) #str1 = "A school blood drive before a measles outbreak permitted correlation of preexposure measles antibody titers with clinical protection using the plaque reduction neutralization (PRN) test and an EIA." # print(classifier.classify(str1)) testdata = [ 'A school blood drive before a measles outbreak permitted correlation of preexposure measles antibody titers with clinical protection using the plaque reduction neutralization (PRN) test and an EIA.', 'Of 9 donors with detectable preexposure PRN titer less than or equal to 120, 8 met the clinical criteria for measles (7 seroconfirmed) compared with none of 71 with preexposure PRN titers greater than 120 (P less than .0001).', 'Seven of 11 donors with preexposure PRN titers of 216-874 had a greater than or equal to 4-fold rise in antibody titer (mean, 43-fold) compared with none of 7 with a preexposure PRN titer greater than or equal to 1052 (P less than .02).', 'Of 37 noncases with preexposure PRN titer less than 1052, 26 (70%) reported one or more symptoms compared with 11 (31%) of 35 donors with preexposure PRN titers greater than or equal to 1052 (P less than .002).', 'By EIA, no case had detectable preexposure antibody; the preexposure geometric mean titer of asymptomatic donors (220) was not significantly higher than that of symptomatic donors who did not meet the clinical criteria for measles (153) (P = .10).', 'The study suggests that PRN titers less than or equal to 120 were not protective against measles disease and illness without rash due to measles may occur in persons with PRN titers above this level.', 'Use of polymerase chain reaction for successful identification of asymptomatic genital infection with herpes simplex virus in pregnant women at delivery. The polymerase chain reaction was adapted to the amplification of a herpes simplex virus (HSV) DNA sequence, common to HSV types 1 and 2 (HSV-1, HSV-2). The amplified product was detectable by ethidium-bromide staining or Southern hybridization of gels and by dot hybridization. The HSV polymerase chain reaction detected HSV DNA in samples obtained from eight patients with genital lesions from which HSV-2 was isolated in tissue culture and from four patients with labial lesions from which HSV-1 was isolated. The HSV polymerase chain reaction identified HSV in clinical specimens obtained from 11 women who had asymptomatic genital HSV infections at delivery. None of 11 samples obtained at delivery from women who had antibodies to HSV-2, but whose delivery cultures were negative, were positive by polymerase chain reaction and no false-positive reactions were obtained when the reaction mixture contained human cell DNA or varicella-zoster virus, cytomegalovirus, Epstein-Barr virus, or human papillomavirus DN' ] for stmt in testdata: print(classifier.classify(stmt))
def decide_actionable_tweet(doc_standard): actionable_tweet = [] from textblob.classifiers import NaiveBayesClassifier as NBC from textblob import TextBlob training_corpus = [ ('naredra modi is good politician','not_actionable'), ('how congress become good oppositor','actionable'), ('python is popular language','not_actionable'), ('here is new version of python available see it','actionable'), ('retweet why india is poor country','actionable'), ('Pro cubbadi startion on 1 august 2017 ','not_actionable'), ('book ticket for goa at reasonable cost','actinable')] test_corpus = [('here is new version of motorola see it','actionable'), ('hellow friends how are you','not_actionable')] model = NBC(training_corpus) print("model",model) try: for doc in doc_standard: # for testing use other list instead of doc_standard result = model.classify(doc) if result is 'actionable': actionable_tweet.append(doc) except: print("error in classify") print("actionable_tweet", actionable_tweet) return actionable_tweet
def classify_text(self): cl = NaiveBayesClassifier(self.train) result = cl.classify("love sandwich!") print(result)
class Model(object): """docstring for Model""" def __init__(self, name='Guess', config={}): self.name = name self.config = config self.clf = NaiveBayesClassifier([]) def train(self, training_data): safe_training = [] for example in training_data: safe_training.append((example.get('text'), example.get('label'))) self.clf.update(safe_training) def evaluate(self, text): label = self.clf.classify(text) prob_dist = self.clf.prob_classify(text) label_prob = prob_dist.prob(label) return label, label_prob def get_classes(self): return self.clf.labels() def save(self): pass def load(self): pass
class AL_Guy: "Secret inner class" def __init__(self): self.classifier = None self.action_map = dict() self.reveries = None self.jokes = None self.training_data = None self.converted_training_data = [] def train(self, training_data): self.training_data = training_data for k in self.training_data.keys(): tag = k tag_data = self.training_data[tag] self.action_map[tag] = tag_data['action'] for d in tag_data['training_data']: self.converted_training_data.append((d, tag)) self.classifier = NaiveBayesClassifier( self.converted_training_data ) #, feature_extractor=cmd_extractor) def respond(self, data): action_class = self.classifier.classify(data) #self.classifier.show_informative_features(5) if action_class in self.action_map: return self.action_map[action_class](data) return 'Don\'t look like anything to me.... (Let me call Bernard, head of programming WW)'
def view_video(request, id=0): person = last_login.objects.all().last() id1 = User.objects.get(username=person.username) videos = Video.objects.get(id=id) context = {'video': videos} df = pd.read_csv("data/feedback.csv", header=0, encoding='unicode_escape') df = df.dropna() data = [] for index, rows in df.iterrows(): a = (rows['statement'], rows['overview']) data.append(a) cl = NaiveBayesClassifier(data) if request.method == 'POST': form = feedback_Form(request.POST) if form.is_valid(): feedback = form.cleaned_data['feedback'] pred1 = cl.classify(feedback) g = feedback_model(feedback=pred1) g.save() h = user_feedback(username=id1, feedback=feedback_model.objects.all().last(), Date=datetime.datetime.today(), video=videos) h.save() context = {'video': videos, 'msg': 'successfully stored'} return render(request, 'video.html', context) return render(request, 'video.html', context)
def bayes(): train = [ ('Extraordinaria reunion Al Gore y AMLO , me dio un gusto enorme conocer sus profundas coincidencias sobre Accion Climatica', 'pos'), ('AMLO fue el unico que estuvo a la altura.', 'pos'), ('AMLO a sabido ganarse nuestra confianza y por eso ahora somos muchos los que lo apoyamos. ', 'pos'), ('No hay otra mejor opcion en Mexico que AMLO. Los demas partidos han demostrado que trabajan para su bolsillo ', 'pos'), ('Al final queda la imagen de un AMLO prudente, inteligente ', 'pos'), ('VOTO MASIVO POR AMLO aun que les arda ', 'pos'), ('Arriba papa AMLO ', 'pos'), ('prometio trabajar junto con algore para hacer frente al cambio climatico', 'pos'), ('Hasta Mitofsky senala que AMLO continua creciendo, Anaya y Meade van a la baja', 'pos'), ('Segun El Financiero, AMLO continua siendo el preferido', 'pos'), ('convence a mas mexicanos y despega 4 puntos arriba en las encuestas. Inalcanzable', 'pos'), ('Lopitos,somos 70% de Mexicanos q no te creemos.Enganas a medio mundo.Tienes genes de dictador despistado.', 'neg'), ('Las contradicciones de AMLO', 'neg'), ('Refuta Penia a AMLO; defiende reforma energetica', 'neg'), ('hablan mucho de Suecia y Noriega... pero quieren votar por el que le mira hacia Venezuela', 'neg'), ('las propuestas de AMLO me dan pesadillas', 'pos'), ('El video viral que muestra la ignorancia de AMLO esta manipulado', 'neg') ] cl = NaiveBayesClassifier(train) print( cl.classify("Las propuestas de AMLO parecieron razonables y viables")) return "OK", 200
def classify_r1(self): lists = first_level #main categories cfy = NaiveBayesClassifier(lists) tweet = self.get_tweet() one = cfy.classify(tweet) return one
def get_analysis(s): train = [ ('I love this sandwich.', 'pos'), ('This is an amazing place!', 'pos'), ('I feel very good about these beers.', 'pos'), ('This is my best work.', 'pos'), ("What an awesome view", 'pos'), ('I do not like this restaurant', 'neg'), ('I am tired of this stuff.', 'neg'), ("I can't deal with this", 'neg'), ('He is my sworn enemy!', 'neg'), ('My boss is horrible.', 'neg') ] cl = NaiveBayesClassifier(train) tweets = Tweet.objects.filter(search_term = s) result = [] for t in tweets: d = {} c = cl.classify(t.tw_text) d['text'] = t.tw_text d['res'] = c result.append(d) return result
def main(): data =[] train =[] test =[] with open('hellopeter_labelled.csv', 'rb') as csvfile: spamreader = csv.reader(csvfile, delimiter=',') spamreader = list(spamreader) for row in spamreader: if (row[13] =='strongly positive'): data.append((row[8],'pos')) if (row[13] =='positive' ): data.append((row[8],'pos')) if ( row[13] =='neutral' ): data.append((row[8],'neu')) if ( row[13] =='negative'): data.append((row[8],'neg')) if (row[13] =='strongly negative' ): data.append((row[8],'neg')) train = data[:1000] test = data[1001:] for innf in test: print innf cl = NaiveBayesClassifier(train) for tnew in test: print '%%%%%%%' print ' ' print tnew[0] print tnew[1] print '%%%%%%%' print '#######' cl.classify(tnew[0]) prob_class = cl.prob_classify(tnew[0]) print '----max prob---' print prob_class.max() print '-----+ve-----' print prob_class.prob("pos") print '-----neutral-----' print prob_class.prob("neu") print '------ve-----' print prob_class.prob("neg") cl.accuracy(test)
def main(): data = [] train = [] test = [] with open('hellopeter_labelled.csv', 'rb') as csvfile: spamreader = csv.reader(csvfile, delimiter=',') spamreader = list(spamreader) for row in spamreader: if (row[13] == 'strongly positive'): data.append((row[8], 'pos')) if (row[13] == 'positive'): data.append((row[8], 'pos')) if (row[13] == 'neutral'): data.append((row[8], 'neu')) if (row[13] == 'negative'): data.append((row[8], 'neg')) if (row[13] == 'strongly negative'): data.append((row[8], 'neg')) train = data[:1000] test = data[1001:] for innf in test: print innf cl = NaiveBayesClassifier(train) for tnew in test: print '%%%%%%%' print ' ' print tnew[0] print tnew[1] print '%%%%%%%' print '#######' cl.classify(tnew[0]) prob_class = cl.prob_classify(tnew[0]) print '----max prob---' print prob_class.max() print '-----+ve-----' print prob_class.prob("pos") print '-----neutral-----' print prob_class.prob("neu") print '------ve-----' print prob_class.prob("neg") cl.accuracy(test)
def qaTest(): train = [('I love this sandwich.', 'pos'), ('this is an amazing place!', 'pos'), ('I feel very good about these beers.', 'pos'), ('this is my best work.', 'pos'), ("what an awesome view", 'pos'), ('I do not like this restaurant', 'neg'), ('I am tired of this stuff.', 'neg'), ("I can't deal with this", 'neg'), ('he is my sworn enemy!', 'neg'), ('my boss is horrible.', 'neg')] test = [('the beer was good.', 'pos'), ('I do not enjoy my job', 'neg'), ("I ain't feeling dandy today.", 'neg'), ("I feel amazing!", 'pos'), ('Gary is a friend of mine.', 'pos'), ("I can't believe I'm doing this.", 'neg')] cl = NaiveBayesClassifier(train) for stmt in test: print cl.classify(stmt[0])
def main(): dataset = [] with open(DATA_FILE) as f: dataset = json.load(f)['data'] dataset = dataset[:850] # dividing the dataset into two pos and neg parts pos_all = [{'sentence': item['sentence'], 'label': 'pos'} for item in dataset if item['label'] == 1] neg_all = [{'sentence': item['sentence'], 'label': 'neg'} for item in dataset if item['label'] == 0] # building the trainset from the entire dataset pos_train = pos_all[:math.floor(len(pos_all)/5) * 4] neg_train = neg_all[:math.floor(len(neg_all)/5) * 4] train_set = pos_train + neg_train # preparing train_set to be fed to the classifier train_set = [(item['sentence'], item['label']) for item in train_set] # preparing the test set pos_test = pos_all[math.floor(len(pos_all)/5) * 4:] neg_test = neg_all[math.floor(len(neg_all)/5) * 4:] test_set = pos_test + neg_test print("Train set: {}, Pos train: {}, Neg train : {}".format(len(train_set), len(pos_train), len(neg_train))) print("Test set: {}, Pos test: {}, Neg test: {}".format(len(test_set), len(pos_test), len(neg_test))) #wpdb.set_trace() # training the classifier model = NaiveBayesClassifier(train_set) correct = 0 TP = 0 TN = 0 FP = 0 FN = 0 for item in test_set: classification = model.classify(item['sentence']) if classification == item['label']: if classification == "pos": TP += 1 else: TN += 1 else: if classification == "pos": FP += 1 else: FN += 1 calculate_performace(TP, FP, TN, FN)
class DodgyBot(): __msg = None def __init__(self, ResponseHandler): self.recogniser = sr.Recognizer() self.microphone = sr.Microphone() self.handler = ResponseHandler self.chatbot = None self.loadIntent() def listen(self, listen_freq=0): while listen_freq == 0: with self.microphone as source: self.audio = self.recogniser.listen(source) return True def recogniseAudio(self): print 'Recognising...' response = self.recogniser.recognize_google(self.audio) return self.setMessage(response) def loadIntent(self): # self.chatbot = ChatBot( # 'Dodgy Bot', # trainer= 'chatterbot.trainers.ChatterBotCorpusTrainer' # ) # self.chatbot.train("chatterbot.corpus.english") with open("intent_data.json", "r") as fp: self.cl = NaiveBayesClassifier(fp, format="json") def getIntent(self): return self.cl.classify(self.getMessage()) def loadResponse(self, intent): if (intent == "greeting"): return os.system("say '%s'" % (self.handler.greetingHandler(self.getMessage()))) if (intent == "question"): return os.system("say '%s'" % (self.handler.questionHandler(self.getMessage()))) return os.system("say '%s'" % (self.handler.unknownResponse(self.getMessage()))) def setMessage(self, msg): DodgyBot.__msg = msg def getMessage(self): return DodgyBot.__msg def getChatBot(self): return self.chatbot
def get_tweet_sentiment(self, tweet): ''' Utility function to classify sentiment of passed tweet using textblob's sentiment method ''' # create TextBlob object of passed tweet text analysis = self.clean_tweet(tweet) # set sentiment with open('train.json', 'r') as fp: cl = NaiveBayesClassifier(fp, format="json") # set sentiment if cl.classify(analysis) == "Pos": return 'positive' elif cl.classify(analysis) == "Neg": return 'negative' else: return 'neutral'
class TimeLogicAdapter(LogicAdapter): """ The TimeLogicAdapter returns the current time. """ def __init__(self, **kwargs): super(TimeLogicAdapter, self).__init__(**kwargs) training_data = [ ("what time is it", 1), ("do you know the time", 1), ("do you know what time it is", 1), ("what is the time", 1), ("do you know the time", 0), ("it is time to go to sleep", 0), ("what is your favorite color", 0), ("i had a great time", 0), ("what is", 0) ] self.classifier = NaiveBayesClassifier(training_data) def process(self, statement, tag_processing = None): user_input = statement.text.lower() if "time" not in user_input: return 0, Statement("") try: # Find the time zone of the user based on latitude and longitude to get the correct time g = geocoders.GoogleV3() user = tag_processing.user lat,lon = user.get_latitude_longitude() timezone = g.timezone((lat,lon)) now = datetime.now(timezone) confidence = self.classifier.classify(user_input) response = Statement("The current time is " + now.strftime("%I:%M %p")) except: confidence = self.classifier.classify(user_input) response = Statement("Sorry. I cannot find the current time. Possible bad user location based on latitude and longitude. Please try again later") return confidence, response
def classify(desc): try: fp = open(os.path.join(os.path.abspath('.'), 'classifier/train.json'), 'r') except Exception: fp = open('train.json', 'r') cl = NaiveBayesClassifier(fp, format='json') classification = cl.classify(desc) return classification
class NaiveBayesAnalyzer: cl = None def __init__(self): with open("training_data.json", "r") as f: self.cl = NaiveBayesClassifier(f, format="json") self.cl.show_informative_features(20) def analyze(self, text): return self.cl.classify(text)
def classify_r2(self, one): #second level if (one == 'EdTech'): lists = Edtech cfy = NaiveBayesClassifier(lists) tweet = self.get_tweet() two = cfy.classify(tweet) return two elif (one == 'Leisure'): lists = Leisure cfy = NaiveBayesClassifier(lists) tweet = self.get_tweet() two = cfy.classify(tweet) return two else: lists = Places cfy = NaiveBayesClassifier(lists) tweet = self.get_tweet() two = cfy.classify(tweet) return two
def classify_intent(self): with open(self.file, 'r') as fp: cl = NaiveBayesClassifier(fp, format='json') intent = cl.classify(self.utterance) test = (self.utterance, intent), accuracy = cl.accuracy(test) # print(accuracy, '<< Accuracy') # nb_clf_result = { # 'intent': intent, # 'accuracy': accuracy # } return intent
def classify_r1(self): lists = first_level cfy = NaiveBayesClassifier(lists) tfile = open("tweets.txt", 'r') for line in tfile: tweet = line.strip() one = cfy.classify(tweet) tfile.close() return one
def respond(): train = [ ('Says the Annies List political group supports third-trimester abortions on demand.', 'false'), ('Donald Trump is against marriage equality. He wants to go back.', 'true'), ('Says nearly half of Oregons children are poor.', 'true'), ('State revenue projections have missed the mark month after month.', 'true'), ("In the month of January, Canada created more new jobs than we did.", 'true'), ('If people work and make more money, they lose more in benefits than they would earn in salary.', 'false'), ('Originally, Democrats promised that if you liked your health care plan, you could keep it. One year later we know that you need a waiver to keep your plan.', 'false'), ("We spend more money on antacids than we do on politics.", 'false'), ('Barack Obama and Joe Biden oppose new drilling at home and oppose nuclear power.', 'false'), ('President Obama once said he wants everybody in America to go to college.', 'false') ] test = [ ('Because of the steps we took, there are about 2 million Americans working right now who would otherwise be unemployed.', 'true'), ('Scientists project that the Arctic will be ice-free in the summer of 2018', 'false'), ("You cannot build a little guy up by tearing a big guy down -- Abraham Lincoln said it.", 'false'), ("One man opposed a flawed strategy in Iraq. One man had the courage to call for change. One man didn't play politics with the truth.", 'true'), ('When I was governor, not only did test scores improve we also narrowed the achievement gap.', 'true'), ("Ukraine was a nuclear-armed state. They gave away their nuclear arms with the understanding that we would protect them.", 'false') ] cl = NaiveBayesClassifier(train) print("your test accuracy is ", cl.accuracy(test)) # Retrieve the message from url parameter message = request.args.get("message", None) response = {} # Check if user sent a name at all if not message: response["ERROR"] = "no user input found, please send a message." # Now the user entered a valid name else: classified_text = cl.classify(message) # response["result"] = {f" The Sentence {message} is {classified_text} and the accuracy is {messageAccuracy}" # # Return the response in json format # return jsonify(response) return jsonify({"Message": f"{message}", "result": f"{classified_text}"})
class TwitterTrendAdapter(LogicAdapter): def __init__(self, **kwargs): super(TwitterTrendAdapter, self).__init__(**kwargs) training_data = [ ("what's trending in ", 1), ('what is trending in', 1), ('what is', 0), ('who is', 0), ('who was', 0), ('what can you tell me about', 0), ('what do you know about', 0), ('any clue about', 0), ('where is',0), ('located', 0), ('what is happening', 1) ] self.classifier = NaiveBayesClassifier(training_data) def process(self, statement): confidence = self.classifier.classify(statement.text.lower()) tokens = nltk.word_tokenize(str(statement)) tagged = nltk.pos_tag(tokens) nouns = [word for word, pos in tagged if (pos == 'NN' or pos == 'NNP' or pos =='JJ' or pos == 'NNS' or pos == 'NNPS')] auth = OAuthHandler(twitter_consumer_key, twitter_consumer_secret) auth.set_access_token(twitter_access_key, twitter_access_secret) api = tweepy.API(auth) trendsName = "" for noun in nouns: try: html = urllib.urlopen( 'http://where.yahooapis.com/v1/places.q(' + noun + ')?appid=' + yahoo_client_Id).read() soup = BeautifulSoup(html, 'html.parser') woeids = soup.find('woeid').contents for woeid in woeids: id = ' '.join(woeid.string.split()) trends1 = api.trends_place(str(id)) data = trends1[0] # grab the trends trends = data['trends'] names1 = [trend['name'] for trend in trends] trendsName += ' '.join(names1) except: pass if len(nouns) != 0 and len(trendsName)!=0: response = Statement("Jarvis: "+trendsName) else: response = Statement("") confidence=0 return confidence, response
def classify_r2(self, one): if (one == 'Technology'): lists = Technology cfy = NaiveBayesClassifier(lists) tfile = open("tweets.txt", 'r') for line in tfile: tweet = line.strip() two = cfy.classify(tweet) tfile.close() return two
def text_classification_with_naive_bayes(text): from textblob.classifiers import NaiveBayesClassifier #key words dictionary = DB.GET_dictionary_from_DB() train = [] for type in dictionary: for word in type.key_words: to_add = (word.lower(), Utils.get_data_id_lower(type.type_name)) train.append(to_add) cl = NaiveBayesClassifier(train) result = cl.classify(text.lower()) print('Según key words:', Utils.get_data_name(result)) #prob_dist = cl.prob_classify(0) # prob_dist.max() # print(round(prob_dist.prob(0), 12)) #secondary words dictionary = DB.GET_dictionary_from_DB() train = [] to_add_list = [] aux = [] import random #en las secondary words cogeremos un valor aleatorio de palabras para equilibrar (sino siempre dice que es futbol) for type in dictionary: for word in type.secondary_words: to_add_list.append((word.lower(), type.type_name)) aux = random.sample(to_add_list, 50) #cogemos 50 al azar to_add_list = [] for add in aux: train.append(add) cl = NaiveBayesClassifier(train) result = cl.classify(text.lower()) print('Según secondary words:', result)
def classify_v1(text): #<str> is passed to func text = bc.basic_cleanning(text) #returned value is in <list> format #print(text) if text != []: with open('train_dataset.csv') as csv_file: cl = NaiveBayesClassifier(csv_file, format="csv") #cl = NaiveBayesClassifier() #pass dataset as list result = cl.classify(text) #print (type(result)) # <str> format prob_dist = cl.prob_classify(text) pos_result = round(prob_dist.prob("pos"), 2) neg_result = round(prob_dist.prob("neg"), 2) return result
def textblob_naivebayes(train_data, test_data, classify_filter): cl = NaiveBayesClassifier(train_data) print( "\nF1 score:", metrics.classification_report( [i[1] for i in test_data], [cl.classify(i[0]) for i in test_data] # , pos_label="account_api" # , average=None )) classify_data = classify_filter['SUBJECT'].unique() temp1 = [] temp2 = [] for i in classify_data: # print(i, cl.classify(i)) temp1.append(i) temp2.append(cl.classify(i)) classify_dataframe = pandas.DataFrame( numpy.column_stack([temp1, temp2]), columns=['subject', 'predicted_label']) with open('classifier.csv', 'w') as f: classify_dataframe.to_csv(f)
def klasify(text): hasil = 'mct' train = [ ('lancar', 'lcr'), ('peningkatan volume kendaraan', 'mct'), ('ramai', 'mct'), ('normal', 'lcr'), ('padat merayap', 'mct'), ('meriah', 'mct'), ('antrian', 'mct'), ('penyempitan', 'mct'), ('sepi', 'lcr'), ('gabisa lewat', 'mct'), ('macet', 'mct'), ] cl = NaiveBayesClassifier(train) cl.classify("macet") 'mct' blob = TextBlob(text, classifier=cl) for s in blob.sentences: hasil = s.classify() # print(s.classify()) return hasil
class TwitterTagAdapter(LogicAdapter): def __init__(self, **kwargs): super(TwitterTagAdapter, self).__init__(**kwargs) training_data = [ ('what are people talking about', 1), ("what's trending in", 0), ('what is going on with', 1), ('what are reviews', 1), ('what is going on',1), ('tweetind',1), ('what can you tell me about', 0), ('what do you know about', 0), ('any clue about', 0), ('where is',0), ('located', 0), ('what is happening', 0) ] self.classifier = NaiveBayesClassifier(training_data) def process(self, statement): confidence = self.classifier.classify(statement.text.lower()) tokens = nltk.word_tokenize(str(statement)) tagged = nltk.pos_tag(tokens) nouns = [word for word, pos in tagged if (pos == 'NN' or pos == 'NNP' or pos =='JJ' or pos == 'NNS' or pos == 'NNPS')] downcased = [x.lower() for x in nouns] searchTerm = " ".join(downcased).encode('utf-8') #"http://where.yahooapis.com/v1/places.q('Place name')?appid=yourappidhere" st="" if len(nouns) != 0: auth = OAuthHandler(twitter_consumer_key, twitter_consumer_secret) auth.set_access_token(twitter_access_key, twitter_access_secret) api = tweepy.API(auth) for status in tweepy.Cursor(api.search, q='#'+searchTerm).items(20): st = st+status.text response = Statement("Jarvis: "+st) else: response = Statement("Jarvis: "+"Sorry sir, Nothing Found") return confidence, response #what's trending in city #movie reviews #people talking about some topic
class Scraper (): def __init__(self, traing_data): self.cl = NaiveBayesClassifier(traing_data) def classifier(self, data): return self.cl.classify(data) def fetch_data(self): BASEURL = "https://news.ycombinator.com/news?p=" for n in range(1): r = requests.get(BASEURL + str(n)) soup = BeautifulSoup(r.content, "html.parser") for title in soup.findAll('tr', {'class': 'athing'}): # Fetch Title for t in title.findAll('a', text=True): art_title = t.text.encode("utf8") art_link = t['href'] print (self.classifier(art_title), art_title)
class App(DictMixin): """ Aggregation of intents. """ def __init__(self,name,greeting): self.name = name self.greeting = greeting self.intents = {} self.classifier = None def __getitem__(self,key): return self.intents[key] def __setitem__(self,key,value): self.intents[key] = value # train classifier phrase_file = file(value.phrases,'r') phrase_data = yaml.safe_load(phrase_file) phrases = [(phrase,value.name) for phrase in phrase_data['Phrases']] if self.classifier: self.classifier.update(phrases) else: self.classifier = Classifier(phrases) def __delitem__(self,key): del self.intents[key] def keys(self): return self.intents.keys() def intent_for(self,phrase): """ Attempt to match an intent to the supplied phrase, using the onboard classifier. """ if not self.classifier: # has not been properly initializes raise IntentNotFound('Classifier not initialized') try: return self.intents[self.classifier.classify(phrase)] except KeyError: raise IntentNotFound
def nayebayesreport(fileFullPath): print "nayebayesreport came" print (fileFullPath) sentimentDtls = [] patternCountMap = { "Negative" : 0, "Positive" : 0, "Neutral" : 0, "Total" : 0, } cl = NaiveBayesClassifier(getTrainData()) print "train data loaded" with open(fileFullPath, 'r') as f: for line in f: try: print line if line and len(line.strip()) > 0: trainedResult = cl.classify(line) patternResult = "Negative" if "pos" == trainedResult: patternResult = "Positive" patternCountMap[patternResult] = patternCountMap[patternResult] + 1 patternCountMap["Total"] = patternCountMap["Total"] + 1 sentimentDtls.append({ "sentiment" : patternResult, "feedback" : line }) except Exception: print(traceback.format_exc()) print(line) addBayesClassifierResult(sentimentDtls) return
class Classifier: def __init__(self): self.cachedStopWords = stopwords.words("english") self.path = os.path.dirname(os.path.abspath(__file__)) def train(self, train_set): train_data = [] for t in train_set: train_data.append((self._cvobj_to_string(t[0]),t[1])) print "Training model..." #print train_data self.cl = NaiveBayesClassifier(train_data) #print self._cvobj_to_string(train_set[0][0]) def _cvobj_to_string(self, cv): str = "" for exp in cv['experience']: str += (exp['description']+" ") for proj in cv['project']: str += (proj['title']+" ") str += (proj['description']+" ") for skill in cv['skill']: str += (skill+" ") str = str.decode("utf-8", "replace") str = ' '.join([word for word in str.split() if word not in self.cachedStopWords]) return str def classify(self, cv): return self.cl.classify(self._cvobj_to_string(cv)) def save(self): pickle.dump( self.cl, open( self.path+"/cv_model.cvm", "wb" ) ) print "CV classifier saved." def load(self): self.cl = pickle.load( open( self.path+"/cv_model.cvm", "rb" ) ) print "CV classifier loaded."
def NaiveBayesAnalyzerParser(text): train =[('creates jobs', 'pos'), ('create jobs', 'pos'), ('created jobs', 'pos'), ('new jobs', 'pos'), ('jobs wanted', 'pos'), ('jobs needed', 'pos'), ('jobs call by', 'pos'), ('unemployment falls', 'pos'), ('bring jobs', 'pos'), ('jobs comming', 'pos'), ('unemployment drops', 'pos'), ('cut jobs', 'neg'), ('cutting jobs', 'neg'), ('cuts jobs', 'neg'), ('lost jobs', 'neg'), ('job loss', 'neg'), ('losing jobs', 'neg'), ('lose jobs', 'neg'), ('jobs not kept', 'neg'), ('jobs trim', 'neg'), ('unemployment rises', 'neg'), ('drops', 'neg'), ('drop', 'neg'), ('dollar falls', 'neg'), ] cl = NaiveBayesClassifier(train) sentiment = TextBlob(text, analyzer=NaiveBayesAnalyzer()).sentiment #Sentiment(classification='pos', p_pos=0.6023632501327671, p_neg=0.3976367498672331) #print(sentiment) subjectivity = 1 - (max(sentiment.p_pos,sentiment.p_neg) - min(sentiment.p_pos,sentiment.p_neg)) if cl.classify(text) == 'pos': return (sentiment.p_pos, subjectivity) else: return (sentiment.p_neg*-1, subjectivity)
class TestNaiveBayesClassifier(unittest.TestCase): def setUp(self): self.classifier = NaiveBayesClassifier(train_set) def test_default_extractor(self): text = "I feel happy this morning." assert_equal(self.classifier.extract_features(text), basic_extractor(text, train_set)) def test_classify(self): res = self.classifier.classify("I feel happy this morning") assert_equal(res, 'positive') assert_equal(len(self.classifier.train_set), len(train_set)) def test_classify_a_list_of_words(self): res = self.classifier.classify(["I", "feel", "happy", "this", "morning"]) assert_equal(res, "positive") def test_train_from_lists_of_words(self): # classifier can be trained on lists of words instead of strings train = [(doc.split(), label) for doc, label in train_set] classifier = NaiveBayesClassifier(train) assert_equal(classifier.accuracy(test_set), self.classifier.accuracy(test_set)) def test_prob_classify(self): res = self.classifier.prob_classify("I feel happy this morning") assert_equal(res.max(), "positive") assert_true(res.prob("positive") > res.prob("negative")) def test_accuracy(self): acc = self.classifier.accuracy(test_set) assert_true(isinstance(acc, float)) def test_update(self): res1 = self.classifier.prob_classify("lorem ipsum") original_length = len(self.classifier.train_set) self.classifier.update([("lorem ipsum", "positive")]) new_length = len(self.classifier.train_set) res2 = self.classifier.prob_classify("lorem ipsum") assert_true(res2.prob("positive") > res1.prob("positive")) assert_equal(original_length + 1, new_length) def test_labels(self): labels = self.classifier.labels() assert_true("positive" in labels) assert_true("negative" in labels) def test_show_informative_features(self): feats = self.classifier.show_informative_features() def test_informative_features(self): feats = self.classifier.informative_features(3) assert_true(isinstance(feats, list)) assert_true(isinstance(feats[0], tuple)) def test_custom_feature_extractor(self): cl = NaiveBayesClassifier(train_set, custom_extractor) cl.classify("Yay! I'm so happy it works.") assert_equal(cl.train_features[0][1], 'positive') def test_init_with_csv_file(self): with open(CSV_FILE) as fp: cl = NaiveBayesClassifier(fp, format="csv") assert_equal(cl.classify("I feel happy this morning"), 'pos') training_sentence = cl.train_set[0][0] assert_true(isinstance(training_sentence, unicode)) def test_init_with_csv_file_without_format_specifier(self): with open(CSV_FILE) as fp: cl = NaiveBayesClassifier(fp) assert_equal(cl.classify("I feel happy this morning"), 'pos') training_sentence = cl.train_set[0][0] assert_true(isinstance(training_sentence, unicode)) def test_init_with_json_file(self): with open(JSON_FILE) as fp: cl = NaiveBayesClassifier(fp, format="json") assert_equal(cl.classify("I feel happy this morning"), 'pos') training_sentence = cl.train_set[0][0] assert_true(isinstance(training_sentence, unicode)) def test_init_with_json_file_without_format_specifier(self): with open(JSON_FILE) as fp: cl = NaiveBayesClassifier(fp) assert_equal(cl.classify("I feel happy this morning"), 'pos') training_sentence = cl.train_set[0][0] assert_true(isinstance(training_sentence, unicode)) def test_init_with_custom_format(self): redis_train = [('I like turtles', 'pos'), ('I hate turtles', 'neg')] class MockRedisFormat(formats.BaseFormat): def __init__(self, client, port): self.client = client self.port = port @classmethod def detect(cls, stream): return True def to_iterable(self): return redis_train formats.register('redis', MockRedisFormat) mock_redis = mock.Mock() cl = NaiveBayesClassifier(mock_redis, format='redis', port=1234) assert_equal(cl.train_set, redis_train) def test_data_with_no_available_format(self): mock_fp = mock.Mock() mock_fp.read.return_value = '' assert_raises(FormatError, lambda: NaiveBayesClassifier(mock_fp)) def test_accuracy_on_a_csv_file(self): with open(CSV_FILE) as fp: a = self.classifier.accuracy(fp) assert_equal(type(a), float) def test_accuracy_on_json_file(self): with open(CSV_FILE) as fp: a = self.classifier.accuracy(fp) assert_equal(type(a), float) def test_init_with_tsv_file(self): with open(TSV_FILE) as fp: cl = NaiveBayesClassifier(fp) assert_equal(cl.classify("I feel happy this morning"), 'pos') training_sentence = cl.train_set[0][0] assert_true(isinstance(training_sentence, unicode)) def test_init_with_bad_format_specifier(self): assert_raises(ValueError, lambda: NaiveBayesClassifier(CSV_FILE, format='unknown')) def test_repr(self): assert_equal(repr(self.classifier), "<NaiveBayesClassifier trained on {0} instances>".format(len(train_set)))
#Train classifier with training data set train = open("training_data.tsv",'r') cl = NaiveBayesClassifier(train, format="tsv") train.close() testing = open("testing_data.txt",'r') classify = open("classified_data.txt",'w') for line in testing: line = line.rstrip('\n') line = line.split('\t') pmid = line[0] sent = line[1] try: classify_result = cl.classify(sent) #classify the sentence into one of four groups newLine = pmid+'\t'+sent+'\t'+classify_result+'\n' classify.write(newLine) except: pass #prob_dist = cl.prob_classify(line) Might use this later for a future network graph. #assoc = round(prob_dist.prob("assoc"),2) For now will just use the highest probability match for each sentence. #found = round(prob_dist.prob("found"),2) #isA = round(prob_dist.prob("is"),2) #involve = round(prob_dist.prob("involve"),2) #newLine = line+'\tAssociation\t'+str(assoc)+'\tFoundIn\t'+str(found)+'\tIsA\t'+str(isA)+'\tInvolve\t'+str(involve)+'\n' testing.close() classify.close()
from textblob.classifiers import NaiveBayesClassifier train = [ ('I love this sandwich.', 'pos'), ('This is an amazing place!', 'pos'), ('I feel very good about these beers.', 'pos'), ('This is my best work.', 'pos'), ("What an awesome view", 'pos'), ('I do not like this restaurant', 'neg'), ('I am tired of this stuff.', 'neg'), ("I can't deal with this", 'neg'), ('He is my sworn enemy!', 'neg'), ('My boss is horrible.', 'neg'), ("I like big butts and I cannot lie","butts") ] test = [ ('The beer was good.', 'pos'), ('I do not enjoy my job', 'neg'), ("I ain't feeling dandy today.", 'neg'), ("I feel amazing!", 'pos'), ('Gary is a friend of mine.', 'pos'), ("I can't believe I'm doing this.", 'neg') ] cl = NaiveBayesClassifier(train) print cl.accuracy(test) print cl.classify("Their burgers are amazing") # "pos" print cl.classify("I don't hate you.") # "neg"
post_body = post_body.decode("utf-8") except UnicodeDecodeError: continue test.append([post_body, i_file]) Bayes = NaiveBayesClassifier(train) print os.getcwd() pos = [] neg = [] for body in test: judge = Bayes.classify(body[0]) if judge == "positive": call(["mv", "./" + body[1], "john/"]) os.getcwd() if judge == "negative": call(["mv", "./" + body[1], "non_john/"]) os.mkdir("hard_to_classify") remaining = glob.glob("*.html") for doc in remaining: call(["mv", "./" + doc, "hard_to_classify/"]) # print Bayes.accuracy(test) print Bayes.show_informative_features(10)
with open(name) as f: text = f.read() text = text.replace("\n", " ") text = unicode(text, "utf-8", errors="ignore") data.append((text, "pro")) i += 1 files = glob.glob(NonPropath) for name in files: with open(name) as f: text = f.read() text = text.replace("\n", " ") text = unicode(text, "utf-8", errors="ignore") data.append((text, "non-pro")) random.shuffle(data) number_of_elements = len(data) split = (number_of_elements / 3) * 2 train = data[:split] test = data[split:] # print 'content of line 5 ' , train[4] cl = NaiveBayesClassifier(train) cl.accuracy(test) cl.classify( "Your symptoms may be caused due to a musculo-skeletal strain. I would advise you to take OTC pain-killers/NSAIDS and see if it helps. Rest and ice will also help to relieve the symptoms. If the pain does not get better, you may need to visit your doctor for a physical examination. X-rays will usually be normal in most cases." )
exit(error) print("Importing...") a = time.time() data_tuples = get_training_tuples(sys.argv[1]) print("Data import time: {0}s".format(time.time()-a)) # Shuffle data: first 250 will be training set; last 250, the test set random.seed(1) random.shuffle(data_tuples) training = data_tuples[:250] test = data_tuples[251:500] # Train classifier print("Training...") a = time.time() cl = NaiveBayesClassifier(training) print("Training time: {0}s".format(time.time()-a)) # Test classifier print("Accuracy: {0}".format(str(cl.accuracy(test)))) # Classify stuff while True: text = input("Enter text to classify or 'q' to quit: ") if text == 'q': print("Exiting") break else: print("Class: {0}".format(cl.classify(text)))
if os.path.exists('/home/lakeesh10/Documents/projectdemo/naivebayes_classifier.pickle'): print "file exist" naive = load_naiveclassifier() else: naive = NaiveBayesClassifier(train) save_naiveclassifier(naive) print "Naive Bayes Trained" if os.path.exists('/home/lakeesh10/Documents/projectdemo/decisiontree_classifier.pickle'): decision = load_decisionclassifier() else: decision = DecisionTreeClassifier(train) save_decisionclassifier(decision) print "Decision Tree Trained" print("Naive Bayes : ",naive.classify("fried chip good and crunchy dig thattaco tropical omg so eyeopening")) #print(decision.classify("fried chip good and crunchy dig thattaco tropical omg so eyeopening")) cl=NaiveBayesAnalyzer() print (cl.analyze("fried chip good and crunchy dig thattaco tropical omg so eyeopening")) blob = TextBlob("fried chip good and crunchy dig thattaco tropical omg so eyeopening") polarity=0 i=0 for sentence in blob.sentences: polarity=polarity+sentence.sentiment.polarity i=i+1 polarity=polarity/i print(polarity) negids = movie_reviews.fileids('neg') posids = movie_reviews.fileids('pos')
#Training the classifier on the headline dataset with open("dataset1.json", 'r', encoding="utf-8-sig") as fp1: cl1 = NaiveBayesClassifier(fp1, format="json") #Training the classifier on the body dataset with open("dataset2.json", 'r', encoding="utf-8-sig") as fp2: cl2 = NaiveBayesClassifier(fp2, format="json") #Taking the string values str1 = str(headline) headline = TextBlob(str1) body = str(body) tb_body = TextBlob(body) subjectivity = tb_body.sentiment.subjectivity subjectivity = float(subjectivity) * 100 body_classify = str(cl2.classify(body)) body = body.lower() #Finding the subjectivity headline = headline.replace('Was', '') headline = headline.replace('was', '') headline = headline.replace('’','') #Finding the tags in the sentence array = headline.tags array1 = [] #Finding the hot words for ii in array: name, tag = ii name = str(name)
#!/usr/bin/python # -*- coding: utf-8 -*- from textblob.classifiers import NaiveBayesClassifier import codecs train_data = [("bu ürün çok güzel".decode('utf8'), 'pos'), ('çok memnunum'.decode('utf8'), 'pos'), ('Çok uygun fiyata çok güzel ürün'.decode('utf8'), 'pos'), ('Tek kelimeyle harika', 'pos'), ('beğenmedim'.decode('utf-8'), 'neg'), ('hiç iyi bir ürün değil'.decode('utf8'), 'neg'), ('almayın bence'.decode('utf-8'), 'neg')] reviews = [] with open('../scraping_reviews/pos.txt') as file: for line in file: reviews.append((line.decode('utf8'), "pos")) with open('../scraping_reviews/neg.txt') as file: for line in file: reviews.append((line.decode('utf8'), 'neg')) print reviews[len(reviews) - 2] cl = NaiveBayesClassifier(reviews) print cl.classify('ürünü gerçekten beğenmedim'.decode('utf8'))
"Korban diajak tersangka ke musala di dekat pondok. Saat kondisi sepi dan hanya berdua dengan korban, tersangka mencabuli korban," kata Wahyu kepada wartawan, Minggu (20/3/2016). Lantaran menganggap Nurul sebagai Gus, korban pun tak berani menolak permintaan tersangka. Terlebih lagi, tersangka membujuk korban bahwa perbuatan cabul itu untuk memasukkan ilmu kebatinan ke tubuh korban. "Tersangka berdalih untuk mengajari korban ilmu tasawuf. Nyatanya itu hanya untuk memuluskan niat tersangka agar bisa mencabuli korban," ungkapnya. Menurut Wahyu, perbuatan cabul itu dilakukan tersangka kepada korban berulang kali selama 2 tahun terakhir. Bahkan korban diminta membayar uang kepada tersangka setiap kali usai melakukan pencabulan. Nilainya antara Rp 200.000 hingga jutaan rupiah. "Tersangka juga meminta uang dari korban berulang kali. Total kerugian korban Rp 40 juta," sebutnya. Tak tahan dengan perbuatan Nurul, lanjut Wahyu, korban pun memutuskan buka mulut ke teman sesama santri. Mendapat dukungan dari teman-temannya, korban memberanikan diri melapor ke Polres Jombang, Kamis (17/3). Pada hari yang sama, polisi memutuskan menjebak tersangka. "Saat korban menyerahkan uang yang terakhir kepada tersangka, saat itu tersangka langsung kami tangkap," jelasnya. Akibat perbuatannya, kini Nurul harus mendekam di Rutan Polres Jombang. Tersangka dijerat dengan Pasal 80 ayat (1) juncto Pasal 82 ayat (1) UU RI No 35 Tahun 2014 tentang Perlindungan Anak dengan ancaman pidana maksimal 15 tahun penjara. "Kalau ada yang merasa menjadi korban perbuatan tersangka ini, jangan malu melapor, akan kami jaga identitasnya. Karena itu bisa memberatkan tersangka," pungkasnya. """ tic = timeit.default_timer() renum = ''.join([i for i in text if not i.isdigit()]) text = stem_words(renum) print("text diatas setelah diklasifikasi yaitu %s\n" % cl.classify(text)) toc = timeit.default_timer() print ("waktu klasifikasi : ") print(toc-tic) print(cl.show_informative_features(20)) # classifier = TextBlob(stemstop_output, classifier=cl) # print(classifier.classify())
def extract_features(document): document_words = set(document) features = {} for word in word_features: features['contains(%s)' % word] = (word in document_words) return features featuresets = [(extract_features(d), c) for (d, c) in tweets] print('featuresets: ', len(featuresets)) train_set, test_set = featuresets[:80], featuresets[80:] blob = TextBlob("It's not the worst.", analyzer=NaiveBayesAnalyzer()) print(blob.sentiment) blob = TextBlob("It's not the worst") print(blob.sentiment) cl = NaiveBayesClassifier(train) print(cl.classify("It's not the worst")) #tb = Blobber(analyzer=NaiveBayesAnalyzer()) #print(tb("DonaldTrump under fire for comments about women weigh in on").sentiment)
class BankClassify(): def __init__(self, data="AllData.csv"): """Load in the previous data (by default from AllData.csv) and initialise the classifier""" if os.path.exists(data): self.prev_data = pd.read_csv(data) else: self.prev_data = pd.DataFrame(columns=['date', 'desc', 'amount', 'cat']) self.classifier = NaiveBayesClassifier(self._get_training(self.prev_data), self._extractor) def add_data(self, filename): """Add new data and interactively classify it. Arguments: - filename: filename of Santander-format file """ self.new_data = self._read_santander_file(filename) self._ask_with_guess(self.new_data) self.prev_data = pd.concat([self.prev_data, self.new_data]) self.prev_data.to_csv("AllData.csv", index=False) def _prep_for_analysis(self): """Prepare data for analysis in pandas, setting index types and subsetting""" self.prev_data = self._make_date_index(self.prev_data) self.prev_data['cat'] = self.prev_data['cat'].str.strip() self.inc = self.prev_data[self.prev_data.amount > 0] self.out = self.prev_data[self.prev_data.amount < 0] self.out.amount = self.out.amount.abs() self.inc_noignore = self.inc[self.inc.cat != 'Ignore'] self.inc_noexpignore = self.inc[(self.inc.cat != 'Ignore') & (self.inc.cat != 'Expenses')] self.out_noignore = self.out[self.out.cat != 'Ignore'] self.out_noexpignore = self.out[(self.out.cat != 'Ignore') & (self.out.cat != 'Expenses')] def _read_categories(self): """Read list of categories from categories.txt""" categories = {} with open('categories.txt') as f: for i, line in enumerate(f.readlines()): categories[i] = line.strip() return categories def _add_new_category(self, category): """Add a new category to categories.txt""" with open('categories.txt', 'a') as f: f.write('\n' + category) def _ask_with_guess(self, df): """Interactively guess categories for each transaction in df, asking each time if the guess is correct""" # Initialise colorama init() df['cat'] = "" categories = self._read_categories() for index, row in df.iterrows(): # Generate the category numbers table from the list of categories cats_list = [[idnum, cat] for idnum, cat in categories.items()] cats_table = tabulate(cats_list) stripped_text = self._strip_numbers(row['desc']) # Guess a category using the classifier (only if there is data in the classifier) if len(self.classifier.train_set) > 1: guess = self.classifier.classify(stripped_text) else: guess = "" # Print list of categories print(chr(27) + "[2J") print(cats_table) print("\n\n") # Print transaction print("On: %s\t %.2f\n%s" % (row['date'], row['amount'], row['desc'])) print(Fore.RED + Style.BRIGHT + "My guess is: " + str(guess) + Fore.RESET) input_value = input("> ") if input_value.lower() == 'q': # If the input was 'q' then quit return df if input_value == "": # If the input was blank then our guess was right! df.ix[index, 'cat'] = guess self.classifier.update([(stripped_text, guess)]) else: # Otherwise, our guess was wrong try: # Try converting the input to an integer category number # If it works then we've entered a category category_number = int(input_value) category = categories[category_number] except ValueError: # Otherwise, we've entered a new category, so add it to the list of # categories category = input_value self._add_new_category(category) categories = self._read_categories() # Write correct answer df.ix[index, 'cat'] = category # Update classifier self.classifier.update([(stripped_text, category) ]) return df def _make_date_index(self, df): """Make the index of df a Datetime index""" df.index = pd.DatetimeIndex(df.date.apply(dateutil.parser.parse,dayfirst=True)) return df def _read_santander_file(self, filename): """Read a file in the plain text format that Santander provides downloads in. Returns a pd.DataFrame with columns of 'date', 'desc' and 'amount'.""" with open(filename, errors='replace') as f: lines = f.readlines() dates = [] descs = [] amounts = [] for line in lines[4:]: line = "".join(i for i in line if ord(i)<128) if line.strip() == '': continue splitted = line.split(":") category = splitted[0] data = ":".join(splitted[1:]) if category == 'Date': dates.append(data.strip()) elif category == 'Description': descs.append(data.strip()) elif category == 'Amount': just_numbers = re.sub("[^0-9\.-]", "", data) amounts.append(just_numbers.strip()) df = pd.DataFrame({'date':dates, 'desc':descs, 'amount':amounts}) df['amount'] = df.amount.astype(float) df['desc'] = df.desc.astype(str) df['date'] = df.date.astype(str) return df def _get_training(self, df): """Get training data for the classifier, consisting of tuples of (text, category)""" train = [] subset = df[df['cat'] != ''] for i in subset.index: row = subset.ix[i] new_desc = self._strip_numbers(row['desc']) train.append( (new_desc, row['cat']) ) return train def _extractor(self, doc): """Extract tokens from a given string""" # TODO: Extend to extract words within words # For example, MUSICROOM should give MUSIC and ROOM tokens = self._split_by_multiple_delims(doc, [' ', '/']) features = {} for token in tokens: if token == "": continue features[token] = True return features def _strip_numbers(self, s): """Strip numbers from the given string""" return re.sub("[^A-Z ]", "", s) def _split_by_multiple_delims(self, string, delims): """Split the given string by the list of delimiters given""" regexp = "|".join(delims) return re.split(regexp, string)
def test_custom_feature_extractor(self): cl = NaiveBayesClassifier(train_set, custom_extractor) cl.classify("Yay! I'm so happy it works.") assert_equal(cl.train_features[0][1], 'positive')
for word in words: if word not in stopwords.words() and not word.isdigit(): list_tuples.append((word.lower(),tabsep[0])) c+=1 if c==500: break return list_tuples print 'importing data...' a = time.time() entire_data = get_list_tuples("dataset.txt") print "It took "+str(time.time()-a)+" seconds to import data" print 'data imported' random.seed(1) random.shuffle(entire_data) train = entire_data[:750] test = entire_data[751:1500] print 'training data' a = time.time() cl = NaiveBayesClassifier(train) print "It took "+str(time.time()-a)+" seconds to train data" print 'data trained, now checking accuracy:' accuracy = cl.accuracy(test) print "accuracy: "+str(accuracy) cl.show_informative_features(5) x = "" while (x != "exit"): x = raw_input("enter a email to check if it is a spam email or not , type exit to exit \n") print cl.classify(x)
def test_init_with_tsv_file(self): with open(TSV_FILE) as fp: cl = NaiveBayesClassifier(fp) assert_equal(cl.classify("I feel happy this morning"), 'pos') training_sentence = cl.train_set[0][0] assert_true(isinstance(training_sentence, unicode))