def __init__(self, parent, guesser=None, itemClass=None): self.status = StatusBar(parent) self.status.pack(side=BOTTOM, fill=X) Frame.__init__(self, parent) self.pack(side=TOP, fill=BOTH) self.itemsPerPage = 20 self.rows = [] for i in range(self.itemsPerPage): self.rows.append(ItemRow()) self.items = [] self.files = [] self.cursor = 0 self.dirty = False if guesser is None: from reverend.thomas import Bayes self.guesser = Bayes() else: self.guesser = guesser if itemClass is None: self.itemClass = TextItem else: self.itemClass = itemClass for row in self.rows: row.summary.set('foo') self.initViews()
def test_untrainedGuess(self): """ The C{guess} method of a L{Bayes} instance with no training data returns an empty list. """ bayes = Bayes() self.assertEquals(bayes.guess("hello, world"), [])
def main(): """ Build aggregator report pages with Bayes rating links. """ # Create a new Bayes guesser guesser = Bayes() # Attempt to load Bayes data, ignoring IOError on first run. try: guesser.load(BAYES_DATA_FN) except IOError: pass # Open up the databases, load the subscriptions, get new entries. feed_db, entry_db = openDBs(FEED_DB_FN, ENTRY_DB_FN) feeds = [x.strip() for x in open(FEEDS_FN, "r").readlines()] entries = getNewFeedEntries(feeds, feed_db, entry_db) # Score the new entries using the Bayesian guesser entries = scoreEntries(guesser, entries) # Write out the current run's aggregator report. out_fn = time.strftime(HTML_FN) writeAggregatorPage(entries, out_fn, DATE_HDR_TMPL, FEED_HDR_TMPL, ENTRY_TMPL, PAGE_TMPL) # Close the databases and save the current guesser's state to disk. closeDBs(feed_db, entry_db) guesser.save(BAYES_DATA_FN)
def __init__(self, non_spam_train_dir, spam_train_dir): self.non_spam_train_dir = non_spam_train_dir self.spam_train_dir = spam_train_dir self.naive_bayes_classifier = Bayes() self.total_num_train_files = 0 self.total_num_test_files = 0 self.num_misclass = 0
def getCategoryGuesses(self, corpus1, corpus2, corpus3): from reverend.thomas import Bayes # instantiate guesser guesser = Bayes() # train category guesser with first corpus guesser.train('first reference text', corpus1) guesser.train('second reference text', corpus2) # compare with second corpus guesses = guesser.guess(corpus3) return guesses
def get_db(private_path, username): path = os.path.join(os.path.join(private_path, username), 'spam.bayes') guesser = Bayes() # load the spam DB try: guesser.load(path) except IOError: print "Creating a new spam filter database" parent_directory = os.path.dirname(path) if not os.path.isdir(parent_directory): os.makedirs(parent_directory) guesser.save(path) return guesser, path
def trained(self, cr, uid, ids, context=None): for id in ids: record = self.read(cr, uid, id, ['category_id', 'description']) if not record['description']: raise osv.except_osv(_('Error!'), _("Description Not Define!")) if not record['category_id']: raise osv.except_osv(_('Error!'), _("Statistics Category Not Define!")) group_obj = self.pool.get('crm.bayes.group') cat_obj = self.pool.get('crm.bayes.categories') cat_rec = cat_obj.read(cr, uid, record['category_id'][0], []) guesser = Bayes() data = "" for rec in group_obj.browse(cr, uid, [cat_rec['group_id'][0]]): if rec['train_data']: data += rec['train_data'] if data: myfile = file(file_path + "crm_bayes.bay", 'w') myfile.write(data) myfile.close() guesser.load(file_path + "crm_bayes.bay") guesser.train(cat_rec['name'], record['description']) guesser.save(file_path + "crm_bayes.bay") myfile = file(file_path + "crm_bayes.bay", 'r') data = "" for fi in myfile.readlines(): data += fi cat_obj.write( cr, uid, record['category_id'][0], {'train_messages': int(cat_rec['train_messages']) + 1}) cr.execute( "select sum(train_messages) as tot_train,sum(guess_messages) as tot_guess from crm_bayes_categories where group_id=%d" % cat_rec['group_id'][0]) rec = cr.dictfetchall() if not rec[0]['tot_guess']: rec[0]['tot_guess'] = 0 percantage = float( rec[0]['tot_guess'] * 100) / float(rec[0]['tot_guess'] + rec[0]['tot_train']) group_obj.write(cr, uid, cat_rec['group_id'][0], { 'train_data': data, 'automate_test': percantage }) self.write(cr, uid, id, {'state_bayes': 'trained'}) return True
def getLanguageGuesses(self, stopWords, corpus, languages): from reverend.thomas import Bayes # charset charset = 'us-ascii' # instantiate guesser guesser = Bayes() # go through language in order to train guesser for selectLanguage in languages: if selectLanguage != 'automatic': stopWordString = stopWords.getStopWordString(selectLanguage) guesser.train(selectLanguage, stopWordString.encode(charset, 'replace')) # get list of possible languages languageGuesses = guesser.guess(corpus.encode(charset, 'replace')) return languageGuesses
def action_guess(self, cr, uid, ids, context=None): guesser = Bayes() group_obj = self.pool.get('crm.bayes.group') if result: for res in range(0, len(result)): result.pop(0) data = "" for rec in group_obj.browse(cr, uid, context['active_ids']): if rec['train_data']: data += rec['train_data'] result_lang = [] if data: myfile = file("/tmp/crm_bayes.bay", 'w') myfile.write(data) myfile.close() guesser.load('/tmp/crm_bayes.bay') message = self.read(cr, uid, ids, ['name']) result_lang = guesser.guess(message[0]['name']) cat_obj = self.pool.get('crm.bayes.categories') cat_id = cat_obj.search(cr, uid, []) for re in cat_obj.read(cr, uid, cat_id, ['name']): flag = False for r in result_lang: if r[0] == re['name']: result.append(r) flag = True break if not flag: result.append((re['name'], 0)) context_new = {} context_new.update({'from_wiz': True}) context_new.update({'group_id': context.get('active_id', False)}) return { 'context': context_new, 'view_type': 'form', "view_mode": 'form', 'res_model': 'crm.bayes.test.train', 'type': 'ir.actions.act_window', 'target': 'new', }
def guess_message(self, cr, uid, ids, context={}): cases = self.browse(cr, uid, ids) result_lang = [] if cases.description: guesser = Bayes() group_obj = self.pool.get('crm.bayes.group') data = "" for rec in group_obj.browse( cr, uid, group_obj.search(cr, uid, [('active', '=', True)])): if rec['train_data']: data += rec['train_data'] if data: myfile = file("/tmp/crm_bayes.bay", 'w') myfile.write(data) myfile.close() guesser.load('/tmp/crm_bayes.bay') result_lang = guesser.guess(cases.description) guess_re = [] for le in result_lang: guess_re.append((le[0], le[1] * 100)) return guess_re
def main(): """ Perform a test run of the FeedFilter using defaults. """ # Create a new Bayes guesser, attempt to load data guesser = Bayes() guesser.load(BAYES_DATA_FN) # Open up the databases, load the subscriptions, get new entries. feed_db, entry_db = openDBs(FEED_DB_FN, ENTRY_DB_FN) feeds = [x.strip() for x in open(FEEDS_FN, "r").readlines()] entries = getNewFeedEntries(feeds, feed_db, entry_db) # Build the feed filter. f = BayesFilter(guesser, entries) f.FEED_META['feed.title'] = FEED_TITLE f.FEED_META['feed.tagline'] = FEED_TAGLINE # Output the feed as both RSS and Atom. open(FEED_NAME_FN % 'rss', 'w').write(f.scrape_rss()) open(FEED_NAME_FN % 'atom', 'w').write(f.scrape_atom()) # Close the databases and save the current guesser's state to disk. closeDBs(feed_db, entry_db)
'Goal: Build a language recognizer using a naive bayesian classifier' # Make a 50 language reconizer trained on 10 books per language at: # http://www.gutenberg.org/browse/languages/en # http://www.gutenberg.org/files/1342/1342-0.txt from reverend.thomas import Bayes # Train the classifier language_sniffer = Bayes() for lang in ['en', 'es', 'fr', 'de', 'it']: filename = 'notes/proverbs_%s.txt' % lang with open(filename) as f: data = f.read().decode('utf-8') language_sniffer.train(lang, data) # Apply the classifier phrases = u'''\ All the leaves are brown and the sky is gray. I've been for a walk on a winter's day. De colores, todos los colores. De colores se visten los campos en la primavera. Jingle bells, jingle all the way. Oh what fun it is to ride in a one horse open sleigh. Casca belles, hoy es navidad. Es un dia, de allegria y felicidad. '''.splitlines() for phrase in phrases: best_guess = language_sniffer.guess(phrase)[0][0] print best_guess, '<--', phrase[:30]
""" pip install reverend pip install sets Source Code :https://laslabs.github.io/python-reverend/_modules/reverend/thomas.html Overview of Bayes Rule: https://towardsdatascience.com/bayes-rule-with-a-simple-and-practical-example-2bce3d0f4ad0 """ from reverend.thomas import Bayes g = Bayes() # guesser g.train('french','La souris est rentre dans son trou.') g.train('english','my tailor is rich.') g.train('french','Je ne sais pas si je viendrai demain.') g.train('english','I do not plan to update my website soon and I would really like some help from the rest of you idiots.') print(g.guess('Jumping out of cliffs it not a good idea.')) # print(g.guess('Demain il fera trs probablement chaud.'))
====== RESTART: /Users/raymond/Dropbox/Public/army2/decorator_school.py ====== >>> >>> y = big_func(10) Doing hard work INFO:root:Called big_func() with (10,) giving 11 in 1.074376 seconds >>> y = big_func(20) Doing hard work INFO:root:Called big_func() with (20,) giving 21 in 1.100503 seconds >>> show_cache(big_func) {10: 11, 20: 21} SyntaxError: invalid syntax >>> >>> >>> from reverend.thomas import Bayes >>> gender = Bayes() >>> gender.train('male', 'bill hank chris mark martin pat adam hank chris zack sean') >>> gender.train('female', 'mindy shelly pat mary daisy amber chris pat becky sue') >>> gender.guess('hank') [('male', 0.9999)] >>> gender.guess('mindy') [('female', 0.9999)] >>> gender.guess('pat') [('female', 0.6451612903225806), ('male', 0.35483870967741926)] >>> gender.guess('chris') [('male', 0.6875000000000001), ('female', 0.3125)] >>> gender.train('male', 'red red orange yellow red orange blue black brown blue red yellow') >>> gender.train('female', 'pink red green green blue blue chartreuse green blue yellow orange blue green') >>> gender.guess('red') [('male', 0.8), ('female', 0.19999999999999996)] >>> gender.guess('pink')
def __init__(self): self.guesser = Bayes()
from reverend.thomas import Bayes guesser = Bayes() guesser.train('french', 'le la les du un une je il elle de en') guesser.train('german', 'der die das ein eine') guesser.train('spanish', 'el uno una las de la en') guesser.train('english', 'the it she he they them are were to') guesser.guess('they went to el cantina') guesser.guess('they were flying planes') guesser.train('english', 'the rain in spain falls mainly on the plain') guesser.save('my_guesser.bay')
def action_train(self, cr, uid, ids, context=None): cat_obj = self.pool.get('crm.bayes.categories') group_obj = self.pool.get('crm.bayes.group') message_obj = self.pool.get('crm.bayes.test.guess') for id in ids: cat_id = self.read(cr, uid, id, ['category_id', 'name']) cat_id = cat_id[0]['category_id'] if result: max_list = max(result, key=lambda k: k[1]) if cat_id: cat_guess_msg = cat_obj.read(cr, uid, cat_id, ['train_messages']) cat_obj.write(cr, uid, cat_id, { 'train_messages': cat_guess_msg['train_messages'] + 1 }) if max_list[1] > 0 and not cat_id: cat_id = cat_obj.search(cr, uid, [('name', '=', max_list[0])])[0] cat_guess_msg = cat_obj.read(cr, uid, cat_id, ['guess_messages']) cat_obj.write(cr, uid, cat_id, { 'guess_messages': cat_guess_msg['guess_messages'] + 1 }) self.write(cr, uid, ids, {'category_id': cat_id}) if cat_id: cat_rec = cat_obj.read(cr, uid, cat_id, []) guesser = Bayes() data = "" for rec in group_obj.browse(cr, uid, [cat_rec['group_id'][0]]): if rec['train_data']: data += rec['train_data'] if data: myfile = file(file_path + "crm_bayes.bay", 'w') myfile.write(data) myfile.close() guesser.load(file_path + "crm_bayes.bay") guesser.train(cat_rec['name'], message_obj.read(cr, uid, id)[0]['name']) guesser.save(file_path + "crm_bayes.bay") myfile = file(file_path + "crm_bayes.bay", 'r') data = "" for fi in myfile.readlines(): data += fi cr.execute( "select sum(train_messages) as tot_train,sum(guess_messages) as tot_guess from crm_bayes_categories where group_id=%d" % cat_rec['group_id'][0]) rec = cr.dictfetchall() if not rec[0]['tot_guess']: rec[0]['tot_guess'] = 0 percantage = float( rec[0]['tot_guess'] * 100) / float(rec[0]['tot_guess'] + rec[0]['tot_train']) group_obj.write(cr, uid, cat_rec['group_id'][0], { 'train_data': data, 'automate_test': percantage }) else: raise osv.except_osv(_('Error !'), _('Please Select Category! ')) return { 'view_type': 'form', "view_mode": 'form', 'res_model': 'crm.bayes.train.message', 'type': 'ir.actions.act_window', 'target': 'new', }
def run(corpus, verbose=False, hkap_file=os.path.join(software, 'libs/PACManData.bay'), train=False, authors=False, exact_names=False, first_only=False, nyears=10, plotit=False, hst=False, clobber=False, rs_exceptions=''): f = open(os.path.join(software, 'category_synonyms.txt'), 'r') lines = f.readlines() f.close() acronyms = {} for line in lines: if line.startswith('#'): continue key, value = line.split('=') acronyms[key.strip()] = value.strip().split(',') uber_categories = acronyms stopwords = load_stopwords() dguesser = Bayes() dguesser.load(hkap_file) if not authors: if hst: ## Below, proposals are retrieved, then parsed. abs = parse_abstracts_proposals(corpus) text = parse_science_justification_proposals(corpus) justification = abs + text bayesString = " " + justification else: f = open(corpus) lines = f.readlines() f.close() text = '' for line in lines: if line.startswith('#'): continue if not line.strip(): continue text += line.strip() + ' ' bayesString = text bayesString = work_string(bayesString, stopwords) result = dguesser.guess(bayesString) result = normalize_result(result) else: ## assumes input is a person report ## if .pkl report not available, creates new one import util records = [] results_dict = {} results_pkl = corpus.replace(corpus.split('.')[-1], 'pkl') if not os.path.isfile(results_pkl) or clobber: f = open(corpus) lines = f.readlines() f.close() for line in lines: if line.startswith('#'): continue if not line.strip(): continue info = line.rstrip().split("\t") if info[0] == '': continue # records.append(info[0].replace(' ','').replace('"','').replace("'",'').lower()) records.append(info[0].replace('"', '').replace("'", '').lower()) author_dict, cite_dict = util.adscrawl.run_authors( records, nyears=nyears, rs_exceptions=rs_exceptions) ## author_dict, cite_dict = util.adscrawl.run_exact_authors(records, nyears=nyears) pickle.dump(author_dict, open(results_pkl, 'wb')) pickle.dump(cite_dict, open('cites.pkl', 'wb')) else: author_dict = pickle.load(open(results_pkl, 'rb')) cite_dict = pickle.load(open('cites.pkl', 'rb')) for author in author_dict.keys(): bayesString = '' for abstract in author_dict[author]: bayesString = ' ' + abstract bayesString = work_string(bayesString, stopwords) result = dguesser.guess(bayesString) ## result = normalize_result(result) results_dict[author] = {} results_dict[author]['hkap'] = rec.fromrecords(result) try: results_dict[author]['cites'] = sorted(cite_dict[author], reverse=True) except: results_dict[author]['cites'] = [0] result = results_dict return (result, uber_categories)
# by default, self.combiner is set to self.robinson state['combiner'] = None return state def Bayes__setstate__(self, state): self.__dict__.update(state) # support the default combiner (an instance method): if 'combiner' in state and state['combiner'] is None: self.combiner = self.robinson Bayes.__getstate__ = Bayes__getstate__ Bayes.__setstate__ = Bayes__setstate__ bayes = Bayes() # Traverses all files and directories starting from a root directory # Adds normalized files to trainingData dict def getCorpus(path, classification): for root, subFolders, fileNames in os.walk(path): for fileName in fileNames: # Learn type of file - only want text files fileType = mimetypes.guess_type(fileName) if (fileType[1] is None and fileType[0] is None) or re.match( combinedMimeRegex, fileType[0]):
) neg_file = open(BASE_DIR+"/data/rt-polarity.neg").read() pos_file = open(BASE_DIR+"/data/rt-polarity.pos").read() neg_tweets_list = str(neg_file).split('\n') pos_tweets_list = str(pos_file).split('\n') neg_cutoff = int(neg_tweets_list.__len__()*3/4) pos_cutoff = int(pos_tweets_list.__len__()*3/4) neg_train = neg_tweets_list[:neg_cutoff] pos_train = pos_tweets_list[:neg_cutoff] neg_test = neg_tweets_list[neg_cutoff:] pos_test = pos_tweets_list[pos_cutoff:] tweet_data = {'neg_train':neg_train,'pos_train':pos_train,'neg_test':neg_test,'pos_test':pos_test} bestwords = get_best_words(pos_train, neg_train) single_classifier = Bayes() single_classifier.load(fname=BASE_DIR+"/data/rt_polarity_classifiers/single_classifier.dat") non_stop_classifier = Bayes(tokenizer=non_stop_tokenizer()) non_stop_classifier.load(fname=BASE_DIR+"/data/rt_polarity_classifiers/single_stop_classifier.dat") best_classifier = Bayes(tokenizer=best_tokenizer(best_words=bestwords)) best_classifier.load(fname=BASE_DIR+"/data/rt_polarity_classifiers/single_best_classifier.dat") bigram_best_classifier = Bayes(tokenizer=best_bigram_tokenizer(best_words=bestwords)) bigram_best_classifier.load(fname=BASE_DIR+"/data/rt_polarity_classifiers/single_bi_classifier.dat")
# nthcolumn import mechanize import cookielib import time import os, sys, requests, pattern, json, tweepy import numpy as np from random import randint from pattern.en import sentiment from bs4 import BeautifulSoup from six.moves.html_parser import HTMLParser h = HTMLParser() from reverend.thomas import Bayes ai = Bayes() from hackernews import HackerNews hn = HackerNews() with open('./Documents/tattle/config.json') as data_file: settings = json.load(data_file) consumer_key = settings['twitter']['consumer_key'] consumer_secret = settings['twitter']['consumer_secret'] access_key = settings['twitter']['access_token_key'] access_secret = settings['twitter']['access_token_secret'] auth = tweepy.OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_token, access_secret) twitter = tweepy.API(auth)