class NBClassifier: def __init__(self, train_data_file): self._train_data_file = train_data_file f = open(self._train_data_file, 'r+') self._cl = NaiveBayesClassifier(f, format="json") f.close() def update_train_set(self, sentence): new_data = [(sentence.str_sentence, sentence.label)] self._cl.update(new_data) self._save_data_to_file() def _save_data_to_file(self): TEXT = "{\"text\":\"" LABEL = "\", \"label\":\"" dict_str = ",\n".join([ str(TEXT + str(el[0]) + LABEL + str(el[1]) + "\"}") for el in self._cl.train_set ]) f = open(self._train_data_file, 'r+') f.write("[" + dict_str + "]") f.close() def prob_classify(self, sentence): # import ipdb; ipdb.set_trace() return self._cl.prob_classify(sentence).max()
class Model(object): """docstring for Model""" def __init__(self, name='Guess', config={}): self.name = name self.config = config self.clf = NaiveBayesClassifier([]) def train(self, training_data): safe_training = [] for example in training_data: safe_training.append((example.get('text'), example.get('label'))) self.clf.update(safe_training) def evaluate(self, text): label = self.clf.classify(text) prob_dist = self.clf.prob_classify(text) label_prob = prob_dist.prob(label) return label, label_prob def get_classes(self): return self.clf.labels() def save(self): pass def load(self): pass
def chunk(data, mode, classificationS): ''' Parameters: data: Type: Array Dataframe containing tweets and party information. mode: Type: String "train" or String "test" Determines whether or not we are training our classifier or testing the accuracy of it. classificationS: Type: NLTK Classifier or None Sets a classifier if one exists for testing purposes. Trains/tests a NLTK Naive Bayes Classifier (NBC) on arrays. Data must be loaded in slowly/overtime to prevent memory errors. ''' length = len(data) curPos = 0 classifier = None if classificationS is not None: classifier = classificationS if mode == "train": while curPos <= length: if curPos == 0: d = data[0:50] classifier = NaiveBayesClassifier(d) curPos = 50 else: if curPos + 50 >= length: classifier.update(data[curPos:length]) else: classifier.update(data[curPos:curPos + 50]) curPos = curPos + 50 time.sleep(2) return classifier elif mode == 'test': listOfAccs = [] while curPos <= length: if curPos + 50 >= length: listOfAccs.append(classifier.accuracy(data[curPos:length])) else: listOfAccs.append(classifier.accuracy(data[curPos:curPos + 50])) curPos = curPos + 50 time.sleep(2) return listOfAccs
class TestValidators(TestCase): def setUp(self): self.data = StringIO('{}') self.classifier = NaiveBayesClassifier(self.data, format='json') self.classifier.update([ ('spam spam spam', 'spam'), ('this is not spam', 'valid'), ]) self.mock_classifier_get = mock.patch.object( ClassifierValidator, 'get_classifier', mock.Mock(return_value=self.classifier) ) self.patch_classifier_get = self.mock_classifier_get.start() def test_validator_pass(self): validate = ClassifierValidator() validate('this is totally legit') def test_validator_invalid(self): validate = ClassifierValidator() with self.assertRaises(ValidationError): validate('spam spammy spam') def test_validator_invalid_different_exception(self): validate = ClassifierValidator(raises=ValueError) with self.assertRaises(ValueError): validate('spam spammy spam') @mock.patch('textclassifier.classifier.TEXTCLASSIFIER_DATA_FILE', '') def test_open_file_failure(self): """Open file, but still validate after errors""" self.mock_classifier_get.stop() mod_name = ('builtins', '__builtin__')[(sys.version_info < (3,0))] with mock.patch('{0}.open'.format(mod_name)) as mocked_open: mocked_open.side_effect = IOError with self.assertRaises(IOError): DefaultClassifier() validate = ClassifierValidator() validate('spam spam spam')
class TBSentiment(Model): """Wrapper around the TextBlob sentiment analyzer. Can train and test a using the standardized data format. Args: Model (): Initialize the model. """ def __init__(self): self.cl = NaiveBayesClassifier([]) def classify(self, comment): prob_dist = self.cl.prob_classify(comment) pol_pred = prob_dist.max() confidence = prob_dist.prob(pol_pred) return pol_pred, confidence def train(self, data, eval=None, d_print=False): """Train the TextBlob object on custom data. Args: data (:obj:`list` of :obj:`tuple`): Take a list of tuples with format (comment, polarity in ["pos", "neg"]). """ self.cl.update(data) def test(self, data): """Test the TextBlob object on custom data. Args: data (:obj:`list` of :obj:`tuple`): Take a list of tuples with format (comment, polarity in ["pos", "neg"]). Returns: :obj:`tuple`: Return the successes and failures in a list (:obj:`list`, :obj:`list`) """ return
class TestNaiveBayesClassifier(unittest.TestCase): def setUp(self): self.classifier = NaiveBayesClassifier(train_set) def test_default_extractor(self): text = "I feel happy this morning." assert_equal(self.classifier.extract_features(text), basic_extractor(text, train_set)) def test_classify(self): res = self.classifier.classify("I feel happy this morning") assert_equal(res, 'positive') assert_equal(len(self.classifier.train_set), len(train_set)) def test_classify_a_list_of_words(self): res = self.classifier.classify(["I", "feel", "happy", "this", "morning"]) assert_equal(res, "positive") def test_train_from_lists_of_words(self): # classifier can be trained on lists of words instead of strings train = [(doc.split(), label) for doc, label in train_set] classifier = NaiveBayesClassifier(train) assert_equal(classifier.accuracy(test_set), self.classifier.accuracy(test_set)) def test_prob_classify(self): res = self.classifier.prob_classify("I feel happy this morning") assert_equal(res.max(), "positive") assert_true(res.prob("positive") > res.prob("negative")) def test_accuracy(self): acc = self.classifier.accuracy(test_set) assert_true(isinstance(acc, float)) def test_update(self): res1 = self.classifier.prob_classify("lorem ipsum") original_length = len(self.classifier.train_set) self.classifier.update([("lorem ipsum", "positive")]) new_length = len(self.classifier.train_set) res2 = self.classifier.prob_classify("lorem ipsum") assert_true(res2.prob("positive") > res1.prob("positive")) assert_equal(original_length + 1, new_length) def test_labels(self): labels = self.classifier.labels() assert_true("positive" in labels) assert_true("negative" in labels) def test_show_informative_features(self): feats = self.classifier.show_informative_features() def test_informative_features(self): feats = self.classifier.informative_features(3) assert_true(isinstance(feats, list)) assert_true(isinstance(feats[0], tuple)) def test_custom_feature_extractor(self): cl = NaiveBayesClassifier(train_set, custom_extractor) cl.classify("Yay! I'm so happy it works.") assert_equal(cl.train_features[0][1], 'positive') def test_init_with_csv_file(self): cl = NaiveBayesClassifier(CSV_FILE, format="csv") assert_equal(cl.classify("I feel happy this morning"), 'pos') training_sentence = cl.train_set[0][0] assert_true(isinstance(training_sentence, unicode)) def test_init_with_csv_file_without_format_specifier(self): cl = NaiveBayesClassifier(CSV_FILE) assert_equal(cl.classify("I feel happy this morning"), 'pos') training_sentence = cl.train_set[0][0] assert_true(isinstance(training_sentence, unicode)) def test_init_with_json_file(self): cl = NaiveBayesClassifier(JSON_FILE, format="json") assert_equal(cl.classify("I feel happy this morning"), 'pos') training_sentence = cl.train_set[0][0] assert_true(isinstance(training_sentence, unicode)) def test_init_with_json_file_without_format_specifier(self): cl = NaiveBayesClassifier(JSON_FILE) assert_equal(cl.classify("I feel happy this morning"), 'pos') training_sentence = cl.train_set[0][0] assert_true(isinstance(training_sentence, unicode)) def test_accuracy_on_a_csv_file(self): a = self.classifier.accuracy(CSV_FILE) assert_true(isinstance(a, float)) def test_accuracy_on_json_file(self): a = self.classifier.accuracy(JSON_FILE) assert_true(isinstance(a, float)) def test_init_with_tsv_file(self): cl = NaiveBayesClassifier(TSV_FILE) assert_equal(cl.classify("I feel happy this morning"), 'pos') training_sentence = cl.train_set[0][0] assert_true(isinstance(training_sentence, unicode)) def test_init_with_bad_format_specifier(self): assert_raises(ValueError, lambda: NaiveBayesClassifier(CSV_FILE, format='unknown')) def test_repr(self): assert_equal(repr(self.classifier), "<NaiveBayesClassifier trained on {0} instances>".format(len(train_set)))
blob = TextBlob("The beer is good. But the hangover is horrible.", classifier=cl) print(blob.classify()) for s in blob.sentences: print(s) print(s.classify()) # evaluating classifiers print(cl.accuracy(test)) print(cl.show_informative_features( 5)) # displaying a listing of the most informative features # updating classifiers wth new data new_data = [('She is my best friend.', 'pos'), ("I'm happy to have a new friend.", 'pos'), ("Stay thirsty, my friend.", 'pos'), ("He ain't from around here.", 'neg')] print(cl.update(new_data)) print(cl.accuracy(test)) # feature extractors # creating a feature extractor that just uses the first and last words of a document as its features def end_word_extractor(document): tokens = document.split() first_word, last_word = tokens[0], tokens[-1] feats = {} feats["first({0})".format(first_word)] = True feats["last({0})".format(last_word)] = False return feats features = end_word_extractor("I feel happy")
class LogicProc: def __init__(self, preclassified_file, channel, slack_token): if os.path.isfile(preclassified_file)==False: print('"' + preclassified_file + '" does not exist!') with open(preclassified_file,'r') as train_set: print 'training from ' + preclassified_file self.spam_classifier = NaiveBayesClassifier(train_set, format="csv") self.slack_client = slack_interface.SlackInterface(slack_token) self.message_queue = [] self.last_message_ts = None self.channel = self.slack_client.get_channel_id(channel) if self.channel==None: print 'Could not find channel ' + channel self.db_interface = database_interface.DB() training = self.db_interface.get_training_data() self.spam_classifier.update(training) self.update_classifer_from_slack(self.channel) self.spam_classifier.show_informative_features() self.check_twitter_msgs = infinite_timer.InfiniteTimer(5.0, self.proc_messages) self.check_slack_msgs = infinite_timer.InfiniteTimer(60.0, self.update_classifer_from_slack, self.channel) self.check_twitter_msgs.start() self.check_slack_msgs.start() def add_new_message(self, msg, source): """ Callback from Twitter when there is a new message @param msg The Twitter message, with all its attributes @param source Where the message came from. Right now should only be 'twitter' """ self.message_queue.append({'source': source, 'message': msg}) def proc_messages(self): for msg in self.message_queue: if msg['source'] == 'twitter': message = msg['message'] if self.quality_filter(message.text) == True: print 'GOOD: ' + message.text.encode('utf-8') self.post_to_slack(message, self.channel) self.store_message(message.text, True) else: print 'BAD: ' + message.text.encode('utf-8') self.store_message(message.text, False) self.message_queue.remove(msg) def run_loop(self): """ Not sure what this was originally intended to do.. now it runs proc_messages once a second """ while True: # sleep between polling queue time.sleep(1) def quality_filter(self, message_text): # -filter useless hashtag announcements "Prayers for Irma! Use #IrmaSoS" # -filter outside the geobounds # -filter duplicates # -bayesian filter result = self.spam_classifier.classify(message_text) if result == 'neg': return False else: return True def post_to_slack(self, msg, channel): self.slack_client.post_message(msg.text, channel) def update_classifer_from_slack(self, channel): slack_msgs = self.slack_client.get_slack_reactions(channel, self.last_message_ts) if len(slack_msgs)>0: self.last_message_ts = slack_msgs[-1]['ts'] bayesian_update_data = [] for m in slack_msgs: user_feedback = self.is_slack_reaction_pos(m['reactions']) text = m['text'] if user_feedback == None: pass elif user_feedback == True: bayesian_update_data.append((text, 'pos')) elif user_feedback == False: bayesian_update_data.append((text, 'neg')) # update for better results if we can if len(bayesian_update_data) > 0: print 'updating db...' # update classification in DB self.db_interface.update(bayesian_update_data); # update classifier print 'updating classifier...' self.spam_classifier.update(bayesian_update_data) print 'done...' self.spam_classifier.show_informative_features() def is_slack_reaction_pos(self,reactions): for t in reactions: name = t['name'] if name == '-1': return False if name == '+1': return True return None def store_message(self, message, filter_classification, source='twitter'): self.db_interface.add(message,filter_classification, source) def bayesian_search(self, query): results = self.api.search(query) filtered_results = [r for r in results if self.is_spam(r.text) == 0] return filtered_results
# Classifying Text ( Call the classify(text) method to use the classifier.) test_check = cl.classify("This is an amazing library!") print test_check # You can get the label probability distribution with the prob_classify(text) method. prob_dist = cl.prob_classify("This one's a doozy.") print prob_dist.max() print round(prob_dist.prob("pos"), 2) print round(prob_dist.prob("neg"), 2) print prob_dist.prob("pos") print prob_dist.prob("neg") blob = TextBlob("The beer is good. But the hangover is horrible.", classifier=cl) print blob.classify() # Evaluating Classifiers (To compute the accuracy on our test set, use the accuracy(test_data) method.) print cl.accuracy(test) # Updating Classifiers with New Data (Use the update(new_data) method to update a classifier with new training data.) new_data = [('She is my best friend.', 'pos'), ("I'm happy to have a new friend.", 'pos'), ('Stay thirsty, my friend.', 'pos'), ("He ain't from around here.", 'neg')] #print new_data print cl.update(new_data) print cl.accuracy(test)
class IntentClassifier: """ This intent classifier is a Python interface that uses NaiveBayesClassifier from textblob. It trains data from local data folder that contains json data files, which each has a name, training phrases(list), desired responses(list). Test file are located in the test folder as json format that has phrases and each corresponding intent. Trained classifier can be saved or loaded, methods are implemented using pickling. """ def __init__(self): """Construtor of the intent classifier""" self._responses = {} def __load_data(self): """Load data from the local 'data' folder that contains json data files for training""" print("loading training data...") training_data = [] files = glob('data/*.json') for file in files: print("loading", file) with open(file) as data_file: training_data.append(json.load(data_file)) return training_data def __load_test(self): """Load data from the local 'test' folder that contains json data files for testing""" print("loading testing data...") with open('test/test.json') as test_file: return json.load(test_file) def __build_responses(self, intents): """Create dictionary of intent mapping each to lists of responses""" responses = {} for intent in intents: responses[intent['name']] = intent['responses'] return responses def __arrange_data(self, intents): """Convert a json lists to a list of tuples each contains utterance and intent""" return [(utterance, intent['name']) for intent in intents for utterance in intent['userSays']] def __arrange_test(self, tests): """Convert a json to a list of tuples each containts phrase and intent""" return [(test['utterance'], test['intent']) for test in tests['tests']] def train(self, utterances=[]): """ Loads data from local data folder that contains json data files to train the Naive Bayes Classifier and populate a dictionary of intents mapping to each list of responses if no utterances list were given Keyword Arguments: intents -- is a list containing tuples of phrase and intent to train (optional) """ if not utterances: json_data = self.__load_data() self._responses = self.__build_responses(json_data) utterances = self.__arrange_data(json_data) self._cl = NaiveBayesClassifier(utterances) def update(self, utterances=[]): """ Loads data from local data folder that contains json data files to train the Naive Bayes Classifier and populate a dictionary of intents mapping to each list of responses if no utterances list were given Keyword Arguments: intents -- is a list containing tuples of phrase and intent to train (optional) """ if not utterances: json_data = self.__load_data() self._responses = { **self._responses, **self.__build_responses(utterances) } utterances = self.__arrange_data(json_data) self._cl.update(utterances) def test(self): """Test the accuracy of the classifier""" data_set = self.__arrange_test(self.__load_test()) return self._cl.accuracy(data_set) def classify(self, target): """Classify a text""" label = self._cl.classify(target) return label def getProbability(self, target, intent): """Get probability of a phrase to an intent""" guess = self._cl.prob_classify(target) return round(guess.prob(intent), 2) def response(self, target): """Get a response according to the intent of the text""" responses = self._responses[self.classify(target)] return random.choice(responses) def addResponse(self, text, intent): """Add a response to the dictionary""" self._responses[intent].append(text) def addResponses(self, utterances): """Add list of tuples each containing response and intent to responses""" for utterance in utterances: self._responses[utterance[1]].append(utterance[0]) def save(self, path): """Save the current trained classifier""" with open(path, "wb") as classifier_f: pickle.dump(self, classifier_f) def load(path): """A class method that load from local classifier""" with open(path, "rb") as classifier_f: classifier = pickle.load(classifier_f) return classifier
class BankClassify(): def __init__(self, data="AllData.csv"): """Load in the previous data (by default from AllData.csv) and initialise the classifier""" if os.path.exists(data): self.prev_data = pd.read_csv(data) else: self.prev_data = pd.DataFrame(columns=['date', 'desc', 'amount', 'cat']) self.classifier = NaiveBayesClassifier(self._get_training(self.prev_data), self._extractor) def add_data(self, filename): """Add new data and interactively classify it. Arguments: - filename: filename of Santander-format file """ self.new_data = self._read_santander_file(filename) self._ask_with_guess(self.new_data) self.prev_data = pd.concat([self.prev_data, self.new_data]) self.prev_data.to_csv("AllData.csv", index=False) def _prep_for_analysis(self): """Prepare data for analysis in pandas, setting index types and subsetting""" self.prev_data = self._make_date_index(self.prev_data) self.prev_data['cat'] = self.prev_data['cat'].str.strip() self.inc = self.prev_data[self.prev_data.amount > 0] self.out = self.prev_data[self.prev_data.amount < 0] self.out.amount = self.out.amount.abs() self.inc_noignore = self.inc[self.inc.cat != 'Ignore'] self.inc_noexpignore = self.inc[(self.inc.cat != 'Ignore') & (self.inc.cat != 'Expenses')] self.out_noignore = self.out[self.out.cat != 'Ignore'] self.out_noexpignore = self.out[(self.out.cat != 'Ignore') & (self.out.cat != 'Expenses')] def _read_categories(self): """Read list of categories from categories.txt""" categories = {} with open('categories.txt') as f: for i, line in enumerate(f.readlines()): categories[i] = line.strip() return categories def _add_new_category(self, category): """Add a new category to categories.txt""" with open('categories.txt', 'a') as f: f.write('\n' + category) def _ask_with_guess(self, df): """Interactively guess categories for each transaction in df, asking each time if the guess is correct""" # Initialise colorama init() df['cat'] = "" categories = self._read_categories() for index, row in df.iterrows(): # Generate the category numbers table from the list of categories cats_list = [[idnum, cat] for idnum, cat in categories.items()] cats_table = tabulate(cats_list) stripped_text = self._strip_numbers(row['desc']) # Guess a category using the classifier (only if there is data in the classifier) if len(self.classifier.train_set) > 1: guess = self.classifier.classify(stripped_text) else: guess = "" # Print list of categories print(chr(27) + "[2J") print(cats_table) print("\n\n") # Print transaction print("On: %s\t %.2f\n%s" % (row['date'], row['amount'], row['desc'])) print(Fore.RED + Style.BRIGHT + "My guess is: " + str(guess) + Fore.RESET) input_value = input("> ") if input_value.lower() == 'q': # If the input was 'q' then quit return df if input_value == "": # If the input was blank then our guess was right! df.ix[index, 'cat'] = guess self.classifier.update([(stripped_text, guess)]) else: # Otherwise, our guess was wrong try: # Try converting the input to an integer category number # If it works then we've entered a category category_number = int(input_value) category = categories[category_number] except ValueError: # Otherwise, we've entered a new category, so add it to the list of # categories category = input_value self._add_new_category(category) categories = self._read_categories() # Write correct answer df.ix[index, 'cat'] = category # Update classifier self.classifier.update([(stripped_text, category) ]) return df def _make_date_index(self, df): """Make the index of df a Datetime index""" df.index = pd.DatetimeIndex(df.date.apply(dateutil.parser.parse,dayfirst=True)) return df def _read_santander_file(self, filename): """Read a file in the plain text format that Santander provides downloads in. Returns a pd.DataFrame with columns of 'date', 'desc' and 'amount'.""" with open(filename, errors='replace') as f: lines = f.readlines() dates = [] descs = [] amounts = [] for line in lines[4:]: line = "".join(i for i in line if ord(i)<128) if line.strip() == '': continue splitted = line.split(":") category = splitted[0] data = ":".join(splitted[1:]) if category == 'Date': dates.append(data.strip()) elif category == 'Description': descs.append(data.strip()) elif category == 'Amount': just_numbers = re.sub("[^0-9\.-]", "", data) amounts.append(just_numbers.strip()) df = pd.DataFrame({'date':dates, 'desc':descs, 'amount':amounts}) df['amount'] = df.amount.astype(float) df['desc'] = df.desc.astype(str) df['date'] = df.date.astype(str) return df def _get_training(self, df): """Get training data for the classifier, consisting of tuples of (text, category)""" train = [] subset = df[df['cat'] != ''] for i in subset.index: row = subset.ix[i] new_desc = self._strip_numbers(row['desc']) train.append( (new_desc, row['cat']) ) return train def _extractor(self, doc): """Extract tokens from a given string""" # TODO: Extend to extract words within words # For example, MUSICROOM should give MUSIC and ROOM tokens = self._split_by_multiple_delims(doc, [' ', '/']) features = {} for token in tokens: if token == "": continue features[token] = True return features def _strip_numbers(self, s): """Strip numbers from the given string""" return re.sub("[^A-Z ]", "", s) def _split_by_multiple_delims(self, string, delims): """Split the given string by the list of delimiters given""" regexp = "|".join(delims) return re.split(regexp, string)
('Gary is a friend of mine.', 'pos'), ("I can't believe I'm doing this.", 'neg')] from textblob.classifiers import NaiveBayesClassifier cl = NaiveBayesClassifier(train) # classify method print(cl.classify("This is amazing!")) # probablity of class prob_dist = cl.prob_classify("This one's a doozy.") prob_dist.max() round(prob_dist.prob("pos"), 2) round(prob_dist.prob("neg"), 2) # classify text blob blob = TextBlob("I have good spelling!", classifier=cl) blob.classify() cl.accuracy(test) cl.show_informative_features(5) new_data = [('She is my best friend.', 'pos'), ("I'm happy to have a new friend.", 'pos'), ("Stay thirsty, my friend.", 'pos'), ("He ain't from around here.", 'neg')] cl.update(new_data) cl.accuracy(test)
] test = [ ('The beer was good.', 'pos'), ('I do not enjoy my job', 'neg'), ("I ain't feeling dandy today.", 'neg'), ("I feel amazing!", 'pos'), ('Gary is a friend of mine.', 'pos'), ("I can't believe I'm doing this.", 'neg') ] cl = NaiveBayesClassifier(train) reviews = [(list(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] random.shuffle(reviews) new_train, new_test = reviews[0:100], reviews[101:200] cl.update(new_train) @app.route('/', methods=['POST']) def home(): data = request.data dataDict = json.loads(data) return cl.classify(dataDict['text']) @app.route("/HEALTH") def health(): return "HEALTH" app.run()
("I can't deal with this", 'neg'), ('He is my sworn enemy!', 'neg'), ('My boss is horrible.', 'neg') ] test = [ ('The beer was good.', 'pos'), ('I do not enjoy my job', 'neg'), ("I ain't feeling dandy today.", 'neg'), ("I feel amazing!", 'pos'), ('Gary is a friend of mine.', 'pos'), ("I can't believe I'm doing this.", 'neg') ] cl = NaiveBayesClassifier(train) # Grab some movie review data reviews = [(list(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] random.shuffle(reviews) new_train, new_test = reviews[0:100], reviews[101:200] # Update the classifier with the new training data cl.update(new_train) # Compute accuracy accuracy = cl.accuracy(test + new_test) print("Accuracy: {0}".format(accuracy)) # Show 5 most informative features cl.show_informative_features(5)
class BankClassify(): def __init__(self): """Load in the previous data (by default from AllData.csv) and initialise the classifier""" self.training_set = [] self.accuracy = 0 self.classifier = NaiveBayesClassifier(self.training_set, self.extractor) def get_accuracy(self): return self.accuracy def category_classify(self, item): # Guess a category using the classifier (only if there is data in the classifier) if len(self.classifier.train_set) > 1: guess = self.classifier.classify(item.lower()) else: guess = "" new_entry = [(item.lower(), guess)] self.classifier.update(new_entry) self.training_set = self.training_set + new_entry self.accuracy = self.classifier.accuracy(self.training_set) return guess def read_bank_file(self, filename): """Read a csv file Returns a list with columns of 'desc' and 'category'.""" with open(filename, 'r') as csvfile: reader = csv.reader(csvfile) next(reader, None) train = [] for line in reader: train.append(tuple(line)) self.training_set = self.training_set + train self.classifier.update(train) self.accuracy = self.classifier.accuracy(train) return True def update_training_set(self, new_data): """Update training data for the classifier, consisting of tuples of (text, category)""" train = [] if len(new_data) > 0: for i in range(0, len(new_data)): row = new_data[i] new_desc = self.strip_numbers(row[0]).lower() train.append((new_desc, row[1])) self.training_set = self.training_set + train self.classifier.update(train) self.accuracy = self.classifier.accuracy(train) else: self.accuracy = 0 return self.training_set def extractor(self, doc): """Extract tokens from a given string""" # TODO: Extend to extract words within words # For example, MUSICROOM should give MUSIC and ROOM tokens = self.split_by_multiple_delims(doc, [' ', '/']) features = {} for token in tokens: if token == "": continue features[token] = True return features def strip_numbers(self, s): """Strip numbers from the given string""" return re.sub("[^A-Z ]", "", s) def split_by_multiple_delims(self, string, delims): """Split the given string by the list of delimiters given""" regexp = "|".join(delims) return re.split(regexp, string)
class TestNaiveBayesClassifier(unittest.TestCase): def setUp(self): self.classifier = NaiveBayesClassifier(train_set) def test_default_extractor(self): text = "I feel happy this morning." assert_equal(self.classifier.extract_features(text), basic_extractor(text, train_set)) def test_classify(self): res = self.classifier.classify("I feel happy this morning") assert_equal(res, 'positive') assert_equal(len(self.classifier.train_set), len(train_set)) def test_classify_a_list_of_words(self): res = self.classifier.classify(["I", "feel", "happy", "this", "morning"]) assert_equal(res, "positive") def test_train_from_lists_of_words(self): # classifier can be trained on lists of words instead of strings train = [(doc.split(), label) for doc, label in train_set] classifier = NaiveBayesClassifier(train) assert_equal(classifier.accuracy(test_set), self.classifier.accuracy(test_set)) def test_prob_classify(self): res = self.classifier.prob_classify("I feel happy this morning") assert_equal(res.max(), "positive") assert_true(res.prob("positive") > res.prob("negative")) def test_accuracy(self): acc = self.classifier.accuracy(test_set) assert_true(isinstance(acc, float)) def test_update(self): res1 = self.classifier.prob_classify("lorem ipsum") original_length = len(self.classifier.train_set) self.classifier.update([("lorem ipsum", "positive")]) new_length = len(self.classifier.train_set) res2 = self.classifier.prob_classify("lorem ipsum") assert_true(res2.prob("positive") > res1.prob("positive")) assert_equal(original_length + 1, new_length) def test_labels(self): labels = self.classifier.labels() assert_true("positive" in labels) assert_true("negative" in labels) def test_show_informative_features(self): feats = self.classifier.show_informative_features() def test_informative_features(self): feats = self.classifier.informative_features(3) assert_true(isinstance(feats, list)) assert_true(isinstance(feats[0], tuple)) def test_custom_feature_extractor(self): cl = NaiveBayesClassifier(train_set, custom_extractor) cl.classify("Yay! I'm so happy it works.") assert_equal(cl.train_features[0][1], 'positive') def test_init_with_csv_file(self): with open(CSV_FILE) as fp: cl = NaiveBayesClassifier(fp, format="csv") assert_equal(cl.classify("I feel happy this morning"), 'pos') training_sentence = cl.train_set[0][0] assert_true(isinstance(training_sentence, unicode)) def test_init_with_csv_file_without_format_specifier(self): with open(CSV_FILE) as fp: cl = NaiveBayesClassifier(fp) assert_equal(cl.classify("I feel happy this morning"), 'pos') training_sentence = cl.train_set[0][0] assert_true(isinstance(training_sentence, unicode)) def test_init_with_json_file(self): with open(JSON_FILE) as fp: cl = NaiveBayesClassifier(fp, format="json") assert_equal(cl.classify("I feel happy this morning"), 'pos') training_sentence = cl.train_set[0][0] assert_true(isinstance(training_sentence, unicode)) def test_init_with_json_file_without_format_specifier(self): with open(JSON_FILE) as fp: cl = NaiveBayesClassifier(fp) assert_equal(cl.classify("I feel happy this morning"), 'pos') training_sentence = cl.train_set[0][0] assert_true(isinstance(training_sentence, unicode)) def test_init_with_custom_format(self): redis_train = [('I like turtles', 'pos'), ('I hate turtles', 'neg')] class MockRedisFormat(formats.BaseFormat): def __init__(self, client, port): self.client = client self.port = port @classmethod def detect(cls, stream): return True def to_iterable(self): return redis_train formats.register('redis', MockRedisFormat) mock_redis = mock.Mock() cl = NaiveBayesClassifier(mock_redis, format='redis', port=1234) assert_equal(cl.train_set, redis_train) def test_data_with_no_available_format(self): mock_fp = mock.Mock() mock_fp.read.return_value = '' assert_raises(FormatError, lambda: NaiveBayesClassifier(mock_fp)) def test_accuracy_on_a_csv_file(self): with open(CSV_FILE) as fp: a = self.classifier.accuracy(fp) assert_equal(type(a), float) def test_accuracy_on_json_file(self): with open(CSV_FILE) as fp: a = self.classifier.accuracy(fp) assert_equal(type(a), float) def test_init_with_tsv_file(self): with open(TSV_FILE) as fp: cl = NaiveBayesClassifier(fp) assert_equal(cl.classify("I feel happy this morning"), 'pos') training_sentence = cl.train_set[0][0] assert_true(isinstance(training_sentence, unicode)) def test_init_with_bad_format_specifier(self): assert_raises(ValueError, lambda: NaiveBayesClassifier(CSV_FILE, format='unknown')) def test_repr(self): assert_equal(repr(self.classifier), "<NaiveBayesClassifier trained on {0} instances>".format(len(train_set)))
print("Made new classifier") del full_data feeding_size = 1000 left_splice = 11000 right_splice = feeding_size + left_splice count = 0 new_start_time = time.time() past_times = 0 while right_splice < 1500000: loop_time = time.time() data = itertools.islice(training_data, left_splice, right_splice) try: classifier.update(data) except Exception: print("Houston we got a problem") with open("sentimentclassifier.pickle", "wb") as sentiment: pickle.dump(classifier, sentiment) sys.exit("Yo it ended at {} and {}".format(left_splice, right_splice)) past_times += time.time() - loop_time count += 1 string = "Left: {} Right: {}. Took {} seconds. Total Time Elapsed: {}. Average Time for each: {}. Count: {}."\ .format(left_splice, right_splice, time.time()-loop_time, time.time() - new_start_time, past_times/count, count) sys.stdout.write('\r' + string) left_splice += feeding_size right_splice += feeding_size with open("sentimentclassifier.pickle", "wb") as sentiment: pickle.dump(classifier, sentiment) print("Done dumping cycle {}!".format(count))
def final_utterance_appreciation_analysis(final_utterance): """ Input: A list of final utterances by the user. Output: The percentage of the people expressing appreciation at the end of the conversation. Algorithm: 1. Create a training set and a validation set of conversation which are manually classified into "appreciation" and "nonappreciation" The differentiation criteria is based on the existence of the words of gratitude. 2. Train the Naive Bayesian classifier algorithm using the training set. 3. If the accuracy of the classifier algorithm in classifying the validation dataset into "appreciation" and "nonappreciation", apply the algorithm to all the list final_utterance using a for loop. 4. Use a dictionary data structure during the loop to store the number of people who express gratitude and who do not express gratitude. 5. Calculate the percentage of people who express gratitude. How the Native Bayesian Classifier Algorithm from TextBlob Package Works: For training dataset: In order to find the probability for classifying the sentence with a label of "appreciation" and "nonappreciation", the algorithm first removes all the meaningless stop words such as "the" and "a" in the sentence. Then it calculates the frequency of the remaining tokens and creates a likelihood table that maps the tokens (which are the features) to the probability of the token being labelled as "appreciation" and "nonappreciation". For a new sentence, it removes all the meaningless stop words and calculate the probability of the sentence being "appreciation" or "nonappreciation" based on the 'naive' assumption that all features are independent, given the label: | P(label) * P(f1|label) * ... * P(fn|label) | P(label|features) = -------------------------------------------- | P(features) """ classified_dict = {"appreciation": 0, "non-appreciation": 0} train = [('Very well. How about the price for the trip to Essen?', 'nonappreciation'), ("I'd like to book the Cairo package. Thank you!", 'appreciation'), ('oh heck yeah!! economy - I need the money', 'nonappreciation'), ('Then I will take it!', 'nonappreciation'), ('Awesome!!! Thanks!!!', 'appreciation'), ('What??? :disappointed:', 'nonappreciation'), ('Yes do that', 'nonappreciation'), ('Thank you kindly!', 'appreciation'), ('Ok, thank you for your time anyways', 'appreciation'), ('thank you very much for your patience you are an absolute gem','appreciation'), ('Thank you so much!', 'appreciation'), ('Lots of swanky hotels to choose from! Well, based on length of trip, that one to SL sounds like a great deal. I think I wanna go ahead with booking that', 'nonappreciation'), ('Uh huh', 'nonappreciation'), ('Jerusalem to Kingston. I swear if I have to repeat myself again then I will sue', 'nonappreciation'), ('Ok, thanks anyway','appreciation'), ('Looking to go from San Francisco to MArseille. ', 'nonappreciation'), ('Book me for September 18 to 22. Let me know if its more than 2800 because thats all I can afford', 'nonappreciation'), ('duuuude. ah\nwhat about Ciudad Juarez', 'nonappreciation'), ('Well what if I leave the 8th', 'nonappreciation'), ('Ok :+1: we out', 'nonappreciation'), ('Yes!!!!!', 'nonappreciation'), ('ok fine lets do it, business class please', 'nonappreciation'), ('WOE IS ME, FOR I HAVE NOT', 'nonappreciation'), ('ah damn', 'nonappreciation'), ('okay bye', 'nonappreciation'), ('Yikes. Ok Buenos Aires it is\nBook it please\nBusiness class', 'nonappreciation'), ('shit yassss we goin in. Book it for us, please.', 'nonappreciation'), ('well, this is rather disappointing we cannot spend our family vacation near the airport. i wont be booking anything today in this case, goodbye', 'nonappreciation'), ('Thanks! Very excited!', 'appreciation'), ('NOT GOOD', 'nonappreciation'), ("you're a lifesaver", "appreciation"), ('ah. if i could book, i would book this one. well thanks for your time, ill come back next year and save my vacation days for a trip to San Diego.', "appreciation"), ('Great, thanks a lot!', "appreciation"), ("WHAT!?!?! Ugh, kill me now. Okkay fine. I'll look somewhere else.", "nonappreciation"), ("I guess that sound okay, I'll take it", "nonappreciation"), ("Ok, that's fine\nBook it", "nonappreciation"), ('I like the sound of that one. Heart of the city would be better than near a mall.\nLets book business class in Buenos Aires.', "nonappreciation"), ('cool bye', "nonappreciation"), ("let's book :wink:", "nonappreciation"), ('Done, booked! Thanks!', 'appreciation'), ('Okay will consider it and get back to you, thanks!', 'appreciation'), ('DOPE. book it', 'nonappreciation'), ('Hmm. Okay well im just gonna take the information you gave me and discuss it with my wife before booking something she might not enjoy. Thanks for the help!', 'appreciation'), ('Thanks! You were a great help!', 'appreciation'), ('i said 2.5 wasnt good enough', 'nonappreciation'), ('No thats the last straw, we are taking our business elsewhere', 'nonappreciation'), ('Thanks :slightly_smiling_face:', 'appreciation'), ('Hi Do you fly from Ulsan to London??', 'nonappreciation'), ('Ok then leave from Beijing', 'appreciation'), ('i need to get away from a little longer than that one. so lets book vancouver please and thanks', "appreciation"), ("Let's book Valencia. Pleasure doing business with you.", "appreciation"), ('Thank you bot.', "appreciation"), ('No worries, thanks!', "appreciation"), ("That sucks. I'll look somewhere else", "nonappreciation"), ('I am giving you one last time to you your job. you better tread carefully here, my friend,\nCairo to Porto Alegre or I will raise hell', "nonappreciation"), ('Bye. And thanks for nothing.', "nonappreciation"), ("Yes, I'll take it. Thank you", "nonappreciation"), ('no there are 7 of us', "nonappreciation"), ('for 712.00 it sounds like a very nice deal I will book flight on August 26 for 6 days. Thank you for your help.', 'appreciation'), ('3.5 it is then. lets book it', 'nonappreciation'), ('but fine, book it', 'nonappreciation'), ('no can do', "nonappreciation"), ('Thank you very much.', "nonappreciation"), ('gracias!', "appreciation"), ("Perfect! I'll book it", "nonappreciation"), ('Do you do flights leaving from Tel Aviv?', "nonappreciation"), ('that seem good, i will book! Gracias!', "appreciation"), ("No it's alright! thanks though!", "appreciation"), ('okay well its crucial i get there from Fortaleza so I will call someone else', "nonappreciation"), ('how is that possible', "nonappreciation"), ('Well what about in Goiania.?','nonappreciation'), ('ok no thats not good enough im going elsewhere', "nonappreciation"), ('amazing! thanks!', "appreciation"), ('Lets do Business class', "nonappreciation"), ("Oh Okay well i'll look somewhere else. Thanks anyway.", "appreciation"), ('you dont have any flights to birmingham yeah i find that pretty freakin hard to believe', "nonappreciation"), ('This is HORRIBLE', "nonappreciation"), ("yes, you're right.. thank you", "appreciation"), ('ok thanks so much', "appreciation"), ('what if i changed the dates. sept 2 and 23', "nonappreciation"), ('Thank you, but I will go use another service that can better satisfy my escapist fantasies', "appreciation"), ("I really want a spa. If you have nothing to offer with a spa, I'll shop around then.", 'nonappreciation'), ('Oh dear, thats quite above our 3 thousand dollar budget.', 'nonappreciation'), ('dope! thanks', 'appreciation'), ('No worries! Bye!', 'nonappreciation'), ('Ok Lets lock in San Diego', "nonappreciation"), ("You're great", 'appreciation'), ('ok. book it out of Milan please', 'nonappreciation)'), ('ill go for Ciudad Juarez', "nonappreciation"), ('Thank you wozbot!', "appreciation"), ('yes please', "nonappreciation"), ("Usually I wouldn't want to be caught dead in a 3.5 star hotel, but I'm short on time here. Get us on that trip, business class", "nonappreciation"), ('GREAT Thanks!!!!!!!!', "appreciation"), ("I think I'll stick to the 11 day package in Belem at Las Flores, seems like the best deal and it had a good user rating. Let's book that one.", "nonappreciation"), ('thnx', "appreciation"), ('no it HAS to be baltimore and it HAS to be perfect. thanks anyways', "appreciation"), ("Perfect! I'll book it", "nonappreciation"), ("That's it?", "nonappreciation"), ('I shall take the 5 star package!', "nonappreciation"), ('thank you so much', "appreciation"), ('YOU ARE RUINING MY MARRIAGE', "nonappreciation")] validation = [('Yes chief', "appreciation"), ("Thanks! I'm sure it will be amazinggg", "appreciation"), ("Weeeelllll this is a no brainer, I 'll just leave the next day and save a whole lotta money! Can you book this for me right away so I don't lose it?", "nonappreciation"), ("Ok I'll book the package with 8 days in Pittsburgh from August 17th to the 24th. Thank you.", "appreciation"), ('Thanks - will do', "appreciation"), ('Killing it! thank', "appreciation"), ('Thanks, you too', "appreciation"), ('thank you wozbot :slightly_smiling_face: toodles', "appreciation"), ('spectacular book please', "nonappreciation"), ("Well, I reckon I'll just book this one.", "nonappreciation"), ("yea so I've heard... send me to Paris then", 'nonappreciation'), ('Fortaleza\n5 stars', "nonappreciation"), ('I guess I can increase my budget by 1000', 'nonappreciation'), ('ok see ya', "nonappreciation"), ('leaving from anywhere??', "nonappreciation"), ("That's it! Thank you so so much :):):)", "appreciation"), ('Done. Book it.', "nonappreciation"), ('Great, sounds perfect. Thank you.', "appreciation"), ('Thats all i had my heart set on!!', "nonappreciation"), ("That sounds like the better hotel. Can't be too cautious travelling by myself for the first time! I will book that deal in an economy class ticket, I'm not ready for business class YET, need to pass that bar exam!", "nonappreciation"), ('Then I will take my search elsewhere', "nonappreciation"), ('Ya thanks', "appreciation"), ('Thank you, glad to be going back so soon', "appreciation"), ('well okay I can always take the tram in to the city. I will book that one.', "nonappreciation"), ('This is hopeless', "nonappreciation"), ('Great, thank you. I will most certainly book my next vacation with you.', "appreciation"), ('thank youuuu', "appreciation"), ('Lock it down', "nonappreciation"), ("Please help! My lovely parents have been married fof 20 years and they've never taken a trip together. I'm thinking of getting them out of town Sept 6 to 9\nyou got anything good for 2 adults leaving sao paulo, for under 2400?", "nonappreciation"), ('we can also go to Kochi', "nonappreciation"), ('no but we can stay for 9 days instead of 3', "nonappreciation"), ('thanks you!', "appreciation"), ('Just under budget. ok bye now', "nonappreciation"), ('thankyou', "appreciation"), ('can you tell me the price and nearby attractions?', "nonappreciation"), ('1 adult', "nonappreciation"), ('San Jose to Porto Alegre please. oh it needs to be between sept 18 to 22', "nonappreciation"), ('Ok sold! please enter a booking for us', "nonappreciation"), ('I can leave from Tel aviv and I want to go to San Jose with 7 adults for 2500', "nonappreciation"), ('Well what about in Goiania.?', "nonappreciation"), ('you are being unhelpful just answer yes or no, is it near a park or beach?', "nonappreciation"), ('thak you', "appreciation"), ('I shall take the 5 star package!', "nonappreciation"), ('Okay but what if I leave from Naples instead. Can you get me to Manas from Naples?', "nonappreciation"), ("I'm a woman! Try to find something 9000 or less if you can.", "nonappreciation"), ("That's perfect.", "nonappreciation"), ('ok. fine. I have a 4500 $ budjet and I will star as long as that money lasts. thx', "appreciation"), ('sure fine flexible actually no i dont wanna go any more', "nonappreciation"), ("No, unfortunately I can't. Guess I'll just take a staycation this time :disappointed: Thanks anyway", "appreciation"), (" I'll book this one. Thank you, friend!", "appreciation"), ('No we can only go to Porto... or Porto. Thanks.', "appreciation")] cl = NaiveBayesClassifier(train) # train the Naive Bayesian Classifier algorithm if cl.accuracy(validation) > 0.90: # check if the accuracy of the Naive Bayesian Classifier algorithm in classifying the validation data set is greater than 90%. cl.update(validation) # update the Naive Bayesian Classifier algorithm with the validation data set. for m in final_utterance: if cl.classify(m) == "appreciation": classified_dict["appreciation"] += 1 else: classified_dict["non-appreciation"] += 1 # calculate the percentage of people expressing appreciation return "{}% people express appreciation.".format(float(classified_dict["appreciation"] / (float(classified_dict["appreciation"] + classified_dict["non-appreciation"]))) * 100)
class NLTKHashtagsClassifier(Classifier): """ Classifies InstagramProfiles as blogger, brand or undecided. Currently is a PROTOTYPE. """ # list of all available categories for categorization AVAILABLE_CATEGORIES = [ 'brand', 'blogger', 'undecided', ] classifier = None undecided_margin = None def __init__(self, blogger_hashtags=[], brand_hashtags=[], undecided_margin=None): """ Explicitly inits lists of hashtags and creates NLTK Classifier object. Lists are not intended to contain unique hashtags. :param blogger_hashtags: list of lists of hashtags suitable for bloggers :param brand_hashtags: list of lists of hashtags suitable for brands :param undecided_margin: probability margin when to consider classification result as undecided :return: """ from textblob.classifiers import NaiveBayesClassifier initial_train = [] for v in blogger_hashtags: initial_train.append((v, self.AVAILABLE_CATEGORIES[1])) for v in brand_hashtags: initial_train.append((v, self.AVAILABLE_CATEGORIES[0])) self.classifier = NaiveBayesClassifier(initial_train) initial_train = [] def classify_unit(self, source=None, **kwargs): """ This method is the core of classification algorithm. It receives source data for classification (object, model, string, etc.) and returns a value of classification category for this object. For example, we use InstagramProfile as source data, and result could be either 'brand' or 'blogger' or 'undecided'. """ # return 'brand' cat_classified = self.classifier.classify(source) probability = self.classifier.prob_classify(source) # TODO: add probability_margin logic here return cat_classified def classify_queryset(self, source_queryset=None, **kwargs): """ Helper method. Same as above but performs the whole queryset. Return queryset """ # TODO: Think how to do it for this classifier. raise NotImplemented def update_classifier(self, extra_data=None): """ """ if extra_data is not None: self.classifier.update(extra_data)
#Print succes message print "> File opened successfully!" counter = 0 for row in reader: selectTweets(row) counter += 1 print "> Wait a sec for the results..." cl = NaiveBayesClassifier(trainTweets) print "> add another data set" cl.update(trainFeatures) print "> finish combination" cl.show_informative_features(10) outputPos=open('positiveTweet.txt','a') outputNeg=open('negativeTweet.txt','a') dataset = str(raw_input("> Please enter a filename contains tweets: ")) with open(dataset) as f: out = f.readlines() for lines in out: tweetWords = [] words = lines.split() for i in words: i = i.lower() i = i.strip('@#\'"?,.!')
# Test model with its two labels print cl.classify(u" احسن علاج هذا") # second cl model test prob_dist = cl.prob_classify(u"ك يوم يا ظالم,") print prob_dist.max() print prob_dist.prob("positive") print prob_dist.prob("negative") # compute the accuracy on our test set print "accuracy on the test set:{} ".format(cl.accuracy("testing.csv", format="csv")) # display a listing of the most informative features. cl.show_informative_features(5) # add new data new_data = [(u"كلام صحيح من شان هيك الدول اللي ما فيها بطالة والمجتمعات المفتوحة بتقل فيها المشاكل النفسية", 'positive'), (u"لا طبعا التقرب الى الله هو خير علاج للحالات النفسية", 'positive'), (u"تفائلوا بالخير تجدوه", 'positive'), (u"يا ترى الحكومه بدها تزيد دعم المواطن الي الله يكون في عونه", 'negative')] # updating classifiers with new data cl.update(new_data) # test accuracy after adding new data to the generated model print "accuracy on the test set:{} ".format(cl.accuracy("testing.csv", format="csv"))
class TestNaiveBayesClassifier(unittest.TestCase): def setUp(self): self.classifier = NaiveBayesClassifier(train_set) def test_default_extractor(self): text = "I feel happy this morning." assert_equal(self.classifier.extract_features(text), basic_extractor(text, train_set)) def test_classify(self): res = self.classifier.classify("I feel happy this morning") assert_equal(res, 'positive') assert_equal(len(self.classifier.train_set), len(train_set)) def test_classify_a_list_of_words(self): res = self.classifier.classify( ["I", "feel", "happy", "this", "morning"]) assert_equal(res, "positive") def test_train_from_lists_of_words(self): # classifier can be trained on lists of words instead of strings train = [(doc.split(), label) for doc, label in train_set] classifier = NaiveBayesClassifier(train) assert_equal(classifier.accuracy(test_set), self.classifier.accuracy(test_set)) def test_prob_classify(self): res = self.classifier.prob_classify("I feel happy this morning") assert_equal(res.max(), "positive") assert_true(res.prob("positive") > res.prob("negative")) def test_accuracy(self): acc = self.classifier.accuracy(test_set) assert_true(isinstance(acc, float)) def test_update(self): res1 = self.classifier.prob_classify("lorem ipsum") original_length = len(self.classifier.train_set) self.classifier.update([("lorem ipsum", "positive")]) new_length = len(self.classifier.train_set) res2 = self.classifier.prob_classify("lorem ipsum") assert_true(res2.prob("positive") > res1.prob("positive")) assert_equal(original_length + 1, new_length) def test_labels(self): labels = self.classifier.labels() assert_true("positive" in labels) assert_true("negative" in labels) def test_show_informative_features(self): feats = self.classifier.show_informative_features() def test_informative_features(self): feats = self.classifier.informative_features(3) assert_true(isinstance(feats, list)) assert_true(isinstance(feats[0], tuple)) def test_custom_feature_extractor(self): cl = NaiveBayesClassifier(train_set, custom_extractor) cl.classify("Yay! I'm so happy it works.") assert_equal(cl.train_features[0][1], 'positive') def test_init_with_csv_file(self): cl = NaiveBayesClassifier(CSV_FILE, format="csv") assert_equal(cl.classify("I feel happy this morning"), 'pos') training_sentence = cl.train_set[0][0] assert_true(isinstance(training_sentence, unicode)) def test_init_with_csv_file_without_format_specifier(self): cl = NaiveBayesClassifier(CSV_FILE) assert_equal(cl.classify("I feel happy this morning"), 'pos') training_sentence = cl.train_set[0][0] assert_true(isinstance(training_sentence, unicode)) def test_init_with_json_file(self): cl = NaiveBayesClassifier(JSON_FILE, format="json") assert_equal(cl.classify("I feel happy this morning"), 'pos') training_sentence = cl.train_set[0][0] assert_true(isinstance(training_sentence, unicode)) def test_init_with_json_file_without_format_specifier(self): cl = NaiveBayesClassifier(JSON_FILE) assert_equal(cl.classify("I feel happy this morning"), 'pos') training_sentence = cl.train_set[0][0] assert_true(isinstance(training_sentence, unicode)) def test_accuracy_on_a_csv_file(self): a = self.classifier.accuracy(CSV_FILE) assert_true(isinstance(a, float)) def test_accuracy_on_json_file(self): a = self.classifier.accuracy(JSON_FILE) assert_true(isinstance(a, float)) def test_init_with_tsv_file(self): cl = NaiveBayesClassifier(TSV_FILE) assert_equal(cl.classify("I feel happy this morning"), 'pos') training_sentence = cl.train_set[0][0] assert_true(isinstance(training_sentence, unicode)) def test_init_with_bad_format_specifier(self): assert_raises(ValueError, lambda: NaiveBayesClassifier(CSV_FILE, format='unknown')) def test_repr(self): assert_equal( repr(self.classifier), "<NaiveBayesClassifier trained on {0} instances>".format( len(train_set)))
def process_questions(self): self._load_training_data() self._pload_processed_tuples() five_ws = [ "who", "what", "where", "when", "why" ] trigger_phrases = [ "best practice", "best way", "simplest way", "preferred nomenclature", "preferred location", " have any recommendation", "exact command", "documentation", " doc for ", " doc about ", "tutorial", "release", "external inventory", "inventory file", "playbook", "play", "role", "task", "handler", "variable", "var", "connection", "async", "accelerate", "{{", "}}", "lookup", "plugin", "callback", "hang", "conditional", "when:" "group", "ec2 module", "route53", "fault tolerance", "public key" ] cl = NaiveBayesClassifier(self.train) ks = [ int(x) for x in self.logdata.keys() ] sorted_ks = sorted(ks) total_ks = sorted_ks[-1] for k in sorted_ks: k_str = str(k) print total_ks,"-",k_str this_msg = self.logdata[k_str]['message'] text_obj = TextBlob(this_msg) if hasattr(text_obj, "raw_sentences"): for sent in text_obj.sentences: try: str(sent) except UnicodeDecodeError: #self.known_sentences.append(sent) continue if str(sent) in self.processed_tuples: continue if sent.endswith("?") and [ x for x in sent.words if x.lower() in five_ws ]: curr_rating = cl.classify(sent) triggered = False for ph in trigger_phrases: if ph in str(sent): triggered = True this_tuple = (k, sent, curr_rating, triggered) self.processed_tuples[str(sent)] = this_tuple #self.known_sentences.append(str(sent)) # save what we have self._pdump_processed_tuples() for pt in self.processed_tuples.keys(): print "##############################\n" #import epdb; epdb.st() k = self.processed_tuples[pt][0] sent = self.processed_tuples[pt][1] curr_rating = self.processed_tuples[pt][2] triggered = self.processed_tuples[pt][3] print sent print "\n" print "rating: %s" % curr_rating print "triggered: %s" % triggered if ( curr_rating == "b" and triggered ) or ( curr_rating == "g" and not triggered ): #continue q_string = "\n$ g(ood) question or b(ad) question? (default: %s): " % curr_rating x = raw_input(q_string) else: x = str(curr_rating) print "\n" if x == "": this_tup = [ (str(sent), curr_rating) ] cl.update(this_tup) #self.known_sentences.append(str(sent)) open(self.train_file, "a").write("'%s';%s\n" % (sent, curr_rating)) elif x == "b" or x == "g": this_tup = [ (str(sent), x) ] cl.update(this_tup) #self.known_sentences.append(str(sent)) open(self.train_file, "a").write("'%s';%s\n" % (sent, x)) elif x == "break": pass
class BankClassify(): #def __init__(self, data="AllData1.csv"): def __init__(self, data="Transactions.csv"): """Load in the previous data (by default from AllData.csv) and initialise the classifier""" if os.path.exists(data): self.prev_data = pd.read_csv(data) else: self.prev_data = pd.DataFrame( columns=['date', 'desc', 'amount', 'cat']) self.classifier = NaiveBayesClassifier( self._get_training(self.prev_data), self._extractor) def add_data(self, filename): """Add new data and interactively classify it. Arguments: - filename: filename of Santander-format file """ #self.new_data = self._read_santander_file(filename) self.new_data = self._read_own_file(filename) self._ask_with_guess(self.new_data) self.prev_data = pd.concat([self.prev_data, self.new_data]) #self.prev_data.to_csv("AllData1.csv", index=False) self.prev_data.to_csv("Transactions.csv", index=False) def _prep_for_analysis(self): """Prepare data for analysis in pandas, setting index types and subsetting""" self.prev_data = self._make_date_index(self.prev_data) self.prev_data['cat'] = self.prev_data['cat'].str.strip() self.inc = self.prev_data[self.prev_data.amount > 0] self.out = self.prev_data[self.prev_data.amount < 0] self.out.amount = self.out.amount.abs() self.inc_noignore = self.inc[self.inc.cat != 'Ignore'] self.inc_noexpignore = self.inc[(self.inc.cat != 'Ignore') & (self.inc.cat != 'Expenses')] self.out_noignore = self.out[self.out.cat != 'Ignore'] self.out_noexpignore = self.out[(self.out.cat != 'Ignore') & (self.out.cat != 'Expenses')] def _read_categories(self): """Read list of categories from categories.txt""" categories = {} with open('categories.txt') as f: #with open('categories1.txt') as f: for i, line in enumerate(f.readlines()): categories[i] = line.strip() return categories def _add_new_category(self, category): """Add a new category to categories.txt""" #with open('categories1.txt', 'a') as f: with open('categories.txt', 'a') as f: f.write('\n' + category) def _ask_with_guess(self, df): """Interactively guess categories for each transaction in df, asking each time if the guess is correct""" # Initialise colorama #init() df['cat'] = "" categories = self._read_categories() for index, row in df.iterrows(): # Generate the category numbers table from the list of categories cats_list = [[idnum, cat] for idnum, cat in categories.items()] cats_table = tabulate(cats_list) stripped_text = self._strip_numbers(row['desc']) # Guess a category using the classifier (only if there is data in the classifier) if len(self.classifier.train_set) > 1: guess = self.classifier.classify(stripped_text) else: guess = "" # Print list of categories print(chr(27) + "[2J") print(cats_table) print("\n\n") # Print transaction print("On: %s\t %.2f\n%s" % (row['date'], row['amount'], row['desc'])) #print(Fore.RED + Style.BRIGHT + "My guess is: " + str(guess) + Fore.RESET) print("My guess is: " + str(guess)) input_value = input("> ") if input_value.lower() == 'q': # If the input was 'q' then quit return df if input_value == "": # If the input was blank then our guess was right! df.ix[index, 'cat'] = guess self.classifier.update([(stripped_text, guess)]) else: # Otherwise, our guess was wrong try: # Try converting the input to an integer category number # If it works then we've entered a category category_number = int(input_value) category = categories[category_number] except ValueError: # Otherwise, we've entered a new category, so add it to the list of # categories category = input_value self._add_new_category(category) categories = self._read_categories() # Write correct answer df.ix[index, 'cat'] = category # Update classifier self.classifier.update([(stripped_text, category)]) return df def _make_date_index(self, df): """Make the index of df a Datetime index""" df.index = pd.DatetimeIndex( df.date.apply(dateutil.parser.parse, dayfirst=True)) return df def _read_own_file(self, filename): with open(filename, errors='replace') as f: lines = f.readlines() dates = [] descs = [] amounts = [] ############# FUNCTION TO CHECK IF A STRING IS A VALID NUMBER: def is_number(s): try: float(s) return True except ValueError: return False for line in lines[3:]: s = line.replace(' ', '') s = line.split() dates.append(" ".join(s[0:2])) amounts.append(s[-1].replace(',', '')) descs.append(" ".join(s[2:-1])) ##########################IF FEDERAL BANK: # # # for line in lines[10:-1]: # # splitted = line.split('|') # s= line.split('TFR ') # s1 = s[1].split(' ') # #print(amts) # dates.append(splitted[0]) # descs.append(splitted[2]) # amounts.append(s1[0]) # # ########################IF AXIS BANK: # # # for line in lines[10:-1]: # s= line.replace(' ','') # s= line.split() # dates.append(s[0]) # descs.append(" ".join(s[1:-3])) # amounts.append(s[-3].replace(',','')) # # ######################## IF CITI BANK: # # # for line in lines[5:-1]: # s = line.replace(' ', '') # s = line.split() # dates.append(s[0]) # if is_number(s[-2]): # amounts.append(s[-2]) # elif is_number(s[-3]): # amounts.append(s[-3]) # else: # amounts.append(s[-1]) # # pattern = '[A-Za-z]+' # s1 = " ".join(s) # d=re.findall(pattern, s1) # descs.append(" ".join(d[1:])) # # ############################## IF HDFC BANK: # # elif filename== 'AccDetails4.txt': # for line in lines[9:-3]: # s=line.replace(' ','') # s=line.split() # dates.append(s[0]) # amounts.append(s[-2].replace(',','')) # descs.append((" ".join(s[1:-4]))) # # ################################### AMEX # # # for line in lines[5:-3]: # s = line.replace(' ', '') # s = line.split() # dates.append(" ".join(s[0:2])) # amounts.append(s[-1].replace(',','')) # descs.append(" ".join(s[2:-1])) # # ################################## KOTAK # # for line in lines[4:-2]: # s=line.replace(' ','') # s=line.split() # dates.append(s[0]) # amounts.append(s[-4].replace(',', '')) # descs.append(" ".join(s[1:-4])) # # # ############################# # # for line in lines[5:-2]: # s = line.replace(' ', '') # s = line.split() # dates.append(s[0]) # if float(s[-2].replace(',',''))!=0.00: # amounts.append(s[-2].replace(',','')) # else: # amounts.append(s[-3].replace(',', '')) # descs.append(" ".join(s[1:-6])) # df = pd.DataFrame({'date': dates, 'desc': descs, 'amount': amounts}) df['amount'] = df.amount.astype(float) df['desc'] = df.desc.astype(str) df['date'] = df.date.astype(str) return df def _get_training(self, df): """Get training data for the classifier, consisting of tuples of (text, category)""" train = [] subset = df[df['cat'] != ''] for i in subset.index: row = subset.ix[i] new_desc = self._strip_numbers(row['desc']) train.append((new_desc, row['cat'])) return train def _extractor(self, doc): """Extract tokens from a given string""" # TODO: Extend to extract words within words # For example, MUSICROOM should give MUSIC and ROOM tokens = self._split_by_multiple_delims(doc, [' ', '/']) features = {} for token in tokens: if token == "": continue features[token] = True return features def _strip_numbers(self, s): """Strip numbers from the given string""" return re.sub("[^A-Z ]", "", s) def _split_by_multiple_delims(self, string, delims): """Split the given string by the list of delimiters given""" regexp = "|".join(delims) return re.split(regexp, string)
('Prediction', 'Contextual Text Mining'), ('Contextual', 'Contextual Text Mining')] #Instantiating the NB Classifier - Simple classifier = NaiveBayesClassifier(featureListTrain) #Random Shuffling of data for consistency random.shuffle(data) #print(str(data[0][1]).split('::')) #Split Corpus data into train and test datasets train, test = data[0:10], data[11:23] #Update Classifier with new corpus data classifier.update(train) # Compute accuracy accuracy = classifier.accuracy(featureListTest + test + data) print("Accuracy: {0}".format(accuracy)) catList = [] # Loop through Corpus Data and Classify on entire dataset # We do not have a large dataset and hence to get maximum categories classified # the entire data is being considered #If probablity of classification is at least 0.5 then capture the category i = 0 while i < len(data): pdist = classifier.prob_classify(str(data[i][0])) #for category in reader.categories():
import pickle from textblob.classifiers import NaiveBayesClassifier classifier = NaiveBayesClassifier([('???', '???'), ...]) classifier.update([('???', '???'), ...]) with open('...', 'wb') as f: pickle.dump(classifier, f) with open('...', 'rb') as f: classifier = pickle.load(f) classifier.classify(...)
('I do not like this new restaurant', 'neg'), ('I am tired of waiting for my new book.', 'neg'), ("I can't deal with my toothache", 'neg'), ("The fun events in costa rica were amazing", 'pos'), ('He is my worst boss!', 'neg'), ('People do have bad writing skills on facebook', 'neg')] test = [('The beer was good.', 'pos'), ('I do not enjoy my job', 'neg'), ("I feel amazing!", 'pos'), ('Mark is a friend of mine.', 'pos'), ("I can't believe I was asked to do this.", 'neg')] cl = NaiveBayesClassifier(train) print(cl.classify("The new movie was amazing.")) # "pos" print(cl.classify("I don't like ther noodles.")) # "neg" print "Test Results" cl.update(test) # Classify a TextBlob blob = TextBlob( "The food was good. But the service was horrible. " "My father was not pleased.", classifier=cl) print(blob) print(blob.classify()) for sentence in blob.sentences: print(sentence) print(sentence.classify()) # Compute accuracy print("Accuracy: {0}".format(cl.accuracy(test)))
articleTestMaster = articleTest titleTestMaster = titleTest zScoreTestMaster = zScoreTest titleClassifier = NaiveBayesClassifier([ (title, score) for title, score in zip(titleTrain, zScoreTrain) ]) articleClassifier = NaiveBayesClassifier([ (article, score) for article, score in zip(articleTrain, zScoreTrain) ]) else: articleTestMaster = np.append(articleTestMaster, articleTest) titleTestMaster = np.append(titleTestMaster, titleTest) zScoreTestMaster = np.append(zScoreTestMaster, zScoreTest) titleClassifier.update( ([(title, score) for title, score in zip(titleTrain, zScoreTrain)])) articleClassifier.update( ([(article, score) for article, score in zip(articleTrain, zScoreTrain)])) print( titleClassifier.accuracy([ (title, zScore) for title, zScore in zip(titleTestMaster, zScoreTestMaster) ])) print( articleClassifier.accuracy([ (article, zScore) for article, zScore in zip(articleTestMaster, zScoreTestMaster) ])) pickle.dump(titleClassifier, open('titleClassifier.pkl', 'wb'))
# ### Evaluating Classifiers class1.accuracy(test) # ### Diplay a Listing of the Most Informative Features class1.show_informative_features(5) # ### Updating Classifiers with New Data¶ new_data = [('She is my best friend.', 'pos'), ("I'm happy to have a new friend.", 'pos'), ("Stay thirsty, my friend.", 'pos'), ("He ain't from around here.", 'neg')] class1.update(new_data) class1.accuracy(test) # ### Feature Extractors def end_word_extractor(document): tokens = document.split() first_word, last_word = tokens[0], tokens[-1] feats = {} feats["first({0})".format(first_word)] = True feats["last({0})".format(last_word)] = False return feats
class BankClassify(): def __init__(self, data="AllData.csv"): """Load in the previous data (by default from AllData.csv) and initialise the classifier""" if os.path.exists(data): self.prev_data = pd.read_csv(data) else: self.prev_data = pd.DataFrame( columns=['date', 'desc', 'amount', 'cat']) self.classifier = NaiveBayesClassifier( self._get_training(self.prev_data), self._extractor) def add_data(self, filename): """Add new data and interactively classify it. Arguments: - filename: filename of Santander-format file """ self.new_data = self._read_santander_file(filename) self._ask_with_guess(self.new_data) self.prev_data = pd.concat([self.prev_data, self.new_data]) self.prev_data.to_csv("AllData.csv", index=False) def _prep_for_analysis(self): """Prepare data for analysis in pandas, setting index types and subsetting""" self.prev_data = self._make_date_index(self.prev_data) self.prev_data['cat'] = self.prev_data['cat'].str.strip() self.inc = self.prev_data[self.prev_data.amount > 0] self.out = self.prev_data[self.prev_data.amount < 0] self.out.amount = self.out.amount.abs() self.inc_noignore = self.inc[self.inc.cat != 'Ignore'] self.inc_noexpignore = self.inc[(self.inc.cat != 'Ignore') & (self.inc.cat != 'Expenses')] self.out_noignore = self.out[self.out.cat != 'Ignore'] self.out_noexpignore = self.out[(self.out.cat != 'Ignore') & (self.out.cat != 'Expenses')] def _read_categories(self): """Read list of categories from categories.txt""" categories = {} with open('categories.txt') as f: for i, line in enumerate(f.readlines()): categories[i] = line.strip() return categories def _add_new_category(self, category): """Add a new category to categories.txt""" with open('categories.txt', 'a') as f: f.write('\n' + category) def _ask_with_guess(self, df): """Interactively guess categories for each transaction in df, asking each time if the guess is correct""" # Initialise colorama init() df['cat'] = "" categories = self._read_categories() for index, row in df.iterrows(): # Generate the category numbers table from the list of categories cats_list = [[idnum, cat] for idnum, cat in categories.items()] cats_table = tabulate(cats_list) stripped_text = self._strip_numbers(row['desc']) # Guess a category using the classifier (only if there is data in the classifier) if len(self.classifier.train_set) > 1: guess = self.classifier.classify(stripped_text) else: guess = "" # Print list of categories print(chr(27) + "[2J") print(cats_table) print("\n\n") # Print transaction print("On: %s\t %.2f\n%s" % (row['date'], row['amount'], row['desc'])) print(Fore.RED + Style.BRIGHT + "My guess is: " + str(guess) + Fore.RESET) input_value = input("> ") if input_value.lower() == 'q': # If the input was 'q' then quit return df if input_value == "": # If the input was blank then our guess was right! df.at[index, 'cat'] = guess self.classifier.update([(stripped_text, guess)]) else: # Otherwise, our guess was wrong try: # Try converting the input to an integer category number # If it works then we've entered a category category_number = int(input_value) category = categories[category_number] except ValueError: # Otherwise, we've entered a new category, so add it to the list of # categories category = input_value self._add_new_category(category) categories = self._read_categories() # Write correct answer df.at[index, 'cat'] = category # Update classifier self.classifier.update([(stripped_text, category)]) return df def _make_date_index(self, df): """Make the index of df a Datetime index""" df.index = pd.DatetimeIndex( df.date.apply(dateutil.parser.parse, dayfirst=True)) return df def _read_santander_file(self, filename): """Read a file in the plain text format that Santander provides downloads in. Returns a pd.DataFrame with columns of 'date', 'desc' and 'amount'.""" with open(filename, errors='replace') as f: lines = f.readlines() dates = [] descs = [] amounts = [] for line in lines[4:]: line = "".join(i for i in line if ord(i) < 128) if line.strip() == '': continue splitted = line.split(":") category = splitted[0] data = ":".join(splitted[1:]) if category == 'Date': dates.append(data.strip()) elif category == 'Description': descs.append(data.strip()) elif category == 'Amount': just_numbers = re.sub("[^0-9\.-]", "", data) amounts.append(just_numbers.strip()) df = pd.DataFrame({'date': dates, 'desc': descs, 'amount': amounts}) df['amount'] = df.amount.astype(float) df['desc'] = df.desc.astype(str) df['date'] = df.date.astype(str) return df def _get_training(self, df): """Get training data for the classifier, consisting of tuples of (text, category)""" train = [] subset = df[df['cat'] != ''] for i in subset.index: row = subset.iloc[i] new_desc = self._strip_numbers(row['desc']) train.append((new_desc, row['cat'])) return train def _extractor(self, doc): """Extract tokens from a given string""" # TODO: Extend to extract words within words # For example, MUSICROOM should give MUSIC and ROOM tokens = self._split_by_multiple_delims(doc, [' ', '/']) features = {} for token in tokens: if token == "": continue features[token] = True return features def _strip_numbers(self, s): """Strip numbers from the given string""" return re.sub("[^A-Z ]", "", s) def _split_by_multiple_delims(self, string, delims): """Split the given string by the list of delimiters given""" regexp = "|".join(delims) return re.split(regexp, string)
class BankClassify(): def __init__(self, data="AllData.csv"): """Load in the previous data (by default from `data`) and initialise the classifier""" # allows dynamic training data to be used (i.e many accounts in a loop) self.trainingDataFile = data if os.path.exists(data): self.prev_data = pd.read_csv(self.trainingDataFile) else: self.prev_data = pd.DataFrame( columns=['date', 'desc', 'amount', 'cat']) self.classifier = NaiveBayesClassifier( self._get_training(self.prev_data), self._extractor) def add_data(self, filename, bank="santander"): """Add new data and interactively classify it. Arguments: - filename: filename of Santander-format file """ if bank == "santander": print("adding Santander data!") self.new_data = self._read_santander_file(filename) elif bank == "nationwide": print("adding Nationwide data!") self.new_data = self._read_nationwide_file(filename) elif bank == "lloyds": print("adding Lloyds Bank data!") self.new_data = self._read_lloyds_csv(filename) elif bank == "barclays": print("adding Barclays Bank data!") self.new_data = self._read_barclays_csv(filename) self._ask_with_guess(self.new_data) self.prev_data = pd.concat([self.prev_data, self.new_data]) # save data to the same file we loaded earlier self.prev_data.to_csv(self.trainingDataFile, index=False) def _prep_for_analysis(self): """Prepare data for analysis in pandas, setting index types and subsetting""" self.prev_data = self._make_date_index(self.prev_data) self.prev_data['cat'] = self.prev_data['cat'].str.strip() self.inc = self.prev_data[self.prev_data.amount > 0] self.out = self.prev_data[self.prev_data.amount < 0] self.out.amount = self.out.amount.abs() self.inc_noignore = self.inc[self.inc.cat != 'Ignore'] self.inc_noexpignore = self.inc[(self.inc.cat != 'Ignore') & (self.inc.cat != 'Expenses')] self.out_noignore = self.out[self.out.cat != 'Ignore'] self.out_noexpignore = self.out[(self.out.cat != 'Ignore') & (self.out.cat != 'Expenses')] def _read_categories(self): """Read list of categories from categories.txt""" categories = {} with open('categories.txt') as f: for i, line in enumerate(f.readlines()): categories[i] = line.strip() return categories def _add_new_category(self, category): """Add a new category to categories.txt""" with open('categories.txt', 'a') as f: f.write('\n' + category) def _ask_with_guess(self, df): """Interactively guess categories for each transaction in df, asking each time if the guess is correct""" # Initialise colorama init() df['cat'] = "" categories = self._read_categories() for index, row in df.iterrows(): # Generate the category numbers table from the list of categories cats_list = [[idnum, cat] for idnum, cat in categories.items()] cats_table = tabulate(cats_list) stripped_text = self._strip_numbers(row['desc']) # Guess a category using the classifier (only if there is data in the classifier) if len(self.classifier.train_set) > 1: guess = self.classifier.classify(stripped_text) else: guess = "" # Print list of categories print(chr(27) + "[2J") print(cats_table) print("\n\n") # Print transaction print("On: %s\t %.2f\n%s" % (row['date'], row['amount'], row['desc'])) print(Fore.RED + Style.BRIGHT + "My guess is: " + str(guess) + Fore.RESET) input_value = input("> ") if input_value.lower() == 'q': # If the input was 'q' then quit return df if input_value == "": # If the input was blank then our guess was right! df.at[index, 'cat'] = guess self.classifier.update([(stripped_text, guess)]) else: # Otherwise, our guess was wrong try: # Try converting the input to an integer category number # If it works then we've entered a category category_number = int(input_value) category = categories[category_number] except ValueError: # Otherwise, we've entered a new category, so add it to the list of # categories category = input_value self._add_new_category(category) categories = self._read_categories() # Write correct answer df.at[index, 'cat'] = category # Update classifier self.classifier.update([(stripped_text, category)]) return df def _make_date_index(self, df): """Make the index of df a Datetime index""" df.index = pd.DatetimeIndex( df.date.apply(dateutil.parser.parse, dayfirst=True)) return df def _read_nationwide_file(self, filename): """Read a file in the csv file that Nationwide provides downloads in. Returns a pd.DataFrame with columns of 'date', 'desc' and 'amount'.""" with open(filename) as f: lines = f.readlines() dates = [] descs = [] amounts = [] for line in lines[5:]: line = "".join(i for i in line if ord(i) < 128) if line.strip() == '': continue splits = line.split("\",\"") """ 0 = Date 1 = Transaction type 2 = Description 3 = Paid Out 4 = Paid In 5 = Balance """ date = splits[0].replace("\"", "").strip() date = datetime.strptime(date, '%d %b %Y').strftime('%d/%m/%Y') dates.append(date) # get spend/pay in amount if splits[3] != "": # paid out spend = float(re.sub("[^0-9\.-]", "", splits[3])) * -1 else: # paid in spend = float(re.sub("[^0-9\.-]", "", splits[4])) amounts.append(spend) #Description descs.append(splits[2]) df = pd.DataFrame({'date': dates, 'desc': descs, 'amount': amounts}) df['amount'] = df.amount.astype(float) df['desc'] = df.desc.astype(str) df['date'] = df.date.astype(str) return df def _read_santander_file(self, filename): """Read a file in the plain text format that Santander provides downloads in. Returns a pd.DataFrame with columns of 'date', 'desc' and 'amount'.""" with open(filename, errors='replace') as f: lines = f.readlines() dates = [] descs = [] amounts = [] for line in lines[4:]: line = "".join(i for i in line if ord(i) < 128) if line.strip() == '': continue splitted = line.split(":") category = splitted[0] data = ":".join(splitted[1:]) if category == 'Date': dates.append(data.strip()) elif category == 'Description': descs.append(data.strip()) elif category == 'Amount': just_numbers = re.sub("[^0-9\.-]", "", data) amounts.append(just_numbers.strip()) df = pd.DataFrame({'date': dates, 'desc': descs, 'amount': amounts}) df['amount'] = df.amount.astype(float) df['desc'] = df.desc.astype(str) df['date'] = df.date.astype(str) return df def _read_lloyds_csv(self, filename): """Read a file in the CSV format that Lloyds Bank provides downloads in. Returns a pd.DataFrame with columns of 'date' 0 , 'desc' 4 and 'amount' 5 .""" df = pd.read_csv(filename, skiprows=0) """Rename columns """ #df.columns = ['date', 'desc', 'amount'] df.rename(columns={ "Transaction Date": 'date', "Transaction Description": 'desc', "Debit Amount": 'amount', "Credit Amount": 'creditAmount' }, inplace=True) # if its income we still want it in the amount col! # manually correct each using 2 cols to create 1 col with either + or - figure # lloyds outputs 2 cols, credit and debit, we want 1 col representing a +- figure for index, row in df.iterrows(): if (row['amount'] > 0): # it's a negative amount because this is a spend df.at[index, 'amount'] = -row['amount'] elif (row['creditAmount'] > 0): df.at[index, 'amount'] = row['creditAmount'] # cast types to columns for math df = df.astype({"desc": str, "date": str, "amount": float}) return df def _read_barclays_csv(self, filename): """Read a file in the CSV format that Barclays Bank provides downloads in. Edge case: foreign txn's sometimes causes more cols than it should Returns a pd.DataFrame with columns of 'date' 1 , 'desc' (memo) 5 and 'amount' 3 .""" # Edge case: Barclays foreign transaction memo sometimes contains a comma, which is bad. # Use a work-around to read only fixed col count # https://stackoverflow.com/questions/20154303/pandas-read-csv-expects-wrong-number-of-columns-with-ragged-csv-file # Prevents an error where some rows have more cols than they should temp = pd.read_csv(filename, sep='^', header=None, prefix='X', skiprows=1) temp2 = temp.X0.str.split(',', expand=True) del temp['X0'] df = pd.concat([temp, temp2], axis=1) """Rename columns """ df.rename(columns={1: 'date', 5: 'desc', 3: 'amount'}, inplace=True) # cast types to columns for math df = df.astype({"desc": str, "date": str, "amount": float}) return df def _get_training(self, df): """Get training data for the classifier, consisting of tuples of (text, category)""" train = [] subset = df[df['cat'] != ''] for i in subset.index: row = subset.iloc[i] new_desc = self._strip_numbers(row['desc']) train.append((new_desc, row['cat'])) return train def _extractor(self, doc): """Extract tokens from a given string""" # TODO: Extend to extract words within words # For example, MUSICROOM should give MUSIC and ROOM tokens = self._split_by_multiple_delims(doc, [' ', '/']) features = {} for token in tokens: if token == "": continue features[token] = True return features def _strip_numbers(self, s): """Strip numbers from the given string""" return re.sub("[^A-Z ]", "", s) def _split_by_multiple_delims(self, string, delims): """Split the given string by the list of delimiters given""" regexp = "|".join(delims) return re.split(regexp, string)
tweet = info[2] polar = info[len(info) - 1] c = tuple([tweet, polar]) trainingData.append(c) print(len(trainingData)) myData1 = trainingData[0:1000] trainingData = trainingData[1000:len(trainingData)] cl = NaiveBayesClassifier(myData1) for i in range(0, len(trainingData), 50): chunk = trainingData[i:i + 50] print(i) cl.update(chunk) print(cl.show_informative_features()) myList2 = [] test = [] with open("data05_02_2020_11-17.csv", "r") as f: reader = csv.reader(f, delimiter="\t") for i, line in enumerate(reader): if i % 2 == 0: myList2.append(line[0]) for line in myList2: info = line.split(',') tweet = info[2]
#inicia classificador treinamento cl = NaiveBayesClassifier(train2) #atualiza barra de progresso i = int(time.time() - start) bar.update(i) #registra acuracia treinamento x teste actest = cl.accuracy(test2) #atualiza barra de progresso i = int(time.time() - start) bar.update(i) #treina com a base de teste cl.update(test2) #atualiza barra de progresso i = int(time.time() - start) bar.update(i) #registra acuracia teste vs treinamento + este actest2 = cl.accuracy(test2) print('\n') print("Descrição dos Dados de Treinamento:") print(datamerge1.describe()) print('\n') print("Descrição dos Dados de Teste:") print(datamerge2.describe()) print('\n')