示例#1
0
class NBClassifier:
    def __init__(self, train_data_file):
        self._train_data_file = train_data_file
        f = open(self._train_data_file, 'r+')
        self._cl = NaiveBayesClassifier(f, format="json")
        f.close()

    def update_train_set(self, sentence):
        new_data = [(sentence.str_sentence, sentence.label)]
        self._cl.update(new_data)
        self._save_data_to_file()

    def _save_data_to_file(self):
        TEXT = "{\"text\":\""
        LABEL = "\", \"label\":\""
        dict_str = ",\n".join([
            str(TEXT + str(el[0]) + LABEL + str(el[1]) + "\"}")
            for el in self._cl.train_set
        ])
        f = open(self._train_data_file, 'r+')
        f.write("[" + dict_str + "]")
        f.close()

    def prob_classify(self, sentence):
        # import ipdb; ipdb.set_trace()
        return self._cl.prob_classify(sentence).max()
示例#2
0
class Model(object):
    """docstring for Model"""
    def __init__(self, name='Guess', config={}):
        self.name = name
        self.config = config
        self.clf = NaiveBayesClassifier([])

    def train(self, training_data):

        safe_training = []

        for example in training_data:
            safe_training.append((example.get('text'), example.get('label')))

        self.clf.update(safe_training)

    def evaluate(self, text):
        label = self.clf.classify(text)
        prob_dist = self.clf.prob_classify(text)
        label_prob = prob_dist.prob(label)
        return label, label_prob

    def get_classes(self):
        return self.clf.labels()

    def save(self):
        pass

    def load(self):
        pass
示例#3
0
def chunk(data, mode, classificationS):
    '''
    Parameters:
        data:
            Type: Array
            Dataframe containing tweets and party information. 
        mode:
            Type: String "train" or String "test"
            Determines whether or not we are training our classifier or testing the accuracy of it. 
        classificationS:
            Type: NLTK Classifier or None
            Sets a classifier if one exists for testing purposes.
    Trains/tests a NLTK Naive Bayes Classifier (NBC) on arrays. Data must be loaded
    in slowly/overtime to prevent memory errors. 
    '''
    length = len(data)
    curPos = 0
    classifier = None
    if classificationS is not None:
        classifier = classificationS

    if mode == "train":
        while curPos <= length:
            if curPos == 0:
                d = data[0:50]
                classifier = NaiveBayesClassifier(d)
                curPos = 50
            else:
                if curPos + 50 >= length:
                    classifier.update(data[curPos:length])
                else:
                    classifier.update(data[curPos:curPos + 50])
                curPos = curPos + 50
                time.sleep(2)
        return classifier

    elif mode == 'test':
        listOfAccs = []
        while curPos <= length:
            if curPos + 50 >= length:
                listOfAccs.append(classifier.accuracy(data[curPos:length]))
            else:
                listOfAccs.append(classifier.accuracy(data[curPos:curPos +
                                                           50]))
            curPos = curPos + 50
            time.sleep(2)
        return listOfAccs
class TestValidators(TestCase):

    def setUp(self):
        self.data = StringIO('{}')
        self.classifier = NaiveBayesClassifier(self.data, format='json')
        self.classifier.update([
            ('spam spam spam', 'spam'),
            ('this is not spam', 'valid'),
        ])

        self.mock_classifier_get = mock.patch.object(
            ClassifierValidator,
            'get_classifier',
            mock.Mock(return_value=self.classifier)
        )
        self.patch_classifier_get = self.mock_classifier_get.start()

    def test_validator_pass(self):
        validate = ClassifierValidator()
        validate('this is totally legit')

    def test_validator_invalid(self):
        validate = ClassifierValidator()
        with self.assertRaises(ValidationError):
            validate('spam spammy spam')

    def test_validator_invalid_different_exception(self):
        validate = ClassifierValidator(raises=ValueError)
        with self.assertRaises(ValueError):
            validate('spam spammy spam')

    @mock.patch('textclassifier.classifier.TEXTCLASSIFIER_DATA_FILE', '')
    def test_open_file_failure(self):
        """Open file, but still validate after errors"""
        self.mock_classifier_get.stop()
        mod_name = ('builtins', '__builtin__')[(sys.version_info < (3,0))]
        with mock.patch('{0}.open'.format(mod_name)) as mocked_open:
            mocked_open.side_effect = IOError
            with self.assertRaises(IOError):
                DefaultClassifier()
            validate = ClassifierValidator()
            validate('spam spam spam')
示例#5
0
class TBSentiment(Model):
    """Wrapper around the TextBlob sentiment analyzer. Can train and test a
    using the standardized data format.
    
    Args:
        Model (): Initialize the model.
    """

    def __init__(self):
        self.cl = NaiveBayesClassifier([])

    def classify(self, comment):
        prob_dist = self.cl.prob_classify(comment)
        pol_pred = prob_dist.max()
        confidence = prob_dist.prob(pol_pred)
        return pol_pred, confidence

    def train(self, data, eval=None, d_print=False):
        """Train the TextBlob object on custom data.
        
        Args:
            data (:obj:`list` of :obj:`tuple`): Take a list of tuples with
                format (comment, polarity in ["pos", "neg"]).
        """

        self.cl.update(data)

    def test(self, data):
        """Test the TextBlob object on custom data.
        
        Args:
            data (:obj:`list` of :obj:`tuple`): Take a list of tuples with
                format (comment, polarity in ["pos", "neg"]).

        Returns:
            :obj:`tuple`: Return the successes and failures in a list (:obj:`list`, :obj:`list`)
        """
        return 
示例#6
0
class TestNaiveBayesClassifier(unittest.TestCase):

    def setUp(self):
        self.classifier = NaiveBayesClassifier(train_set)

    def test_default_extractor(self):
        text = "I feel happy this morning."
        assert_equal(self.classifier.extract_features(text), basic_extractor(text, train_set))

    def test_classify(self):
        res = self.classifier.classify("I feel happy this morning")
        assert_equal(res, 'positive')
        assert_equal(len(self.classifier.train_set), len(train_set))

    def test_classify_a_list_of_words(self):
        res = self.classifier.classify(["I", "feel", "happy", "this", "morning"])
        assert_equal(res, "positive")

    def test_train_from_lists_of_words(self):
        # classifier can be trained on lists of words instead of strings
        train = [(doc.split(), label) for doc, label in train_set]
        classifier = NaiveBayesClassifier(train)
        assert_equal(classifier.accuracy(test_set),
                        self.classifier.accuracy(test_set))

    def test_prob_classify(self):
        res = self.classifier.prob_classify("I feel happy this morning")
        assert_equal(res.max(), "positive")
        assert_true(res.prob("positive") > res.prob("negative"))

    def test_accuracy(self):
        acc = self.classifier.accuracy(test_set)
        assert_true(isinstance(acc, float))

    def test_update(self):
        res1 = self.classifier.prob_classify("lorem ipsum")
        original_length = len(self.classifier.train_set)
        self.classifier.update([("lorem ipsum", "positive")])
        new_length = len(self.classifier.train_set)
        res2 = self.classifier.prob_classify("lorem ipsum")
        assert_true(res2.prob("positive") > res1.prob("positive"))
        assert_equal(original_length + 1, new_length)

    def test_labels(self):
        labels = self.classifier.labels()
        assert_true("positive" in labels)
        assert_true("negative" in labels)

    def test_show_informative_features(self):
        feats = self.classifier.show_informative_features()

    def test_informative_features(self):
        feats = self.classifier.informative_features(3)
        assert_true(isinstance(feats, list))
        assert_true(isinstance(feats[0], tuple))

    def test_custom_feature_extractor(self):
        cl = NaiveBayesClassifier(train_set, custom_extractor)
        cl.classify("Yay! I'm so happy it works.")
        assert_equal(cl.train_features[0][1], 'positive')

    def test_init_with_csv_file(self):
        cl = NaiveBayesClassifier(CSV_FILE, format="csv")
        assert_equal(cl.classify("I feel happy this morning"), 'pos')
        training_sentence = cl.train_set[0][0]
        assert_true(isinstance(training_sentence, unicode))

    def test_init_with_csv_file_without_format_specifier(self):
        cl = NaiveBayesClassifier(CSV_FILE)
        assert_equal(cl.classify("I feel happy this morning"), 'pos')
        training_sentence = cl.train_set[0][0]
        assert_true(isinstance(training_sentence, unicode))

    def test_init_with_json_file(self):
        cl = NaiveBayesClassifier(JSON_FILE, format="json")
        assert_equal(cl.classify("I feel happy this morning"), 'pos')
        training_sentence = cl.train_set[0][0]
        assert_true(isinstance(training_sentence, unicode))

    def test_init_with_json_file_without_format_specifier(self):
        cl = NaiveBayesClassifier(JSON_FILE)
        assert_equal(cl.classify("I feel happy this morning"), 'pos')
        training_sentence = cl.train_set[0][0]
        assert_true(isinstance(training_sentence, unicode))

    def test_accuracy_on_a_csv_file(self):
        a = self.classifier.accuracy(CSV_FILE)
        assert_true(isinstance(a, float))

    def test_accuracy_on_json_file(self):
        a = self.classifier.accuracy(JSON_FILE)
        assert_true(isinstance(a, float))

    def test_init_with_tsv_file(self):
        cl = NaiveBayesClassifier(TSV_FILE)
        assert_equal(cl.classify("I feel happy this morning"), 'pos')
        training_sentence = cl.train_set[0][0]
        assert_true(isinstance(training_sentence, unicode))

    def test_init_with_bad_format_specifier(self):
        assert_raises(ValueError,
            lambda: NaiveBayesClassifier(CSV_FILE, format='unknown'))

    def test_repr(self):
        assert_equal(repr(self.classifier),
            "<NaiveBayesClassifier trained on {0} instances>".format(len(train_set)))
blob = TextBlob("The beer is good. But the hangover is horrible.",
                classifier=cl)
print(blob.classify())
for s in blob.sentences:
    print(s)
    print(s.classify())
# evaluating classifiers
print(cl.accuracy(test))
print(cl.show_informative_features(
    5))  # displaying a listing of the most informative features
# updating classifiers wth new data
new_data = [('She is my best friend.', 'pos'),
            ("I'm happy to have a new friend.", 'pos'),
            ("Stay thirsty, my friend.", 'pos'),
            ("He ain't from around here.", 'neg')]
print(cl.update(new_data))
print(cl.accuracy(test))


# feature extractors
# creating a feature extractor that just uses the first and last words of a document as its features
def end_word_extractor(document):
    tokens = document.split()
    first_word, last_word = tokens[0], tokens[-1]
    feats = {}
    feats["first({0})".format(first_word)] = True
    feats["last({0})".format(last_word)] = False
    return feats


features = end_word_extractor("I feel happy")
示例#8
0
class LogicProc:
    def __init__(self, preclassified_file, channel, slack_token):

        if os.path.isfile(preclassified_file)==False:
            print('"' + preclassified_file + '" does not exist!')

        with open(preclassified_file,'r') as train_set:   
            print 'training from ' + preclassified_file
            self.spam_classifier = NaiveBayesClassifier(train_set, format="csv")

        self.slack_client = slack_interface.SlackInterface(slack_token)
        self.message_queue = []
        self.last_message_ts = None
        self.channel = self.slack_client.get_channel_id(channel)
        if self.channel==None:
            print 'Could not find channel ' + channel
        self.db_interface = database_interface.DB()

        training = self.db_interface.get_training_data()
        self.spam_classifier.update(training)
        self.update_classifer_from_slack(self.channel)

        self.spam_classifier.show_informative_features()

        self.check_twitter_msgs = infinite_timer.InfiniteTimer(5.0, self.proc_messages)
        self.check_slack_msgs = infinite_timer.InfiniteTimer(60.0, self.update_classifer_from_slack, self.channel)
        self.check_twitter_msgs.start()
        self.check_slack_msgs.start()


    def add_new_message(self, msg, source):
        """
        Callback from Twitter when there is a new message
        @param msg     The Twitter message, with all its attributes
        @param source  Where the message came from.  Right now should only be 'twitter'
        """
        self.message_queue.append({'source': source, 'message': msg})

    def proc_messages(self):
        for msg in self.message_queue:
            if msg['source'] == 'twitter':
                message = msg['message']
                if self.quality_filter(message.text) == True:
                    print 'GOOD: ' + message.text.encode('utf-8')
                    self.post_to_slack(message, self.channel)
                    self.store_message(message.text, True)
                else:
                    print 'BAD: ' + message.text.encode('utf-8')
                    self.store_message(message.text, False)
            self.message_queue.remove(msg)

    def run_loop(self):
        """
         Not sure what this was originally intended to do..
         now it runs proc_messages once a second
        """
        while True:
           # sleep between polling queue
           time.sleep(1)

    def quality_filter(self, message_text):
        # -filter useless hashtag announcements "Prayers for Irma! Use #IrmaSoS"
        # -filter outside the geobounds
        # -filter duplicates
        # -bayesian filter
        result = self.spam_classifier.classify(message_text)
        if result == 'neg':
            return False
        else:
            return True

    def post_to_slack(self, msg, channel):
        self.slack_client.post_message(msg.text, channel)

    def update_classifer_from_slack(self, channel):
        slack_msgs = self.slack_client.get_slack_reactions(channel, self.last_message_ts)
        if len(slack_msgs)>0:
            self.last_message_ts = slack_msgs[-1]['ts']
        bayesian_update_data = []
        for m in slack_msgs:
            user_feedback = self.is_slack_reaction_pos(m['reactions'])
            text = m['text']
            if user_feedback == None:
                pass
            elif user_feedback == True:
                bayesian_update_data.append((text, 'pos'))
            elif user_feedback == False:
                bayesian_update_data.append((text, 'neg'))
        # update for better results if we can
        if len(bayesian_update_data) > 0:
          print 'updating db...'
          # update classification in DB
          self.db_interface.update(bayesian_update_data);
          # update classifier
          print 'updating classifier...'
          self.spam_classifier.update(bayesian_update_data)
          print 'done...'
          self.spam_classifier.show_informative_features()

    def is_slack_reaction_pos(self,reactions):
        for t in reactions:
           name = t['name']
           if name == '-1':
               return False
           if name == '+1':
               return True
        return None


    def store_message(self, message, filter_classification, source='twitter'):
        self.db_interface.add(message,filter_classification, source)
        
    def bayesian_search(self, query):
        results = self.api.search(query)
        filtered_results = [r for r in results if self.is_spam(r.text) == 0]
        return filtered_results
#	Classifying Text ( Call the classify(text) method to use the classifier.)
test_check = cl.classify("This is an amazing library!")
print test_check

#	You can get the label probability distribution with the prob_classify(text) method.

prob_dist = cl.prob_classify("This one's a doozy.")
print prob_dist.max()
print round(prob_dist.prob("pos"), 2)
print round(prob_dist.prob("neg"), 2)
print prob_dist.prob("pos")
print prob_dist.prob("neg")

blob = TextBlob("The beer is good. But the hangover is horrible.", classifier=cl)
print blob.classify()


# Evaluating Classifiers (To compute the accuracy on our test set, use the accuracy(test_data) method.)
print cl.accuracy(test)

# Updating Classifiers with New Data (Use the update(new_data) method to update a classifier with new training data.)

new_data = [('She is my best friend.', 'pos'),
 			("I'm happy to have a new friend.", 'pos'),
 			('Stay thirsty, my friend.', 'pos'),
 			("He ain't from around here.", 'neg')]

#print new_data
print cl.update(new_data)
print cl.accuracy(test)
示例#10
0
class IntentClassifier:
    """
    This intent classifier is a Python interface that uses NaiveBayesClassifier from textblob.  
    It trains data from local data folder that contains json data files,
    which each has a name, training phrases(list), desired responses(list).
    Test file are located in the test folder as json format that has phrases and each 
    corresponding intent. Trained classifier can be saved or loaded, methods are 
    implemented using pickling.
    """
    def __init__(self):
        """Construtor of the intent classifier"""
        self._responses = {}

    def __load_data(self):
        """Load data from the local 'data' folder that contains json data files for training"""
        print("loading training data...")
        training_data = []
        files = glob('data/*.json')
        for file in files:
            print("loading", file)
            with open(file) as data_file:
                training_data.append(json.load(data_file))
        return training_data

    def __load_test(self):
        """Load data from the local 'test' folder that contains json data files for testing"""
        print("loading testing data...")
        with open('test/test.json') as test_file:
            return json.load(test_file)

    def __build_responses(self, intents):
        """Create dictionary of intent mapping each to lists of responses"""
        responses = {}
        for intent in intents:
            responses[intent['name']] = intent['responses']
        return responses

    def __arrange_data(self, intents):
        """Convert a json lists to a list of tuples each contains utterance and intent"""
        return [(utterance, intent['name']) for intent in intents
                for utterance in intent['userSays']]

    def __arrange_test(self, tests):
        """Convert a json to a list of tuples each containts phrase and intent"""
        return [(test['utterance'], test['intent']) for test in tests['tests']]

    def train(self, utterances=[]):
        """
        Loads data from local data folder that contains json data files to train the Naive Bayes Classifier
        and populate a dictionary of intents mapping to each list of responses if no utterances list were given

        Keyword Arguments: 
        intents -- is a list containing tuples of phrase and intent to train (optional)
        """
        if not utterances:
            json_data = self.__load_data()
            self._responses = self.__build_responses(json_data)
            utterances = self.__arrange_data(json_data)
        self._cl = NaiveBayesClassifier(utterances)

    def update(self, utterances=[]):
        """
        Loads data from local data folder that contains json data files to train the Naive Bayes Classifier
        and populate a dictionary of intents mapping to each list of responses if no utterances list were given

        Keyword Arguments: 
        intents -- is a list containing tuples of phrase and intent to train (optional)
        """
        if not utterances:
            json_data = self.__load_data()
            self._responses = {
                **self._responses,
                **self.__build_responses(utterances)
            }
            utterances = self.__arrange_data(json_data)
        self._cl.update(utterances)

    def test(self):
        """Test the accuracy of the classifier"""
        data_set = self.__arrange_test(self.__load_test())
        return self._cl.accuracy(data_set)

    def classify(self, target):
        """Classify a text"""
        label = self._cl.classify(target)
        return label

    def getProbability(self, target, intent):
        """Get probability of a phrase to an intent"""
        guess = self._cl.prob_classify(target)
        return round(guess.prob(intent), 2)

    def response(self, target):
        """Get a response according to the intent of the text"""
        responses = self._responses[self.classify(target)]
        return random.choice(responses)

    def addResponse(self, text, intent):
        """Add a response to the dictionary"""
        self._responses[intent].append(text)

    def addResponses(self, utterances):
        """Add list of tuples each containing response and intent to responses"""
        for utterance in utterances:
            self._responses[utterance[1]].append(utterance[0])

    def save(self, path):
        """Save the current trained classifier"""
        with open(path, "wb") as classifier_f:
            pickle.dump(self, classifier_f)

    def load(path):
        """A class method that load from local classifier"""
        with open(path, "rb") as classifier_f:
            classifier = pickle.load(classifier_f)
        return classifier
示例#11
0
class BankClassify():

    def __init__(self, data="AllData.csv"):
        """Load in the previous data (by default from AllData.csv) and initialise the classifier"""
        if os.path.exists(data):
            self.prev_data = pd.read_csv(data)
        else:
            self.prev_data = pd.DataFrame(columns=['date', 'desc', 'amount', 'cat'])

        self.classifier = NaiveBayesClassifier(self._get_training(self.prev_data), self._extractor)

    def add_data(self, filename):
        """Add new data and interactively classify it.

        Arguments:
         - filename: filename of Santander-format file
        """
        self.new_data = self._read_santander_file(filename)

        self._ask_with_guess(self.new_data)

        self.prev_data = pd.concat([self.prev_data, self.new_data])
        self.prev_data.to_csv("AllData.csv", index=False)

    def _prep_for_analysis(self):
        """Prepare data for analysis in pandas, setting index types and subsetting"""
        self.prev_data = self._make_date_index(self.prev_data)

        self.prev_data['cat'] = self.prev_data['cat'].str.strip()

        self.inc = self.prev_data[self.prev_data.amount > 0]
        self.out = self.prev_data[self.prev_data.amount < 0]
        self.out.amount = self.out.amount.abs()

        self.inc_noignore = self.inc[self.inc.cat != 'Ignore']
        self.inc_noexpignore = self.inc[(self.inc.cat != 'Ignore') & (self.inc.cat != 'Expenses')]

        self.out_noignore = self.out[self.out.cat != 'Ignore']
        self.out_noexpignore = self.out[(self.out.cat != 'Ignore') & (self.out.cat != 'Expenses')]

    def _read_categories(self):
        """Read list of categories from categories.txt"""
        categories = {}

        with open('categories.txt') as f:
            for i, line in enumerate(f.readlines()):
                categories[i] = line.strip()

        return categories

    def _add_new_category(self, category):
        """Add a new category to categories.txt"""
        with open('categories.txt', 'a') as f:
            f.write('\n' + category)

    def _ask_with_guess(self, df):
        """Interactively guess categories for each transaction in df, asking each time if the guess
        is correct"""
        # Initialise colorama
        init()

        df['cat'] = ""

        categories = self._read_categories()

        for index, row in df.iterrows():

            # Generate the category numbers table from the list of categories
            cats_list = [[idnum, cat] for idnum, cat in categories.items()]
            cats_table = tabulate(cats_list)

            stripped_text = self._strip_numbers(row['desc'])

            # Guess a category using the classifier (only if there is data in the classifier)
            if len(self.classifier.train_set) > 1:
                guess = self.classifier.classify(stripped_text)
            else:
                guess = ""


            # Print list of categories
            print(chr(27) + "[2J")
            print(cats_table)
            print("\n\n")
            # Print transaction
            print("On: %s\t %.2f\n%s" % (row['date'], row['amount'], row['desc']))
            print(Fore.RED  + Style.BRIGHT + "My guess is: " + str(guess) + Fore.RESET)

            input_value = input("> ")

            if input_value.lower() == 'q':
                # If the input was 'q' then quit
                return df
            if input_value == "":
                # If the input was blank then our guess was right!
                df.ix[index, 'cat'] = guess
                self.classifier.update([(stripped_text, guess)])
            else:
                # Otherwise, our guess was wrong
                try:
                    # Try converting the input to an integer category number
                    # If it works then we've entered a category
                    category_number = int(input_value)
                    category = categories[category_number]
                except ValueError:
                    # Otherwise, we've entered a new category, so add it to the list of
                    # categories
                    category = input_value
                    self._add_new_category(category)
                    categories = self._read_categories()

                # Write correct answer
                df.ix[index, 'cat'] = category
                # Update classifier
                self.classifier.update([(stripped_text, category)   ])

        return df

    def _make_date_index(self, df):
        """Make the index of df a Datetime index"""
        df.index = pd.DatetimeIndex(df.date.apply(dateutil.parser.parse,dayfirst=True))

        return df

    def _read_santander_file(self, filename):
        """Read a file in the plain text format that Santander provides downloads in.

        Returns a pd.DataFrame with columns of 'date', 'desc' and 'amount'."""
        with open(filename, errors='replace') as f:
            lines = f.readlines()

        dates = []
        descs = []
        amounts = []

        for line in lines[4:]:

            line = "".join(i for i in line if ord(i)<128)
            if line.strip() == '':
                continue

            splitted = line.split(":")

            category = splitted[0]
            data = ":".join(splitted[1:])

            if category == 'Date':
                dates.append(data.strip())
            elif category == 'Description':
                descs.append(data.strip())
            elif category == 'Amount':
                just_numbers = re.sub("[^0-9\.-]", "", data)
                amounts.append(just_numbers.strip())

        df = pd.DataFrame({'date':dates, 'desc':descs, 'amount':amounts})

        df['amount'] = df.amount.astype(float)
        df['desc'] = df.desc.astype(str)
        df['date'] = df.date.astype(str)

        return df

    def _get_training(self, df):
        """Get training data for the classifier, consisting of tuples of
        (text, category)"""
        train = []
        subset = df[df['cat'] != '']
        for i in subset.index:
            row = subset.ix[i]
            new_desc = self._strip_numbers(row['desc'])
            train.append( (new_desc, row['cat']) )

        return train

    def _extractor(self, doc):
        """Extract tokens from a given string"""
        # TODO: Extend to extract words within words
        # For example, MUSICROOM should give MUSIC and ROOM
        tokens = self._split_by_multiple_delims(doc, [' ', '/'])

        features = {}

        for token in tokens:
            if token == "":
                continue
            features[token] = True

        return features

    def _strip_numbers(self, s):
        """Strip numbers from the given string"""
        return re.sub("[^A-Z ]", "", s)

    def _split_by_multiple_delims(self, string, delims):
        """Split the given string by the list of delimiters given"""
        regexp = "|".join(delims)

        return re.split(regexp, string)
        ('Gary is a friend of mine.', 'pos'),
        ("I can't believe I'm doing this.", 'neg')]

from textblob.classifiers import NaiveBayesClassifier
cl = NaiveBayesClassifier(train)

# classify method
print(cl.classify("This is amazing!"))

# probablity of class
prob_dist = cl.prob_classify("This one's a doozy.")
prob_dist.max()

round(prob_dist.prob("pos"), 2)
round(prob_dist.prob("neg"), 2)

# classify text blob
blob = TextBlob("I have good spelling!", classifier=cl)
blob.classify()

cl.accuracy(test)
cl.show_informative_features(5)

new_data = [('She is my best friend.', 'pos'),
            ("I'm happy to have a new friend.", 'pos'),
            ("Stay thirsty, my friend.", 'pos'),
            ("He ain't from around here.", 'neg')]

cl.update(new_data)
cl.accuracy(test)
示例#13
0
]
test = [
    ('The beer was good.', 'pos'),
    ('I do not enjoy my job', 'neg'),
    ("I ain't feeling dandy today.", 'neg'),
    ("I feel amazing!", 'pos'),
    ('Gary is a friend of mine.', 'pos'),
    ("I can't believe I'm doing this.", 'neg')
]
cl = NaiveBayesClassifier(train)
reviews = [(list(movie_reviews.words(fileid)), category)
            for category in movie_reviews.categories()
            for fileid in movie_reviews.fileids(category)]
random.shuffle(reviews)
new_train, new_test = reviews[0:100], reviews[101:200]
cl.update(new_train)
@app.route('/', methods=['POST'])
def home():
    data = request.data
    dataDict = json.loads(data)
    return  cl.classify(dataDict['text'])

@app.route("/HEALTH")
def health():
   
    return "HEALTH"

app.run()


示例#14
0
    ("I can't deal with this", 'neg'),
    ('He is my sworn enemy!', 'neg'),
    ('My boss is horrible.', 'neg')
]
test = [
    ('The beer was good.', 'pos'),
    ('I do not enjoy my job', 'neg'),
    ("I ain't feeling dandy today.", 'neg'),
    ("I feel amazing!", 'pos'),
    ('Gary is a friend of mine.', 'pos'),
    ("I can't believe I'm doing this.", 'neg')
]

cl = NaiveBayesClassifier(train)

# Grab some movie review data
reviews = [(list(movie_reviews.words(fileid)), category)
              for category in movie_reviews.categories()
              for fileid in movie_reviews.fileids(category)]
random.shuffle(reviews)
new_train, new_test = reviews[0:100], reviews[101:200]

# Update the classifier with the new training data
cl.update(new_train)

# Compute accuracy
accuracy = cl.accuracy(test + new_test)
print("Accuracy: {0}".format(accuracy))

# Show 5 most informative features
cl.show_informative_features(5)
示例#15
0
class BankClassify():

    def __init__(self):
        """Load in the previous data (by default from AllData.csv) and initialise the classifier"""
        self.training_set = []
        self.accuracy = 0
        self.classifier = NaiveBayesClassifier(self.training_set, self.extractor)

    def get_accuracy(self):
        return self.accuracy

    def category_classify(self, item):
        # Guess a category using the classifier (only if there is data in the classifier)
        if len(self.classifier.train_set) > 1:
            guess = self.classifier.classify(item.lower())
        else:
            guess = ""
        new_entry = [(item.lower(), guess)]
        self.classifier.update(new_entry)
        self.training_set = self.training_set + new_entry
        self.accuracy = self.classifier.accuracy(self.training_set)
        return guess

    def read_bank_file(self, filename):
        """Read a csv file
        Returns a list with columns of 'desc' and 'category'."""
        with open(filename, 'r') as csvfile:
            reader = csv.reader(csvfile)
            next(reader, None)
            train = []
            for line in reader:
                train.append(tuple(line))
            self.training_set = self.training_set + train
            self.classifier.update(train)
            self.accuracy = self.classifier.accuracy(train)

        return True

    def update_training_set(self, new_data):
        """Update training data for the classifier, consisting of tuples of
        (text, category)"""
        train = []
        if len(new_data) > 0:
            for i in range(0, len(new_data)):
                row = new_data[i]
                new_desc = self.strip_numbers(row[0]).lower()
                train.append((new_desc, row[1]))
            self.training_set = self.training_set + train
            self.classifier.update(train)
            self.accuracy = self.classifier.accuracy(train)

        else:
            self.accuracy = 0
        return self.training_set

    def extractor(self, doc):
        """Extract tokens from a given string"""
        # TODO: Extend to extract words within words
        # For example, MUSICROOM should give MUSIC and ROOM
        tokens = self.split_by_multiple_delims(doc, [' ', '/'])

        features = {}

        for token in tokens:
            if token == "":
                continue
            features[token] = True

        return features

    def strip_numbers(self, s):
        """Strip numbers from the given string"""
        return re.sub("[^A-Z ]", "", s)

    def split_by_multiple_delims(self, string, delims):
        """Split the given string by the list of delimiters given"""
        regexp = "|".join(delims)

        return re.split(regexp, string)
示例#16
0
class TestNaiveBayesClassifier(unittest.TestCase):

    def setUp(self):
        self.classifier = NaiveBayesClassifier(train_set)

    def test_default_extractor(self):
        text = "I feel happy this morning."
        assert_equal(self.classifier.extract_features(text), basic_extractor(text, train_set))

    def test_classify(self):
        res = self.classifier.classify("I feel happy this morning")
        assert_equal(res, 'positive')
        assert_equal(len(self.classifier.train_set), len(train_set))

    def test_classify_a_list_of_words(self):
        res = self.classifier.classify(["I", "feel", "happy", "this", "morning"])
        assert_equal(res, "positive")

    def test_train_from_lists_of_words(self):
        # classifier can be trained on lists of words instead of strings
        train = [(doc.split(), label) for doc, label in train_set]
        classifier = NaiveBayesClassifier(train)
        assert_equal(classifier.accuracy(test_set),
                        self.classifier.accuracy(test_set))

    def test_prob_classify(self):
        res = self.classifier.prob_classify("I feel happy this morning")
        assert_equal(res.max(), "positive")
        assert_true(res.prob("positive") > res.prob("negative"))

    def test_accuracy(self):
        acc = self.classifier.accuracy(test_set)
        assert_true(isinstance(acc, float))

    def test_update(self):
        res1 = self.classifier.prob_classify("lorem ipsum")
        original_length = len(self.classifier.train_set)
        self.classifier.update([("lorem ipsum", "positive")])
        new_length = len(self.classifier.train_set)
        res2 = self.classifier.prob_classify("lorem ipsum")
        assert_true(res2.prob("positive") > res1.prob("positive"))
        assert_equal(original_length + 1, new_length)

    def test_labels(self):
        labels = self.classifier.labels()
        assert_true("positive" in labels)
        assert_true("negative" in labels)

    def test_show_informative_features(self):
        feats = self.classifier.show_informative_features()

    def test_informative_features(self):
        feats = self.classifier.informative_features(3)
        assert_true(isinstance(feats, list))
        assert_true(isinstance(feats[0], tuple))

    def test_custom_feature_extractor(self):
        cl = NaiveBayesClassifier(train_set, custom_extractor)
        cl.classify("Yay! I'm so happy it works.")
        assert_equal(cl.train_features[0][1], 'positive')

    def test_init_with_csv_file(self):
        with open(CSV_FILE) as fp:
            cl = NaiveBayesClassifier(fp, format="csv")
        assert_equal(cl.classify("I feel happy this morning"), 'pos')
        training_sentence = cl.train_set[0][0]
        assert_true(isinstance(training_sentence, unicode))

    def test_init_with_csv_file_without_format_specifier(self):
        with open(CSV_FILE) as fp:
            cl = NaiveBayesClassifier(fp)
        assert_equal(cl.classify("I feel happy this morning"), 'pos')
        training_sentence = cl.train_set[0][0]
        assert_true(isinstance(training_sentence, unicode))

    def test_init_with_json_file(self):
        with open(JSON_FILE) as fp:
            cl = NaiveBayesClassifier(fp, format="json")
        assert_equal(cl.classify("I feel happy this morning"), 'pos')
        training_sentence = cl.train_set[0][0]
        assert_true(isinstance(training_sentence, unicode))

    def test_init_with_json_file_without_format_specifier(self):
        with open(JSON_FILE) as fp:
            cl = NaiveBayesClassifier(fp)
        assert_equal(cl.classify("I feel happy this morning"), 'pos')
        training_sentence = cl.train_set[0][0]
        assert_true(isinstance(training_sentence, unicode))

    def test_init_with_custom_format(self):
        redis_train = [('I like turtles', 'pos'), ('I hate turtles', 'neg')]

        class MockRedisFormat(formats.BaseFormat):
            def __init__(self, client, port):
                self.client = client
                self.port = port

            @classmethod
            def detect(cls, stream):
                return True

            def to_iterable(self):
                return redis_train

        formats.register('redis', MockRedisFormat)
        mock_redis = mock.Mock()
        cl = NaiveBayesClassifier(mock_redis, format='redis', port=1234)
        assert_equal(cl.train_set, redis_train)

    def test_data_with_no_available_format(self):
        mock_fp = mock.Mock()
        mock_fp.read.return_value = ''

        assert_raises(FormatError, lambda: NaiveBayesClassifier(mock_fp))

    def test_accuracy_on_a_csv_file(self):
        with open(CSV_FILE) as fp:
            a = self.classifier.accuracy(fp)
        assert_equal(type(a), float)

    def test_accuracy_on_json_file(self):
        with open(CSV_FILE) as fp:
            a = self.classifier.accuracy(fp)
        assert_equal(type(a), float)

    def test_init_with_tsv_file(self):
        with open(TSV_FILE) as fp:
            cl = NaiveBayesClassifier(fp)
        assert_equal(cl.classify("I feel happy this morning"), 'pos')
        training_sentence = cl.train_set[0][0]
        assert_true(isinstance(training_sentence, unicode))

    def test_init_with_bad_format_specifier(self):
        assert_raises(ValueError,
            lambda: NaiveBayesClassifier(CSV_FILE, format='unknown'))

    def test_repr(self):
        assert_equal(repr(self.classifier),
            "<NaiveBayesClassifier trained on {0} instances>".format(len(train_set)))
    print("Made new classifier")
del full_data

feeding_size = 1000
left_splice = 11000
right_splice = feeding_size + left_splice

count = 0
new_start_time = time.time()
past_times = 0

while right_splice < 1500000:
    loop_time = time.time()
    data = itertools.islice(training_data, left_splice, right_splice)
    try:
        classifier.update(data)
    except Exception:
        print("Houston we got a problem")
        with open("sentimentclassifier.pickle", "wb") as sentiment:
            pickle.dump(classifier, sentiment)
        sys.exit("Yo it ended at {} and {}".format(left_splice, right_splice))
    past_times += time.time() - loop_time
    count += 1
    string = "Left: {} Right: {}. Took {} seconds. Total Time Elapsed: {}. Average Time for each: {}. Count: {}."\
        .format(left_splice, right_splice, time.time()-loop_time, time.time() - new_start_time, past_times/count, count)
    sys.stdout.write('\r' + string)
    left_splice += feeding_size
    right_splice += feeding_size
    with open("sentimentclassifier.pickle", "wb") as sentiment:
        pickle.dump(classifier, sentiment)
        print("Done dumping cycle {}!".format(count))
示例#18
0
def final_utterance_appreciation_analysis(final_utterance):
	"""
	Input: A list of final utterances by the user.
	Output: The percentage of the people expressing appreciation at the end of the conversation.

	Algorithm:
	1. Create a training set and a validation set of conversation which are manually classified into "appreciation" and "nonappreciation"
	   The differentiation criteria is based on the existence of the words of gratitude.
	2. Train the Naive Bayesian classifier algorithm using the training set.
	3. If the accuracy of the classifier algorithm in classifying the validation dataset into "appreciation" and "nonappreciation",
	   apply the algorithm to all the list final_utterance using a for loop.
	4. Use a dictionary data structure during the loop to store the number of people who express gratitude and who do not express gratitude.
	5. Calculate the percentage of people who express gratitude.

	How the Native Bayesian Classifier Algorithm from TextBlob Package Works:

	For training dataset:
	In order to find the probability for classifying the sentence with a label of "appreciation" and "nonappreciation",
	the algorithm first removes all the meaningless stop words such as "the" and "a" in the sentence.
	Then it calculates the frequency of the remaining tokens and creates a likelihood table that maps the tokens (which are the features)
	to the probability of the token being labelled as "appreciation" and "nonappreciation".

	For a new sentence, it removes all the meaningless stop words and calculate the probability of the sentence being "appreciation"
	or "nonappreciation" based on the 'naive' assumption that all features are independent, given the label:
	|                       P(label) * P(f1|label) * ... * P(fn|label)
	|  P(label|features) = --------------------------------------------
	|                                         P(features)

	"""

	classified_dict = {"appreciation": 0, "non-appreciation": 0}

	train = [('Very well. How about the price for the trip to Essen?', 'nonappreciation'),
	         ("I'd like to book the Cairo package. Thank you!", 'appreciation'),
	         ('oh heck yeah!! economy - I need the money', 'nonappreciation'),
	         ('Then I will take it!', 'nonappreciation'),
	         ('Awesome!!! Thanks!!!', 'appreciation'),
	         ('What??? :disappointed:', 'nonappreciation'),
	         ('Yes do that', 'nonappreciation'),
	         ('Thank you kindly!', 'appreciation'),
	         ('Ok, thank you for your time anyways', 'appreciation'),
	         ('thank you very much for your patience you are an absolute gem','appreciation'),
	         ('Thank you so much!', 'appreciation'),
	         ('Lots of swanky hotels to choose from! Well, based on length of trip, that one to SL sounds like a great deal. I think I wanna go ahead with booking that', 'nonappreciation'),
	         ('Uh huh', 'nonappreciation'),
	         ('Jerusalem to Kingston. I swear if I have to repeat myself again then I will sue', 'nonappreciation'),
	         ('Ok, thanks anyway','appreciation'),
	         ('Looking to go from San Francisco to MArseille. ', 'nonappreciation'),
	         ('Book me for September 18 to 22. Let me know if its more than 2800 because thats all I can afford', 'nonappreciation'),
	         ('duuuude. ah\nwhat about Ciudad Juarez', 'nonappreciation'),
	         ('Well what if I leave the 8th', 'nonappreciation'),
	         ('Ok :+1: we out', 'nonappreciation'),
	         ('Yes!!!!!', 'nonappreciation'),
	         ('ok fine lets do it, business class please', 'nonappreciation'),
	         ('WOE IS ME, FOR I HAVE NOT', 'nonappreciation'),
	         ('ah damn', 'nonappreciation'),
	         ('okay bye', 'nonappreciation'),
	         ('Yikes. Ok Buenos Aires it is\nBook it please\nBusiness class', 'nonappreciation'),
	         ('shit yassss we goin in. Book it for us, please.', 'nonappreciation'),
	         ('well, this is rather disappointing we cannot spend our family vacation near the airport. i wont be booking anything today in this case, goodbye', 'nonappreciation'),
	         ('Thanks! Very excited!', 'appreciation'),
	         ('NOT GOOD', 'nonappreciation'),
	         ("you're a lifesaver", "appreciation"),
	         ('ah. if i could book, i would book this one. well thanks for your time, ill come back next year and save my vacation days for a trip to San Diego.', "appreciation"),
	         ('Great, thanks a lot!', "appreciation"),
	         ("WHAT!?!?! Ugh, kill me now. Okkay fine. I'll look somewhere else.", "nonappreciation"),
	         ("I guess that sound okay, I'll take it", "nonappreciation"),
	         ("Ok, that's fine\nBook it", "nonappreciation"),
	         ('I like the sound of that one. Heart of the city would be better than near a mall.\nLets book business class in Buenos Aires.', "nonappreciation"),
	         ('cool bye', "nonappreciation"),
	         ("let's book :wink:", "nonappreciation"),
	         ('Done, booked! Thanks!', 'appreciation'),
	         ('Okay will consider it and get back to you, thanks!', 'appreciation'),
	         ('DOPE. book it', 'nonappreciation'),
	         ('Hmm. Okay well im just gonna take the information you gave me and discuss it with my wife before booking something she might not enjoy. Thanks for the help!', 'appreciation'),
	         ('Thanks! You were a great help!', 'appreciation'),
	         ('i said 2.5 wasnt good enough', 'nonappreciation'),
	         ('No thats the last straw, we are taking our business elsewhere', 'nonappreciation'),
	         ('Thanks :slightly_smiling_face:', 'appreciation'),
	         ('Hi Do you fly from Ulsan to London??', 'nonappreciation'),
	         ('Ok then leave from Beijing', 'appreciation'),
	         ('i need to get away from a little longer than that one. so lets book vancouver please and thanks', "appreciation"),
	         ("Let's book Valencia. Pleasure doing business with you.", "appreciation"),
	         ('Thank you bot.', "appreciation"),
	         ('No worries, thanks!', "appreciation"),
	         ("That sucks. I'll look somewhere else", "nonappreciation"),
	         ('I am giving you one last time to you your job. you better tread carefully here, my friend,\nCairo to Porto Alegre or I will raise hell', "nonappreciation"),
	         ('Bye. And thanks for nothing.', "nonappreciation"),
	         ("Yes, I'll take it. Thank you", "nonappreciation"),
	         ('no there are 7 of us', "nonappreciation"),
	         ('for 712.00 it sounds like a very nice deal I will book flight on August 26 for 6 days. Thank you for your help.', 'appreciation'),
	         ('3.5 it is then. lets book it', 'nonappreciation'),
	         ('but fine, book it', 'nonappreciation'),
	         ('no can do', "nonappreciation"),
	         ('Thank you very much.', "nonappreciation"),
	         ('gracias!', "appreciation"),
	         ("Perfect! I'll book it", "nonappreciation"),
	         ('Do you do flights leaving from Tel Aviv?', "nonappreciation"),
	         ('that seem good, i will book! Gracias!', "appreciation"),
	         ("No it's alright! thanks though!", "appreciation"),
	         ('okay well its crucial i get there from Fortaleza so I will call someone else', "nonappreciation"),
	         ('how is that possible', "nonappreciation"),
	         ('Well what about in Goiania.?','nonappreciation'),
	         ('ok no thats not good enough im going elsewhere', "nonappreciation"),
	         ('amazing! thanks!', "appreciation"),
	         ('Lets do Business class', "nonappreciation"),
	         ("Oh Okay well i'll look somewhere else. Thanks anyway.", "appreciation"),
	         ('you dont have any flights to birmingham yeah i find that pretty freakin hard to believe', "nonappreciation"),
	         ('This is HORRIBLE', "nonappreciation"),
	         ("yes, you're right.. thank you", "appreciation"),
	         ('ok thanks so much', "appreciation"),
	         ('what if i changed the dates. sept 2 and 23', "nonappreciation"),
	         ('Thank you, but I will go use another service that can better satisfy my escapist fantasies', "appreciation"),
	         ("I really want a spa. If you have nothing to offer with a spa, I'll shop around then.", 'nonappreciation'),
	         ('Oh dear, thats quite above our 3 thousand dollar budget.', 'nonappreciation'),
			 ('dope! thanks', 'appreciation'),
			 ('No worries! Bye!', 'nonappreciation'),
			 ('Ok Lets lock in San Diego', "nonappreciation"),
			 ("You're great", 'appreciation'),
			 ('ok. book it out of Milan please', 'nonappreciation)'),
			 ('ill go for Ciudad Juarez', "nonappreciation"),
			 ('Thank you wozbot!', "appreciation"),
			 ('yes please', "nonappreciation"),
			 ("Usually I wouldn't want to be caught dead in a 3.5 star hotel, but I'm short on time here. Get us on that trip, business class", "nonappreciation"),
			 ('GREAT Thanks!!!!!!!!', "appreciation"),
			 ("I think I'll stick to the 11 day package in Belem at Las Flores, seems like the best deal and it had a good user rating. Let's book that one.", "nonappreciation"),
			 ('thnx', "appreciation"),
			 ('no it HAS to be baltimore and it HAS to be perfect. thanks anyways', "appreciation"),
			 ("Perfect! I'll book it", "nonappreciation"),
			 ("That's it?", "nonappreciation"),
			 ('I shall take the 5 star package!', "nonappreciation"),
			 ('thank you so much', "appreciation"),
			 ('YOU ARE RUINING MY MARRIAGE', "nonappreciation")]

	validation = [('Yes chief', "appreciation"),
				 ("Thanks! I'm sure it will be amazinggg", "appreciation"),
				 ("Weeeelllll this is a no brainer, I 'll just leave the next day and save a whole lotta money! Can you book this for me right away so I don't lose it?", "nonappreciation"),
				 ("Ok I'll book the package with 8 days in Pittsburgh from August 17th to the 24th. Thank you.", "appreciation"),
				 ('Thanks - will do', "appreciation"),
				 ('Killing it! thank', "appreciation"),
				 ('Thanks, you too', "appreciation"),
				 ('thank you wozbot :slightly_smiling_face: toodles', "appreciation"),
				 ('spectacular book please', "nonappreciation"),
				 ("Well, I reckon I'll just book this one.", "nonappreciation"),
				 ("yea so I've heard... send me to Paris then", 'nonappreciation'),
				 ('Fortaleza\n5 stars', "nonappreciation"),
				 ('I guess I can increase my budget by 1000', 'nonappreciation'),
				 ('ok see ya', "nonappreciation"),
				 ('leaving from anywhere??', "nonappreciation"),
				 ("That's it! Thank you so so much :):):)", "appreciation"),
				 ('Done. Book it.', "nonappreciation"),
				 ('Great, sounds perfect. Thank you.', "appreciation"),
				 ('Thats all i had my heart set on!!', "nonappreciation"),
				 ("That sounds like the better hotel. Can't be too cautious travelling by myself for the first time! I will book that deal in an economy class ticket, I'm not ready for business class YET, need to pass that bar exam!",  "nonappreciation"),
				 ('Then I will take my search elsewhere', "nonappreciation"),
				 ('Ya thanks', "appreciation"),
				 ('Thank you, glad to be going back so soon', "appreciation"),
				 ('well okay I can always take the tram in to the city. I will book that one.', "nonappreciation"),
				 ('This is hopeless', "nonappreciation"),
				 ('Great, thank you. I will most certainly book my next vacation with you.', "appreciation"),
				 ('thank youuuu', "appreciation"),
				 ('Lock it down', "nonappreciation"),
				 ("Please help! My lovely parents have been married fof 20 years and they've never taken a trip together. I'm thinking of getting them out of town Sept 6 to 9\nyou got anything good for 2 adults leaving sao paulo, for under 2400?", "nonappreciation"),
				 ('we can also go to Kochi', "nonappreciation"),
				 ('no but we can stay for 9 days instead of 3', "nonappreciation"),
				 ('thanks you!', "appreciation"),
				 ('Just under budget. ok bye now', "nonappreciation"),
				 ('thankyou', "appreciation"),
				 ('can you tell me the price and nearby attractions?', "nonappreciation"),
				 ('1 adult', "nonappreciation"),
				 ('San Jose to Porto Alegre please. oh it needs to be between sept 18 to 22', "nonappreciation"),
				 ('Ok sold! please enter a booking for us', "nonappreciation"),
				 ('I can leave from Tel aviv and I want to go to San Jose with 7 adults for 2500', "nonappreciation"),
				 ('Well what about in Goiania.?', "nonappreciation"),
				 ('you are being unhelpful just answer yes or no, is it near a park or beach?', "nonappreciation"),
				 ('thak you', "appreciation"),
				 ('I shall take the 5 star package!', "nonappreciation"),
				 ('Okay but what if I leave from Naples instead. Can you get me to Manas from Naples?', "nonappreciation"),
				 ("I'm a woman! Try to find something 9000 or less if you can.", "nonappreciation"),
				 ("That's perfect.", "nonappreciation"),
				 ('ok. fine. I have a 4500 $ budjet and I will star as long as that money lasts. thx', "appreciation"),
				 ('sure fine flexible actually no i dont wanna go any more', "nonappreciation"),
				 ("No, unfortunately I can't. Guess I'll just take a staycation this time :disappointed: Thanks anyway", "appreciation"),
				 (" I'll book this one. Thank you, friend!", "appreciation"),
				 ('No we can only go to Porto... or Porto. Thanks.', "appreciation")]

	cl = NaiveBayesClassifier(train) # train the Naive Bayesian Classifier algorithm
	if cl.accuracy(validation) > 0.90: # check if the accuracy of the Naive Bayesian Classifier algorithm in classifying the validation data set is greater than 90%.
		cl.update(validation)	# update the Naive Bayesian Classifier algorithm with the validation data set.

		for m in final_utterance:
			if cl.classify(m) == "appreciation":
				classified_dict["appreciation"] += 1
			else:
				classified_dict["non-appreciation"] += 1

	# calculate the percentage of people expressing appreciation
	return "{}% people express appreciation.".format(float(classified_dict["appreciation"] / (float(classified_dict["appreciation"] + classified_dict["non-appreciation"]))) * 100)
示例#19
0
class NLTKHashtagsClassifier(Classifier):
    """
    Classifies InstagramProfiles as blogger, brand or undecided.

    Currently is a PROTOTYPE.
    """

    # list of all available categories for categorization
    AVAILABLE_CATEGORIES = [
        'brand',
        'blogger',
        'undecided',
    ]

    classifier = None
    undecided_margin = None

    def __init__(self,
                 blogger_hashtags=[],
                 brand_hashtags=[],
                 undecided_margin=None):
        """
        Explicitly inits lists of hashtags and creates NLTK Classifier object.
        Lists are not intended to contain unique hashtags.
        :param blogger_hashtags: list of lists of hashtags suitable for bloggers
        :param brand_hashtags: list of lists of hashtags suitable for brands
        :param undecided_margin: probability margin when to consider classification result as undecided
        :return:
        """
        from textblob.classifiers import NaiveBayesClassifier

        initial_train = []
        for v in blogger_hashtags:
            initial_train.append((v, self.AVAILABLE_CATEGORIES[1]))
        for v in brand_hashtags:
            initial_train.append((v, self.AVAILABLE_CATEGORIES[0]))
        self.classifier = NaiveBayesClassifier(initial_train)
        initial_train = []

    def classify_unit(self, source=None, **kwargs):
        """
        This method is the core of classification algorithm. It receives source data for classification (object, model,
        string, etc.) and returns a value of classification category for this object.
        For example, we use InstagramProfile as source data, and result could be either 'brand' or 'blogger'
        or 'undecided'.
        """
        # return 'brand'

        cat_classified = self.classifier.classify(source)
        probability = self.classifier.prob_classify(source)

        # TODO: add probability_margin logic here

        return cat_classified

    def classify_queryset(self, source_queryset=None, **kwargs):
        """
        Helper method. Same as above but performs the whole queryset.
        Return queryset
        """
        # TODO: Think how to do it for this classifier.

        raise NotImplemented

    def update_classifier(self, extra_data=None):
        """

        """
        if extra_data is not None:
            self.classifier.update(extra_data)
                
                #Print succes message
    print "> File opened successfully!"
                
    counter = 0
    for row in reader:
        selectTweets(row)
        counter += 1
                    
    print "> Wait a sec for the results..."
                    
    cl = NaiveBayesClassifier(trainTweets)
                
             
    print "> add another data set"
    cl.update(trainFeatures)  
    print "> finish combination"
    cl.show_informative_features(10)


    outputPos=open('positiveTweet.txt','a')
    outputNeg=open('negativeTweet.txt','a')
    dataset = str(raw_input("> Please enter a filename contains tweets: ")) 
    with open(dataset) as f:
         out = f.readlines()   
         for lines in out:
            tweetWords = []
            words = lines.split()
            for i in words:
                i = i.lower()
                i = i.strip('@#\'"?,.!')
示例#21
0
# Test model with its two labels
print cl.classify(u" احسن علاج هذا")

# second cl model test
prob_dist = cl.prob_classify(u"ك يوم يا ظالم,")
print prob_dist.max()
print prob_dist.prob("positive")
print prob_dist.prob("negative")

# compute the accuracy on our test set
print "accuracy on the test set:{} ".format(cl.accuracy("testing.csv", format="csv"))

# display a listing of the most informative features.
cl.show_informative_features(5)

# add new data
new_data = [(u"كلام صحيح من شان هيك الدول اللي ما فيها بطالة والمجتمعات المفتوحة بتقل فيها المشاكل النفسية", 'positive'),
           (u"لا طبعا التقرب الى الله هو خير علاج للحالات النفسية", 'positive'),
           (u"تفائلوا بالخير تجدوه", 'positive'),
           (u"يا ترى الحكومه بدها تزيد دعم المواطن الي الله يكون في عونه", 'negative')]

# updating classifiers with new data
cl.update(new_data)

# test accuracy after adding new data to the generated model
print "accuracy on the test set:{} ".format(cl.accuracy("testing.csv", format="csv"))



示例#22
0
class TestNaiveBayesClassifier(unittest.TestCase):
    def setUp(self):
        self.classifier = NaiveBayesClassifier(train_set)

    def test_default_extractor(self):
        text = "I feel happy this morning."
        assert_equal(self.classifier.extract_features(text),
                     basic_extractor(text, train_set))

    def test_classify(self):
        res = self.classifier.classify("I feel happy this morning")
        assert_equal(res, 'positive')
        assert_equal(len(self.classifier.train_set), len(train_set))

    def test_classify_a_list_of_words(self):
        res = self.classifier.classify(
            ["I", "feel", "happy", "this", "morning"])
        assert_equal(res, "positive")

    def test_train_from_lists_of_words(self):
        # classifier can be trained on lists of words instead of strings
        train = [(doc.split(), label) for doc, label in train_set]
        classifier = NaiveBayesClassifier(train)
        assert_equal(classifier.accuracy(test_set),
                     self.classifier.accuracy(test_set))

    def test_prob_classify(self):
        res = self.classifier.prob_classify("I feel happy this morning")
        assert_equal(res.max(), "positive")
        assert_true(res.prob("positive") > res.prob("negative"))

    def test_accuracy(self):
        acc = self.classifier.accuracy(test_set)
        assert_true(isinstance(acc, float))

    def test_update(self):
        res1 = self.classifier.prob_classify("lorem ipsum")
        original_length = len(self.classifier.train_set)
        self.classifier.update([("lorem ipsum", "positive")])
        new_length = len(self.classifier.train_set)
        res2 = self.classifier.prob_classify("lorem ipsum")
        assert_true(res2.prob("positive") > res1.prob("positive"))
        assert_equal(original_length + 1, new_length)

    def test_labels(self):
        labels = self.classifier.labels()
        assert_true("positive" in labels)
        assert_true("negative" in labels)

    def test_show_informative_features(self):
        feats = self.classifier.show_informative_features()

    def test_informative_features(self):
        feats = self.classifier.informative_features(3)
        assert_true(isinstance(feats, list))
        assert_true(isinstance(feats[0], tuple))

    def test_custom_feature_extractor(self):
        cl = NaiveBayesClassifier(train_set, custom_extractor)
        cl.classify("Yay! I'm so happy it works.")
        assert_equal(cl.train_features[0][1], 'positive')

    def test_init_with_csv_file(self):
        cl = NaiveBayesClassifier(CSV_FILE, format="csv")
        assert_equal(cl.classify("I feel happy this morning"), 'pos')
        training_sentence = cl.train_set[0][0]
        assert_true(isinstance(training_sentence, unicode))

    def test_init_with_csv_file_without_format_specifier(self):
        cl = NaiveBayesClassifier(CSV_FILE)
        assert_equal(cl.classify("I feel happy this morning"), 'pos')
        training_sentence = cl.train_set[0][0]
        assert_true(isinstance(training_sentence, unicode))

    def test_init_with_json_file(self):
        cl = NaiveBayesClassifier(JSON_FILE, format="json")
        assert_equal(cl.classify("I feel happy this morning"), 'pos')
        training_sentence = cl.train_set[0][0]
        assert_true(isinstance(training_sentence, unicode))

    def test_init_with_json_file_without_format_specifier(self):
        cl = NaiveBayesClassifier(JSON_FILE)
        assert_equal(cl.classify("I feel happy this morning"), 'pos')
        training_sentence = cl.train_set[0][0]
        assert_true(isinstance(training_sentence, unicode))

    def test_accuracy_on_a_csv_file(self):
        a = self.classifier.accuracy(CSV_FILE)
        assert_true(isinstance(a, float))

    def test_accuracy_on_json_file(self):
        a = self.classifier.accuracy(JSON_FILE)
        assert_true(isinstance(a, float))

    def test_init_with_tsv_file(self):
        cl = NaiveBayesClassifier(TSV_FILE)
        assert_equal(cl.classify("I feel happy this morning"), 'pos')
        training_sentence = cl.train_set[0][0]
        assert_true(isinstance(training_sentence, unicode))

    def test_init_with_bad_format_specifier(self):
        assert_raises(ValueError,
                      lambda: NaiveBayesClassifier(CSV_FILE, format='unknown'))

    def test_repr(self):
        assert_equal(
            repr(self.classifier),
            "<NaiveBayesClassifier trained on {0} instances>".format(
                len(train_set)))
示例#23
0
    def process_questions(self):

        self._load_training_data()
        self._pload_processed_tuples()

        five_ws = [ "who", "what", "where", "when", "why" ]

        trigger_phrases = [
            "best practice",
            "best way",
            "simplest way",
            "preferred nomenclature",
            "preferred location",
            " have any recommendation",
            "exact command",
            "documentation",
            " doc for ",
            " doc about ",
            "tutorial",
            "release",
            "external inventory", "inventory file",
            "playbook", "play", "role", "task", "handler",
            "variable", "var",
            "connection", "async", "accelerate",
            "{{", "}}",
            "lookup", "plugin", "callback",
            "hang",
            "conditional", "when:"
            "group",
            "ec2 module", "route53",
            "fault tolerance",
            "public key"
        ]

        cl = NaiveBayesClassifier(self.train)
    
        ks = [ int(x) for x in self.logdata.keys() ]
        sorted_ks = sorted(ks)
        total_ks = sorted_ks[-1]
        for k in sorted_ks:
            k_str = str(k)
            print total_ks,"-",k_str

            this_msg = self.logdata[k_str]['message']
            text_obj = TextBlob(this_msg)

            if hasattr(text_obj, "raw_sentences"):
                for sent in text_obj.sentences:
                    try:
                        str(sent)
                    except UnicodeDecodeError:
                        #self.known_sentences.append(sent)
                        continue

                    if str(sent) in self.processed_tuples:
                        continue

                    if sent.endswith("?") and [ x for x in sent.words if x.lower() in five_ws ]:

                        curr_rating = cl.classify(sent)

                        triggered = False
                        for ph in trigger_phrases:
                            if ph in str(sent):
                                triggered = True                            

                        this_tuple = (k, sent, curr_rating, triggered)
                        self.processed_tuples[str(sent)] = this_tuple
                        #self.known_sentences.append(str(sent))

        # save what we have
        self._pdump_processed_tuples()

        for pt in self.processed_tuples.keys():        
            print "##############################\n"

            #import epdb; epdb.st()
            k = self.processed_tuples[pt][0]
            sent = self.processed_tuples[pt][1]
            curr_rating = self.processed_tuples[pt][2]
            triggered = self.processed_tuples[pt][3]

            print sent
            print "\n"
            print "rating: %s" % curr_rating
            print "triggered: %s" % triggered

            if ( curr_rating == "b" and triggered ) or ( curr_rating == "g" and not triggered ):
                #continue
                q_string = "\n$ g(ood) question or b(ad) question? (default: %s): " % curr_rating
                x = raw_input(q_string)                            
            else:
                x = str(curr_rating)

            print "\n"

            if x == "":
                this_tup = [ (str(sent), curr_rating) ]
                cl.update(this_tup)
                #self.known_sentences.append(str(sent))
                open(self.train_file, "a").write("'%s';%s\n" % (sent, curr_rating))
            elif x == "b" or x == "g":
                this_tup = [ (str(sent), x) ]
                cl.update(this_tup)
                #self.known_sentences.append(str(sent))
                open(self.train_file, "a").write("'%s';%s\n" % (sent, x))
            elif x == "break":
                pass
class BankClassify():

    #def __init__(self, data="AllData1.csv"):
    def __init__(self, data="Transactions.csv"):
        """Load in the previous data (by default from AllData.csv) and initialise the classifier"""
        if os.path.exists(data):
            self.prev_data = pd.read_csv(data)
        else:
            self.prev_data = pd.DataFrame(
                columns=['date', 'desc', 'amount', 'cat'])

        self.classifier = NaiveBayesClassifier(
            self._get_training(self.prev_data), self._extractor)

    def add_data(self, filename):
        """Add new data and interactively classify it.

        Arguments:
         - filename: filename of Santander-format file
        """
        #self.new_data = self._read_santander_file(filename)
        self.new_data = self._read_own_file(filename)

        self._ask_with_guess(self.new_data)

        self.prev_data = pd.concat([self.prev_data, self.new_data])
        #self.prev_data.to_csv("AllData1.csv", index=False)
        self.prev_data.to_csv("Transactions.csv", index=False)

    def _prep_for_analysis(self):
        """Prepare data for analysis in pandas, setting index types and subsetting"""
        self.prev_data = self._make_date_index(self.prev_data)

        self.prev_data['cat'] = self.prev_data['cat'].str.strip()

        self.inc = self.prev_data[self.prev_data.amount > 0]
        self.out = self.prev_data[self.prev_data.amount < 0]
        self.out.amount = self.out.amount.abs()

        self.inc_noignore = self.inc[self.inc.cat != 'Ignore']
        self.inc_noexpignore = self.inc[(self.inc.cat != 'Ignore')
                                        & (self.inc.cat != 'Expenses')]

        self.out_noignore = self.out[self.out.cat != 'Ignore']
        self.out_noexpignore = self.out[(self.out.cat != 'Ignore')
                                        & (self.out.cat != 'Expenses')]

    def _read_categories(self):
        """Read list of categories from categories.txt"""
        categories = {}

        with open('categories.txt') as f:
            #with open('categories1.txt') as f:
            for i, line in enumerate(f.readlines()):
                categories[i] = line.strip()

        return categories

    def _add_new_category(self, category):
        """Add a new category to categories.txt"""
        #with open('categories1.txt', 'a') as f:
        with open('categories.txt', 'a') as f:

            f.write('\n' + category)

    def _ask_with_guess(self, df):
        """Interactively guess categories for each transaction in df, asking each time if the guess
        is correct"""
        # Initialise colorama
        #init()

        df['cat'] = ""

        categories = self._read_categories()

        for index, row in df.iterrows():

            # Generate the category numbers table from the list of categories
            cats_list = [[idnum, cat] for idnum, cat in categories.items()]
            cats_table = tabulate(cats_list)

            stripped_text = self._strip_numbers(row['desc'])

            # Guess a category using the classifier (only if there is data in the classifier)
            if len(self.classifier.train_set) > 1:
                guess = self.classifier.classify(stripped_text)
            else:
                guess = ""

            # Print list of categories
            print(chr(27) + "[2J")
            print(cats_table)
            print("\n\n")
            # Print transaction
            print("On: %s\t %.2f\n%s" %
                  (row['date'], row['amount'], row['desc']))
            #print(Fore.RED  + Style.BRIGHT + "My guess is: " + str(guess) + Fore.RESET)
            print("My guess is: " + str(guess))

            input_value = input("> ")

            if input_value.lower() == 'q':
                # If the input was 'q' then quit
                return df
            if input_value == "":
                # If the input was blank then our guess was right!
                df.ix[index, 'cat'] = guess
                self.classifier.update([(stripped_text, guess)])
            else:
                # Otherwise, our guess was wrong
                try:
                    # Try converting the input to an integer category number
                    # If it works then we've entered a category
                    category_number = int(input_value)
                    category = categories[category_number]
                except ValueError:
                    # Otherwise, we've entered a new category, so add it to the list of
                    # categories
                    category = input_value
                    self._add_new_category(category)
                    categories = self._read_categories()

                # Write correct answer
                df.ix[index, 'cat'] = category
                # Update classifier
                self.classifier.update([(stripped_text, category)])

        return df

    def _make_date_index(self, df):
        """Make the index of df a Datetime index"""
        df.index = pd.DatetimeIndex(
            df.date.apply(dateutil.parser.parse, dayfirst=True))

        return df

    def _read_own_file(self, filename):

        with open(filename, errors='replace') as f:
            lines = f.readlines()

        dates = []
        descs = []
        amounts = []

        ############# FUNCTION TO CHECK IF A STRING IS A VALID NUMBER:
        def is_number(s):
            try:
                float(s)
                return True
            except ValueError:
                return False

        for line in lines[3:]:
            s = line.replace(' ', '')
            s = line.split()
            dates.append(" ".join(s[0:2]))
            amounts.append(s[-1].replace(',', ''))
            descs.append(" ".join(s[2:-1]))

        ##########################IF FEDERAL BANK:

        #
        #
        #     for line in lines[10:-1]:
        #
        #         splitted = line.split('|')
        #         s= line.split('TFR ')
        #         s1 = s[1].split(' ')
        #         #print(amts)
        #         dates.append(splitted[0])
        #         descs.append(splitted[2])
        #         amounts.append(s1[0])
        #
        # ########################IF AXIS BANK:
        #
        #
        #     for line in lines[10:-1]:
        #         s= line.replace(' ','')
        #         s= line.split()
        #         dates.append(s[0])
        #         descs.append(" ".join(s[1:-3]))
        #         amounts.append(s[-3].replace(',',''))
        #
        # ######################## IF CITI BANK:
        #
        #
        #     for line in lines[5:-1]:
        #         s = line.replace(' ', '')
        #         s = line.split()
        #         dates.append(s[0])
        #         if is_number(s[-2]):
        #             amounts.append(s[-2])
        #         elif is_number(s[-3]):
        #             amounts.append(s[-3])
        #         else:
        #             amounts.append(s[-1])
        #
        #         pattern = '[A-Za-z]+'
        #         s1 = " ".join(s)
        #         d=re.findall(pattern, s1)
        #         descs.append(" ".join(d[1:]))
        #
        # ############################## IF HDFC BANK:
        #
        # elif filename== 'AccDetails4.txt':
        #     for line in lines[9:-3]:
        #         s=line.replace(' ','')
        #         s=line.split()
        #         dates.append(s[0])
        #         amounts.append(s[-2].replace(',',''))
        #         descs.append((" ".join(s[1:-4])))
        #
        # ################################### AMEX
        #
        #
        #     for line in lines[5:-3]:
        #         s = line.replace(' ', '')
        #         s = line.split()
        #         dates.append(" ".join(s[0:2]))
        #         amounts.append(s[-1].replace(',',''))
        #         descs.append(" ".join(s[2:-1]))
        #
        # ################################## KOTAK
        #
        #     for line in lines[4:-2]:
        #         s=line.replace(' ','')
        #         s=line.split()
        #         dates.append(s[0])
        #         amounts.append(s[-4].replace(',', ''))
        #         descs.append(" ".join(s[1:-4]))
        #
        #
        # #############################
        #
        #     for line in lines[5:-2]:
        #         s = line.replace(' ', '')
        #         s = line.split()
        #         dates.append(s[0])
        #         if float(s[-2].replace(',',''))!=0.00:
        #             amounts.append(s[-2].replace(',',''))
        #         else:
        #             amounts.append(s[-3].replace(',', ''))
        #         descs.append(" ".join(s[1:-6]))
        #

        df = pd.DataFrame({'date': dates, 'desc': descs, 'amount': amounts})

        df['amount'] = df.amount.astype(float)
        df['desc'] = df.desc.astype(str)
        df['date'] = df.date.astype(str)

        return df

    def _get_training(self, df):
        """Get training data for the classifier, consisting of tuples of
        (text, category)"""
        train = []
        subset = df[df['cat'] != '']
        for i in subset.index:
            row = subset.ix[i]
            new_desc = self._strip_numbers(row['desc'])
            train.append((new_desc, row['cat']))

        return train

    def _extractor(self, doc):
        """Extract tokens from a given string"""
        # TODO: Extend to extract words within words
        # For example, MUSICROOM should give MUSIC and ROOM
        tokens = self._split_by_multiple_delims(doc, [' ', '/'])

        features = {}

        for token in tokens:
            if token == "":
                continue
            features[token] = True

        return features

    def _strip_numbers(self, s):
        """Strip numbers from the given string"""
        return re.sub("[^A-Z ]", "", s)

    def _split_by_multiple_delims(self, string, delims):
        """Split the given string by the list of delimiters given"""
        regexp = "|".join(delims)

        return re.split(regexp, string)
示例#25
0
                   ('Prediction', 'Contextual Text Mining'),
                   ('Contextual', 'Contextual Text Mining')]

#Instantiating the NB Classifier - Simple
classifier = NaiveBayesClassifier(featureListTrain)

#Random Shuffling of data for consistency
random.shuffle(data)

#print(str(data[0][1]).split('::'))

#Split Corpus data into train and test datasets
train, test = data[0:10], data[11:23]

#Update Classifier with new corpus data
classifier.update(train)

# Compute accuracy
accuracy = classifier.accuracy(featureListTest + test + data)
print("Accuracy: {0}".format(accuracy))

catList = []

# Loop through Corpus Data and Classify on entire dataset
# We do not have a large dataset and hence to get maximum categories classified
# the entire data is being considered
#If probablity of classification is at least 0.5 then capture the category
i = 0
while i < len(data):
    pdist = classifier.prob_classify(str(data[i][0]))
    #for category in reader.categories():
示例#26
0
import pickle

from textblob.classifiers import NaiveBayesClassifier

classifier = NaiveBayesClassifier([('???', '???'), ...])

classifier.update([('???', '???'), ...])

with open('...', 'wb') as f:
    pickle.dump(classifier, f)

with open('...', 'rb') as f:
    classifier = pickle.load(f)

classifier.classify(...)
示例#27
0
         ('I do not like this new restaurant', 'neg'),
         ('I am tired of waiting for my new book.', 'neg'),
         ("I can't deal with my toothache", 'neg'),
         ("The fun events in costa rica were amazing", 'pos'),
         ('He is my worst boss!', 'neg'),
         ('People do have bad writing skills on facebook', 'neg')]
test = [('The beer was good.', 'pos'), ('I do not enjoy my job', 'neg'),
        ("I feel amazing!", 'pos'), ('Mark is a friend of mine.', 'pos'),
        ("I can't believe I was asked to do this.", 'neg')]

cl = NaiveBayesClassifier(train)
print(cl.classify("The new movie was amazing."))  # "pos"
print(cl.classify("I don't like ther noodles."))  # "neg"

print "Test Results"
cl.update(test)

# Classify a TextBlob
blob = TextBlob(
    "The food was good. But the service was horrible. "
    "My father was not pleased.",
    classifier=cl)
print(blob)
print(blob.classify())

for sentence in blob.sentences:
    print(sentence)
    print(sentence.classify())

# Compute accuracy
print("Accuracy: {0}".format(cl.accuracy(test)))
        articleTestMaster = articleTest
        titleTestMaster = titleTest
        zScoreTestMaster = zScoreTest
        titleClassifier = NaiveBayesClassifier([
            (title, score) for title, score in zip(titleTrain, zScoreTrain)
        ])
        articleClassifier = NaiveBayesClassifier([
            (article, score)
            for article, score in zip(articleTrain, zScoreTrain)
        ])
    else:
        articleTestMaster = np.append(articleTestMaster, articleTest)
        titleTestMaster = np.append(titleTestMaster, titleTest)
        zScoreTestMaster = np.append(zScoreTestMaster, zScoreTest)
        titleClassifier.update(
            ([(title, score)
              for title, score in zip(titleTrain, zScoreTrain)]))
        articleClassifier.update(
            ([(article, score)
              for article, score in zip(articleTrain, zScoreTrain)]))
print(
    titleClassifier.accuracy([
        (title, zScore)
        for title, zScore in zip(titleTestMaster, zScoreTestMaster)
    ]))
print(
    articleClassifier.accuracy([
        (article, zScore)
        for article, zScore in zip(articleTestMaster, zScoreTestMaster)
    ]))
pickle.dump(titleClassifier, open('titleClassifier.pkl', 'wb'))
示例#29
0
# ### Evaluating Classifiers

class1.accuracy(test)

# ### Diplay a Listing of the Most Informative Features

class1.show_informative_features(5)

# ### Updating Classifiers with New Data¶

new_data = [('She is my best friend.', 'pos'),
            ("I'm happy to have a new friend.", 'pos'),
            ("Stay thirsty, my friend.", 'pos'),
            ("He ain't from around here.", 'neg')]
class1.update(new_data)

class1.accuracy(test)

# ### Feature Extractors


def end_word_extractor(document):
    tokens = document.split()
    first_word, last_word = tokens[0], tokens[-1]
    feats = {}
    feats["first({0})".format(first_word)] = True
    feats["last({0})".format(last_word)] = False
    return feats

示例#30
0
class BankClassify():
    def __init__(self, data="AllData.csv"):
        """Load in the previous data (by default from AllData.csv) and initialise the classifier"""
        if os.path.exists(data):
            self.prev_data = pd.read_csv(data)
        else:
            self.prev_data = pd.DataFrame(
                columns=['date', 'desc', 'amount', 'cat'])

        self.classifier = NaiveBayesClassifier(
            self._get_training(self.prev_data), self._extractor)

    def add_data(self, filename):
        """Add new data and interactively classify it.

        Arguments:
         - filename: filename of Santander-format file
        """
        self.new_data = self._read_santander_file(filename)

        self._ask_with_guess(self.new_data)

        self.prev_data = pd.concat([self.prev_data, self.new_data])
        self.prev_data.to_csv("AllData.csv", index=False)

    def _prep_for_analysis(self):
        """Prepare data for analysis in pandas, setting index types and subsetting"""
        self.prev_data = self._make_date_index(self.prev_data)

        self.prev_data['cat'] = self.prev_data['cat'].str.strip()

        self.inc = self.prev_data[self.prev_data.amount > 0]
        self.out = self.prev_data[self.prev_data.amount < 0]
        self.out.amount = self.out.amount.abs()

        self.inc_noignore = self.inc[self.inc.cat != 'Ignore']
        self.inc_noexpignore = self.inc[(self.inc.cat != 'Ignore')
                                        & (self.inc.cat != 'Expenses')]

        self.out_noignore = self.out[self.out.cat != 'Ignore']
        self.out_noexpignore = self.out[(self.out.cat != 'Ignore')
                                        & (self.out.cat != 'Expenses')]

    def _read_categories(self):
        """Read list of categories from categories.txt"""
        categories = {}

        with open('categories.txt') as f:
            for i, line in enumerate(f.readlines()):
                categories[i] = line.strip()

        return categories

    def _add_new_category(self, category):
        """Add a new category to categories.txt"""
        with open('categories.txt', 'a') as f:
            f.write('\n' + category)

    def _ask_with_guess(self, df):
        """Interactively guess categories for each transaction in df, asking each time if the guess
        is correct"""
        # Initialise colorama
        init()

        df['cat'] = ""

        categories = self._read_categories()

        for index, row in df.iterrows():

            # Generate the category numbers table from the list of categories
            cats_list = [[idnum, cat] for idnum, cat in categories.items()]
            cats_table = tabulate(cats_list)

            stripped_text = self._strip_numbers(row['desc'])

            # Guess a category using the classifier (only if there is data in the classifier)
            if len(self.classifier.train_set) > 1:
                guess = self.classifier.classify(stripped_text)
            else:
                guess = ""

            # Print list of categories
            print(chr(27) + "[2J")
            print(cats_table)
            print("\n\n")
            # Print transaction
            print("On: %s\t %.2f\n%s" %
                  (row['date'], row['amount'], row['desc']))
            print(Fore.RED + Style.BRIGHT + "My guess is: " + str(guess) +
                  Fore.RESET)

            input_value = input("> ")

            if input_value.lower() == 'q':
                # If the input was 'q' then quit
                return df
            if input_value == "":
                # If the input was blank then our guess was right!
                df.at[index, 'cat'] = guess
                self.classifier.update([(stripped_text, guess)])
            else:
                # Otherwise, our guess was wrong
                try:
                    # Try converting the input to an integer category number
                    # If it works then we've entered a category
                    category_number = int(input_value)
                    category = categories[category_number]
                except ValueError:
                    # Otherwise, we've entered a new category, so add it to the list of
                    # categories
                    category = input_value
                    self._add_new_category(category)
                    categories = self._read_categories()

                # Write correct answer
                df.at[index, 'cat'] = category
                # Update classifier
                self.classifier.update([(stripped_text, category)])

        return df

    def _make_date_index(self, df):
        """Make the index of df a Datetime index"""
        df.index = pd.DatetimeIndex(
            df.date.apply(dateutil.parser.parse, dayfirst=True))

        return df

    def _read_santander_file(self, filename):
        """Read a file in the plain text format that Santander provides downloads in.

        Returns a pd.DataFrame with columns of 'date', 'desc' and 'amount'."""
        with open(filename, errors='replace') as f:
            lines = f.readlines()

        dates = []
        descs = []
        amounts = []

        for line in lines[4:]:

            line = "".join(i for i in line if ord(i) < 128)
            if line.strip() == '':
                continue

            splitted = line.split(":")

            category = splitted[0]
            data = ":".join(splitted[1:])

            if category == 'Date':
                dates.append(data.strip())
            elif category == 'Description':
                descs.append(data.strip())
            elif category == 'Amount':
                just_numbers = re.sub("[^0-9\.-]", "", data)
                amounts.append(just_numbers.strip())

        df = pd.DataFrame({'date': dates, 'desc': descs, 'amount': amounts})

        df['amount'] = df.amount.astype(float)
        df['desc'] = df.desc.astype(str)
        df['date'] = df.date.astype(str)

        return df

    def _get_training(self, df):
        """Get training data for the classifier, consisting of tuples of
        (text, category)"""
        train = []
        subset = df[df['cat'] != '']
        for i in subset.index:
            row = subset.iloc[i]
            new_desc = self._strip_numbers(row['desc'])
            train.append((new_desc, row['cat']))

        return train

    def _extractor(self, doc):
        """Extract tokens from a given string"""
        # TODO: Extend to extract words within words
        # For example, MUSICROOM should give MUSIC and ROOM
        tokens = self._split_by_multiple_delims(doc, [' ', '/'])

        features = {}

        for token in tokens:
            if token == "":
                continue
            features[token] = True

        return features

    def _strip_numbers(self, s):
        """Strip numbers from the given string"""
        return re.sub("[^A-Z ]", "", s)

    def _split_by_multiple_delims(self, string, delims):
        """Split the given string by the list of delimiters given"""
        regexp = "|".join(delims)

        return re.split(regexp, string)
示例#31
0
class BankClassify():
    def __init__(self, data="AllData.csv"):
        """Load in the previous data (by default from `data`) and initialise the classifier"""

        # allows dynamic training data to be used (i.e many accounts in a loop)
        self.trainingDataFile = data

        if os.path.exists(data):
            self.prev_data = pd.read_csv(self.trainingDataFile)
        else:
            self.prev_data = pd.DataFrame(
                columns=['date', 'desc', 'amount', 'cat'])

        self.classifier = NaiveBayesClassifier(
            self._get_training(self.prev_data), self._extractor)

    def add_data(self, filename, bank="santander"):
        """Add new data and interactively classify it.

        Arguments:
         - filename: filename of Santander-format file
        """
        if bank == "santander":
            print("adding Santander data!")
            self.new_data = self._read_santander_file(filename)
        elif bank == "nationwide":
            print("adding Nationwide data!")
            self.new_data = self._read_nationwide_file(filename)
        elif bank == "lloyds":
            print("adding Lloyds Bank data!")
            self.new_data = self._read_lloyds_csv(filename)
        elif bank == "barclays":
            print("adding Barclays Bank data!")
            self.new_data = self._read_barclays_csv(filename)

        self._ask_with_guess(self.new_data)

        self.prev_data = pd.concat([self.prev_data, self.new_data])
        # save data to the same file we loaded earlier
        self.prev_data.to_csv(self.trainingDataFile, index=False)

    def _prep_for_analysis(self):
        """Prepare data for analysis in pandas, setting index types and subsetting"""
        self.prev_data = self._make_date_index(self.prev_data)

        self.prev_data['cat'] = self.prev_data['cat'].str.strip()

        self.inc = self.prev_data[self.prev_data.amount > 0]
        self.out = self.prev_data[self.prev_data.amount < 0]
        self.out.amount = self.out.amount.abs()

        self.inc_noignore = self.inc[self.inc.cat != 'Ignore']
        self.inc_noexpignore = self.inc[(self.inc.cat != 'Ignore')
                                        & (self.inc.cat != 'Expenses')]

        self.out_noignore = self.out[self.out.cat != 'Ignore']
        self.out_noexpignore = self.out[(self.out.cat != 'Ignore')
                                        & (self.out.cat != 'Expenses')]

    def _read_categories(self):
        """Read list of categories from categories.txt"""
        categories = {}

        with open('categories.txt') as f:
            for i, line in enumerate(f.readlines()):
                categories[i] = line.strip()

        return categories

    def _add_new_category(self, category):
        """Add a new category to categories.txt"""
        with open('categories.txt', 'a') as f:
            f.write('\n' + category)

    def _ask_with_guess(self, df):
        """Interactively guess categories for each transaction in df, asking each time if the guess
        is correct"""
        # Initialise colorama
        init()

        df['cat'] = ""

        categories = self._read_categories()

        for index, row in df.iterrows():

            # Generate the category numbers table from the list of categories
            cats_list = [[idnum, cat] for idnum, cat in categories.items()]
            cats_table = tabulate(cats_list)

            stripped_text = self._strip_numbers(row['desc'])

            # Guess a category using the classifier (only if there is data in the classifier)
            if len(self.classifier.train_set) > 1:
                guess = self.classifier.classify(stripped_text)
            else:
                guess = ""

            # Print list of categories
            print(chr(27) + "[2J")
            print(cats_table)
            print("\n\n")
            # Print transaction
            print("On: %s\t %.2f\n%s" %
                  (row['date'], row['amount'], row['desc']))
            print(Fore.RED + Style.BRIGHT + "My guess is: " + str(guess) +
                  Fore.RESET)

            input_value = input("> ")

            if input_value.lower() == 'q':
                # If the input was 'q' then quit
                return df
            if input_value == "":
                # If the input was blank then our guess was right!
                df.at[index, 'cat'] = guess
                self.classifier.update([(stripped_text, guess)])
            else:
                # Otherwise, our guess was wrong
                try:
                    # Try converting the input to an integer category number
                    # If it works then we've entered a category
                    category_number = int(input_value)
                    category = categories[category_number]
                except ValueError:
                    # Otherwise, we've entered a new category, so add it to the list of
                    # categories
                    category = input_value
                    self._add_new_category(category)
                    categories = self._read_categories()

                # Write correct answer
                df.at[index, 'cat'] = category
                # Update classifier
                self.classifier.update([(stripped_text, category)])

        return df

    def _make_date_index(self, df):
        """Make the index of df a Datetime index"""
        df.index = pd.DatetimeIndex(
            df.date.apply(dateutil.parser.parse, dayfirst=True))

        return df

    def _read_nationwide_file(self, filename):
        """Read a file in the csv file that Nationwide provides downloads in.

        Returns a pd.DataFrame with columns of 'date', 'desc' and 'amount'."""

        with open(filename) as f:
            lines = f.readlines()

        dates = []
        descs = []
        amounts = []

        for line in lines[5:]:

            line = "".join(i for i in line if ord(i) < 128)
            if line.strip() == '':
                continue

            splits = line.split("\",\"")
            """
            0 = Date
            1 = Transaction type
            2 = Description
            3 = Paid Out
            4 = Paid In
            5 = Balance
            """
            date = splits[0].replace("\"", "").strip()
            date = datetime.strptime(date, '%d %b %Y').strftime('%d/%m/%Y')
            dates.append(date)

            # get spend/pay in amount
            if splits[3] != "":  # paid out
                spend = float(re.sub("[^0-9\.-]", "", splits[3])) * -1
            else:  # paid in
                spend = float(re.sub("[^0-9\.-]", "", splits[4]))

            amounts.append(spend)

            #Description
            descs.append(splits[2])

        df = pd.DataFrame({'date': dates, 'desc': descs, 'amount': amounts})

        df['amount'] = df.amount.astype(float)
        df['desc'] = df.desc.astype(str)
        df['date'] = df.date.astype(str)

        return df

    def _read_santander_file(self, filename):
        """Read a file in the plain text format that Santander provides downloads in.

        Returns a pd.DataFrame with columns of 'date', 'desc' and 'amount'."""
        with open(filename, errors='replace') as f:
            lines = f.readlines()

        dates = []
        descs = []
        amounts = []

        for line in lines[4:]:

            line = "".join(i for i in line if ord(i) < 128)
            if line.strip() == '':
                continue

            splitted = line.split(":")

            category = splitted[0]
            data = ":".join(splitted[1:])

            if category == 'Date':
                dates.append(data.strip())
            elif category == 'Description':
                descs.append(data.strip())
            elif category == 'Amount':
                just_numbers = re.sub("[^0-9\.-]", "", data)
                amounts.append(just_numbers.strip())

        df = pd.DataFrame({'date': dates, 'desc': descs, 'amount': amounts})

        df['amount'] = df.amount.astype(float)
        df['desc'] = df.desc.astype(str)
        df['date'] = df.date.astype(str)

        return df

    def _read_lloyds_csv(self, filename):
        """Read a file in the CSV format that Lloyds Bank provides downloads in.

        Returns a pd.DataFrame with columns of 'date' 0 , 'desc'  4 and 'amount' 5 ."""

        df = pd.read_csv(filename, skiprows=0)
        """Rename columns """
        #df.columns = ['date', 'desc', 'amount']
        df.rename(columns={
            "Transaction Date": 'date',
            "Transaction Description": 'desc',
            "Debit Amount": 'amount',
            "Credit Amount": 'creditAmount'
        },
                  inplace=True)

        # if its income we still want it in the amount col!
        # manually correct each using 2 cols to create 1 col with either + or - figure
        # lloyds outputs 2 cols, credit and debit, we want 1 col representing a +- figure
        for index, row in df.iterrows():
            if (row['amount'] > 0):
                # it's a negative amount because this is a spend
                df.at[index, 'amount'] = -row['amount']
            elif (row['creditAmount'] > 0):
                df.at[index, 'amount'] = row['creditAmount']

        # cast types to columns for math
        df = df.astype({"desc": str, "date": str, "amount": float})

        return df

    def _read_barclays_csv(self, filename):
        """Read a file in the CSV format that Barclays Bank provides downloads in.
            Edge case: foreign txn's sometimes causes more cols than it should 
            Returns a pd.DataFrame with columns of 'date' 1 , 'desc' (memo)  5 and 'amount' 3 ."""

        # Edge case: Barclays foreign transaction memo sometimes contains a comma, which is bad.
        # Use a work-around to read only fixed col count
        # https://stackoverflow.com/questions/20154303/pandas-read-csv-expects-wrong-number-of-columns-with-ragged-csv-file
        # Prevents an error where some rows have more cols than they should
        temp = pd.read_csv(filename,
                           sep='^',
                           header=None,
                           prefix='X',
                           skiprows=1)
        temp2 = temp.X0.str.split(',', expand=True)
        del temp['X0']
        df = pd.concat([temp, temp2], axis=1)
        """Rename columns """
        df.rename(columns={1: 'date', 5: 'desc', 3: 'amount'}, inplace=True)

        # cast types to columns for math
        df = df.astype({"desc": str, "date": str, "amount": float})

        return df

    def _get_training(self, df):
        """Get training data for the classifier, consisting of tuples of
        (text, category)"""
        train = []
        subset = df[df['cat'] != '']
        for i in subset.index:
            row = subset.iloc[i]
            new_desc = self._strip_numbers(row['desc'])
            train.append((new_desc, row['cat']))

        return train

    def _extractor(self, doc):
        """Extract tokens from a given string"""
        # TODO: Extend to extract words within words
        # For example, MUSICROOM should give MUSIC and ROOM
        tokens = self._split_by_multiple_delims(doc, [' ', '/'])

        features = {}

        for token in tokens:
            if token == "":
                continue
            features[token] = True

        return features

    def _strip_numbers(self, s):
        """Strip numbers from the given string"""
        return re.sub("[^A-Z ]", "", s)

    def _split_by_multiple_delims(self, string, delims):
        """Split the given string by the list of delimiters given"""
        regexp = "|".join(delims)

        return re.split(regexp, string)
示例#32
0
    tweet = info[2]
    polar = info[len(info) - 1]
    c = tuple([tweet, polar])
    trainingData.append(c)

print(len(trainingData))

myData1 = trainingData[0:1000]
trainingData = trainingData[1000:len(trainingData)]

cl = NaiveBayesClassifier(myData1)

for i in range(0, len(trainingData), 50):
    chunk = trainingData[i:i + 50]
    print(i)
    cl.update(chunk)

print(cl.show_informative_features())

myList2 = []
test = []

with open("data05_02_2020_11-17.csv", "r") as f:
    reader = csv.reader(f, delimiter="\t")
    for i, line in enumerate(reader):
        if i % 2 == 0:
            myList2.append(line[0])

for line in myList2:
    info = line.split(',')
    tweet = info[2]
示例#33
0
    #inicia classificador treinamento
    cl = NaiveBayesClassifier(train2)

    #atualiza barra de progresso
    i = int(time.time() - start)
    bar.update(i)

    #registra acuracia treinamento x teste
    actest = cl.accuracy(test2)

    #atualiza barra de progresso
    i = int(time.time() - start)
    bar.update(i)

    #treina com a base de teste
    cl.update(test2)

    #atualiza barra de progresso
    i = int(time.time() - start)
    bar.update(i)

    #registra acuracia teste vs treinamento + este
    actest2 = cl.accuracy(test2)

    print('\n')
    print("Descrição dos Dados de Treinamento:")
    print(datamerge1.describe())
    print('\n')
    print("Descrição dos Dados de Teste:")
    print(datamerge2.describe())
    print('\n')