示例#1
0
def resolve_certainty(certainty_info):
    '''Resolve certainty with Naive Bayes'''
    if certainty_info == '':
        return 'No certainty info.'
    else:
        nb = NB()
        for observation, certainty in csv(
                'library/templatetags/c_training_data.csv'):
            v = Document(observation, type=int(certainty), stopwords=True)
            nb.train(v)
        return nb.classify(Document(certainty_info))
示例#2
0
def extractSentiment(characterSentences):
    """
    Trains a Naive Bayes classifier object with the reviews.csv file, analyzes
    the sentence, and returns the tone.
    """
    nb = NB()
    characterTones = defaultdict(list)
    for review, rating in csv("reviews.csv"):
        nb.train(Document(review, type=int(rating), stopwords=True))
    for key, value in characterSentences.items():
        for x in value:
            characterTones[key].append(nb.classify(str(x)))
    return characterTones
def extractSentiment(characterSentences):
    """
    Trains a Naive Bayes classifier object with the reviews.csv file, analyzes
    the sentence, and returns the tone.
    """
    nb = NB()
    characterTones = defaultdict(list)
    for review, rating in csv("reviews.csv"):
        nb.train(Document(review, type=int(rating), stopwords=True))
    for key, value in characterSentences.iteritems():
        for x in value:
            characterTones[key].append(nb.classify(str(x)))
    return characterTones
示例#4
0
def classifyTweets(filename, trainingSet):
    print('Classifying {}...\n'.format(filename))
    data = open('{}/processed{}'.format(INPUT_PATH, filename.capitalize()), 'rb')
    reader = csv.reader(data)
    info = list(reader)

    classifier = NB(train = trainingSet, alpha = 0.0001)

    tweets = []
    for i in range(len(info)):
        tweet = info[i][0] 
        result = classifier.classify(Document(tweet))
        tweets.append([tweet, result])

    # Write all tweets to file
    with open('{}/results.csv'.format(OUTPUT_PATH), 'wb+') as f:
        writer = csv.writer(f)
        writer.writerows(tweets)
class NBModel:
    def __init__(self):
        self.nb = NB()
        self.stats = Statistics()
        try:
   #         self.nb = self.nb.load("./nb_training.p")
            self.new_nb_model = True
        except IOError:
            self.new_nb_model = False
            print("Creating new NB model")

    def naive_bayes_train(self, reviews):
        for review in reviews:
            if review.rating is not None and review.rating < 10 and review.rating > 1:
                v = Document(review.text, type=int(review.rating), stopwords=True)
                self.nb.train(v)
        #self.nb.save("./nb_training.p")
     #   print self.nb.classes

    def nb_test_imdb(self, reviews):
        arr = []
        for review in reviews:
            if review.rating is not None:
                v = Document(review.text, type=int(review.rating), stopwords=True)
                arr.append(v)
        print self.nb.test(arr, target=None)

    def nb_classify_tweets(self, tvshow, tweets):
        ratingSum = 0
        tweet_docs = [(self.nb.classify(Document(tweet)), tweet) for tweet in tweets]
        for tweet in tweet_docs:
            ratingSum += tweet[0]
        self.nb_stats()
        Statistics().printStats(tvshow, ratingSum, len(tweet_docs))
        print self.nb.distribution

    def nb_stats(self):
        print('----------- Classifier stats -----------')
      #  print("Features: ", self.nb.features)
        print("Classes: ", self.nb.classes)
        print("Skewness: ", self.nb.skewness)
        print("Distribution: ", self.nb.distribution)
        print("Majority: ", self.nb.majority)
        print("Minority: ", self.nb.minority)
d1 = Document('Cats are independent pets.', name='cat')
d2 = Document('Dogs are trustworthy pets.', name='dog')
d3 = Document('Boxes are made of cardboard.', name='box')
m = Model((d1, d2, d3))
print m.cluster(method=HIERARCHICAL, k=2)
# hierarchical clustering
cluster = Cluster((1, Cluster((2, Cluster((3, 4))))))
print cluster.depth
print cluster.flatten(1)
# training a classifier
nb = NB()
for review, rating in csv('data/input/reviews.csv'):
    v = Document(review, type=int(rating), stopwords=True)
    nb.train(v)
print nb.classes
print nb.classify(Document('A good movie!'))
# testing a classifier
data = csv('data/input/reviews.csv')
data = [(review, int(rating)) for review, rating in data]
data = [
    Document(review, type=rating, stopwords=True) for review, rating in data
]
nb = NB(train=data[:500])
accuracy, precision, recall, f1 = nb.test(data[500:])
print accuracy
# binary classification
data = csv('data/input/reviews.csv')
data = [(review, int(rating) >= 3) for review, rating in data]
data = [
    Document(review, type=rating, stopwords=True) for review, rating in data
]
class Emoticon_Sentiment:

    """
    Use Emoticon sentiment to classify and predict comments.

    This class provides a binary sentiment analyzer (1 = positive, -1 =
    negative). It is automatically created using emoticons from thousands of
    comments.
    """

    def __init__(self, basefile):
        """
        Initialize the emoticon sentiment. Setup variables.

        Args:
          basefile (str): The location of the comments file that should serve
          as the basis for the classifier.
        """
        self.base = basefile
        self.comments = []
        self.nb = NB()
        self.stop = stopwords.words('english')
        self.positive = [':-)', ':)', ':D', ':o)', ':]', ':3', ':c)', ':>',
                         '=]', '8)', '=)', ':}', ':^)', ':っ)', ':-D', '8-D',
                         '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D', '=-3',
                         '=3', 'B^D', ':-))', ':*', ':^*', ';-)', ';)', '*-)',
                         '*)', ';-]', ';]', ';D', ';^)']
        self.negative = ['>:[', ':-(', ':(', ':-c', ':c', ':-<', ':っC',
                         ':<', ':-[', ':[', ':{', ':-||', ':@', '>:(', 'D:<',
                         'D:', 'D8', 'D;', 'D=', 'DX', 'v.v', '>:)', '>;)',
                         '>:-)', '}:-)', '}:)', '3:-)', '3:)', ':-###..',
                         ':###..', '>:/', ':-/', ':-.', ':/']

    def read_base_comments(self):
        """
        Read the base comments.

        Read the file that loads the comments that serve as the base for the
        classifier.

        Returns:
          list: list of comments from the text file
        """
        f = open(self.base, 'r')
        self.comments = json.load(f)
        f.close()
        return self.comments

    def preprocess(self, comment):
        """
        Preprocess comment.

        Take a comment and pre-processes it (all to lowercase, remove links,
        delete punctuation, remove stopwords and numbers)

        Args:
          comment (str): The text string to be processed

        Returns:
          str: the processed text string
        """
        h = HTMLParser.HTMLParser()
        text = h.unescape(comment.lower())
        p = re.compile(r"(\b(?:(?:https?|ftp|file)://|www\.|ftp\.)"
                       "[-A-Za-z0-9+&@#/%?=~_()|!:,.;]*"
                       "[-A-Za-z0-9+&@#/%=~_()|])")
        text = re.sub(p, '', text)
        exclude = set(string.punctuation)
        tokens = word_tokenize(text)
        cleaned = [token for token in tokens if token not in exclude]
        text = ' '.join([w.encode('ascii', errors='ignore') for w in cleaned
                        if w.lower() not in self.stop and not w.isdigit()])
        return text

    def parse_comments(self):
        """
        Parse comment, check for emoticon, remove neutrals.

        Parse a list of comments, by searching for emoticons. Once an emoticon
        is found, it flags it positive or negative.
        With multiple finds, it sums their value; neutrals are thrown out.

        Returns:
          list: 2-dimensional list of parsed comments and their sentiment flags
        """
        extended_comments = [[self.comments[i], 0, False]
                             for i in range(len(self.comments))]

        # loop through comments and search for emoticons: +1 if positive, -1
        # if negative, and remove emoticons from text
        for comment in extended_comments:

            for n in self.negative:
                if comment[0].encode('utf-8').find(n) > -1:
                    comment[1] = comment[1]-1
                    comment[0] = comment[0].replace(n, '')
                    comment[2] = True

            for p in self.positive:
                if comment[0].encode('utf-8').find(p) > -1:

                    comment[1] = comment[1]+1
                    comment[0] = comment[0].replace(p, '')
                    comment[2] = True

        # throw out comments that have no emoticons
        parsed_comments = [[comment[0], comment[1]] for comment
                           in extended_comments if comment[2]]

        # normalize the values by converting all positives (1,2,3,etc) to 1,
        # and all negatives (-1,-2,-3 etc) to -1
        for item in parsed_comments:
            if item[1] > 1:
                item[1] = 1
            elif item[1] < -1:
                item[1] = -1

        # throw out neutrals
        parsed_comments = [[self.preprocess(i[0]), i[1]]
                           for i in parsed_comments if i[1] != 0]

        return parsed_comments

    def train_classifier(self, comments):
        """
        take the flagged comments and trains a Naive Bayes classifier.

        Args:
          comments (list): The list of flagged comments to be trained
        """
        shuffle(comments)

        for comment in comments:
            v = Document(comment[0], type=int(comment[1]))
            self.nb.train(v)

    def predict(self, text):
        """
        Take a comment and makes a sentiment prediction.

        Args:
          text (str): The text string to be analyzed for sentiment

        Returns:
          int: 1 if positive sentiment, -1 if negative
        """
        return self.nb.classify(text)
示例#8
0
from pattern.db import csv
from collections import Counter

nb = NB()
wordStats = Counter()
opinionStats = Counter({'positive': 0, 'negative': 0, 'overall': 0})

for grade, opinion in csv('trainData.csv', separator = '\t'):
    comment = Document(opinion, type=int(grade), stopwords = True)
    nb.train(comment)

tree = xmlTree.parse("Posts.xml")
root = tree.getroot()

for row in root:
    doc = Document(plaintext(row.attrib['Body']), 
                filter = lambda w: w.strip("'").isalpha() and len(w) > 1,
                stopwords = False)
    opinion = nb.classify(doc)
    opinionStats['overall'] +=1
    if opinion > 0:
        opinionStats['positive'] += 1
    else:
        opinionStats['negative'] += 1
    wordStats += Counter(doc.words)

print wordStats.most_common(10)
print opinionStats


示例#9
0
# of known things. For these known things, we already know the type,
# and we have a description of each thing, called a "vector".
# A vector is a just Python dictionary of features and feature weights.

# ------------------------------------------------------------------------------------

# The simplest classification algorithm is Naive Bayes,
# but it works quite well with text.

# A trivial example with animal features:
nb = NB()
nb.train({"swim": 1, "fin": 2, "legs": 0, "wings": 0}, type="fish")
nb.train({"swim": 0, "fin": 0, "legs": 1, "wings": 2}, type="bird")
nb.train({"swim": 1, "fin": 0, "legs": 4, "wings": 0}, type="mammal")

print nb.classify({"legs": 4})
print

# ------------------------------------------------------------------------------------

# For text, usually the word order is discarded in favor of word count.
# This is called a "bag-of-words" model, i.e., we count each word in a
# document. We can then compare it to other documents to see if they
# have frequent words in common. If so, they probably belong to the
# same class or type.

# For example, if we have 10,000 product reviews + star rating,
# we can transform each review to a vector of adjective => count,
# use the star rating as vector type (e.g., * = -1, ***** = +1),
# and use this training set to predict the star rating of other reviews.
示例#10
0
print "number of words:", len(m.vector)
print "number of words (average):", sum(len(d.features)
                                        for d in m.documents) / float(len(m))
print

# Train Naive Bayes on all documents.
# Each document has a type: True for actual e-mail, False for spam.
# This results in a "binary" classifier that either answers True or False
# for unknown documents.
classifier = NB()
for document in m:
    classifier.train(document)

# We can now ask it questions about unknown e-mails:

print classifier.classify("win money")  # False: most likely spam.
print classifier.classify("fix bug")  # True: most likely a real message.
print

print classifier.classify(
    "customer")  # False: people don't talk like this on developer lists...
print classifier.classify(
    "guys")  # True: because most likely everyone knows everyone.
print

# To test the accuracy of a classifier,
# we typically use 10-fold cross validation.
# This means that 10 individual tests are performed,
# each with 90% of the corpus as training data and 10% as testing data.
from pattern.vector import k_fold_cv
print k_fold_cv(Bayes, documents=m, folds=10)
class NBClassifier:

    """
        This class interfaces a pattern corpus with a pattern.vector nb classifier

    """

    def __init__(self, corpus, **kargs):
        """
            Initializes the NBClassifier class with a corpus and a NB instance
            (input): corpus = a corpus of pattern Documents constructed from Grams
        """
        self.corpus = corpus
        self.documents = self.corpus.documents
        self.model = Model(documents=self.documents, weight='TF-IDF')
        #self.documents.words = self.documents.keywords
        self.split_idx = len(self) / 4
        self.nb = NB()

    def __len__(self):
        return len(self.documents)

    def classify_document(self, document):
        """
            classify document with nb instance
            (input):
                 document = Document instance with same format as classifier train set
            (output): classification result
        """
        return (
            self.nb.classify(Document(document, stemmer=PORTER), discrete=True)
        )

    def nb_train(self):
        """
            This function trains the classifier with (3/4) of the documents
            (input): None
            (outpu): trains self.nb
        """
        train_start = datetime.datetime.now()
        print "training with {0} docs".format(len(self) - self.split_idx)
        documents = self.documents[:-self.split_idx]
        random.shuffle(documents)
        [self.nb.train(doc) for doc in documents]
        train_end = datetime.datetime.now()
        print "training {0} docs took {1} seconds".format(len(documents), (train_end - train_start).seconds)

    def nb_test(self, gold_standard=None):
        """
            Evaluates the classifier on (1/4) of the documents
            (input): None
            (output): accuracy, precision, recall, f1
        """
        if not gold_standard:
            test_docs = self.documents[-self.split_idx:]
            conf_matrix = self.nb.confusion_matrix(test_docs)
            print conf_matrix.table

            return self.nb.test(test_docs)
        else:
            return self.nb.test(gold_standard)

    def get_documentlabel_distribution(self):
        """
            Gets the label distribution of the document set passed into
            self.nb classifier
            (input): None
            (output):
                 distribution = dictionary representation of the label:frequency distribution
                 rankdistribution = a ranked list of label keys by frequency
        """

        distribution = self.nb.distribution
        rankdistribution = sorted(
            distribution,
            key=lambda x: distribution[x],
            reverse=True)
        for each, key in enumerate(rankdistribution[:10]):
            print "{0}:\t{1} ({2})".format(each, key, distribution[key])

        return distribution, rankdistribution

    def run(self):
        """
            executes training / testing of classifier
        """
        print "training and testing {0} total documents...".format(len(self))
        self.nb_train()
        print "testing"
        result = self.nb_test()
        print "finalizing"
        self.nb.finalize()
        return result
示例#12
0
    recall

#test SVM
testsvm = SVM(train=datadocs[:500])
print 'svm features = ', testsvm.features
saccuracy, sprecision, srecall, sf1 = testsvm.test(datadocs[500:])
print 'svm accuracy =', saccuracy


#classifier training example with test classificaiton
nb2 = NB()
for review, rating in data:
    v = Document(review, type=int(rating))
    #print v.vector
    nb2.train(v)

print 'nb2 classes', nb2.classes
print 'test classification', nb2.classify(Document('A poor movie!'))

#cosine similarity example
from pattern.vector import Vector, distance
v1 = Vector({"curiosity": 1, "kill": 1, "cat": 1})
v2 = Vector({"curiosity": 1, "explore": 1, "mars": 1})
print 'cosine similarity between two vectors', 1 - distance(v1, v2)

# a model is a collection of Document objects

#todo now that we've built SVM, needs to create documents out of dummy tweets
#todo then add them to the SVM classifier as a training and test set
#todo build db schema to save tweets and relationships to db
#todo get some sample twitter data in db
class NBModel:
    def __init__(self):
        self.nb = NB()
        self.stats = Statistics()
        try:
            print("dir: " + os.getcwd())
            if os.getcwd().endswith("tv_ratings_frontend"):
                print("Working in django")
                self.nb = self.nb.load("ratings_frontend/backend/pattern_ml/nb_training.p")
            else:
                print("Not working in django")
                self.nb = self.nb.load("./nb_training.p")
            self.new_nb_model = True
            print("Using existing pickled model")
        except IOError:
            self.new_nb_model = False
            print("Creating new NB model")

    def nb_train_text(self, reviews):
        for review in reviews:
            if review.rating is not None:# and review.rating < 10 and review.rating > 1:
                v = Document(review.text, type=int(review.rating), stopwords=True)
                self.nb.train(v)
                self.nb.save("./nb_training.p")
                #   print self.nb.classes

    def nb_train_summary(self, reviews):
        for review in reviews:
            if review.rating is not None:# and review.rating < 10 and review.rating > 1:
                v = Document(review.summary, type=int(review.rating), stopwords=True)
                self.nb.train(v)

    def nb_train_all_text(self, review_set):
        for review_list in review_set:
            self.nb_train_text(review_list)
        self.nb.save_model()

    def save_model(self):
    #    print ""
        self.nb.save('./nb_training.p')

    def nb_test_imdb(self, reviews):
        arr = []
        for review in reviews:
            if review.rating is not None:
                v = Document(self.review_to_words(review.text), type=int(review.rating), stopwords=True)
                arr.append(v)
        print self.nb.test(arr, target=None)

    def nb_classify_tweets(self, tvshow, tweets):
        ratingSum = 0
        tweet_docs = [(self.nb.classify(Document(self.review_to_words(tweet))), self.review_to_words(tweet)) for tweet in tweets]
        for tweet in tweet_docs:
            ratingSum += tweet[0]
            #print tweet
           # print tweet
        self.nb_stats()
        Statistics().printStats(tvshow, ratingSum, len(tweet_docs))
        print self.nb.distribution

        return Statistics().get_stats(tvshow, ratingSum, len(tweet_docs))

    def nb_stats(self):
        print('----------- Classifier stats -----------')
        #  print("Features: ", self.nb.features)
        print("Classes: ", self.nb.classes)
        print("Skewness: ", self.nb.skewness)
        print("Distribution: ", self.nb.distribution)
        print("Majority: ", self.nb.majority)
        print("Minority: ", self.nb.minority)

    def review_to_words(self, raw_review):
        no_url = re.sub("http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", "", raw_review)

        # Remove numerics
        letters_only = re.sub("[^a-zA-Z]", " ", no_url)

        # to lowercase
        words = letters_only.lower().split()

        # remove stop words - the, of , a ....
        stops = set(stopwords.words("english"))

        meaningful_words = [w for w in words if not w in stops]

        return (" ".join(meaningful_words))
示例#14
0
print("number of words:", len(m.vector))
print("number of words (average):", sum(len(d.features)
                                        for d in m.documents) / float(len(m)))
print()

# Train Naive Bayes on all documents.
# Each document has a type: True for actual e-mail, False for spam.
# This results in a "binary" classifier that either answers True or False
# for unknown documents.
classifier = NB()
for document in m:
    classifier.train(document)

# We can now ask it questions about unknown e-mails:

print(classifier.classify("win money"))  # False: most likely spam.
print(classifier.classify("fix bug"))   # True: most likely a real message.
print()

# False: people don't talk like this on developer lists...
print(classifier.classify("customer"))
# True: because most likely everyone knows everyone.
print(classifier.classify("guys"))
print()

# To test the accuracy of a classifier,
# we typically use 10-fold cross validation.
# This means that 10 individual tests are performed,
# each with 90% of the corpus as training data and 10% as testing data.
from pattern.vector import k_fold_cv
print(k_fold_cv(NB, documents=m, folds=10))
示例#15
0
# of known things. For these known things, we already know the type, 
# and we have a description of each thing, called a "vector".
# A vector is a just Python dictionary of features and feature weights.

# ------------------------------------------------------------------------------------

# The simplest classification algorithm is Naive Bayes,
# but it works quite well with text.

# A trivial example with animal features:
nb = NB()
nb.train({"swim": 1, "fin": 2, "legs": 0, "wings": 0}, type="fish")
nb.train({"swim": 0, "fin": 0, "legs": 1, "wings": 2}, type="bird")
nb.train({"swim": 1, "fin": 0, "legs": 4, "wings": 0}, type="mammal")

print nb.classify({"legs": 4})
print

# ------------------------------------------------------------------------------------

# For text, usually the word order is discarded in favor of word count.
# This is called a "bag-of-words" model, i.e., we count each word in a
# document. We can then compare it to other documents to see if they
# have frequent words in common. If so, they probably belong to the 
# same class or type.

# For example, if we have 10,000 product reviews + star rating,
# we can transform each review to a vector of adjective => count,
# use the star rating as vector type (e.g., * = -1, ***** = +1),
# and use this training set to predict the star rating of other reviews.
示例#16
0
print "number of documents:", len(m)
print "number of words:", len(m.vector)
print "number of words (average):", sum(len(d.features) for d in m.documents) / float(len(m))
print

# Train Naive Bayes on all documents.
# Each document has a type: True for actual e-mail, False for spam.
# This results in a "binary" classifier that either answers True or False
# for unknown documents.
classifier = NB()
for document in m:
    classifier.train(document)

# We can now ask it questions about unknown e-mails:

print classifier.classify("win money") # False: most likely spam.
print classifier.classify("fix bug")   # True: most likely a real message.
print

print classifier.classify("customer")  # False: people don't talk like this on developer lists...
print classifier.classify("guys")      # True: because most likely everyone knows everyone.
print

# To test the accuracy of a classifier,
# we typically use 10-fold cross validation.
# This means that 10 individual tests are performed, 
# each with 90% of the corpus as training data and 10% as testing data.
from pattern.vector import k_fold_cv
print k_fold_cv(Bayes, documents=m, folds=10)

# This yields 4 scores: (Accuracy, Precision, Recall, F-score).
示例#17
0
print("number of words:", len(m.vector))
print("number of words (average):",
      sum(len(d.features) for d in m.documents) / float(len(m)))
print()

# Train Naive Bayes on all documents.
# Each document has a type: True for actual e-mail, False for spam.
# This results in a "binary" classifier that either answers True or False
# for unknown documents.
classifier = NB()
for document in m:
    classifier.train(document)

# We can now ask it questions about unknown e-mails:

print(classifier.classify("win money"))  # False: most likely spam.
print(classifier.classify("fix bug"))  # True: most likely a real message.
print()

# False: people don't talk like this on developer lists...
print(classifier.classify("customer"))
# True: because most likely everyone knows everyone.
print(classifier.classify("guys"))
print()

# To test the accuracy of a classifier,
# we typically use 10-fold cross validation.
# This means that 10 individual tests are performed,
# each with 90% of the corpus as training data and 10% as testing data.
from pattern.vector import k_fold_cv
示例#18
0
documents=[]

for linea in result.readlines():
	document = Document(linea.split("/")[1], type=linea.split("/")[0])
	documents.append(document)

m = Model(documents)


nb=NB()

for document in m:
    nb.train(document)

print nb.classify("opinion")
print nb.classify("noticia")

from pattern.vector import k_fold_cv
print k_fold_cv(NB, documents=m, folds=10)

	
#for palabra in lines:
#	print d.get(palabra)
"""
1. Sacar bien los resultados 
	#HuracanPatricia
	#AguaEnMarte
2. Modificar poster de acuerdo a los resultados y enfocarse en el desarrollo 
"""