示例#1
0
def resolve_certainty(certainty_info):
    '''Resolve certainty with Naive Bayes'''
    if certainty_info == '':
        return 'No certainty info.'
    else:
        nb = NB()
        for observation, certainty in csv(
                'library/templatetags/c_training_data.csv'):
            v = Document(observation, type=int(certainty), stopwords=True)
            nb.train(v)
        return nb.classify(Document(certainty_info))
示例#2
0
def extractSentiment(characterSentences):
    """
    Trains a Naive Bayes classifier object with the reviews.csv file, analyzes
    the sentence, and returns the tone.
    """
    nb = NB()
    characterTones = defaultdict(list)
    for review, rating in csv("reviews.csv"):
        nb.train(Document(review, type=int(rating), stopwords=True))
    for key, value in characterSentences.items():
        for x in value:
            characterTones[key].append(nb.classify(str(x)))
    return characterTones
def extractSentiment(characterSentences):
    """
    Trains a Naive Bayes classifier object with the reviews.csv file, analyzes
    the sentence, and returns the tone.
    """
    nb = NB()
    characterTones = defaultdict(list)
    for review, rating in csv("reviews.csv"):
        nb.train(Document(review, type=int(rating), stopwords=True))
    for key, value in characterSentences.iteritems():
        for x in value:
            characterTones[key].append(nb.classify(str(x)))
    return characterTones
示例#4
0
 def set_classifier(self):
     if self.name == 'SLP':
         return SLP(train=self.train_data, iterations=self.iterations)
     elif self.name == 'NB':
         return NB(train=self.train_data)
     else:
         print "Unknown classifier name"
示例#5
0
def classifyTweets(filename, trainingSet):
    print('Classifying {}...\n'.format(filename))
    data = open('{}/processed{}'.format(INPUT_PATH, filename.capitalize()), 'rb')
    reader = csv.reader(data)
    info = list(reader)

    classifier = NB(train = trainingSet, alpha = 0.0001)

    tweets = []
    for i in range(len(info)):
        tweet = info[i][0] 
        result = classifier.classify(Document(tweet))
        tweets.append([tweet, result])

    # Write all tweets to file
    with open('{}/results.csv'.format(OUTPUT_PATH), 'wb+') as f:
        writer = csv.writer(f)
        writer.writerows(tweets)
  def __init__(self):
      self.nb = NB()
      self.stats = Statistics()
      try:
 #         self.nb = self.nb.load("./nb_training.p")
          self.new_nb_model = True
      except IOError:
          self.new_nb_model = False
          print("Creating new NB model")
class NBModel:
    def __init__(self):
        self.nb = NB()
        self.stats = Statistics()
        try:
   #         self.nb = self.nb.load("./nb_training.p")
            self.new_nb_model = True
        except IOError:
            self.new_nb_model = False
            print("Creating new NB model")

    def naive_bayes_train(self, reviews):
        for review in reviews:
            if review.rating is not None and review.rating < 10 and review.rating > 1:
                v = Document(review.text, type=int(review.rating), stopwords=True)
                self.nb.train(v)
        #self.nb.save("./nb_training.p")
     #   print self.nb.classes

    def nb_test_imdb(self, reviews):
        arr = []
        for review in reviews:
            if review.rating is not None:
                v = Document(review.text, type=int(review.rating), stopwords=True)
                arr.append(v)
        print self.nb.test(arr, target=None)

    def nb_classify_tweets(self, tvshow, tweets):
        ratingSum = 0
        tweet_docs = [(self.nb.classify(Document(tweet)), tweet) for tweet in tweets]
        for tweet in tweet_docs:
            ratingSum += tweet[0]
        self.nb_stats()
        Statistics().printStats(tvshow, ratingSum, len(tweet_docs))
        print self.nb.distribution

    def nb_stats(self):
        print('----------- Classifier stats -----------')
      #  print("Features: ", self.nb.features)
        print("Classes: ", self.nb.classes)
        print("Skewness: ", self.nb.skewness)
        print("Distribution: ", self.nb.distribution)
        print("Majority: ", self.nb.majority)
        print("Minority: ", self.nb.minority)
 def __init__(self, corpus, **kargs):
     """
         Initializes the NBClassifier class with a corpus and a NB instance
         (input): corpus = a corpus of pattern Documents constructed from Grams
     """
     self.corpus = corpus
     self.documents = self.corpus.documents
     self.model = Model(documents=self.documents, weight='TF-IDF')
     #self.documents.words = self.documents.keywords
     self.split_idx = len(self) / 4
     self.nb = NB()
示例#9
0
class Classifications():

    #static variables
    _category_path = os.path.join(os.path.dirname(__file__),
                                  "classifiers/category.slp")
    _rating_path = os.path.join(os.path.dirname(__file__),
                                "classifiers/rating.slp")
    _rating_nlp_path = os.path.join(os.path.dirname(__file__),
                                    "classifiers/rating_nlp.svm")
    _sentiment_path = os.path.join(os.path.dirname(__file__),
                                   "classifiers/sentiment.nb")

    _category = SLP.load(_category_path)
    _rating = SLP.load(_rating_path)
    _rating_nlp = SVM.load(_rating_nlp_path)
    _sentiment = NB.load(_sentiment_path)

    @staticmethod
    def selectWords(review):
        '''
        a function that gets a review and selects the nouns, adjectives, verbs and exclamation mark
        '''
        review = parsetree(review, lemmata=True)[0]  #lemmatize the review
        #select adjectives (JJ), nouns (NN), verbs (VB) and exclamation marks
        review = [
            w.lemma for w in review
            if w.tag.startswith(('JJ', 'NN', 'VB', '!'))
        ]
        review = count(review)  #a dictionary of (word, count)
        return review

    @staticmethod
    def classify(text):
        predicted_category = Classifications._category.classify(Document(text),
                                                                discrete=True)
        predicted_rate = Classifications._rating.classify(Document(text),
                                                          discrete=True)
        predicted_rate_nlp = Classifications._rating_nlp.classify(
            Classifications.selectWords(text), discrete=True)
        predicted_sentiment_dict = Classifications._sentiment.classify(
            Classifications.selectWords(text), discrete=False)
        predicted_sentiment = True if str(
            sorted(predicted_sentiment_dict.items(),
                   key=operator.itemgetter(1),
                   reverse=True)[1][0]) in ['True', '3.0', '4.0', '5.0'
                                            ] else False

        return {
            'text': text,
            'rate': predicted_rate,
            'category': predicted_category,
            'rate_nlp': predicted_rate_nlp,
            'positivity': predicted_sentiment
        }
 def __init__(self):
     self.nb = NB()
     self.stats = Statistics()
     try:
         print("dir: " + os.getcwd())
         if os.getcwd().endswith("tv_ratings_frontend"):
             print("Working in django")
             self.nb = self.nb.load("ratings_frontend/backend/pattern_ml/nb_training.p")
         else:
             print("Not working in django")
             self.nb = self.nb.load("./nb_training.p")
         self.new_nb_model = True
         print("Using existing pickled model")
     except IOError:
         self.new_nb_model = False
         print("Creating new NB model")
示例#11
0
def normal_test(data, type):
    print '----------------------------------------------------'
    print 'TEST FUNCTION STARTED FOR ' + type + '!'
    total_data_size = len(data)
    training_size = int(round(total_data_size/2))
    test_size = training_size
    print 'Total Size: ' + str(total_data_size)
    print 'Training Size: ' + str(training_size)
    print 'Test Size: ' + str(test_size)

    print 'Training Started for ' + type + '!'
    classification_methods = {
      #uncomment based on what classification algorithm you would like to test
      'NB' :  NB(train=data[:training_size], baseline=MAJORITY, method=MULTINOMIAL),
      'KNN2' : KNN(train=data[:training_size], baseline=MAJORITY, k=2, distance=COSINE),
      'KNN3' : KNN(train=data[:training_size], baseline=MAJORITY, k=3, distance=COSINE),
      'KNN4' : KNN(train=data[:training_size], baseline=MAJORITY, k=4, distance=COSINE),
      'KNN5' : KNN(train=data[:training_size], baseline=MAJORITY, k=5, distance=COSINE),
      'KNN6' : KNN(train=data[:training_size], baseline=MAJORITY, k=6, distance=COSINE),
      'KNN7' : KNN(train=data[:training_size], baseline=MAJORITY, k=7, distance=COSINE),
      'KNN8' : KNN(train=data[:training_size], baseline=MAJORITY, k=8, distance=COSINE),
      'KNN9' : KNN(train=data[:training_size], baseline=MAJORITY, k=9, distance=COSINE),
      'KNN10' : KNN(train=data[:training_size], baseline=MAJORITY, k=10, distance=COSINE),
      'SLP1' : SLP(train=data[:training_size], baseline=MAJORITY, iterations=1),
      'SLP2' : SLP(train=data[:training_size], baseline=MAJORITY, iterations=2),
      'SLP3' : SLP(train=data[:training_size], baseline=MAJORITY, iterations=3),
      'SVM' : SVM(train=data[:training_size], type=CLASSIFICATION, kernel=POLYNOMIAL),
    }

    print 'Normal Testing Started!'
    # uncomment to start the normal test
    for classification in classification_methods.keys():
      #measure the time it takes to classify!
      start = timeit.default_timer()
      #normal test
      accuracy, precision, recall, f1 = classification_methods[classification].test(data[training_size:training_size+test_size])
      stop = timeit.default_timer()
      print '*' + classification + '*'
      print 'Accuracy: ' + str(accuracy)
      print 'Precision: ' + str(precision)
      print 'Recall: ' + str(recall)
      print 'F1-score: ' + str(f1)
      print 'Time: ' + str(stop - start)
      print
    def __init__(self, basefile):
        """
        Initialize the emoticon sentiment. Setup variables.

        Args:
          basefile (str): The location of the comments file that should serve
          as the basis for the classifier.
        """
        self.base = basefile
        self.comments = []
        self.nb = NB()
        self.stop = stopwords.words('english')
        self.positive = [':-)', ':)', ':D', ':o)', ':]', ':3', ':c)', ':>',
                         '=]', '8)', '=)', ':}', ':^)', ':っ)', ':-D', '8-D',
                         '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D', '=-3',
                         '=3', 'B^D', ':-))', ':*', ':^*', ';-)', ';)', '*-)',
                         '*)', ';-]', ';]', ';D', ';^)']
        self.negative = ['>:[', ':-(', ':(', ':-c', ':c', ':-<', ':っC',
                         ':<', ':-[', ':[', ':{', ':-||', ':@', '>:(', 'D:<',
                         'D:', 'D8', 'D;', 'D=', 'DX', 'v.v', '>:)', '>;)',
                         '>:-)', '}:-)', '}:)', '3:-)', '3:)', ':-###..',
                         ':###..', '>:/', ':-/', ':-.', ':/']
示例#13
0
def learnCategories(tn):
    nb = NB()
    for (cat, content) in csv('resource/%s.catdata' % tn,
                              separator=';',
                              headers=True):
        if not cat or not content: continue
        t = cat  # toASCII(cat)
        v = Document(content,
                     type=t,
                     stemmer=None,
                     stopwords=False,
                     language='fr')
        nb.train(v)
    # cr = csv('resource/%s.catdata' % tn, separator = ';', headers = True)
    # for (i, r) in enumerate(cr):
    # 	v = Document(str(i), type = r[0], stemmer = None, stopwords = False, language = 'fr')
    # 	nb.train(v)
    logging.info('TRAINED %s on %d categories', tn, len(nb.classes))
    nb.save('resource/%s.classifier' % tn)
示例#14
0
"""
views imports app, auth, and models, but none of these import views
"""
from flask import render_template, redirect, request, url_for, jsonify
from flask.ext.classy import FlaskView

from app import app
from auth import auth
from models import User

# Classifier, CSV Loading
from pattern.vector import Document, NB
from pattern.db import Datasheet

# Load classifier
nb = NB.load("project/data/amazonClassifier")


@app.route('/classify', methods=['POST'])
def classify_review():
    text = request.form.get('text')
    return jsonify(result=nb.classify(text.strip()))


class BaseView(FlaskView):
    '''Basic views, such as the home and about page.'''
    route_base = '/'

    def index(self):
        return render_template('home.html')
示例#15
0
for score, message in data:
    document = Document(message, type=int(score) > 0)
    documents.append(document)
m = Model(documents)

print("number of documents:", len(m))
print("number of words:", len(m.vector))
print("number of words (average):", sum(len(d.features)
                                        for d in m.documents) / float(len(m)))
print()

# Train Naive Bayes on all documents.
# Each document has a type: True for actual e-mail, False for spam.
# This results in a "binary" classifier that either answers True or False
# for unknown documents.
classifier = NB()
for document in m:
    classifier.train(document)

# We can now ask it questions about unknown e-mails:

print(classifier.classify("win money"))  # False: most likely spam.
print(classifier.classify("fix bug"))   # True: most likely a real message.
print()

# False: people don't talk like this on developer lists...
print(classifier.classify("customer"))
# True: because most likely everyone knows everyone.
print(classifier.classify("guys"))
print()
    for concept, w1 in m.lsa.vectors[d.id].items():
        for feature, w2 in m.lsa.concepts[concept].items():
            if w1 != 0 and w2 != 0:
                print(feature, w1 * w2)
# clustering
d1 = Document('Cats are independent pets.', name='cat')
d2 = Document('Dogs are trustworthy pets.', name='dog')
d3 = Document('Boxes are made of cardboard.', name='box')
m = Model((d1, d2, d3))
print m.cluster(method=HIERARCHICAL, k=2)
# hierarchical clustering
cluster = Cluster((1, Cluster((2, Cluster((3, 4))))))
print cluster.depth
print cluster.flatten(1)
# training a classifier
nb = NB()
for review, rating in csv('data/input/reviews.csv'):
    v = Document(review, type=int(rating), stopwords=True)
    nb.train(v)
print nb.classes
print nb.classify(Document('A good movie!'))
# testing a classifier
data = csv('data/input/reviews.csv')
data = [(review, int(rating)) for review, rating in data]
data = [
    Document(review, type=rating, stopwords=True) for review, rating in data
]
nb = NB(train=data[:500])
accuracy, precision, recall, f1 = nb.test(data[500:])
print accuracy
# binary classification
class NBClassifier:

    """
        This class interfaces a pattern corpus with a pattern.vector nb classifier

    """

    def __init__(self, corpus, **kargs):
        """
            Initializes the NBClassifier class with a corpus and a NB instance
            (input): corpus = a corpus of pattern Documents constructed from Grams
        """
        self.corpus = corpus
        self.documents = self.corpus.documents
        self.model = Model(documents=self.documents, weight='TF-IDF')
        #self.documents.words = self.documents.keywords
        self.split_idx = len(self) / 4
        self.nb = NB()

    def __len__(self):
        return len(self.documents)

    def classify_document(self, document):
        """
            classify document with nb instance
            (input):
                 document = Document instance with same format as classifier train set
            (output): classification result
        """
        return (
            self.nb.classify(Document(document, stemmer=PORTER), discrete=True)
        )

    def nb_train(self):
        """
            This function trains the classifier with (3/4) of the documents
            (input): None
            (outpu): trains self.nb
        """
        train_start = datetime.datetime.now()
        print "training with {0} docs".format(len(self) - self.split_idx)
        documents = self.documents[:-self.split_idx]
        random.shuffle(documents)
        [self.nb.train(doc) for doc in documents]
        train_end = datetime.datetime.now()
        print "training {0} docs took {1} seconds".format(len(documents), (train_end - train_start).seconds)

    def nb_test(self, gold_standard=None):
        """
            Evaluates the classifier on (1/4) of the documents
            (input): None
            (output): accuracy, precision, recall, f1
        """
        if not gold_standard:
            test_docs = self.documents[-self.split_idx:]
            conf_matrix = self.nb.confusion_matrix(test_docs)
            print conf_matrix.table

            return self.nb.test(test_docs)
        else:
            return self.nb.test(gold_standard)

    def get_documentlabel_distribution(self):
        """
            Gets the label distribution of the document set passed into
            self.nb classifier
            (input): None
            (output):
                 distribution = dictionary representation of the label:frequency distribution
                 rankdistribution = a ranked list of label keys by frequency
        """

        distribution = self.nb.distribution
        rankdistribution = sorted(
            distribution,
            key=lambda x: distribution[x],
            reverse=True)
        for each, key in enumerate(rankdistribution[:10]):
            print "{0}:\t{1} ({2})".format(each, key, distribution[key])

        return distribution, rankdistribution

    def run(self):
        """
            executes training / testing of classifier
        """
        print "training and testing {0} total documents...".format(len(self))
        self.nb_train()
        print "testing"
        result = self.nb_test()
        print "finalizing"
        self.nb.finalize()
        return result
class Emoticon_Sentiment:

    """
    Use Emoticon sentiment to classify and predict comments.

    This class provides a binary sentiment analyzer (1 = positive, -1 =
    negative). It is automatically created using emoticons from thousands of
    comments.
    """

    def __init__(self, basefile):
        """
        Initialize the emoticon sentiment. Setup variables.

        Args:
          basefile (str): The location of the comments file that should serve
          as the basis for the classifier.
        """
        self.base = basefile
        self.comments = []
        self.nb = NB()
        self.stop = stopwords.words('english')
        self.positive = [':-)', ':)', ':D', ':o)', ':]', ':3', ':c)', ':>',
                         '=]', '8)', '=)', ':}', ':^)', ':っ)', ':-D', '8-D',
                         '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D', '=-3',
                         '=3', 'B^D', ':-))', ':*', ':^*', ';-)', ';)', '*-)',
                         '*)', ';-]', ';]', ';D', ';^)']
        self.negative = ['>:[', ':-(', ':(', ':-c', ':c', ':-<', ':っC',
                         ':<', ':-[', ':[', ':{', ':-||', ':@', '>:(', 'D:<',
                         'D:', 'D8', 'D;', 'D=', 'DX', 'v.v', '>:)', '>;)',
                         '>:-)', '}:-)', '}:)', '3:-)', '3:)', ':-###..',
                         ':###..', '>:/', ':-/', ':-.', ':/']

    def read_base_comments(self):
        """
        Read the base comments.

        Read the file that loads the comments that serve as the base for the
        classifier.

        Returns:
          list: list of comments from the text file
        """
        f = open(self.base, 'r')
        self.comments = json.load(f)
        f.close()
        return self.comments

    def preprocess(self, comment):
        """
        Preprocess comment.

        Take a comment and pre-processes it (all to lowercase, remove links,
        delete punctuation, remove stopwords and numbers)

        Args:
          comment (str): The text string to be processed

        Returns:
          str: the processed text string
        """
        h = HTMLParser.HTMLParser()
        text = h.unescape(comment.lower())
        p = re.compile(r"(\b(?:(?:https?|ftp|file)://|www\.|ftp\.)"
                       "[-A-Za-z0-9+&@#/%?=~_()|!:,.;]*"
                       "[-A-Za-z0-9+&@#/%=~_()|])")
        text = re.sub(p, '', text)
        exclude = set(string.punctuation)
        tokens = word_tokenize(text)
        cleaned = [token for token in tokens if token not in exclude]
        text = ' '.join([w.encode('ascii', errors='ignore') for w in cleaned
                        if w.lower() not in self.stop and not w.isdigit()])
        return text

    def parse_comments(self):
        """
        Parse comment, check for emoticon, remove neutrals.

        Parse a list of comments, by searching for emoticons. Once an emoticon
        is found, it flags it positive or negative.
        With multiple finds, it sums their value; neutrals are thrown out.

        Returns:
          list: 2-dimensional list of parsed comments and their sentiment flags
        """
        extended_comments = [[self.comments[i], 0, False]
                             for i in range(len(self.comments))]

        # loop through comments and search for emoticons: +1 if positive, -1
        # if negative, and remove emoticons from text
        for comment in extended_comments:

            for n in self.negative:
                if comment[0].encode('utf-8').find(n) > -1:
                    comment[1] = comment[1]-1
                    comment[0] = comment[0].replace(n, '')
                    comment[2] = True

            for p in self.positive:
                if comment[0].encode('utf-8').find(p) > -1:

                    comment[1] = comment[1]+1
                    comment[0] = comment[0].replace(p, '')
                    comment[2] = True

        # throw out comments that have no emoticons
        parsed_comments = [[comment[0], comment[1]] for comment
                           in extended_comments if comment[2]]

        # normalize the values by converting all positives (1,2,3,etc) to 1,
        # and all negatives (-1,-2,-3 etc) to -1
        for item in parsed_comments:
            if item[1] > 1:
                item[1] = 1
            elif item[1] < -1:
                item[1] = -1

        # throw out neutrals
        parsed_comments = [[self.preprocess(i[0]), i[1]]
                           for i in parsed_comments if i[1] != 0]

        return parsed_comments

    def train_classifier(self, comments):
        """
        take the flagged comments and trains a Naive Bayes classifier.

        Args:
          comments (list): The list of flagged comments to be trained
        """
        shuffle(comments)

        for comment in comments:
            v = Document(comment[0], type=int(comment[1]))
            self.nb.train(v)

    def predict(self, text):
        """
        Take a comment and makes a sentiment prediction.

        Args:
          text (str): The text string to be analyzed for sentiment

        Returns:
          int: 1 if positive sentiment, -1 if negative
        """
        return self.nb.classify(text)
class NBModel:
    def __init__(self):
        self.nb = NB()
        self.stats = Statistics()
        try:
            print("dir: " + os.getcwd())
            if os.getcwd().endswith("tv_ratings_frontend"):
                print("Working in django")
                self.nb = self.nb.load("ratings_frontend/backend/pattern_ml/nb_training.p")
            else:
                print("Not working in django")
                self.nb = self.nb.load("./nb_training.p")
            self.new_nb_model = True
            print("Using existing pickled model")
        except IOError:
            self.new_nb_model = False
            print("Creating new NB model")

    def nb_train_text(self, reviews):
        for review in reviews:
            if review.rating is not None:# and review.rating < 10 and review.rating > 1:
                v = Document(review.text, type=int(review.rating), stopwords=True)
                self.nb.train(v)
                self.nb.save("./nb_training.p")
                #   print self.nb.classes

    def nb_train_summary(self, reviews):
        for review in reviews:
            if review.rating is not None:# and review.rating < 10 and review.rating > 1:
                v = Document(review.summary, type=int(review.rating), stopwords=True)
                self.nb.train(v)

    def nb_train_all_text(self, review_set):
        for review_list in review_set:
            self.nb_train_text(review_list)
        self.nb.save_model()

    def save_model(self):
    #    print ""
        self.nb.save('./nb_training.p')

    def nb_test_imdb(self, reviews):
        arr = []
        for review in reviews:
            if review.rating is not None:
                v = Document(self.review_to_words(review.text), type=int(review.rating), stopwords=True)
                arr.append(v)
        print self.nb.test(arr, target=None)

    def nb_classify_tweets(self, tvshow, tweets):
        ratingSum = 0
        tweet_docs = [(self.nb.classify(Document(self.review_to_words(tweet))), self.review_to_words(tweet)) for tweet in tweets]
        for tweet in tweet_docs:
            ratingSum += tweet[0]
            #print tweet
           # print tweet
        self.nb_stats()
        Statistics().printStats(tvshow, ratingSum, len(tweet_docs))
        print self.nb.distribution

        return Statistics().get_stats(tvshow, ratingSum, len(tweet_docs))

    def nb_stats(self):
        print('----------- Classifier stats -----------')
        #  print("Features: ", self.nb.features)
        print("Classes: ", self.nb.classes)
        print("Skewness: ", self.nb.skewness)
        print("Distribution: ", self.nb.distribution)
        print("Majority: ", self.nb.majority)
        print("Minority: ", self.nb.minority)

    def review_to_words(self, raw_review):
        no_url = re.sub("http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", "", raw_review)

        # Remove numerics
        letters_only = re.sub("[^a-zA-Z]", " ", no_url)

        # to lowercase
        words = letters_only.lower().split()

        # remove stop words - the, of , a ....
        stops = set(stopwords.words("english"))

        meaningful_words = [w for w in words if not w in stops]

        return (" ".join(meaningful_words))
示例#20
0
"""
views imports app, auth, and models, but none of these import views
"""
from flask import render_template, redirect, request, url_for, jsonify
from flask.ext.classy import FlaskView

from app import app
from auth import auth
from models import User

# Classifier, CSV Loading
from pattern.vector import Document, NB
from pattern.db import Datasheet

# Load classifier
nb = NB.load("project/data/amazonClassifier")

@app.route('/classify', methods=['POST'])
def classify_review():
	text = request.form.get('text')
	return jsonify(result=nb.classify(text.strip()))

class BaseView(FlaskView):
  '''Basic views, such as the home and about page.'''
  route_base = '/'

  def index(self):
    return render_template('home.html')

BaseView.register(app)
示例#21
0
import xml.etree.ElementTree as xmlTree
from pattern.vector import Document, NB, count, words
from pattern.web import plaintext
from pattern.db import csv
from collections import Counter

nb = NB()
wordStats = Counter()
opinionStats = Counter({'positive': 0, 'negative': 0, 'overall': 0})

for grade, opinion in csv('trainData.csv', separator = '\t'):
    comment = Document(opinion, type=int(grade), stopwords = True)
    nb.train(comment)

tree = xmlTree.parse("Posts.xml")
root = tree.getroot()

for row in root:
    doc = Document(plaintext(row.attrib['Body']), 
                filter = lambda w: w.strip("'").isalpha() and len(w) > 1,
                stopwords = False)
    opinion = nb.classify(doc)
    opinionStats['overall'] +=1
    if opinion > 0:
        opinionStats['positive'] += 1
    else:
        opinionStats['negative'] += 1
    wordStats += Counter(doc.words)

print wordStats.most_common(10)
print opinionStats
示例#22
0
from pattern.vector import count, words, chngrams
from pattern.vector import kfoldcv, fsel

# Statistical machine learning is a branch of AI that can be used
# to learn the "type" of unknown things, based on a "training set"
# of known things. For these known things, we already know the type,
# and we have a description of each thing, called a "vector".
# A vector is a just Python dictionary of features and feature weights.

# ------------------------------------------------------------------------------------

# The simplest classification algorithm is Naive Bayes,
# but it works quite well with text.

# A trivial example with animal features:
nb = NB()
nb.train({"swim": 1, "fin": 2, "legs": 0, "wings": 0}, type="fish")
nb.train({"swim": 0, "fin": 0, "legs": 1, "wings": 2}, type="bird")
nb.train({"swim": 1, "fin": 0, "legs": 4, "wings": 0}, type="mammal")

print nb.classify({"legs": 4})
print

# ------------------------------------------------------------------------------------

# For text, usually the word order is discarded in favor of word count.
# This is called a "bag-of-words" model, i.e., we count each word in a
# document. We can then compare it to other documents to see if they
# have frequent words in common. If so, they probably belong to the
# same class or type.
示例#23
0
for score, message in data:
    document = Document(message, type=int(score) > 0)
    documents.append(document)
m = Model(documents)

print "number of documents:", len(m)
print "number of words:", len(m.vector)
print "number of words (average):", sum(len(d.features)
                                        for d in m.documents) / float(len(m))
print

# Train Naive Bayes on all documents.
# Each document has a type: True for actual e-mail, False for spam.
# This results in a "binary" classifier that either answers True or False
# for unknown documents.
classifier = NB()
for document in m:
    classifier.train(document)

# We can now ask it questions about unknown e-mails:

print classifier.classify("win money")  # False: most likely spam.
print classifier.classify("fix bug")  # True: most likely a real message.
print

print classifier.classify(
    "customer")  # False: people don't talk like this on developer lists...
print classifier.classify(
    "guys")  # True: because most likely everyone knows everyone.
print
示例#24
0
from pattern.vector import count, words, chngrams
from pattern.vector import kfoldcv, fsel

# Statistical machine learning is a branch of AI that can be used
# to learn the "type" of unknown things, based on a "training set" 
# of known things. For these known things, we already know the type, 
# and we have a description of each thing, called a "vector".
# A vector is a just Python dictionary of features and feature weights.

# ------------------------------------------------------------------------------------

# The simplest classification algorithm is Naive Bayes,
# but it works quite well with text.

# A trivial example with animal features:
nb = NB()
nb.train({"swim": 1, "fin": 2, "legs": 0, "wings": 0}, type="fish")
nb.train({"swim": 0, "fin": 0, "legs": 1, "wings": 2}, type="bird")
nb.train({"swim": 1, "fin": 0, "legs": 4, "wings": 0}, type="mammal")

print nb.classify({"legs": 4})
print

# ------------------------------------------------------------------------------------

# For text, usually the word order is discarded in favor of word count.
# This is called a "bag-of-words" model, i.e., we count each word in a
# document. We can then compare it to other documents to see if they
# have frequent words in common. If so, they probably belong to the 
# same class or type.
示例#25
0
from pattern.vector import Document, NB, SVM
from pattern.db import Datasheet

data = Datasheet.load('reviews.csv', headers=True)
print data
datalist = [(review, int(rating)) for review, rating in data]
print datalist
datadocs = [Document(review, type=rating, stopwords=True) for review, rating
            in datalist]
print datadocs

#naive Bayes
#training set
nb = NB(train=datadocs[:500])
print 'nb distribution = ', nb.distribution
print 'nb confusion matrix = ', nb.confusion_matrix(datadocs[500:])
print 'nb confusion matrix for each class = ', nb.confusion_matrix(datadocs[500:])(True) # (TP, TN, FP, FN)
print 'nb features = ', nb.features
#test set
accuracy, precision, recall, f1 = nb.test(datadocs[500:])
print 'nb accuracy = ', accuracy, 'nb precision =', precision, 'nbrecall = ', \
    recall

#test SVM
testsvm = SVM(train=datadocs[:500])
print 'svm features = ', testsvm.features
saccuracy, sprecision, srecall, sf1 = testsvm.test(datadocs[500:])
print 'svm accuracy =', saccuracy


#classifier training example with test classificaiton
示例#26
0
documents = []
for score, message in data:
    document = Document(message, type=int(score) > 0)
    documents.append(document)
m = Model(documents)

print "number of documents:", len(m)
print "number of words:", len(m.vector)
print "number of words (average):", sum(len(d.features) for d in m.documents) / float(len(m))
print

# Train Naive Bayes on all documents.
# Each document has a type: True for actual e-mail, False for spam.
# This results in a "binary" classifier that either answers True or False
# for unknown documents.
classifier = NB()
for document in m:
    classifier.train(document)

# We can now ask it questions about unknown e-mails:

print classifier.classify("win money") # False: most likely spam.
print classifier.classify("fix bug")   # True: most likely a real message.
print

print classifier.classify("customer")  # False: people don't talk like this on developer lists...
print classifier.classify("guys")      # True: because most likely everyone knows everyone.
print

# To test the accuracy of a classifier,
# we typically use 10-fold cross validation.
示例#27
0
for score, message in data:
    document = Document(message, type=int(score) > 0)
    documents.append(document)
m = Model(documents)

print("number of documents:", len(m))
print("number of words:", len(m.vector))
print("number of words (average):",
      sum(len(d.features) for d in m.documents) / float(len(m)))
print()

# Train Naive Bayes on all documents.
# Each document has a type: True for actual e-mail, False for spam.
# This results in a "binary" classifier that either answers True or False
# for unknown documents.
classifier = NB()
for document in m:
    classifier.train(document)

# We can now ask it questions about unknown e-mails:

print(classifier.classify("win money"))  # False: most likely spam.
print(classifier.classify("fix bug"))  # True: most likely a real message.
print()

# False: people don't talk like this on developer lists...
print(classifier.classify("customer"))
# True: because most likely everyone knows everyone.
print(classifier.classify("guys"))
print()
示例#28
0
result.close()

result=open('result.txt',"r")


documents=[]

for linea in result.readlines():
	document = Document(linea.split("/")[1], type=linea.split("/")[0])
	documents.append(document)

m = Model(documents)


nb=NB()

for document in m:
    nb.train(document)

print nb.classify("opinion")
print nb.classify("noticia")

from pattern.vector import k_fold_cv
print k_fold_cv(NB, documents=m, folds=10)

	
#for palabra in lines:
#	print d.get(palabra)
"""
1. Sacar bien los resultados