Пример #1
0
    def upload_csv_button(self):
        self.file_path = tkFileDialog.askopenfilename(
            initialdir='/',
            title='Select file',
            filetypes=(('csv files', '*.csv'), ('all files', '*.*')))

        with open(self.file_path) as f:
            s = f.read()
        my_d = repr(s)
        docclass.getwords(my_d)
        c1 = docclass.classifier(docclass.getwords)
        docclass.sampletrain(c1)
Пример #2
0
def get_article_words_count(feedlist):
    allwords = {}
    articlewords = []
    articletitles = set()
    ec = 0
    # loop over every feed
    
    for feed in feedlist:
        f = feedparser.parse(feed)
        for e in f.entries:
            # ignore identical articles
            if e.title in articletitles: continue
            
            # extract the words
            txt = e.title.encode('utf-8') + stripHTML(e.description.encode('utf-8'))
            words = getwords(txt)
            articlewords.append({})
            articletitles.add(e.title)
            
            # inc counts of all words in an article in allwords, articlewords
            for word in words:
                allwords.setdefault(word, 0)
                allwords[word] += 1
                articlewords[ec].setdefault(word, 0)
                articlewords[ec][word] += 1
            ec = ec + 1
            
    return allwords, articlewords, list(articletitles)
Пример #3
0
def parse_netflix_data(feed_file, con, predicted):
    # Get all features from database and saved into array
    cur = con.execute('select * from fc');
    features = [f[0] for f in cur]

    titles = []
    data = []

    feeds = feedparser.parse(feed_file)
    entries = feeds['entries']
    
    for e in range(len(entries)):
        entry = entries[e]
        
        title       = ''
        publisher   = ''
        summary     = ''
            
        if 'title' in entry:
            title = entry['title']
        if 'publisher' in entry:
            publisher = entry['publisher']
        if 'summary' in entry:
            summary = entry['summary']
            
        fulltext = '{} {} {}'.format(title, publisher, summary)
        words = docclass.getwords(fulltext)
        
        # Calculate word count
        wc = {}
        for w in words:
            wc.setdefault(w, 0)
            wc[w] += 1
            
        arr_wc = []
        # Convert wc to colon-separated term:value
        for term, value in wc.items():
            # Find index of term in entries
            if term in features:
                arr_wc.append('{}:{}'.format(features.index(term), value))
                
        # Get actual category, then compare it with predicted
        # Insert result (-1 or +1) at the 1st column
        (actual,) = con.execute('select actual_category from entry where title =?', (title,)).fetchone();
        if actual:
            if actual == predicted:
                arr_wc.insert(0, '+1')
            else:
                arr_wc.insert(0, '-1')
        else:
            arr_wc.insert(0, '-1')
                
        # Join array into space separated value
        str_wc = ' '.join(arr_wc)
        
        titles.append(title)
        data.append(str_wc)
        
    return (titles, features, data)
Пример #4
0
    def analysis(self, event):
        self.text.delete(1.0, END)
        index = self.listbox2.curselection()
        choice = self.listbox2.get(index)
        choice2 = "Genre: " + choice + '\n'
        GenresBooks = self.BooksDATA[choice]
        totalB = "Total Number Of Books: " + str(len(GenresBooks))
        self.text.insert(END, choice2)
        self.text.insert(END, totalB)

        predictedGenre = []
        Books = self.BooksDATA[choice]
        for i in Books:
            self.selectedBook = i[1]
            # categories = self.clasifier.categories()
            estimates = []
            for cat in self.clasifier.categories():
                prob = 0
                for word in docclass.getwords(self.selectedBook):
                    prob += self.clasifier.fprob(word, cat, default_prob=0)
                if prob != 0:
                    estimates.append([cat, prob])
            estimates = sorted(estimates, key=lambda x: x[1], reverse=True)
            estimates = estimates[0]
            predictedGenre.append(estimates)
        print predictedGenre

        c = len(self.BooksDATA[choice])
        wrong = 0

        for j in predictedGenre:
            if j[0] != choice:
                c -= 1
                wrong += 1
        correct = '\nCorrectly predicted Books: ' + str(c) + '\n'
        wrongg = 'Wrongly Predicted Books: ' + str(wrong) + '\n'
        accuracy = 'Accuracy: ' + str(
            (float(c) / len(self.BooksDATA[choice])) * 100)

        self.text.insert(END, correct)
        self.text.insert(END, wrongg)
        self.text.insert(END, accuracy)
Пример #5
0
 def testStripDuplicates(self):
   self.assertEquals(['mail', 'spam'],
       sorted(list(docclass.getwords('spam mail spam'))))
Пример #6
0
    def listClicked(self, event):

        self.labelp1.config(text=' ', bg='wheat2')
        self.labelp2.config(text=' ', bg='wheat2')
        self.labelp3.config(text=' ', bg='wheat2')

        index = self.listBox1.curselection()
        temp = self.listBox1.get(index[0])
        catt = temp.split('-')[2]
        temp = temp.split('-')[1]
        temp = temp[1:]

        self.selectedBook = temp
        # categories = self.clasifier.categories()
        estimates = []
        for cat in self.clasifier.categories():
            prob = 0
            for word in docclass.getwords(self.selectedBook):
                prob += self.clasifier.fprob(word, cat, default_prob=0)
            if prob != 0:
                estimates.append([cat, prob])

        estimates = sorted(estimates, key=lambda x: x[1], reverse=True)
        if len(estimates) == 1:

            line = str(estimates[0][0]) + " -->" + str(estimates[0][1])
            if estimates[0][0] == catt:
                self.labelp1.config(text=line, bg='green')
            else:
                print estimates
                self.labelp1.config(text=line, bg='red')

        elif len(estimates) == 2:

            line = str(estimates[0][0]) + " -->" + str(estimates[0][1])
            line2 = str(estimates[1][0]) + " -->" + str(estimates[1][1])
            if estimates[0][0] == catt:
                self.labelp1.config(text=line, bg='green')
            else:
                print estimates
                self.labelp1.config(text=line, bg='red')

            if estimates[1][0] == catt:
                self.labelp2.config(text=line2, bg='green')
            else:
                self.labelp2.config(text=line2, bg='red')

        elif len(estimates) == 0:
            print 'wait what'

        else:

            line = str(estimates[0][0]) + " -->" + str(estimates[0][1])
            line2 = str(estimates[1][0]) + " -->" + str(estimates[1][1])
            line3 = str(estimates[2][0]) + " -->" + str(estimates[2][1])

            if estimates[0][0] == catt:
                self.labelp1.config(text=line, bg='green')
            else:
                print estimates
                self.labelp1.config(text=line, bg='red')

            if estimates[1][0] == catt:
                self.labelp2.config(text=line2, bg='green')
            else:
                self.labelp2.config(text=line2, bg='red')

            if estimates[2][0] == catt:
                self.labelp3.config(text=line3, bg='green')
            else:
                self.labelp3.config(text=line3, bg='red')
Пример #7
0
 def test_incf(self):
     clas = docclass.classifier(docclass.getwords)
     origin_str = 'the quick brown for jumps over the lazy dog'
     clas.train('the quick brown for jumps over the lazy dog', 'good')
     str_tobe = docclass.getwords(origin_str)
     self.assertDictEqual(clas.fc, str_tobe)
Пример #8
0
 def test_getwords(self):
     words_source = 'disk*jdhs&342nhek[]989c12cnjshdsjkv774*j'
     words_result_dict = dict([('disk', 1), ('jdhs', 1),
                              ('nhek', 1), ('cnjshdsjkv', 1)])
     self.assertDictEqual(words_result_dict,
                          docclass.getwords(words_source))
Пример #9
0
 def test_incf(self):
     clas = docclass.classifier(docclass.getwords)
     origin_str = 'the quick brown for jumps over the lazy dog'
     clas.train('the quick brown for jumps over the lazy dog', 'good')
     str_tobe = docclass.getwords(origin_str)
     self.assertDictEqual(clas.fc, str_tobe)
Пример #10
0
 def test_getwords(self):
     words_source = 'disk*jdhs&342nhek[]989c12cnjshdsjkv774*j'
     words_result_dict = dict([('disk', 1), ('jdhs', 1), ('nhek', 1),
                               ('cnjshdsjkv', 1)])
     self.assertDictEqual(words_result_dict,
                          docclass.getwords(words_source))
Пример #11
0
import docclass

docclass.getwords('python is a dynamic language')

cl = docclass.naivebayes(docclass.getwords)

cl.setdb('test.db')

cl.train('pythons are constrictors', 'snake')
cl.train('python has dynamic types', 'language')
cl.train('python was developed as scripting language', 'language')

cl.classify('dynamic programming')

cl.classify('boa constrictors')
exit()