示例#1
0
    def train(self):
        trainingData, allFeatures, docCount = [], set(), utility.Counter()
        paragraphCounter = 0

        for label, text in self.documents.items():
            for paragraph in text.split("\n"):
                d = defaultdict(bool)
                words = set(utility.wordParse(paragraph))
                docCount.update(words)
                paragraphCounter += 1
                for word in words:
                    d[word] = True
                    allFeatures.add(word)
                trainingData.append((label, d))
        print len(allFeatures)
        self.tree = BoolDecisionTree(trainingData, allFeatures)
示例#2
0
    def train(self):
        # Maybe some stemming later?
        docCount = {1:utility.Counter(), 2:utility.Counter()}

        for text in self.documents.itervalues():
            docCount[1].update(set(utility.ngramFinder(text, 1)))
            docCount[2].update(set(utility.ngramFinder(text, 2)))
        print "Got doc count"

        categoryCount = utility.Counter()
        categoryWords = defaultdict(list)
        for q in Question.objects.all()[::100]:
            words = utility.wordParse(q.body)

            categoryWords[q.category] += words
            categoryWords[""] += words

        categoryBins = len(set(categoryWords[""]))
        del categoryWords[""]

        for category in categoryWords:
            categoryCount.update((word,) for word in set(categoryWords[category]))

        for category, words in categoryWords.items():
            categoryWords[category] = WordDist(words, categoryBins)
            utility.wordFilter(categoryCount, len(categoryWords), 
                               categoryWords[category])
        print "Trained Category"

        for label in self.documents:
            self.features[label] = NGramModel(2, self.documents[label],
                                              docCount, len(self.documents))
            category = Label.objects.get(body=label).questions.all()[0].category
            self.features[label].addBackoff(categoryWords[category],
                                            categoryBins)
        print "Trained Wikipedia"

        return categoryCount