def testClassifier(self): print 'Getting documents for testing' bingDocsFetcher = DocumentsGenerator('test.json') testQueries = ['apple', 'facebook', 'westeros', 'gonzaga', 'banana'] categories = ['Entertainment', 'Business', 'Politics'] jsonDocData = bingDocsFetcher.getDocuments(testQueries, categories) numDocs = len(jsonDocData) print numDocs, ' is the number of documents' confusionDict = {} for category in self._categories: confusionDict[category] = {} confusionDict[category]['TN'] = 0 confusionDict[category]['TP'] = 0 confusionDict[category]['FN'] = 0 confusionDict[category]['FP'] = 0 for document in jsonDocData: classPredicted = self.classifyDocument(document) actualClass = document['category'] if actualClass == classPredicted: confusionDict[actualClass]['TP'] += 1 else: confusionDict[actualClass]['FN'] += 1 confusionDict[classPredicted]['FP'] += 1 for category in self._categories: confusionDict[category]['TN'] = numDocs - confusionDict[category]['TP'] - confusionDict[category]['FP'] - confusionDict[category]['FN'] (tpSum, fpSum, fnSum) = (0, 0, 0) for category in confusionDict: tpSum += confusionDict[category]['TP'] fpSum += confusionDict[category]['FP'] fnSum += confusionDict[category]['FN'] precision = tpSum / float(tpSum + fpSum) recall = tpSum / float(tpSum + fnSum) f1 = 2 * precision * recall / (precision + recall) print 'Mico averaged F1 ', f1 print confusionDict
class NaiveBayes: _bingDocsGenerator = None _queryList = ['bing', 'amazon', 'twitter', 'yahoo', 'google', 'beyonce', 'bieber', 'television', 'movies', 'music', 'obama', 'america', 'congress', 'senate', 'lawmakers'] _categories = ['Entertainment', 'Business', 'Politics'] _priors = {'Sports': 0.7} _documentCollection = {'ab4b5bbcaef44446fc': {}} _termsMIDict = {} ''' {'d_u_m_m_y$t_e_r_m': 'prior': {'Entertainment': .45, 'Business': .35,... }, } ''' _termsDict = {} _noOfTermsToBeSelected = 9000 def __init__(self, dataCacheFile): self._documentCollection.clear() self._priors.clear() self._bingDocsGenerator = DocumentsGenerator(dataCacheFile) self._populateDocuments() self._calculateProbabilities() self._calculateMutualInformation() self._pruneTermDict(self._noOfTermsToBeSelected) ''' Here we do the feature selection ''' def _pruneTermDict(self, numberOfTermsSelected): print 'Pruning terms' total = len(self._termsDict) print 'Number of features before pruning', total categoryTermMIDict = {} for category in self._categories: lstTermMI = [] for term in self._termsMIDict: termMIInfo = self._termsMIDict[term] MI = termMIInfo[category]['MI'] lstTermMI.append((term, MI)) categoryTermMIDict[category] = sorted(lstTermMI, key = itemgetter(1)) termsToRemovePerCategory = (total - numberOfTermsSelected) / len(self._categories) for category in self._categories: removedTerms = 0 for termInfo in categoryTermMIDict[category]: term = termInfo[0] if removedTerms == termsToRemovePerCategory: break if term in self._termsDict: try: print 'Removed ', term except: print 'error in printing!!' self._termsDict.pop(term) removedTerms += 1 print 'Number of features after pruning', len(self._termsDict) def _removeStopWords(self, terms): for term in terms: if term in stopwords: terms.remove(term) def _populateDocuments(self): print 'Getting Documents for CLassification' jsonData = self._bingDocsGenerator.getDocuments(self._queryList, self._categories) print 'Got data !!' for documentJson in jsonData: docInfoDict = {} docId = documentJson['id'] description = documentJson['description'] title = documentJson['title'] terms = findall('\w+', title + ' ' + description , UNICODE) docInfoDict = {} self._removeStopWords(terms) docInfoDict['terms'] = terms docInfoDict['termsSet'] = set(terms) docInfoDict['category'] = documentJson['category'] self._documentCollection[docId] = docInfoDict print len(self._documentCollection), ' is the total number of documents' def _updatePriorData(self, category): if category in self._priors: self._priors[category] += 1 else: self._priors[category] = 0 def _updateDataForTerm(self, term, category, termFrequency): termDataDict = self._termsDict[term] priorDataDict = termDataDict['prior'] # updating term frequency priorDataDict[category] += termFrequency priorDataDict['totalCount'] += termFrequency def _initializeTermData(self, category, termFrequency): # initializing prior data for the term termDataDict = {} termDataDict['prior'] = {} for categoryName in self._categories: termDataDict['prior'][categoryName] = 0 termDataDict['prior']['totalCount'] = termFrequency termDataDict['prior'][category] = termFrequency return termDataDict def _updateMutualInfoData(self): for term in self._termsDict: if term not in self._termsMIDict: self._termsMIDict[term] = {} for category in self._categories: self._termsMIDict[term][category] = {'N11':0,'N10':0,'N00':0, 'N01':0} for docId in self._documentCollection: document = self._documentCollection[docId] termsSet = document['termsSet'] docCategory = document['category'] if term in termsSet: self._termsMIDict[term][docCategory]['N11'] += 1 for category in self._categories: if category == docCategory: continue self._termsMIDict[term][category]['N10'] += 1 else: self._termsMIDict[term][docCategory]['N01'] += 1 for category in self._categories: if category == docCategory: continue self._termsMIDict[term][category]['N00'] += 1 def _updateTermsData(self, terms, category, docId): termsSet = set(terms) termFrequencyDistribution = Counter(terms) for term in termsSet: termDataDict = {} termFrequency = termFrequencyDistribution[term] if term in self._termsDict: termDataDict = self._termsDict[term] self._updateDataForTerm(term, category, termFrequency) else: termDataDict = self._initializeTermData(category, termFrequency) # updating the information for the term in the terms Dictionary self._termsDict[term] = termDataDict def _calculateMutualInformation(self): self._updateMutualInfoData() for term in self._termsMIDict: termDataDict = self._termsMIDict[term] for category in self._categories: N11 = float(termDataDict[category]['N11']) + 0.00001 N10 = float(termDataDict[category]['N10']) + 0.00001 N01 = float(termDataDict[category]['N01']) + 0.00001 N00 = float(termDataDict[category]['N00']) + 0.00001 N = N10 + N11 + N01 + N00 N1 = N10 + N11 N0 = N01 + N00 MI = 0 MI += (N11 / N) * log( (N * N11) / (N1 * N1), 2) MI += (N01 / N) * log( (N * N01) / (N0 * N1), 2) MI += (N10 / N) * log( (N * N10) / (N1 * N0), 2) MI += (N00 / N) * log( (N * N00) / (N0 * N0), 2) termDataDict[category]['MI'] = MI def _calculateProbabilities(self): for docId in self._documentCollection: document = self._documentCollection[docId] docCategory = document['category'] terms = document['terms'] self._updatePriorData(docCategory) self._updateTermsData(terms, docCategory, docId) print 'Calculating probabilities' for priorCategory in self._priors: prior = self._priors[priorCategory] / float(len(self._documentCollection)) self._priors[priorCategory] = prior for term in self._termsDict: termDataDict = self._termsDict[term] priorDict = termDataDict['prior'] for priorCategory in priorDict: priorDict[priorCategory] = (priorDict[priorCategory] + 1) / (float(priorDict['totalCount']) + len(self._termsDict)) def classifyDocument(self, document): title = document['title'] description = document['description'] terms = findall('\w+', title + ' ' + description , UNICODE) self._removeStopWords(terms) categoryProbabilities = [] termProbabilityLogSum = 0 for category in self._categories: categoryProbability = 0 termProbabilityLogSum = 0 for term in terms: if term in self._termsDict: termProbabilityLogSum += log(self._termsDict[term]['prior'][category], 2) categoryProbability = log(self._priors[category], 2) + termProbabilityLogSum categoryProbabilities.append((category, categoryProbability)) documentCategory = max(categoryProbabilities, key=itemgetter(1))[0] return documentCategory def displayData(self): print '**********************' for term in self._termsMIDict: termData = self._termsMIDict[term] try: print term,': ----> ', termData except: print 'Error !!' print '-------------------------' def testClassifier(self): bingDocsFetcher = DocumentsGenerator('test.json') testQueries = ['apple', 'facebook', 'westeros', 'gonzaga', 'banana'] categories = ['Entertainment', 'Business', 'Politics'] jsonDocData = bingDocsFetcher.getDocuments(testQueries, categories) numDocs = len(jsonDocData) print numDocs, ' is the number of documents' confusionDict = {} for category in self._categories: confusionDict[category] = {} confusionDict[category]['TN'] = 0 confusionDict[category]['TP'] = 0 confusionDict[category]['FN'] = 0 confusionDict[category]['FP'] = 0 for document in jsonDocData: classPredicted = self.classifyDocument(document) actualClass = document['category'] if actualClass == classPredicted: confusionDict[actualClass]['TP'] += 1 else: confusionDict[actualClass]['FN'] += 1 confusionDict[classPredicted]['FP'] += 1 for category in self._categories: confusionDict[category]['TN'] = numDocs - confusionDict[category]['TP'] - confusionDict[category]['FP'] - confusionDict[category]['FN'] (tpSum, fpSum, fnSum) = (0, 0, 0) for category in confusionDict: tpSum += confusionDict[category]['TP'] fpSum += confusionDict[category]['FP'] fnSum += confusionDict[category]['FN'] precision = tpSum / float(tpSum + fpSum) recall = tpSum / float(tpSum + fnSum) f1 = 2 * precision * recall / (precision + recall) print 'Mico averaged F1 ', f1 print confusionDict
class NaiveBayes: _bingDocsGenerator = None _queryList = ['bing', 'amazon', 'twitter', 'yahoo', 'google', 'beyonce', 'bieber', 'television', 'movies', 'music', 'obama', 'america', 'congress', 'senate', 'lawmakers'] _categories = ['Entertainment', 'Business', 'Politics'] _priors = {'Sports': 0.7} _documentCollection = {'ab4b5bbcaef44446fc': {}} ''' {'d_u_m_m_y$t_e_r_m': 'prior': {'Entertainment': .45, 'Business': .35,... }, } ''' _termsDict = {} def __init__(self, dataCacheFile): self._documentCollection.clear() self._priors.clear() self._bingDocsGenerator = DocumentsGenerator(dataCacheFile) self._populateDocuments() self._calculateProbabilities() def _populateDocuments(self): print 'Getting Documents for learning Classification..' jsonData = self._bingDocsGenerator.getDocuments(self._queryList, self._categories) for documentJson in jsonData: docInfoDict = {} docId = documentJson['id'] description = documentJson['description'] title = documentJson['title'] terms = findall('\w+', title + ' ' + description , UNICODE) docInfoDict = {} docInfoDict['terms'] = terms docInfoDict['category'] = documentJson['category'] self._documentCollection[docId] = docInfoDict def _updatePriorData(self, category): if category in self._priors: self._priors[category] += 1 else: self._priors[category] = 0 def _updateDataForTerm(self, term, category, termFrequency): termDataDict = self._termsDict[term] priorDataDict = termDataDict['prior'] # updating term frequency priorDataDict[category] += termFrequency priorDataDict['totalCount'] += termFrequency def _initializeTermData(self, category, termFrequency): # initializing prior data for the term termDataDict = {} termDataDict['prior'] = {} for categoryName in self._categories: termDataDict['prior'][categoryName] = 0 termDataDict['prior']['totalCount'] = termFrequency termDataDict['prior'][category] = termFrequency return termDataDict def _updateTermsData(self, terms, category): termsSet = set(terms) termFrequencyDistribution = Counter(terms) for term in termsSet: termDataDict = {} termFrequency = termFrequencyDistribution[term] if term in self._termsDict: termDataDict = self._termsDict[term] self._updateDataForTerm(term, category, termFrequency) else: termDataDict = self._initializeTermData(category, termFrequency) self._termsDict[term] = termDataDict # updating the information the term in the terms Dictionary def _calculateProbabilities(self): for docId in self._documentCollection: document = self._documentCollection[docId] docCategory = document['category'] terms = document['terms'] self._updatePriorData(docCategory) self._updateTermsData(terms, docCategory) print 'Calculating probabilities' for priorCategory in self._priors: prior = self._priors[priorCategory] / float(len(self._documentCollection)) self._priors[priorCategory] = prior for term in self._termsDict: termDataDict = self._termsDict[term] priorDict = termDataDict['prior'] for priorCategory in priorDict: priorDict[priorCategory] = (priorDict[priorCategory] + 1 ) / (float(priorDict['totalCount']) + len(self._termsDict)) def classifyDocument(self, document): title = document['title'] description = document['description'] terms = findall('\w+', title + ' ' + description , UNICODE) categoryProbabilities = [] termProbabilityLogSum = 0 for category in self._categories: categoryProbability = 0 termProbabilityLogSum = 0 for term in terms: if term in self._termsDict: termProbabilityLogSum += log(self._termsDict[term]['prior'][category], 2) categoryProbability = log(self._priors[category], 2) + termProbabilityLogSum categoryProbabilities.append((category, categoryProbability)) documentCategory = max(categoryProbabilities, key = itemgetter(1))[0] return documentCategory def displayData(self): for term in self._termsDict: termDataDict = self._termsDict[term] priorDict = termDataDict['prior'] print term,' --> ', priorDict print '-------------------------' print self._priors def testClassifier(self): print 'Getting documents for testing' bingDocsFetcher = DocumentsGenerator('test.json') testQueries = ['apple', 'facebook', 'westeros', 'gonzaga', 'banana'] categories = ['Entertainment', 'Business', 'Politics'] jsonDocData = bingDocsFetcher.getDocuments(testQueries, categories) numDocs = len(jsonDocData) print numDocs, ' is the number of documents' confusionDict = {} for category in self._categories: confusionDict[category] = {} confusionDict[category]['TN'] = 0 confusionDict[category]['TP'] = 0 confusionDict[category]['FN'] = 0 confusionDict[category]['FP'] = 0 for document in jsonDocData: classPredicted = self.classifyDocument(document) actualClass = document['category'] if actualClass == classPredicted: confusionDict[actualClass]['TP'] += 1 else: confusionDict[actualClass]['FN'] += 1 confusionDict[classPredicted]['FP'] += 1 for category in self._categories: confusionDict[category]['TN'] = numDocs - confusionDict[category]['TP'] - confusionDict[category]['FP'] - confusionDict[category]['FN'] (tpSum, fpSum, fnSum) = (0, 0, 0) for category in confusionDict: tpSum += confusionDict[category]['TP'] fpSum += confusionDict[category]['FP'] fnSum += confusionDict[category]['FN'] precision = tpSum / float(tpSum + fpSum) recall = tpSum / float(tpSum + fnSum) f1 = 2 * precision * recall / (precision + recall) print 'Mico averaged F1 ', f1 print confusionDict
class KMeans: _bingDocsGenerator = None _vectorSpace = None _vectorsInfo = [{'vector':[], 'class':'texas aggies'}] _classList = ['texas aggies', 'texas longhorns', 'duke blue devils','dallas cowboys', 'dallas mavericks'] _expectedNumberOfClusters = 6 _clusters = {'cluster1' : {'vectorIndices':set(), 'center' : [], 'assignedclasses':{'texas aggies': 0} } } def __init__(self, dataCacheFilePath): self._bingDocsGenerator = DocumentsGenerator(dataCacheFilePath) self._vectorSpace = VectorSpace(self._populateDocuments()) self._vectorsInfo = self._vectorSpace.getAllDocumentVectors() def _populateDocuments(self): print 'Getting documents for clustering' jsonData = self._bingDocsGenerator.getDocuments(self._classList) return jsonData def _generateRandomPoint(self): numberOfPoints = len(self._vectorsInfo) randomIndex = randint(0, numberOfPoints - 1) randomPoint = self._vectorsInfo[randomIndex] return randomPoint def _getDistance(self, docVector1, docVector2): return self._vectorSpace.getEuclidianDistance(docVector1, docVector2) #return 1 - self._vectorSpace.getCosineSimilarity(docVector1, docVector2) def _isContained(self, vectorList, vectorElement): for vector in vectorList: if self._vectorSpace._areEqual(vector['vector'], vectorElement['vector']): return True return False def _initializeClusters(self): print 'Initializing clusters' randomCenters = [] while True: randomCenter = self._generateRandomPoint() if not self._isContained(randomCenters, randomCenter): randomCenters.append(randomCenter) if len(randomCenters) == self._expectedNumberOfClusters: break tempClusters = {} clusterIndex = 0 for randomCenter in randomCenters: tempCluster = {'center' : randomCenter, 'vectorIndices' : set()} tempClusterName = 'cluster' + str(clusterIndex) clusterIndex += 1 tempClusters[tempClusterName] = tempCluster self._clusters = tempClusters def _calculateRSS(self): distanceSum = 0 for clusterName in self._clusters: cluster = self._clusters[clusterName] clusterCenter = cluster['center']['vector'] for vectorIndex in cluster['vectorIndices']: distance = self._getDistance(clusterCenter, self._vectorsInfo[vectorIndex]['vector']) distanceSum += distance RSS = distanceSum / len(self._clusters) return RSS def _findClosestCluster(self, docVector): distanceList = [] for clusterName in self._clusters: cluster = self._clusters[clusterName] clusterCenter = cluster['center']['vector'] distance = self._getDistance(clusterCenter, docVector) distanceList.append((clusterName, distance)) closestClusterName = min(distanceList, key=itemgetter(1))[0] return closestClusterName def _assignDocVectorToCluster(self, docVectorIndex, clusterName): cluster = self._clusters[clusterName] cluster['vectorIndices'].add(docVectorIndex) docVectorClass = self._vectorsInfo[docVectorIndex]['class'] if not 'assignedclasses' in cluster: cluster['assignedclasses'] = {} assignedClasses = cluster['assignedclasses'] if docVectorClass in assignedClasses: assignedClasses[docVectorClass] += 1 else: assignedClasses[docVectorClass] = 1 def _calculateCentroids(self): for clusterName in self._clusters: cluster = self._clusters[clusterName] vectors = [] for vectorIndex in cluster['vectorIndices']: vectors.append(self._vectorsInfo[vectorIndex]['vector']) if len(vectors) > 0: centroid = self._vectorSpace.getCentroid(vectors) cluster['center'] = {'vector': centroid} def _removeEmptyClusters(self): emptyClusterNames = [] for clusterName in self._clusters: if len(self._clusters[clusterName]['vectorIndices']) == 0: emptyClusterNames.append(clusterName) for emptyClusterName in emptyClusterNames: self._clusters.pop(emptyClusterName) def _clearClusterMembers(self): for clusterName in self._clusters: cluster = self._clusters[clusterName] cluster['vectorIndices'].clear() cluster['assignedclasses'].clear() def clusterPoints(self): print 'Clustering Points' self._initializeClusters() iterCount = 0 while True: vectorIndex = 0 # Assign all vectors to clusters for vectorInfo in self._vectorsInfo: closestClusterName = self._findClosestCluster(vectorInfo['vector']) self._assignDocVectorToCluster(vectorIndex, closestClusterName) vectorIndex += 1 print 'Iteration---' for clusterName in self._clusters: print clusterName ,'--->' ,self._clusters[clusterName]['assignedclasses'] RI = self.getRandIndex() print 'RI : ', RI RSS = self._calculateRSS() print 'RSS:', RSS purity = self.getPurity() print 'Purity is', purity if iterCount > 10: print 'Restarting !!----------------------------------------------------------------------' self._initializeClusters() iterCount = 0 continue self._calculateCentroids() self._clearClusterMembers() iterCount += 1 def _getVectorClassCounts(self): vectorCountDict = {} for clusterName in self._clusters: cluster = self._clusters[clusterName] vectorCountDict[clusterName] = {} for className in self._classList: vectorCountDict[clusterName][className] = 0 for vectorIndex in cluster['vectorIndices']: if self._vectorsInfo[vectorIndex]['class'] == className: vectorCountDict[clusterName][className] += 1 return vectorCountDict def getMaxCountClass(self, vectorCountDict, clusterName): maxCount = 0 classCountDict = vectorCountDict[clusterName] for className in classCountDict: classCount = classCountDict[className] if classCount > maxCount: maxCount = classCount return maxCount def _belongToSameCluster(self, vectorIndex1, vectorIndex2): for clusterName in self._clusters: cluster = self._clusters[clusterName] vectorIndices = cluster['vectorIndices'] if vectorIndex1 in vectorIndices and vectorIndex2 in vectorIndices: return True return False def getPurity(self): print 'Calculating purity' vectorCountDict = self._getVectorClassCounts() maxCountSum = 0 for clusterName in vectorCountDict: maxCount = self.getMaxCountClass(vectorCountDict, clusterName) maxCountSum += maxCount purity = maxCountSum / float(len(self._vectorsInfo)) return purity def getRandIndex(self): print 'Calculating Rand Index' falsePositivesCount = 0 falseNegativesCount = 0 truePositivesCount = 0 trueNegativesCount = 0 iterCount = 0 for vectorIndex1 in range(len(self._vectorsInfo)): for vectorIndex2 in range(len(self._vectorsInfo)): if vectorIndex1 == vectorIndex2: continue else: iterCount += 1 vectorInfo1 = self._vectorsInfo[vectorIndex1] vectorInfo2 = self._vectorsInfo[vectorIndex2] haveSameClass = vectorInfo1['class'] == vectorInfo2['class'] haveSameCluster = self._belongToSameCluster(vectorIndex1, vectorIndex2) if haveSameClass: if haveSameCluster: truePositivesCount += 1 else : falseNegativesCount += 1 else: if haveSameCluster: falsePositivesCount += 1 else : trueNegativesCount += 1 print 'Number of iterations :', iterCount print 'TN:', trueNegativesCount/ 2, ' TP: ', truePositivesCount/2 print 'FP: ', falsePositivesCount/2, 'FN: ', falseNegativesCount/2 total = trueNegativesCount + truePositivesCount + falseNegativesCount + falsePositivesCount RI = float(truePositivesCount + trueNegativesCount) / total return RI