def getClusterForDocument(self, document): UtilityMethods.updatePhraseTextAndDimensionsMap(document, self.phraseTextAndDimensionMap, **self.clustering_settings) document.setSignatureUsingVectorPermutations(self.unitVector, self.vectorPermutations, self.phraseTextAndDimensionMap) predictedCluster = None possibleNearestNeighbors = reduce(lambda x,y:x.union(y), (permutation.getNearestDocuments(document) for permutation in self.signaturePermutations), set()) if possibleNearestNeighbors: predictedCluster = max(((clusterId, self.clusters[clusterId].cosineSimilarity(document)) for clusterId in possibleNearestNeighbors), key=itemgetter(1)) if predictedCluster and predictedCluster[1]>=self.thresholdForDocumentToBeInACluster:return predictedCluster[0]
def getNearestDocument(self, document): UtilityMethods.updatePhraseTextAndDimensionsMap(document, self.phraseTextAndDimensionMap, **self.settings) document.setSignatureUsingVectorPermutations(self.unitVector, self.vectorPermutations, self.phraseTextAndDimensionMap) predictedNeighbor = None possibleNearestNeighbors = reduce(lambda x,y:x.union(y), (permutation.getNearestDocuments(document) for permutation in self.signaturePermutations), set()) if possibleNearestNeighbors: predictedNeighbor = max(((docId, self.documentIdToDocumentMap[docId].cosineSimilarity(document)) for docId in possibleNearestNeighbors), key=itemgetter(1)) print predictedNeighbor if predictedNeighbor and predictedNeighbor[1]>=self.nearestNeighborThreshold:return predictedNeighbor[0]
def update(self, newDocument): UtilityMethods.updatePhraseTextAndDimensionsMap(newDocument, self.phraseTextAndDimensionMap, **self.settings) currentDocument = self.documentIdToDocumentMap.get(newDocument.docId, None) self.documentIdToDocumentMap[newDocument.docId] = newDocument newDocument.setSignatureUsingVectorPermutations(self.unitVector, self.vectorPermutations, self.phraseTextAndDimensionMap) for permutation in self.signaturePermutations: if currentDocument!=None: permutation.removeDocument(currentDocument) permutation.addDocument(newDocument)
def test_updatePhraseTextAndDimensionsMap_PhraseMapHasLesserDimensions( self): settings['dimensions'] = 4 UtilityMethods.updatePhraseTextAndDimensionsMap( self.phraseVector, self.phraseTextAndDimensionMap, **settings) self.assertEqual( self.finalPhraseToIdMap, self.phraseTextAndDimensionMap.getMap(TwoWayMap.MAP_FORWARD))
def test_updatePhraseTextAndDimensionsMap_PhraseMapHasMaximumDimensions( self): UtilityMethods.updatePhraseTextAndDimensionsMap( self.phraseVector, self.phraseTextAndDimensionMap, **settings) for k in ['streams', 'highdimensional']: del self.finalPhraseToIdMap[k] self.assertEqual( self.finalPhraseToIdMap, self.phraseTextAndDimensionMap.getMap(TwoWayMap.MAP_FORWARD))
def update(self, newDocument): UtilityMethods.updatePhraseTextAndDimensionsMap( newDocument, self.phraseTextAndDimensionMap, **self.settings) currentDocument = self.documentIdToDocumentMap.get( newDocument.docId, None) self.documentIdToDocumentMap[newDocument.docId] = newDocument newDocument.setSignatureUsingVectorPermutations( self.unitVector, self.vectorPermutations, self.phraseTextAndDimensionMap) for permutation in self.signaturePermutations: if currentDocument != None: permutation.removeDocument(currentDocument) permutation.addDocument(newDocument)
def getClusterForDocument(self, document): UtilityMethods.updatePhraseTextAndDimensionsMap( document, self.phraseTextAndDimensionMap, **self.clustering_settings) document.setSignatureUsingVectorPermutations( self.unitVector, self.vectorPermutations, self.phraseTextAndDimensionMap) predictedCluster = None possibleNearestNeighbors = reduce( lambda x, y: x.union(y), (permutation.getNearestDocuments(document) for permutation in self.signaturePermutations), set()) if possibleNearestNeighbors: predictedCluster = max( ((clusterId, self.clusters[clusterId].cosineSimilarity(document)) for clusterId in possibleNearestNeighbors), key=itemgetter(1)) if predictedCluster and predictedCluster[ 1] >= self.thresholdForDocumentToBeInACluster: return predictedCluster[0]
def getNearestDocument(self, document): UtilityMethods.updatePhraseTextAndDimensionsMap( document, self.phraseTextAndDimensionMap, **self.settings) document.setSignatureUsingVectorPermutations( self.unitVector, self.vectorPermutations, self.phraseTextAndDimensionMap) predictedNeighbor = None possibleNearestNeighbors = reduce( lambda x, y: x.union(y), (permutation.getNearestDocuments(document) for permutation in self.signaturePermutations), set()) if possibleNearestNeighbors: predictedNeighbor = max((( docId, self.documentIdToDocumentMap[docId].cosineSimilarity(document)) for docId in possibleNearestNeighbors), key=itemgetter(1)) print predictedNeighbor if predictedNeighbor and predictedNeighbor[ 1] >= self.nearestNeighborThreshold: return predictedNeighbor[0]
def test_updatePhraseTextAndDimensionsMap_PhraseMapHasMaximumDimensions(self): UtilityMethods.updatePhraseTextAndDimensionsMap(self.phraseVector, self.phraseTextAndDimensionMap, **settings) for k in ['streams', 'highdimensional']: del self.finalPhraseToIdMap[k] self.assertEqual(self.finalPhraseToIdMap, self.phraseTextAndDimensionMap.getMap(TwoWayMap.MAP_FORWARD))
def test_updatePhraseTextAndDimensionsMap_PhraseMapHasLesserDimensions(self): settings['dimensions'] = 4 UtilityMethods.updatePhraseTextAndDimensionsMap(self.phraseVector, self.phraseTextAndDimensionMap, **settings) self.assertEqual(self.finalPhraseToIdMap, self.phraseTextAndDimensionMap.getMap(TwoWayMap.MAP_FORWARD))