def run(self, dataIterator, estimationMethod, parameterSpecificDataCollectionMethod=None): estimationMethod = FixedIntervalMethod(estimationMethod, self.timeUnitInSeconds) for data in dataIterator: message = self.convertDataToMessageMethod(data, **self.stream_settings) if CDA.messageInOrder(message.timeStamp): if parameterSpecificDataCollectionMethod != None: parameterSpecificDataCollectionMethod(estimationObject=self, message=message) UtilityMethods.updatePhraseTextToPhraseObject(message.vector, message.timeStamp, self.phraseTextToPhraseObjectMap, **self.stream_settings) estimationMethod.call(message.timeStamp, estimationObject=self, currentMessageTime=message.timeStamp)
def run(self, dataIterator, estimationMethod, parameterSpecificDataCollectionMethod=None): estimationMethod = FixedIntervalMethod(estimationMethod, self.timeUnitInSeconds) for data in dataIterator: message = self.convertDataToMessageMethod(data, **self.stream_settings) if CDA.messageInOrder(message.timeStamp): if parameterSpecificDataCollectionMethod != None: parameterSpecificDataCollectionMethod(estimationObject=self, message=message) UtilityMethods.updatePhraseTextToPhraseObject(message.vector, message.timeStamp, self.phraseTextToPhraseObjectMap, **self.stream_settings) estimationMethod.call(message.timeStamp, estimationObject=self, currentMessageTime=message.timeStamp)
def __init__(self, **stream_settings): super(HDStreaminClustering, self).__init__(**stream_settings) self.stream_settings = stream_settings self.phraseTextToPhraseObjectMap, self.streamIdToStreamObjectMap = {}, {} self.dimensionsUpdatingFrequency = stream_settings['dimension_update_frequency_in_seconds'] self.clustersAnalysisFrequency = stream_settings['cluster_analysis_frequency_in_seconds'] self.clustersFilteringFrequency = stream_settings['cluster_filtering_frequency_in_seconds'] self.updateDimensionsMethod = FixedIntervalMethod(stream_settings.get('update_dimensions_method', DataStreamMethods.updateDimensions), self.dimensionsUpdatingFrequency) self.clusterAnalysisMethod = FixedIntervalMethod(stream_settings.get('cluster_analysis_method', DataStreamMethods.clusterAnalysisMethod), self.clustersAnalysisFrequency) self.clusterFilteringMethod = FixedIntervalMethod(stream_settings.get('cluster_filtering_method', DataStreamMethods.clusterFilteringMethod), self.clustersFilteringFrequency) self.combineClustersMethod=stream_settings.get('combine_clusters_method',None) self.convertDataToMessageMethod=stream_settings['convert_data_to_message_method'] DataStreamMethods.messageInOrderVariable = None
def __init__(self, **stream_settings): super(HDStreaminClustering, self).__init__(**stream_settings) self.stream_settings = stream_settings self.phraseTextToPhraseObjectMap, self.streamIdToStreamObjectMap = {}, {} self.dimensionsUpdatingFrequency = stream_settings[ 'dimension_update_frequency_in_seconds'] self.clustersAnalysisFrequency = stream_settings[ 'cluster_analysis_frequency_in_seconds'] self.clustersFilteringFrequency = stream_settings[ 'cluster_filtering_frequency_in_seconds'] self.updateDimensionsMethod = FixedIntervalMethod( stream_settings.get('update_dimensions_method', DataStreamMethods.updateDimensions), self.dimensionsUpdatingFrequency) self.clusterAnalysisMethod = FixedIntervalMethod( stream_settings.get('cluster_analysis_method', DataStreamMethods.clusterAnalysisMethod), self.clustersAnalysisFrequency) self.clusterFilteringMethod = FixedIntervalMethod( stream_settings.get('cluster_filtering_method', DataStreamMethods.clusterFilteringMethod), self.clustersFilteringFrequency) self.combineClustersMethod = stream_settings.get( 'combine_clusters_method', None) self.convertDataToMessageMethod = stream_settings[ 'convert_data_to_message_method'] DataStreamMethods.messageInOrderVariable = None
class HDSkipStreamClustering(StreamingLSHClustering): def __init__(self, **stream_settings): super(HDSkipStreamClustering, self).__init__(**stream_settings) self.stream_settings = stream_settings self.phraseTextToPhraseObjectMap, self.streamIdToStreamObjectMap = {}, {} self.dimensionsUpdatingFrequency = stream_settings['dimension_update_frequency_in_seconds'] self.clustersAnalysisFrequency = stream_settings['cluster_analysis_frequency_in_seconds'] self.clustersFilteringFrequency = stream_settings['cluster_filtering_frequency_in_seconds'] self.updateDimensionsMethod = FixedIntervalMethod(stream_settings.get('update_dimensions_method', DataStreamMethods.updateDimensions), self.dimensionsUpdatingFrequency) self.clusterAnalysisMethod = FixedIntervalMethod(stream_settings.get('cluster_analysis_method', DataStreamMethods.clusterAnalysisMethod), self.clustersAnalysisFrequency) self.clusterFilteringMethod = FixedIntervalMethod(stream_settings.get('cluster_filtering_method', DataStreamMethods.clusterFilteringMethod), self.clustersFilteringFrequency) self.combineClustersMethod=stream_settings.get('combine_clusters_method',None) self.convertDataToMessageMethod=stream_settings['convert_data_to_message_method'] DataStreamMethods.messageInOrderVariable = None def cluster(self, dataIterator): i = 1 for data in dataIterator: message = self.convertDataToMessageMethod(data, **self.stream_settings) # message = data if DataStreamMethods.messageInOrder(message.timeStamp): UtilityMethods.updatePhraseTextToPhraseObject(message.vector, message.timeStamp, self.phraseTextToPhraseObjectMap, **self.stream_settings) if message.streamId not in self.streamIdToStreamObjectMap: self.streamIdToStreamObjectMap[message.streamId] = Stream(message.streamId, message) self.getClusterAndUpdateExistingClusters(self.streamIdToStreamObjectMap[message.streamId]) else: previousStreamObject=Vector(vectorInitialValues=self.streamIdToStreamObjectMap[message.streamId]) self.streamIdToStreamObjectMap[message.streamId].updateForMessage(message, VectorUpdateMethods.exponentialDecay, **self.stream_settings ) streamObject=self.streamIdToStreamObjectMap[message.streamId] distance = Vector.euclideanDistance(streamObject, previousStreamObject) if distance>10: # print i, len(self.clusters), distance self.getClusterAndUpdateExistingClusters(self.streamIdToStreamObjectMap[message.streamId]) self.updateDimensionsMethod.call(message.timeStamp, hdStreamClusteringObject=self, currentMessageTime=message.timeStamp) self.clusterFilteringMethod.call(message.timeStamp, hdStreamClusteringObject=self, currentMessageTime=message.timeStamp) # self.clusterAnalysisMethod.call(message.timeStamp, hdStreamClusteringObject=self, currentMessageTime=message.timeStamp) self.clusterAnalysisMethod.call(time.time(), hdStreamClusteringObject=self, currentMessageTime=message.timeStamp, numberOfMessages=i) # print i, len(self.clusters) i+=1 # self.getClusterAndUpdateExistingClusters(streamObject) # self.getClusterAndUpdateExistingClusters(message) def getClusterAndUpdateExistingClusters(self, stream): predictedCluster = self.getClusterForDocument(stream) if predictedCluster!=None: self.clusters[predictedCluster].addDocument(stream, **self.stream_settings) else: newCluster = StreamCluster(stream) newCluster.setSignatureUsingVectorPermutations(self.unitVector, self.vectorPermutations, self.phraseTextAndDimensionMap) for permutation in self.signaturePermutations: permutation.addDocument(newCluster) self.clusters[newCluster.clusterId] = newCluster
class HDStreaminClustering(StreamingLSHClustering): def __init__(self, **stream_settings): super(HDStreaminClustering, self).__init__(**stream_settings) self.stream_settings = stream_settings self.phraseTextToPhraseObjectMap, self.streamIdToStreamObjectMap = {}, {} self.dimensionsUpdatingFrequency = stream_settings['dimension_update_frequency_in_seconds'] self.clustersAnalysisFrequency = stream_settings['cluster_analysis_frequency_in_seconds'] self.clustersFilteringFrequency = stream_settings['cluster_filtering_frequency_in_seconds'] self.updateDimensionsMethod = FixedIntervalMethod(stream_settings.get('update_dimensions_method', DataStreamMethods.updateDimensions), self.dimensionsUpdatingFrequency) self.clusterAnalysisMethod = FixedIntervalMethod(stream_settings.get('cluster_analysis_method', DataStreamMethods.clusterAnalysisMethod), self.clustersAnalysisFrequency) self.clusterFilteringMethod = FixedIntervalMethod(stream_settings.get('cluster_filtering_method', DataStreamMethods.clusterFilteringMethod), self.clustersFilteringFrequency) self.combineClustersMethod=stream_settings.get('combine_clusters_method',None) self.convertDataToMessageMethod=stream_settings['convert_data_to_message_method'] DataStreamMethods.messageInOrderVariable = None def cluster(self, dataIterator): i = 1 for data in dataIterator: message = self.convertDataToMessageMethod(data, **self.stream_settings) # message = data if DataStreamMethods.messageInOrder(message.timeStamp): UtilityMethods.updatePhraseTextToPhraseObject(message.vector, message.timeStamp, self.phraseTextToPhraseObjectMap, **self.stream_settings) if message.streamId not in self.streamIdToStreamObjectMap: self.streamIdToStreamObjectMap[message.streamId] = Stream(message.streamId, message) else: self.streamIdToStreamObjectMap[message.streamId].updateForMessage(message, VectorUpdateMethods.exponentialDecay, **self.stream_settings ) streamObject=self.streamIdToStreamObjectMap[message.streamId] self.updateDimensionsMethod.call(message.timeStamp, hdStreamClusteringObject=self, currentMessageTime=message.timeStamp) self.clusterFilteringMethod.call(message.timeStamp, hdStreamClusteringObject=self, currentMessageTime=message.timeStamp) self.clusterAnalysisMethod.call(message.timeStamp, hdStreamClusteringObject=self, currentMessageTime=message.timeStamp) self.getClusterAndUpdateExistingClusters(streamObject) # self.getClusterAndUpdateExistingClusters(message) def getClusterAndUpdateExistingClusters(self, stream): predictedCluster = self.getClusterForDocument(stream) ''' Do not remove this comment. Might need this if StreamCluster is used again in future. if predictedCluster!=None: self.clusters[predictedCluster].addStream(stream, **self.stream_settings) ''' if predictedCluster!=None: self.clusters[predictedCluster].addDocument(stream, **self.stream_settings) else: newCluster = StreamCluster(stream) newCluster.setSignatureUsingVectorPermutations(self.unitVector, self.vectorPermutations, self.phraseTextAndDimensionMap) for permutation in self.signaturePermutations: permutation.addDocument(newCluster) self.clusters[newCluster.clusterId] = newCluster
class HDStreaminClustering(StreamingLSHClustering): def __init__(self, **stream_settings): super(HDStreaminClustering, self).__init__(**stream_settings) self.stream_settings = stream_settings self.phraseTextToPhraseObjectMap, self.streamIdToStreamObjectMap = {}, {} self.dimensionsUpdatingFrequency = stream_settings[ 'dimension_update_frequency_in_seconds'] self.clustersAnalysisFrequency = stream_settings[ 'cluster_analysis_frequency_in_seconds'] self.clustersFilteringFrequency = stream_settings[ 'cluster_filtering_frequency_in_seconds'] self.updateDimensionsMethod = FixedIntervalMethod( stream_settings.get('update_dimensions_method', DataStreamMethods.updateDimensions), self.dimensionsUpdatingFrequency) self.clusterAnalysisMethod = FixedIntervalMethod( stream_settings.get('cluster_analysis_method', DataStreamMethods.clusterAnalysisMethod), self.clustersAnalysisFrequency) self.clusterFilteringMethod = FixedIntervalMethod( stream_settings.get('cluster_filtering_method', DataStreamMethods.clusterFilteringMethod), self.clustersFilteringFrequency) self.combineClustersMethod = stream_settings.get( 'combine_clusters_method', None) self.convertDataToMessageMethod = stream_settings[ 'convert_data_to_message_method'] DataStreamMethods.messageInOrderVariable = None def cluster(self, dataIterator): i = 1 for data in dataIterator: message = self.convertDataToMessageMethod(data, **self.stream_settings) # message = data if DataStreamMethods.messageInOrder(message.timeStamp): UtilityMethods.updatePhraseTextToPhraseObject( message.vector, message.timeStamp, self.phraseTextToPhraseObjectMap, **self.stream_settings) if message.streamId not in self.streamIdToStreamObjectMap: self.streamIdToStreamObjectMap[message.streamId] = Stream( message.streamId, message) else: self.streamIdToStreamObjectMap[ message.streamId].updateForMessage( message, VectorUpdateMethods.exponentialDecay, **self.stream_settings) streamObject = self.streamIdToStreamObjectMap[message.streamId] self.updateDimensionsMethod.call( message.timeStamp, hdStreamClusteringObject=self, currentMessageTime=message.timeStamp) self.clusterFilteringMethod.call( message.timeStamp, hdStreamClusteringObject=self, currentMessageTime=message.timeStamp) self.clusterAnalysisMethod.call( message.timeStamp, hdStreamClusteringObject=self, currentMessageTime=message.timeStamp) self.getClusterAndUpdateExistingClusters(streamObject) # self.getClusterAndUpdateExistingClusters(message) def getClusterAndUpdateExistingClusters(self, stream): predictedCluster = self.getClusterForDocument(stream) ''' Do not remove this comment. Might need this if StreamCluster is used again in future. if predictedCluster!=None: self.clusters[predictedCluster].addStream(stream, **self.stream_settings) ''' if predictedCluster != None: self.clusters[predictedCluster].addDocument( stream, **self.stream_settings) else: newCluster = StreamCluster(stream) newCluster.setSignatureUsingVectorPermutations( self.unitVector, self.vectorPermutations, self.phraseTextAndDimensionMap) for permutation in self.signaturePermutations: permutation.addDocument(newCluster) self.clusters[newCluster.clusterId] = newCluster
class HDSkipStreamClustering(StreamingLSHClustering): def __init__(self, **stream_settings): super(HDSkipStreamClustering, self).__init__(**stream_settings) self.stream_settings = stream_settings self.phraseTextToPhraseObjectMap, self.streamIdToStreamObjectMap = {}, {} self.dimensionsUpdatingFrequency = stream_settings[ 'dimension_update_frequency_in_seconds'] self.clustersAnalysisFrequency = stream_settings[ 'cluster_analysis_frequency_in_seconds'] self.clustersFilteringFrequency = stream_settings[ 'cluster_filtering_frequency_in_seconds'] self.updateDimensionsMethod = FixedIntervalMethod( stream_settings.get('update_dimensions_method', DataStreamMethods.updateDimensions), self.dimensionsUpdatingFrequency) self.clusterAnalysisMethod = FixedIntervalMethod( stream_settings.get('cluster_analysis_method', DataStreamMethods.clusterAnalysisMethod), self.clustersAnalysisFrequency) self.clusterFilteringMethod = FixedIntervalMethod( stream_settings.get('cluster_filtering_method', DataStreamMethods.clusterFilteringMethod), self.clustersFilteringFrequency) self.combineClustersMethod = stream_settings.get( 'combine_clusters_method', None) self.convertDataToMessageMethod = stream_settings[ 'convert_data_to_message_method'] DataStreamMethods.messageInOrderVariable = None def cluster(self, dataIterator): i = 1 for data in dataIterator: message = self.convertDataToMessageMethod(data, **self.stream_settings) # message = data if DataStreamMethods.messageInOrder(message.timeStamp): UtilityMethods.updatePhraseTextToPhraseObject( message.vector, message.timeStamp, self.phraseTextToPhraseObjectMap, **self.stream_settings) if message.streamId not in self.streamIdToStreamObjectMap: self.streamIdToStreamObjectMap[message.streamId] = Stream( message.streamId, message) self.getClusterAndUpdateExistingClusters( self.streamIdToStreamObjectMap[message.streamId]) else: previousStreamObject = Vector( vectorInitialValues=self.streamIdToStreamObjectMap[ message.streamId]) self.streamIdToStreamObjectMap[ message.streamId].updateForMessage( message, VectorUpdateMethods.exponentialDecay, **self.stream_settings) streamObject = self.streamIdToStreamObjectMap[ message.streamId] distance = Vector.euclideanDistance( streamObject, previousStreamObject) if distance > 10: # print i, len(self.clusters), distance self.getClusterAndUpdateExistingClusters( self.streamIdToStreamObjectMap[message.streamId]) self.updateDimensionsMethod.call( message.timeStamp, hdStreamClusteringObject=self, currentMessageTime=message.timeStamp) self.clusterFilteringMethod.call( message.timeStamp, hdStreamClusteringObject=self, currentMessageTime=message.timeStamp) # self.clusterAnalysisMethod.call(message.timeStamp, hdStreamClusteringObject=self, currentMessageTime=message.timeStamp) self.clusterAnalysisMethod.call( time.time(), hdStreamClusteringObject=self, currentMessageTime=message.timeStamp, numberOfMessages=i) # print i, len(self.clusters) i += 1 # self.getClusterAndUpdateExistingClusters(streamObject) # self.getClusterAndUpdateExistingClusters(message) def getClusterAndUpdateExistingClusters(self, stream): predictedCluster = self.getClusterForDocument(stream) if predictedCluster != None: self.clusters[predictedCluster].addDocument( stream, **self.stream_settings) else: newCluster = StreamCluster(stream) newCluster.setSignatureUsingVectorPermutations( self.unitVector, self.vectorPermutations, self.phraseTextAndDimensionMap) for permutation in self.signaturePermutations: permutation.addDocument(newCluster) self.clusters[newCluster.clusterId] = newCluster