Python HDStreaminClustering示例，hd_streams_clustering.HDStreaminClustering Python示例

示例#1

0

显示文件

文件： algorithms_performance.py 项目： greeness/hd_streams_clustering

 def generateExperimentData2(self, fixedType):
     global previousTime
     experts_twitter_stream_settings[
         'cluster_analysis_method'] = JustifyDimensionsEstimation.modifiedClusterAnalysisMethod2
     if fixedType:
         experts_twitter_stream_settings[
             'dimensions_performance_type'] = JustifyDimensionsEstimation.first_n_dimension
         #            experts_twitter_stream_settings['update_dimensions_method'] = emptyUpdateDimensionsMethod
         experts_twitter_stream_settings['phrase_decay_coefficient'] = 1.0
         experts_twitter_stream_settings['stream_decay_coefficient'] = 1.0
         experts_twitter_stream_settings[
             'stream_cluster_decay_coefficient'] = 1.0
         for dimensions in range(10**4, 21 * 10**4, 10**4):
             experts_twitter_stream_settings[
                 'dimensions'] = getLargestPrimeLesserThan(dimensions)
             previousTime = time.time()
             HDStreaminClustering(
                 **experts_twitter_stream_settings).cluster(
                     TwitterIterators.iterateTweetsFromExperts(
                         expertsDataStartTime=datetime(2011, 3, 19),
                         expertsDataEndTime=datetime(2011, 3, 20, 5)))
     else:
         experts_twitter_stream_settings[
             'dimensions_performance_type'] = JustifyDimensionsEstimation.top_n_dimension
         previousTime = time.time()
         HDStreaminClustering(**experts_twitter_stream_settings).cluster(
             TwitterIterators.iterateTweetsFromExperts(
                 expertsDataStartTime=datetime(2011, 3, 19),
                 expertsDataEndTime=datetime(2011, 3, 20, 5)))

示例#2

0

显示文件

文件： performance_with_cda.py 项目： greeness/hd_streams_clustering

 def performanceForCDAAt(noOfTweets, fileName, **stream_settings):
     clustering=HDStreaminClustering(**stream_settings)
     ts = time.time()
     clustering.cluster(TwitterIterators.iterateFromFile(fileName))
     te = time.time()
     documentClusters = [cluster.documentsInCluster.keys() for k, cluster in clustering.clusters.iteritems() if len(cluster.documentsInCluster.keys())>=stream_settings['cluster_filter_threshold']]
     return Evaluation.getEvaluationMetrics(noOfTweets, documentClusters, te-ts)

示例#3

0

显示文件

 def __init__(self, **stream_settings):
     stream_settings[
         '%s_file' % ClusteringParametersEstimation.
         clusterLagDistributionId] = stream_settings[
             'parameter_estimation_folder'] + ClusteringParametersEstimation.clusterLagDistributionId
     self.stream_settings = stream_settings
     self.hdsClustering = HDStreaminClustering(**self.stream_settings)

示例#4

0

显示文件

 def performanceForCDAAt(noOfTweets, fileName, **stream_settings):
     clustering = HDStreaminClustering(**stream_settings)
     ts = time.time()
     clustering.cluster(TwitterIterators.iterateFromFile(fileName))
     te = time.time()
     documentClusters = [
         cluster.documentsInCluster.keys()
         for k, cluster in clustering.clusters.iteritems()
         if len(cluster.documentsInCluster.keys()) >=
         stream_settings['cluster_filter_threshold']
     ]
     return Evaluation.getEvaluationMetrics(noOfTweets, documentClusters,
                                            te - ts)

示例#5

0

显示文件

文件： algorithms_performance.py 项目： ylaron/hd_streams_clustering

 def generateExperimentData(self, withOutPruning):
     global previousTime
     if withOutPruning: experts_twitter_stream_settings['cluster_filtering_method'] = emptyClusterFilteringMethod; experts_twitter_stream_settings['pruing_type'] = JustifyMemoryPruning.without_memory_pruning
     else: experts_twitter_stream_settings['pruing_type'] = JustifyMemoryPruning.with_memory_pruning
     experts_twitter_stream_settings['cluster_analysis_method'] = JustifyMemoryPruning.modifiedClusterAnalysisMethod
     previousTime = time.time()
     HDStreaminClustering(**experts_twitter_stream_settings).cluster(TwitterIterators.iterateTweetsFromExperts(expertsDataStartTime=datetime(2011,3,19), expertsDataEndTime=datetime(2011,3,27)))

示例#6

0

显示文件

文件： algorithms_performance.py 项目： greeness/hd_streams_clustering

 def generateExperimentData(self, with_vanilla_lsh):
     global previousTime
     if with_vanilla_lsh:
         experts_twitter_stream_settings[
             'lsh_type'] = JustifyNotUsingVanillaLSH.with_vanilla_lsh
         experts_twitter_stream_settings['phrase_decay_coefficient'] = 1.0
         experts_twitter_stream_settings['stream_decay_coefficient'] = 1.0
         experts_twitter_stream_settings[
             'stream_cluster_decay_coefficient'] = 1.0
         experts_twitter_stream_settings[
             'cluster_filtering_method'] = emptyClusterFilteringMethod
         experts_twitter_stream_settings[
             'signature_type'] = 'signature_type_list'
         experts_twitter_stream_settings[
             'dimensions'] = getLargestPrimeLesserThan(10000)
         experts_twitter_stream_settings[
             'update_dimensions_method'] = emptyUpdateDimensionsMethod
     else:
         experts_twitter_stream_settings[
             'lsh_type'] = JustifyNotUsingVanillaLSH.with_modified_lsh
     experts_twitter_stream_settings[
         'cluster_analysis_method'] = JustifyNotUsingVanillaLSH.modifiedClusterAnalysisMethod
     previousTime = time.time()
     HDStreaminClustering(**experts_twitter_stream_settings).cluster(
         TwitterIterators.iterateTweetsFromExperts(
             expertsDataStartTime=datetime(2011, 3, 19),
             expertsDataEndTime=datetime(2011, 3, 27)))

示例#7

0

显示文件

文件： algorithms_performance.py 项目： ylaron/hd_streams_clustering

 def generateExperimentData(self, withOutDecay):
     global previousTime
     if withOutDecay: 
         experts_twitter_stream_settings['decay_type'] = JustifyExponentialDecay.without_decay
         experts_twitter_stream_settings['phrase_decay_coefficient']=1.0; experts_twitter_stream_settings['stream_decay_coefficient']=1.0; experts_twitter_stream_settings['stream_cluster_decay_coefficient']=1.0;
     else: experts_twitter_stream_settings['decay_type'] = JustifyExponentialDecay.with_decay
     experts_twitter_stream_settings['cluster_analysis_method'] = JustifyExponentialDecay.modifiedClusterAnalysisMethod
     previousTime = time.time()
     HDStreaminClustering(**experts_twitter_stream_settings).cluster(TwitterIterators.iterateTweetsFromExperts(expertsDataStartTime=datetime(2011,4,1), expertsDataEndTime=datetime(2011,4,8)))

示例#8

0

显示文件

文件： algorithms_performance.py 项目： ylaron/hd_streams_clustering

 def generateExperimentData(self, withoutTrie):
     global previousTime
     if withoutTrie: 
         experts_twitter_stream_settings['trie_type'] = JustifyTrie.with_sorted_list
         experts_twitter_stream_settings['signature_type']='signature_type_list'
     else: experts_twitter_stream_settings['trie_type'] = JustifyTrie.with_trie
     experts_twitter_stream_settings['cluster_analysis_method'] = JustifyTrie.modifiedClusterAnalysisMethod
     experts_twitter_stream_settings['cluster_filtering_method'] = emptyClusterFilteringMethod
     previousTime = time.time()
     HDStreaminClustering(**experts_twitter_stream_settings).cluster(TwitterIterators.iterateTweetsFromExperts(expertsDataStartTime=datetime(2011,3,19), expertsDataEndTime=datetime(2011,3,27)))

示例#9

0

显示文件

文件： algorithms_performance.py 项目： ylaron/hd_streams_clustering

 def generateExperimentData(self):
     global previousTime
     experts_twitter_stream_settings['dimensions_performance_type'] = JustifyDimensionsEstimation.first_n_dimension
     experts_twitter_stream_settings['update_dimensions_method'] = emptyUpdateDimensionsMethod
     experts_twitter_stream_settings['cluster_analysis_method'] = JustifyDimensionsEstimation.modifiedClusterAnalysisMethod
     for dimensions in range(10**4,21*10**4,10**4):
         experts_twitter_stream_settings['dimensions'] = getLargestPrimeLesserThan(dimensions)
         previousTime = time.time()
         try:
             HDStreaminClustering(**experts_twitter_stream_settings).cluster(TwitterIterators.iterateTweetsFromExperts())
         except Exception as e: pass

示例#10

0

显示文件

    def generateStatsForHDLSHClustering(self):
        print 'HD LSH'

        def _getDocumentFromTuple((user, text)):
            vector, words = Vector(), text.split()
            for word in words[1:]:
                if word not in vector: vector[word] = 1
                else: vector[word] += 1
            return Document(user, vector)

        self.stream_settings[
            'convert_data_to_message_method'] = TwitterCrowdsSpecificMethods.convertTweetJSONToMessage
        self.stream_settings[
            'cluster_analysis_method'] = emptyClusterAnalysisMethod
        #        self.stream_settings['cluster_filtering_method'] = emptyClusterFilteringMethod
        self.documents = [
            tw[1] for tw in list(self._tweetWithTimestampIterator())
            if tw[1]['text'].strip() != ''
        ]
        self.documents = [
            tw[0] for tw in sorted([(
                t, getDateTimeObjectFromTweetTimestamp(t['created_at']))
                                    for t in self.documents],
                                   key=itemgetter(0))
        ]
        clustering = HDStreaminClustering(**self.stream_settings)
        ts = time.time()
        #        for tweet in self.documents: clustering.getClusterAndUpdateExistingClusters(_getDocumentFromTuple(tweet))
        #        clustering.cluster([_getDocumentFromTuple(d) for d in self.documents])
        clustering.cluster(self.documents)
        te = time.time()
        documentClusters = [
            cluster.documentsInCluster.keys()
            for k, cluster in clustering.clusters.iteritems()
            if len(cluster.documentsInCluster.keys()) >=
            self.stream_settings['cluster_filter_threshold']
        ]
        return self.getEvaluationMetrics(documentClusters, te - ts)

示例#11

0

显示文件

文件： quality_comparison_with_kmeans.py 项目： greeness/hd_streams_clustering

    def generateStatsForHDLSHClustering(self):
        print 'HD LSH'
        def _getDocumentFromTuple((user, text)):
            vector, words = Vector(), text.split()
            for word in words[1:]:
                if word not in vector: vector[word]=1
                else: vector[word]+=1
            return Document(user, vector)
        self.stream_settings['convert_data_to_message_method'] = TwitterCrowdsSpecificMethods.convertTweetJSONToMessage
        self.stream_settings['cluster_analysis_method'] = emptyClusterAnalysisMethod
#        self.stream_settings['cluster_filtering_method'] = emptyClusterFilteringMethod
        self.documents = [tw[1] for tw in list(self._tweetWithTimestampIterator()) if tw[1]['text'].strip()!='']
        self.documents = [ tw[0] for tw in 
                          sorted([(t, getDateTimeObjectFromTweetTimestamp(t['created_at']))  for t in self.documents], key=itemgetter(0))
                          ]
        clustering=HDStreaminClustering(**self.stream_settings)
        ts = time.time()
#        for tweet in self.documents: clustering.getClusterAndUpdateExistingClusters(_getDocumentFromTuple(tweet))
#        clustering.cluster([_getDocumentFromTuple(d) for d in self.documents])
        clustering.cluster(self.documents)
        te = time.time()
        documentClusters = [cluster.documentsInCluster.keys() for k, cluster in clustering.clusters.iteritems() if len(cluster.documentsInCluster.keys())>=self.stream_settings['cluster_filter_threshold']]
        return self.getEvaluationMetrics(documentClusters, te-ts)

示例#12

0

显示文件

文件： stream_parameters_estimation.py 项目： greeness/hd_streams_clustering

 def __init__(self, **stream_settings):
     stream_settings['%s_file' % ClusteringParametersEstimation.clusterLagDistributionId] = stream_settings['parameter_estimation_folder'] + ClusteringParametersEstimation.clusterLagDistributionId
     self.stream_settings = stream_settings
     self.hdsClustering = HDStreaminClustering(**self.stream_settings)

示例#13

0

显示文件

文件： stream_parameters_estimation.py 项目： greeness/hd_streams_clustering

class ClusteringParametersEstimation():
    clusterLagDistributionId = 'cluster_lag_distribution'
    def __init__(self, **stream_settings):
        stream_settings['%s_file' % ClusteringParametersEstimation.clusterLagDistributionId] = stream_settings['parameter_estimation_folder'] + ClusteringParametersEstimation.clusterLagDistributionId
        self.stream_settings = stream_settings
        self.hdsClustering = HDStreaminClustering(**self.stream_settings)
    def run(self, iterator): self.hdsClustering.cluster(iterator)
    @staticmethod
    def emptyClusterFilteringMethod(hdStreamClusteringObject, currentMessageTime): pass
    @staticmethod
    def clusterLagDistributionMethod(hdStreamClusteringObject, currentMessageTime):
        lagDistribution = defaultdict(int)
        for cluster in hdStreamClusteringObject.clusters.values():
            lag = DateTimeAirthematic.getDifferenceInTimeUnits(currentMessageTime, cluster.lastStreamAddedTime, hdStreamClusteringObject.stream_settings['time_unit_in_seconds'].seconds)
            lagDistribution[str(lag)] += 1
        print currentMessageTime, len(hdStreamClusteringObject.clusters)
        iterationData = {
                         'time_stamp': getStringRepresentationForTweetTimestamp(currentMessageTime),
                         'settings': pprint.pformat(hdStreamClusteringObject.stream_settings),
                         ClusteringParametersEstimation.clusterLagDistributionId: lagDistribution,
                         'lag_between_streams_added_to_cluster': hdStreamClusteringObject.stream_settings['lag_between_streams_added_to_cluster']
                         }
#        print hdStreamClusteringObject.stream_settings['lag_between_streams_added_to_cluster']
        FileIO.writeToFileAsJson(iterationData, hdStreamClusteringObject.stream_settings['%s_file' % ClusteringParametersEstimation.clusterLagDistributionId])
    def plotCDFClustersLagDistribution(self, returnAxisValuesOnly=True):
        '''
        This determines the time after which a cluster can be considered 
        decayed and hence removed.
        
        Experts stream [ 0.66002386  0.07035227] 0.1 82
        Houston stream [ 0.73800037  0.05890473] 0.1 29
        
        458 (# of time units) Experts stream [ 0.66002386  0.07035227] 0.2 15
        71 (# of time units) Houston stream [ 0.73756656  0.05883258] 0.2 3
        
        '''
        def calculateInActivityTimeFor(params, probabilityOfInactivity): return int(CurveFit.inverseOfIncreasingExponentialFunction(params, 1 - probabilityOfInactivity))
        data = list(FileIO.iterateJsonFromFile(self.hdsClustering.stream_settings['%s_file' % ClusteringParametersEstimation.clusterLagDistributionId]))[-1]
        total = float(sum(data['lag_between_streams_added_to_cluster'].values()))
        x = sorted(map(int, data['lag_between_streams_added_to_cluster'].keys()))
        y = getCumulativeDistribution([data['lag_between_streams_added_to_cluster'][str(i)] / total for i in x])
        exponentialCurveParams = CurveFit.getParamsAfterFittingData(x, y, CurveFit.increasingExponentialFunction, [1., 1.])
        print self.stream_settings['plot_label'], exponentialCurveParams, calculateInActivityTimeFor(exponentialCurveParams, 0.2) 
        plt.plot(x, y, 'o', label=getLatexForString(self.stream_settings['plot_label']) + getLatexForString(' (%0.2fx^{%0.2f})') % (exponentialCurveParams[0], exponentialCurveParams[1]), color=self.stream_settings['plot_color'])
        plt.plot(x, CurveFit.getYValues(CurveFit.increasingExponentialFunction, exponentialCurveParams, x), color=self.stream_settings['plot_color'], lw=2)
        plt.ylabel(r'$P\ (\ lag\ \leq\ TU\ )$'), plt.xlabel(getLatexForString(xlabelTimeUnits)), plt.title(getLatexForString('CDF for clusters lag distribution.'))
        plt.ylim((0, 1.2))
        plt.legend(loc=4)
        if returnAxisValuesOnly: plt.show()
    def plotPercentageOfClustersWithinALag(self, returnAxisValuesOnly=True):
        '''
        458 Experts stream [ 0.01860266  0.70639136] 15 0.874004297177
        80 Houston stream [ 0.0793181   0.47644004] 3 0.866127308876
        '''
        def calculatePercentageOfDecayedPhrasesFor(params, timeUnit): return 1 - CurveFit.increasingExponentialFunction(params, timeUnit)
        dataDistribution = {}
        currentTimeUnit = 0
#        file='/mnt/chevron/kykamath/data/twitter/lsh_crowds/houston_stream/parameter_estimation/cluster_lag_distribution'
        file = self.hdsClustering.stream_settings['%s_file' % ClusteringParametersEstimation.clusterLagDistributionId]
        lines = list(FileIO.iterateJsonFromFile(file))
        numberOfTimeUnits = len(lines)
        for data in lines:
            totalClusters = float(sum(data[ClusteringParametersEstimation.clusterLagDistributionId].values()))
            tempArray = []
            for k, v in data[ClusteringParametersEstimation.clusterLagDistributionId].iteritems():
                k = int(k)
                if k not in dataDistribution: dataDistribution[k] = [0] * numberOfTimeUnits
                dataDistribution[k][currentTimeUnit] = v / totalClusters
                tempArray.append(v / totalClusters)
            currentTimeUnit += 1
        x = sorted(dataDistribution)
        print numberOfTimeUnits,
        y = getCumulativeDistribution([np.mean(dataDistribution[k]) for k in x])
        params = CurveFit.getParamsAfterFittingData(x, y, CurveFit.increasingExponentialFunction, [1., 1.])
        print self.stream_settings['plot_label'], params,
        def subPlot(id, timeUnit):
            plt.subplot(id)
            print timeUnit, calculatePercentageOfDecayedPhrasesFor(params, timeUnit)
            plt.plot(x, y, 'o', label=getLatexForString(self.stream_settings['plot_label']) + getLatexForString(' (%0.2fx^{%0.2f})') % (params[0], params[1]), color=self.stream_settings['plot_color'])
            plt.plot(x, CurveFit.getYValues(CurveFit.increasingExponentialFunction, params, x), color=self.stream_settings['plot_color'], lw=2)
        if self.stream_settings['stream_id'] == 'experts_twitter_stream': subPlot(111, 15); plt.title(getLatexForString('Percentage of clusters within a lag'))
        else: subPlot(111, 3); plt.xlabel(getLatexForString(xlabelTimeUnits))
        plt.ylabel(r'$\%\ of\ clusters\ with\ lag\ \leq\ TU$')
        plt.legend(loc=4)
        if returnAxisValuesOnly: plt.show()
    @staticmethod
    def thresholdForDocumentToBeInCluterEstimation(stats_file, **stream_settings):
        ''' Estimate thresold for the clusters by varying the threshold_for_document_to_be_in_cluster value.
        Run this on a document set of size 100K. 
        '''
        for length in [i * j for i in 10 ** 3, 10 ** 4, 10 ** 5 for j in range(1, 10)]: 
#            for t in range(1, 16): 
            for t in range(16, 21):
                stream_settings['threshold_for_document_to_be_in_cluster'] = t * 0.05
                print length, stream_settings['threshold_for_document_to_be_in_cluster']
                stats = {'streaming_lsh': KMeansTweetsFile(length, **stream_settings).generateStatsForStreamingLSHClustering(), 'settings': Settings.getSerialzedObject(stream_settings)}
                FileIO.writeToFileAsJson(stats, stats_file)

示例#14

0

显示文件

文件： data_generation_and_crowd_analysis.py 项目： ylaron/hd_streams_clustering

 def generateClusters(self, iterator):
     self.stream_settings['cluster_analysis_method'] = TwitterStreamAnalysis.writeClusters
     HDStreaminClustering(**self.stream_settings).cluster(iterator)

示例#15

0

显示文件

class ClusteringParametersEstimation():
    clusterLagDistributionId = 'cluster_lag_distribution'

    def __init__(self, **stream_settings):
        stream_settings[
            '%s_file' % ClusteringParametersEstimation.
            clusterLagDistributionId] = stream_settings[
                'parameter_estimation_folder'] + ClusteringParametersEstimation.clusterLagDistributionId
        self.stream_settings = stream_settings
        self.hdsClustering = HDStreaminClustering(**self.stream_settings)

    def run(self, iterator):
        self.hdsClustering.cluster(iterator)

    @staticmethod
    def emptyClusterFilteringMethod(hdStreamClusteringObject,
                                    currentMessageTime):
        pass

    @staticmethod
    def clusterLagDistributionMethod(hdStreamClusteringObject,
                                     currentMessageTime):
        lagDistribution = defaultdict(int)
        for cluster in hdStreamClusteringObject.clusters.values():
            lag = DateTimeAirthematic.getDifferenceInTimeUnits(
                currentMessageTime, cluster.lastStreamAddedTime,
                hdStreamClusteringObject.
                stream_settings['time_unit_in_seconds'].seconds)
            lagDistribution[str(lag)] += 1
        print currentMessageTime, len(hdStreamClusteringObject.clusters)
        iterationData = {
            'time_stamp':
            getStringRepresentationForTweetTimestamp(currentMessageTime),
            'settings':
            pprint.pformat(hdStreamClusteringObject.stream_settings),
            ClusteringParametersEstimation.clusterLagDistributionId:
            lagDistribution,
            'lag_between_streams_added_to_cluster':
            hdStreamClusteringObject.
            stream_settings['lag_between_streams_added_to_cluster']
        }
        #        print hdStreamClusteringObject.stream_settings['lag_between_streams_added_to_cluster']
        FileIO.writeToFileAsJson(
            iterationData, hdStreamClusteringObject.stream_settings[
                '%s_file' %
                ClusteringParametersEstimation.clusterLagDistributionId])

    def plotCDFClustersLagDistribution(self, returnAxisValuesOnly=True):
        '''
        This determines the time after which a cluster can be considered 
        decayed and hence removed.
        
        Experts stream [ 0.66002386  0.07035227] 0.1 82
        Houston stream [ 0.73800037  0.05890473] 0.1 29
        
        458 (# of time units) Experts stream [ 0.66002386  0.07035227] 0.2 15
        71 (# of time units) Houston stream [ 0.73756656  0.05883258] 0.2 3
        
        '''
        def calculateInActivityTimeFor(params, probabilityOfInactivity):
            return int(
                CurveFit.inverseOfIncreasingExponentialFunction(
                    params, 1 - probabilityOfInactivity))

        data = list(
            FileIO.iterateJsonFromFile(self.hdsClustering.stream_settings[
                '%s_file' %
                ClusteringParametersEstimation.clusterLagDistributionId]))[-1]
        total = float(
            sum(data['lag_between_streams_added_to_cluster'].values()))
        x = sorted(
            map(int, data['lag_between_streams_added_to_cluster'].keys()))
        y = getCumulativeDistribution([
            data['lag_between_streams_added_to_cluster'][str(i)] / total
            for i in x
        ])
        exponentialCurveParams = CurveFit.getParamsAfterFittingData(
            x, y, CurveFit.increasingExponentialFunction, [1., 1.])
        print self.stream_settings[
            'plot_label'], exponentialCurveParams, calculateInActivityTimeFor(
                exponentialCurveParams, 0.2)
        plt.plot(x,
                 y,
                 'o',
                 label=getLatexForString(self.stream_settings['plot_label']) +
                 getLatexForString(' (%0.2fx^{%0.2f})') %
                 (exponentialCurveParams[0], exponentialCurveParams[1]),
                 color=self.stream_settings['plot_color'])
        plt.plot(x,
                 CurveFit.getYValues(CurveFit.increasingExponentialFunction,
                                     exponentialCurveParams, x),
                 color=self.stream_settings['plot_color'],
                 lw=2)
        plt.ylabel(r'$P\ (\ lag\ \leq\ TU\ )$'), plt.xlabel(
            getLatexForString(xlabelTimeUnits)), plt.title(
                getLatexForString('CDF for clusters lag distribution.'))
        plt.ylim((0, 1.2))
        plt.legend(loc=4)
        if returnAxisValuesOnly: plt.show()

    def plotPercentageOfClustersWithinALag(self, returnAxisValuesOnly=True):
        '''
        458 Experts stream [ 0.01860266  0.70639136] 15 0.874004297177
        80 Houston stream [ 0.0793181   0.47644004] 3 0.866127308876
        '''
        def calculatePercentageOfDecayedPhrasesFor(params, timeUnit):
            return 1 - CurveFit.increasingExponentialFunction(params, timeUnit)

        dataDistribution = {}
        currentTimeUnit = 0
        #        file='/mnt/chevron/kykamath/data/twitter/lsh_crowds/houston_stream/parameter_estimation/cluster_lag_distribution'
        file = self.hdsClustering.stream_settings[
            '%s_file' %
            ClusteringParametersEstimation.clusterLagDistributionId]
        lines = list(FileIO.iterateJsonFromFile(file))
        numberOfTimeUnits = len(lines)
        for data in lines:
            totalClusters = float(
                sum(data[ClusteringParametersEstimation.
                         clusterLagDistributionId].values()))
            tempArray = []
            for k, v in data[ClusteringParametersEstimation.
                             clusterLagDistributionId].iteritems():
                k = int(k)
                if k not in dataDistribution:
                    dataDistribution[k] = [0] * numberOfTimeUnits
                dataDistribution[k][currentTimeUnit] = v / totalClusters
                tempArray.append(v / totalClusters)
            currentTimeUnit += 1
        x = sorted(dataDistribution)
        print numberOfTimeUnits,
        y = getCumulativeDistribution(
            [np.mean(dataDistribution[k]) for k in x])
        params = CurveFit.getParamsAfterFittingData(
            x, y, CurveFit.increasingExponentialFunction, [1., 1.])
        print self.stream_settings['plot_label'], params,

        def subPlot(id, timeUnit):
            plt.subplot(id)
            print timeUnit, calculatePercentageOfDecayedPhrasesFor(
                params, timeUnit)
            plt.plot(
                x,
                y,
                'o',
                label=getLatexForString(self.stream_settings['plot_label']) +
                getLatexForString(' (%0.2fx^{%0.2f})') %
                (params[0], params[1]),
                color=self.stream_settings['plot_color'])
            plt.plot(x,
                     CurveFit.getYValues(
                         CurveFit.increasingExponentialFunction, params, x),
                     color=self.stream_settings['plot_color'],
                     lw=2)

        if self.stream_settings['stream_id'] == 'experts_twitter_stream':
            subPlot(111, 15)
            plt.title(getLatexForString('Percentage of clusters within a lag'))
        else:
            subPlot(111, 3)
            plt.xlabel(getLatexForString(xlabelTimeUnits))
        plt.ylabel(r'$\%\ of\ clusters\ with\ lag\ \leq\ TU$')
        plt.legend(loc=4)
        if returnAxisValuesOnly: plt.show()

    @staticmethod
    def thresholdForDocumentToBeInCluterEstimation(stats_file,
                                                   **stream_settings):
        ''' Estimate thresold for the clusters by varying the threshold_for_document_to_be_in_cluster value.
        Run this on a document set of size 100K. 
        '''
        for length in [
                i * j for i in 10**3, 10**4, 10**5 for j in range(1, 10)
        ]:
            #            for t in range(1, 16):
            for t in range(16, 21):
                stream_settings[
                    'threshold_for_document_to_be_in_cluster'] = t * 0.05
                print length, stream_settings[
                    'threshold_for_document_to_be_in_cluster']
                stats = {
                    'streaming_lsh':
                    KMeansTweetsFile(length, **stream_settings).
                    generateStatsForStreamingLSHClustering(),
                    'settings':
                    Settings.getSerialzedObject(stream_settings)
                }
                FileIO.writeToFileAsJson(stats, stats_file)