Python CurveFit示例，library.plotting.CurveFit Python示例

示例#1

0

显示文件

文件： stream_parameters_estimation.py 项目： ylaron/hd_streams_clustering

 def plotCDFClustersLagDistribution(self, returnAxisValuesOnly=True):
     '''
     This determines the time after which a cluster can be considered 
     decayed and hence removed.
     
     Experts stream [ 0.66002386  0.07035227] 0.1 82
     Houston stream [ 0.73800037  0.05890473] 0.1 29
     
     458 (# of time units) Experts stream [ 0.66002386  0.07035227] 0.2 15
     71 (# of time units) Houston stream [ 0.73756656  0.05883258] 0.2 3
     
     '''
     def calculateInActivityTimeFor(params, probabilityOfInactivity): return int(CurveFit.inverseOfIncreasingExponentialFunction(params, 1 - probabilityOfInactivity))
     data = list(FileIO.iterateJsonFromFile(self.hdsClustering.stream_settings['%s_file' % ClusteringParametersEstimation.clusterLagDistributionId]))[-1]
     total = float(sum(data['lag_between_streams_added_to_cluster'].values()))
     x = sorted(map(int, data['lag_between_streams_added_to_cluster'].keys()))
     y = getCumulativeDistribution([data['lag_between_streams_added_to_cluster'][str(i)] / total for i in x])
     exponentialCurveParams = CurveFit.getParamsAfterFittingData(x, y, CurveFit.increasingExponentialFunction, [1., 1.])
     print self.stream_settings['plot_label'], exponentialCurveParams, calculateInActivityTimeFor(exponentialCurveParams, 0.2) 
     plt.plot(x, y, 'o', label=getLatexForString(self.stream_settings['plot_label']) + getLatexForString(' (%0.2fx^{%0.2f})') % (exponentialCurveParams[0], exponentialCurveParams[1]), color=self.stream_settings['plot_color'])
     plt.plot(x, CurveFit.getYValues(CurveFit.increasingExponentialFunction, exponentialCurveParams, x), color=self.stream_settings['plot_color'], lw=2)
     plt.ylabel(r'$P\ (\ lag\ \leq\ TU\ )$'), plt.xlabel(getLatexForString(xlabelTimeUnits)), plt.title(getLatexForString('CDF for clusters lag distribution.'))
     plt.ylim((0, 1.2))
     plt.legend(loc=4)
     if returnAxisValuesOnly: plt.show()

示例#2

0

显示文件

文件： stream_parameters_estimation.py 项目： greeness/hd_streams_clustering

 def plotCDFClustersLagDistribution(self, returnAxisValuesOnly=True):
     '''
     This determines the time after which a cluster can be considered 
     decayed and hence removed.
     
     Experts stream [ 0.66002386  0.07035227] 0.1 82
     Houston stream [ 0.73800037  0.05890473] 0.1 29
     
     458 (# of time units) Experts stream [ 0.66002386  0.07035227] 0.2 15
     71 (# of time units) Houston stream [ 0.73756656  0.05883258] 0.2 3
     
     '''
     def calculateInActivityTimeFor(params, probabilityOfInactivity): return int(CurveFit.inverseOfIncreasingExponentialFunction(params, 1 - probabilityOfInactivity))
     data = list(FileIO.iterateJsonFromFile(self.hdsClustering.stream_settings['%s_file' % ClusteringParametersEstimation.clusterLagDistributionId]))[-1]
     total = float(sum(data['lag_between_streams_added_to_cluster'].values()))
     x = sorted(map(int, data['lag_between_streams_added_to_cluster'].keys()))
     y = getCumulativeDistribution([data['lag_between_streams_added_to_cluster'][str(i)] / total for i in x])
     exponentialCurveParams = CurveFit.getParamsAfterFittingData(x, y, CurveFit.increasingExponentialFunction, [1., 1.])
     print self.stream_settings['plot_label'], exponentialCurveParams, calculateInActivityTimeFor(exponentialCurveParams, 0.2) 
     plt.plot(x, y, 'o', label=getLatexForString(self.stream_settings['plot_label']) + getLatexForString(' (%0.2fx^{%0.2f})') % (exponentialCurveParams[0], exponentialCurveParams[1]), color=self.stream_settings['plot_color'])
     plt.plot(x, CurveFit.getYValues(CurveFit.increasingExponentialFunction, exponentialCurveParams, x), color=self.stream_settings['plot_color'], lw=2)
     plt.ylabel(r'$P\ (\ lag\ \leq\ TU\ )$'), plt.xlabel(getLatexForString(xlabelTimeUnits)), plt.title(getLatexForString('CDF for clusters lag distribution.'))
     plt.ylim((0, 1.2))
     plt.legend(loc=4)
     if returnAxisValuesOnly: plt.show()

示例#3

0

显示文件

    def plotDimensionsEstimation(self, returnAxisValuesOnly=True):
        def calculateDimensionsFor(params, percentageOfNewDimensions):
            '''
            numberOfTimeUnits=10*24*12
            Experts stream [  1.17707899e+03   1.03794580e+00] 76819
            Houston stream [  2.73913900e+03   1.02758516e+00] 195731
            '''
            print getSmallestPrimeNumberGreaterThan(
                int(
                    CurveFit.inverseOfDecreasingExponentialFunction(
                        params, percentageOfNewDimensions)))

        dataDistribution = defaultdict(list)
        for line in FileIO.iterateJsonFromFile(self.dimensionsEstimationFile):
            for k, v in line[
                    ParameterEstimation.dimensionsEstimationId].iteritems():
                k = int(k)
                if k not in dataDistribution: dataDistribution[k] = [0., 0.]
                dataDistribution[k][0] += v
                dataDistribution[k][1] += 1
        x, y = [], []
        [(x.append(k),
          y.append((dataDistribution[k][0] / dataDistribution[k][1]) / k))
         for k in sorted(dataDistribution) if k > 1000]
        x, y = x[:numberOfTimeUnits], y[:numberOfTimeUnits]
        exponentialCurveParams = CurveFit.getParamsAfterFittingData(
            x, y, CurveFit.decreasingExponentialFunction, [1., 1.])
        print self.stream_settings[
            'plot_label'], exponentialCurveParams, calculateDimensionsFor(
                exponentialCurveParams, 0.01)
        plt.ylabel(getLatexForString('\% of decaying dimensions')), plt.xlabel(
            getLatexForString('\# of dimensions')
        ), plt.title(
            getLatexForString(
                'Dimension stability with increasing number of dimensions.'))
        plt.semilogy(
            x,
            y,
            'o',
            color=self.stream_settings['plot_color'],
            label=getLatexForString(self.stream_settings['plot_label']) +
            getLatexForString(' (%0.2fx^{-%0.2f})') %
            (exponentialCurveParams[0], exponentialCurveParams[1]),
            lw=2)
        plt.semilogy(x,
                     CurveFit.getYValues(
                         CurveFit.decreasingExponentialFunction,
                         exponentialCurveParams, x),
                     color=self.stream_settings['plot_color'],
                     lw=2)
        plt.legend()
        if returnAxisValuesOnly: plt.show()

示例#4

0

显示文件

文件： stream_parameters_estimation.py 项目： greeness/hd_streams_clustering

 def calculateDimensionsFor(params, percentageOfNewDimensions): 
     '''
     numberOfTimeUnits=10*24*12
     Experts stream [  1.17707899e+03   1.03794580e+00] 76819
     Houston stream [  2.73913900e+03   1.02758516e+00] 195731
     '''
     print getSmallestPrimeNumberGreaterThan(int(CurveFit.inverseOfDecreasingExponentialFunction(params, percentageOfNewDimensions)))

示例#5

0

显示文件

文件： stream_parameters_estimation.py 项目： ylaron/hd_streams_clustering

 def calculateDimensionsFor(params, percentageOfNewDimensions): 
     '''
     numberOfTimeUnits=10*24*12
     Experts stream [  1.17707899e+03   1.03794580e+00] 76819
     Houston stream [  2.73913900e+03   1.02758516e+00] 195731
     '''
     print getSmallestPrimeNumberGreaterThan(int(CurveFit.inverseOfDecreasingExponentialFunction(params, percentageOfNewDimensions)))

示例#6

0

显示文件

文件： stream_parameters_estimation.py 项目： greeness/hd_streams_clustering

 def plotCDFDimensionsLagDistribution(self, returnAxisValuesOnly=True):
     '''
     Inactivity time is the time after which there is a high probability that a
     dimension will not appear. Find time_unit that gives this probability. 
     
     Cumulative distribution function (http://en.wikipedia.org/wiki/Cumulative_distribution_function)
     lag = time betweeen occurance of two dimensions (similar to inactivty_time)
     
     F(time_unit) = P(lag<=time_unit)
     time_unit = F_inv(P(lag<=time_unit))
     
     Given P(inactivty_time>time_unit) determine time_unit as shown:
     P(inactivty_time<=time_unit) = 1 - P(inactivty_time>time_unit)
     inactivty_time = F_inv(P(inactivty_time<=time_unit))
     
     numberOfTimeUnits=10*24*12
     
     Experts stream [ 0.23250341  0.250209  ] 0.25 107
     Houston stream [ 0.16948096  0.30751358] 0.25 126
     
     Experts stream [ 0.23250341  0.250209  ] 0.1, 223
     Houston stream [ 0.16948096  0.30751358] 0.1, 228
     
     Compared to other vaues these values are pretty close to each
     other. This is expected. Irrespective of size of the streams,
     the phrases have the same lifetime and hence decay close to each other.
     '''
     def calculateInActivityTimeFor(params, probabilityOfInactivity): return int(CurveFit.inverseOfIncreasingExponentialFunction(params, 1 - probabilityOfInactivity))
     data = list(FileIO.iterateJsonFromFile(self.dimensionInActivityTimeFile))[numberOfTimeUnits]
     total = float(sum(data[ParameterEstimation.dimensionInActivityTimeId].values()))
     x = sorted(map(int, data[ParameterEstimation.dimensionInActivityTimeId].keys()))
     y = getCumulativeDistribution([data[ParameterEstimation.dimensionInActivityTimeId][str(i)] / total for i in x])
     print len(x)
     exponentialCurveParams = CurveFit.getParamsAfterFittingData(x, y, CurveFit.increasingExponentialFunction, [1., 1.])
     print self.stream_settings['plot_label'], exponentialCurveParams, calculateInActivityTimeFor(exponentialCurveParams, 0.1) 
     plt.plot(x, y, 'o', label=getLatexForString(self.stream_settings['plot_label']) + getLatexForString(' (%0.2fx^{%0.2f})') % (exponentialCurveParams[0], exponentialCurveParams[1]), color=self.stream_settings['plot_color'])
     plt.plot(x, CurveFit.getYValues(CurveFit.increasingExponentialFunction, exponentialCurveParams, x), color=self.stream_settings['plot_color'], lw=2)
     plt.ylabel(r'$P\ (\ lag\ \leq\ TU\ )$'), plt.xlabel(getLatexForString(xlabelTimeUnits)), plt.title(getLatexForString('CDF for dimension lag distribution.'))
     plt.ylim((0, 1.2))
     plt.legend(loc=4)
     if returnAxisValuesOnly: plt.show()

示例#7

0

显示文件

文件： stream_parameters_estimation.py 项目： ylaron/hd_streams_clustering

 def plotCDFDimensionsLagDistribution(self, returnAxisValuesOnly=True):
     '''
     Inactivity time is the time after which there is a high probability that a
     dimension will not appear. Find time_unit that gives this probability. 
     
     Cumulative distribution function (http://en.wikipedia.org/wiki/Cumulative_distribution_function)
     lag = time betweeen occurance of two dimensions (similar to inactivty_time)
     
     F(time_unit) = P(lag<=time_unit)
     time_unit = F_inv(P(lag<=time_unit))
     
     Given P(inactivty_time>time_unit) determine time_unit as shown:
     P(inactivty_time<=time_unit) = 1 - P(inactivty_time>time_unit)
     inactivty_time = F_inv(P(inactivty_time<=time_unit))
     
     numberOfTimeUnits=10*24*12
     
     Experts stream [ 0.23250341  0.250209  ] 0.25 107
     Houston stream [ 0.16948096  0.30751358] 0.25 126
     
     Experts stream [ 0.23250341  0.250209  ] 0.1, 223
     Houston stream [ 0.16948096  0.30751358] 0.1, 228
     
     Compared to other vaues these values are pretty close to each
     other. This is expected. Irrespective of size of the streams,
     the phrases have the same lifetime and hence decay close to each other.
     '''
     def calculateInActivityTimeFor(params, probabilityOfInactivity): return int(CurveFit.inverseOfIncreasingExponentialFunction(params, 1 - probabilityOfInactivity))
     data = list(FileIO.iterateJsonFromFile(self.dimensionInActivityTimeFile))[numberOfTimeUnits]
     total = float(sum(data[ParameterEstimation.dimensionInActivityTimeId].values()))
     x = sorted(map(int, data[ParameterEstimation.dimensionInActivityTimeId].keys()))
     y = getCumulativeDistribution([data[ParameterEstimation.dimensionInActivityTimeId][str(i)] / total for i in x])
     print len(x)
     exponentialCurveParams = CurveFit.getParamsAfterFittingData(x, y, CurveFit.increasingExponentialFunction, [1., 1.])
     print self.stream_settings['plot_label'], exponentialCurveParams, calculateInActivityTimeFor(exponentialCurveParams, 0.1) 
     plt.plot(x, y, 'o', label=getLatexForString(self.stream_settings['plot_label']) + getLatexForString(' (%0.2fx^{%0.2f})') % (exponentialCurveParams[0], exponentialCurveParams[1]), color=self.stream_settings['plot_color'])
     plt.plot(x, CurveFit.getYValues(CurveFit.increasingExponentialFunction, exponentialCurveParams, x), color=self.stream_settings['plot_color'], lw=2)
     plt.ylabel(r'$P\ (\ lag\ \leq\ TU\ )$'), plt.xlabel(getLatexForString(xlabelTimeUnits)), plt.title(getLatexForString('CDF for dimension lag distribution.'))
     plt.ylim((0, 1.2))
     plt.legend(loc=4)
     if returnAxisValuesOnly: plt.show()

示例#8

0

显示文件

文件： stream_parameters_estimation.py 项目： greeness/hd_streams_clustering

 def plotDimensionsEstimation(self, returnAxisValuesOnly=True):
     def calculateDimensionsFor(params, percentageOfNewDimensions): 
         '''
         numberOfTimeUnits=10*24*12
         Experts stream [  1.17707899e+03   1.03794580e+00] 76819
         Houston stream [  2.73913900e+03   1.02758516e+00] 195731
         '''
         print getSmallestPrimeNumberGreaterThan(int(CurveFit.inverseOfDecreasingExponentialFunction(params, percentageOfNewDimensions)))
     dataDistribution = defaultdict(list)
     for line in FileIO.iterateJsonFromFile(self.dimensionsEstimationFile):
         for k, v in line[ParameterEstimation.dimensionsEstimationId].iteritems():
             k = int(k)
             if k not in dataDistribution: dataDistribution[k] = [0., 0.]
             dataDistribution[k][0] += v; dataDistribution[k][1] += 1
     x, y = [], []; [(x.append(k), y.append((dataDistribution[k][0] / dataDistribution[k][1]) / k)) for k in sorted(dataDistribution) if k > 1000]
     x, y = x[:numberOfTimeUnits], y[:numberOfTimeUnits]
     exponentialCurveParams = CurveFit.getParamsAfterFittingData(x, y, CurveFit.decreasingExponentialFunction, [1., 1.])
     print self.stream_settings['plot_label'], exponentialCurveParams, calculateDimensionsFor(exponentialCurveParams, 0.01) 
     plt.ylabel(getLatexForString('\% of decaying dimensions')), plt.xlabel(getLatexForString('\# of dimensions')), plt.title(getLatexForString('Dimension stability with increasing number of dimensions.'))
     plt.semilogy(x, y, 'o', color=self.stream_settings['plot_color'], label=getLatexForString(self.stream_settings['plot_label']) + getLatexForString(' (%0.2fx^{-%0.2f})') % (exponentialCurveParams[0], exponentialCurveParams[1]), lw=2)
     plt.semilogy(x, CurveFit.getYValues(CurveFit.decreasingExponentialFunction, exponentialCurveParams, x), color=self.stream_settings['plot_color'], lw=2)
     plt.legend()
     if returnAxisValuesOnly: plt.show()

示例#9

0

显示文件

 def subPlot(id, timeUnit):
     plt.subplot(id)
     print timeUnit, calculatePercentageOfDecayedPhrasesFor(
         params, timeUnit)
     plt.plot(
         x,
         y,
         'o',
         label=getLatexForString(self.stream_settings['plot_label']) +
         getLatexForString(' (%0.2fx^{%0.2f})') %
         (params[0], params[1]),
         color=self.stream_settings['plot_color'])
     plt.plot(x,
              CurveFit.getYValues(
                  CurveFit.increasingExponentialFunction, params, x),
              color=self.stream_settings['plot_color'],
              lw=2)

示例#10

0

显示文件

文件： stream_parameters_estimation.py 项目： ylaron/hd_streams_clustering

 def plotPercentageOfDimensionsWithinALag(self, returnAxisValuesOnly=True):
     '''
     This gives us the percentage of phrases we can loose everytime we prune phrases.
     
     Measures the percentage of dimensions having lag less than TU.
     
     So at the end of 10th day, almost y% of phrases can be removed. With some probabiity
     that it will not occure again.
     
     numberOfTimeUnits=10*24*12
     With 75% probability.
     Experts stream [ 0.0097055   0.81888514] 107 0.554497397565
     Houston stream [ 0.00943499  0.825918  ] 126 0.487757815615
     With 90% probability.
     Experts stream [ 0.0097055   0.81888514] 223 0.187150798756
     Houston stream [ 0.00943499  0.825918  ] 228 0.164007589276
     '''
     def calculatePercentageOfDecayedPhrasesFor(params, timeUnit): return 1 - CurveFit.increasingExponentialFunction(params, timeUnit)
     dataDistribution = {}
     currentTimeUnit = 0
     for data in list(FileIO.iterateJsonFromFile(self.dimensionInActivityTimeFile))[:numberOfTimeUnits]:
         totalDimensions = float(sum(data['phrases_lag_distribution'].values()))
         tempArray = []
         for k, v in data['phrases_lag_distribution'].iteritems():
             k = int(k)
             if k not in dataDistribution: dataDistribution[k] = [0] * numberOfTimeUnits
             dataDistribution[k][currentTimeUnit] = v / totalDimensions
             tempArray.append(v / totalDimensions)
         currentTimeUnit += 1
     x = sorted(dataDistribution)
     y = getCumulativeDistribution([np.mean(dataDistribution[k]) for k in x])
     params = CurveFit.getParamsAfterFittingData(x, y, CurveFit.increasingExponentialFunction, [1., 1.])
     print self.stream_settings['plot_label'], params,
     def subPlot(id, timeUnit):
         plt.subplot(id)
         print timeUnit, calculatePercentageOfDecayedPhrasesFor(params, timeUnit)
         plt.plot(x, y, 'o', label=getLatexForString(self.stream_settings['plot_label']) + getLatexForString(' (%0.2fx^{%0.2f})') % (params[0], params[1]), color=self.stream_settings['plot_color'])
         plt.plot(x, CurveFit.getYValues(CurveFit.increasingExponentialFunction, params, x), color=self.stream_settings['plot_color'], lw=2)
     if self.stream_settings['stream_id'] == 'experts_twitter_stream': subPlot(111, 107); plt.title(getLatexForString('Percentage of phrases within a lag'))
     else: subPlot(111, 126); plt.xlabel(getLatexForString(xlabelTimeUnits))
     plt.ylabel(r'$\%\ of\ phrases\ with\ lag\ \leq\ TU$')
     plt.legend(loc=4)
     if returnAxisValuesOnly: plt.show()

示例#11

0

显示文件

文件： stream_parameters_estimation.py 项目： greeness/hd_streams_clustering

 def plotPercentageOfDimensionsWithinALag(self, returnAxisValuesOnly=True):
     '''
     This gives us the percentage of phrases we can loose everytime we prune phrases.
     
     Measures the percentage of dimensions having lag less than TU.
     
     So at the end of 10th day, almost y% of phrases can be removed. With some probabiity
     that it will not occure again.
     
     numberOfTimeUnits=10*24*12
     With 75% probability.
     Experts stream [ 0.0097055   0.81888514] 107 0.554497397565
     Houston stream [ 0.00943499  0.825918  ] 126 0.487757815615
     With 90% probability.
     Experts stream [ 0.0097055   0.81888514] 223 0.187150798756
     Houston stream [ 0.00943499  0.825918  ] 228 0.164007589276
     '''
     def calculatePercentageOfDecayedPhrasesFor(params, timeUnit): return 1 - CurveFit.increasingExponentialFunction(params, timeUnit)
     dataDistribution = {}
     currentTimeUnit = 0
     for data in list(FileIO.iterateJsonFromFile(self.dimensionInActivityTimeFile))[:numberOfTimeUnits]:
         totalDimensions = float(sum(data['phrases_lag_distribution'].values()))
         tempArray = []
         for k, v in data['phrases_lag_distribution'].iteritems():
             k = int(k)
             if k not in dataDistribution: dataDistribution[k] = [0] * numberOfTimeUnits
             dataDistribution[k][currentTimeUnit] = v / totalDimensions
             tempArray.append(v / totalDimensions)
         currentTimeUnit += 1
     x = sorted(dataDistribution)
     y = getCumulativeDistribution([np.mean(dataDistribution[k]) for k in x])
     params = CurveFit.getParamsAfterFittingData(x, y, CurveFit.increasingExponentialFunction, [1., 1.])
     print self.stream_settings['plot_label'], params,
     def subPlot(id, timeUnit):
         plt.subplot(id)
         print timeUnit, calculatePercentageOfDecayedPhrasesFor(params, timeUnit)
         plt.plot(x, y, 'o', label=getLatexForString(self.stream_settings['plot_label']) + getLatexForString(' (%0.2fx^{%0.2f})') % (params[0], params[1]), color=self.stream_settings['plot_color'])
         plt.plot(x, CurveFit.getYValues(CurveFit.increasingExponentialFunction, params, x), color=self.stream_settings['plot_color'], lw=2)
     if self.stream_settings['stream_id'] == 'experts_twitter_stream': subPlot(111, 107); plt.title(getLatexForString('Percentage of phrases within a lag'))
     else: subPlot(111, 126); plt.xlabel(getLatexForString(xlabelTimeUnits))
     plt.ylabel(r'$\%\ of\ phrases\ with\ lag\ \leq\ TU$')
     plt.legend(loc=4)
     if returnAxisValuesOnly: plt.show()

示例#12

0

显示文件

文件： stream_parameters_estimation.py 项目： greeness/hd_streams_clustering

    def plotPercentageOfClustersWithinALag(self, returnAxisValuesOnly=True):
        '''
        458 Experts stream [ 0.01860266  0.70639136] 15 0.874004297177
        80 Houston stream [ 0.0793181   0.47644004] 3 0.866127308876
        '''
        def calculatePercentageOfDecayedPhrasesFor(params, timeUnit): return 1 - CurveFit.increasingExponentialFunction(params, timeUnit)
        dataDistribution = {}
        currentTimeUnit = 0
#        file='/mnt/chevron/kykamath/data/twitter/lsh_crowds/houston_stream/parameter_estimation/cluster_lag_distribution'
        file = self.hdsClustering.stream_settings['%s_file' % ClusteringParametersEstimation.clusterLagDistributionId]
        lines = list(FileIO.iterateJsonFromFile(file))
        numberOfTimeUnits = len(lines)
        for data in lines:
            totalClusters = float(sum(data[ClusteringParametersEstimation.clusterLagDistributionId].values()))
            tempArray = []
            for k, v in data[ClusteringParametersEstimation.clusterLagDistributionId].iteritems():
                k = int(k)
                if k not in dataDistribution: dataDistribution[k] = [0] * numberOfTimeUnits
                dataDistribution[k][currentTimeUnit] = v / totalClusters
                tempArray.append(v / totalClusters)
            currentTimeUnit += 1
        x = sorted(dataDistribution)
        print numberOfTimeUnits,
        y = getCumulativeDistribution([np.mean(dataDistribution[k]) for k in x])
        params = CurveFit.getParamsAfterFittingData(x, y, CurveFit.increasingExponentialFunction, [1., 1.])
        print self.stream_settings['plot_label'], params,
        def subPlot(id, timeUnit):
            plt.subplot(id)
            print timeUnit, calculatePercentageOfDecayedPhrasesFor(params, timeUnit)
            plt.plot(x, y, 'o', label=getLatexForString(self.stream_settings['plot_label']) + getLatexForString(' (%0.2fx^{%0.2f})') % (params[0], params[1]), color=self.stream_settings['plot_color'])
            plt.plot(x, CurveFit.getYValues(CurveFit.increasingExponentialFunction, params, x), color=self.stream_settings['plot_color'], lw=2)
        if self.stream_settings['stream_id'] == 'experts_twitter_stream': subPlot(111, 15); plt.title(getLatexForString('Percentage of clusters within a lag'))
        else: subPlot(111, 3); plt.xlabel(getLatexForString(xlabelTimeUnits))
        plt.ylabel(r'$\%\ of\ clusters\ with\ lag\ \leq\ TU$')
        plt.legend(loc=4)
        if returnAxisValuesOnly: plt.show()

示例#13

0

显示文件

文件： stream_parameters_estimation.py 项目： ylaron/hd_streams_clustering

    def plotPercentageOfClustersWithinALag(self, returnAxisValuesOnly=True):
        '''
        458 Experts stream [ 0.01860266  0.70639136] 15 0.874004297177
        80 Houston stream [ 0.0793181   0.47644004] 3 0.866127308876
        '''
        def calculatePercentageOfDecayedPhrasesFor(params, timeUnit): return 1 - CurveFit.increasingExponentialFunction(params, timeUnit)
        dataDistribution = {}
        currentTimeUnit = 0
#        file='/mnt/chevron/kykamath/data/twitter/lsh_crowds/houston_stream/parameter_estimation/cluster_lag_distribution'
        file = self.hdsClustering.stream_settings['%s_file' % ClusteringParametersEstimation.clusterLagDistributionId]
        lines = list(FileIO.iterateJsonFromFile(file))
        numberOfTimeUnits = len(lines)
        for data in lines:
            totalClusters = float(sum(data[ClusteringParametersEstimation.clusterLagDistributionId].values()))
            tempArray = []
            for k, v in data[ClusteringParametersEstimation.clusterLagDistributionId].iteritems():
                k = int(k)
                if k not in dataDistribution: dataDistribution[k] = [0] * numberOfTimeUnits
                dataDistribution[k][currentTimeUnit] = v / totalClusters
                tempArray.append(v / totalClusters)
            currentTimeUnit += 1
        x = sorted(dataDistribution)
        print numberOfTimeUnits,
        y = getCumulativeDistribution([np.mean(dataDistribution[k]) for k in x])
        params = CurveFit.getParamsAfterFittingData(x, y, CurveFit.increasingExponentialFunction, [1., 1.])
        print self.stream_settings['plot_label'], params,
        def subPlot(id, timeUnit):
            plt.subplot(id)
            print timeUnit, calculatePercentageOfDecayedPhrasesFor(params, timeUnit)
            plt.plot(x, y, 'o', label=getLatexForString(self.stream_settings['plot_label']) + getLatexForString(' (%0.2fx^{%0.2f})') % (params[0], params[1]), color=self.stream_settings['plot_color'])
            plt.plot(x, CurveFit.getYValues(CurveFit.increasingExponentialFunction, params, x), color=self.stream_settings['plot_color'], lw=2)
        if self.stream_settings['stream_id'] == 'experts_twitter_stream': subPlot(111, 15); plt.title(getLatexForString('Percentage of clusters within a lag'))
        else: subPlot(111, 3); plt.xlabel(getLatexForString(xlabelTimeUnits))
        plt.ylabel(r'$\%\ of\ clusters\ with\ lag\ \leq\ TU$')
        plt.legend(loc=4)
        if returnAxisValuesOnly: plt.show()

示例#14

0

显示文件

文件： stream_parameters_estimation.py 项目： greeness/hd_streams_clustering

 def subPlot(id, timeUnit):
     plt.subplot(id)
     print timeUnit, calculatePercentageOfDecayedPhrasesFor(params, timeUnit)
     plt.plot(x, y, 'o', label=getLatexForString(self.stream_settings['plot_label']) + getLatexForString(' (%0.2fx^{%0.2f})') % (params[0], params[1]), color=self.stream_settings['plot_color'])
     plt.plot(x, CurveFit.getYValues(CurveFit.increasingExponentialFunction, params, x), color=self.stream_settings['plot_color'], lw=2)

示例#15

0

显示文件

文件： stream_parameters_estimation.py 项目： greeness/hd_streams_clustering

 def calculatePercentageOfDecayedPhrasesFor(params, timeUnit): return 1 - CurveFit.increasingExponentialFunction(params, timeUnit)
 dataDistribution = {}

示例#16

0

显示文件

文件： stream_parameters_estimation.py 项目： greeness/hd_streams_clustering

 def calculateInActivityTimeFor(params, probabilityOfInactivity): return int(CurveFit.inverseOfIncreasingExponentialFunction(params, 1 - probabilityOfInactivity))
 data = list(FileIO.iterateJsonFromFile(self.hdsClustering.stream_settings['%s_file' % ClusteringParametersEstimation.clusterLagDistributionId]))[-1]

示例#17

0

显示文件

文件： stream_parameters_estimation.py 项目： ylaron/hd_streams_clustering

 def calculateInActivityTimeFor(params, probabilityOfInactivity): return int(CurveFit.inverseOfIncreasingExponentialFunction(params, 1 - probabilityOfInactivity))
 data = list(FileIO.iterateJsonFromFile(self.dimensionInActivityTimeFile))[numberOfTimeUnits]

示例#18

0

显示文件

文件： stream_parameters_estimation.py 项目： greeness/hd_streams_clustering

 def calculateInActivityTimeFor(params, probabilityOfInactivity): return int(CurveFit.inverseOfIncreasingExponentialFunction(params, 1 - probabilityOfInactivity))
 data = list(FileIO.iterateJsonFromFile(self.dimensionInActivityTimeFile))[numberOfTimeUnits]

示例#19

0

显示文件

文件： stream_parameters_estimation.py 项目： ylaron/hd_streams_clustering

 def calculateInActivityTimeFor(params, probabilityOfInactivity): return int(CurveFit.inverseOfIncreasingExponentialFunction(params, 1 - probabilityOfInactivity))
 data = list(FileIO.iterateJsonFromFile(self.hdsClustering.stream_settings['%s_file' % ClusteringParametersEstimation.clusterLagDistributionId]))[-1]

示例#20

0

显示文件

 def calculateInActivityTimeFor(params, probabilityOfInactivity):
     return int(
         CurveFit.inverseOfIncreasingExponentialFunction(
             params, 1 - probabilityOfInactivity))

示例#21

0

显示文件

 def calculatePercentageOfDecayedPhrasesFor(params, timeUnit):
     return 1 - CurveFit.increasingExponentialFunction(params, timeUnit)