def plotCDFClustersLagDistribution(self, returnAxisValuesOnly=True):
     '''
     This determines the time after which a cluster can be considered 
     decayed and hence removed.
     
     Experts stream [ 0.66002386  0.07035227] 0.1 82
     Houston stream [ 0.73800037  0.05890473] 0.1 29
     
     458 (# of time units) Experts stream [ 0.66002386  0.07035227] 0.2 15
     71 (# of time units) Houston stream [ 0.73756656  0.05883258] 0.2 3
     
     '''
     def calculateInActivityTimeFor(params, probabilityOfInactivity): return int(CurveFit.inverseOfIncreasingExponentialFunction(params, 1 - probabilityOfInactivity))
     data = list(FileIO.iterateJsonFromFile(self.hdsClustering.stream_settings['%s_file' % ClusteringParametersEstimation.clusterLagDistributionId]))[-1]
     total = float(sum(data['lag_between_streams_added_to_cluster'].values()))
     x = sorted(map(int, data['lag_between_streams_added_to_cluster'].keys()))
     y = getCumulativeDistribution([data['lag_between_streams_added_to_cluster'][str(i)] / total for i in x])
     exponentialCurveParams = CurveFit.getParamsAfterFittingData(x, y, CurveFit.increasingExponentialFunction, [1., 1.])
     print self.stream_settings['plot_label'], exponentialCurveParams, calculateInActivityTimeFor(exponentialCurveParams, 0.2) 
     plt.plot(x, y, 'o', label=getLatexForString(self.stream_settings['plot_label']) + getLatexForString(' (%0.2fx^{%0.2f})') % (exponentialCurveParams[0], exponentialCurveParams[1]), color=self.stream_settings['plot_color'])
     plt.plot(x, CurveFit.getYValues(CurveFit.increasingExponentialFunction, exponentialCurveParams, x), color=self.stream_settings['plot_color'], lw=2)
     plt.ylabel(r'$P\ (\ lag\ \leq\ TU\ )$'), plt.xlabel(getLatexForString(xlabelTimeUnits)), plt.title(getLatexForString('CDF for clusters lag distribution.'))
     plt.ylim((0, 1.2))
     plt.legend(loc=4)
     if returnAxisValuesOnly: plt.show()
 def plotCDFClustersLagDistribution(self, returnAxisValuesOnly=True):
     '''
     This determines the time after which a cluster can be considered 
     decayed and hence removed.
     
     Experts stream [ 0.66002386  0.07035227] 0.1 82
     Houston stream [ 0.73800037  0.05890473] 0.1 29
     
     458 (# of time units) Experts stream [ 0.66002386  0.07035227] 0.2 15
     71 (# of time units) Houston stream [ 0.73756656  0.05883258] 0.2 3
     
     '''
     def calculateInActivityTimeFor(params, probabilityOfInactivity): return int(CurveFit.inverseOfIncreasingExponentialFunction(params, 1 - probabilityOfInactivity))
     data = list(FileIO.iterateJsonFromFile(self.hdsClustering.stream_settings['%s_file' % ClusteringParametersEstimation.clusterLagDistributionId]))[-1]
     total = float(sum(data['lag_between_streams_added_to_cluster'].values()))
     x = sorted(map(int, data['lag_between_streams_added_to_cluster'].keys()))
     y = getCumulativeDistribution([data['lag_between_streams_added_to_cluster'][str(i)] / total for i in x])
     exponentialCurveParams = CurveFit.getParamsAfterFittingData(x, y, CurveFit.increasingExponentialFunction, [1., 1.])
     print self.stream_settings['plot_label'], exponentialCurveParams, calculateInActivityTimeFor(exponentialCurveParams, 0.2) 
     plt.plot(x, y, 'o', label=getLatexForString(self.stream_settings['plot_label']) + getLatexForString(' (%0.2fx^{%0.2f})') % (exponentialCurveParams[0], exponentialCurveParams[1]), color=self.stream_settings['plot_color'])
     plt.plot(x, CurveFit.getYValues(CurveFit.increasingExponentialFunction, exponentialCurveParams, x), color=self.stream_settings['plot_color'], lw=2)
     plt.ylabel(r'$P\ (\ lag\ \leq\ TU\ )$'), plt.xlabel(getLatexForString(xlabelTimeUnits)), plt.title(getLatexForString('CDF for clusters lag distribution.'))
     plt.ylim((0, 1.2))
     plt.legend(loc=4)
     if returnAxisValuesOnly: plt.show()
Пример #3
0
    def plotDimensionsEstimation(self, returnAxisValuesOnly=True):
        def calculateDimensionsFor(params, percentageOfNewDimensions):
            '''
            numberOfTimeUnits=10*24*12
            Experts stream [  1.17707899e+03   1.03794580e+00] 76819
            Houston stream [  2.73913900e+03   1.02758516e+00] 195731
            '''
            print getSmallestPrimeNumberGreaterThan(
                int(
                    CurveFit.inverseOfDecreasingExponentialFunction(
                        params, percentageOfNewDimensions)))

        dataDistribution = defaultdict(list)
        for line in FileIO.iterateJsonFromFile(self.dimensionsEstimationFile):
            for k, v in line[
                    ParameterEstimation.dimensionsEstimationId].iteritems():
                k = int(k)
                if k not in dataDistribution: dataDistribution[k] = [0., 0.]
                dataDistribution[k][0] += v
                dataDistribution[k][1] += 1
        x, y = [], []
        [(x.append(k),
          y.append((dataDistribution[k][0] / dataDistribution[k][1]) / k))
         for k in sorted(dataDistribution) if k > 1000]
        x, y = x[:numberOfTimeUnits], y[:numberOfTimeUnits]
        exponentialCurveParams = CurveFit.getParamsAfterFittingData(
            x, y, CurveFit.decreasingExponentialFunction, [1., 1.])
        print self.stream_settings[
            'plot_label'], exponentialCurveParams, calculateDimensionsFor(
                exponentialCurveParams, 0.01)
        plt.ylabel(getLatexForString('\% of decaying dimensions')), plt.xlabel(
            getLatexForString('\# of dimensions')
        ), plt.title(
            getLatexForString(
                'Dimension stability with increasing number of dimensions.'))
        plt.semilogy(
            x,
            y,
            'o',
            color=self.stream_settings['plot_color'],
            label=getLatexForString(self.stream_settings['plot_label']) +
            getLatexForString(' (%0.2fx^{-%0.2f})') %
            (exponentialCurveParams[0], exponentialCurveParams[1]),
            lw=2)
        plt.semilogy(x,
                     CurveFit.getYValues(
                         CurveFit.decreasingExponentialFunction,
                         exponentialCurveParams, x),
                     color=self.stream_settings['plot_color'],
                     lw=2)
        plt.legend()
        if returnAxisValuesOnly: plt.show()
Пример #4
0
 def subPlot(id, timeUnit):
     plt.subplot(id)
     print timeUnit, calculatePercentageOfDecayedPhrasesFor(
         params, timeUnit)
     plt.plot(
         x,
         y,
         'o',
         label=getLatexForString(self.stream_settings['plot_label']) +
         getLatexForString(' (%0.2fx^{%0.2f})') %
         (params[0], params[1]),
         color=self.stream_settings['plot_color'])
     plt.plot(x,
              CurveFit.getYValues(
                  CurveFit.increasingExponentialFunction, params, x),
              color=self.stream_settings['plot_color'],
              lw=2)
 def plotCDFDimensionsLagDistribution(self, returnAxisValuesOnly=True):
     '''
     Inactivity time is the time after which there is a high probability that a
     dimension will not appear. Find time_unit that gives this probability. 
     
     Cumulative distribution function (http://en.wikipedia.org/wiki/Cumulative_distribution_function)
     lag = time betweeen occurance of two dimensions (similar to inactivty_time)
     
     F(time_unit) = P(lag<=time_unit)
     time_unit = F_inv(P(lag<=time_unit))
     
     Given P(inactivty_time>time_unit) determine time_unit as shown:
     P(inactivty_time<=time_unit) = 1 - P(inactivty_time>time_unit)
     inactivty_time = F_inv(P(inactivty_time<=time_unit))
     
     numberOfTimeUnits=10*24*12
     
     Experts stream [ 0.23250341  0.250209  ] 0.25 107
     Houston stream [ 0.16948096  0.30751358] 0.25 126
     
     Experts stream [ 0.23250341  0.250209  ] 0.1, 223
     Houston stream [ 0.16948096  0.30751358] 0.1, 228
     
     Compared to other vaues these values are pretty close to each
     other. This is expected. Irrespective of size of the streams,
     the phrases have the same lifetime and hence decay close to each other.
     '''
     def calculateInActivityTimeFor(params, probabilityOfInactivity): return int(CurveFit.inverseOfIncreasingExponentialFunction(params, 1 - probabilityOfInactivity))
     data = list(FileIO.iterateJsonFromFile(self.dimensionInActivityTimeFile))[numberOfTimeUnits]
     total = float(sum(data[ParameterEstimation.dimensionInActivityTimeId].values()))
     x = sorted(map(int, data[ParameterEstimation.dimensionInActivityTimeId].keys()))
     y = getCumulativeDistribution([data[ParameterEstimation.dimensionInActivityTimeId][str(i)] / total for i in x])
     print len(x)
     exponentialCurveParams = CurveFit.getParamsAfterFittingData(x, y, CurveFit.increasingExponentialFunction, [1., 1.])
     print self.stream_settings['plot_label'], exponentialCurveParams, calculateInActivityTimeFor(exponentialCurveParams, 0.1) 
     plt.plot(x, y, 'o', label=getLatexForString(self.stream_settings['plot_label']) + getLatexForString(' (%0.2fx^{%0.2f})') % (exponentialCurveParams[0], exponentialCurveParams[1]), color=self.stream_settings['plot_color'])
     plt.plot(x, CurveFit.getYValues(CurveFit.increasingExponentialFunction, exponentialCurveParams, x), color=self.stream_settings['plot_color'], lw=2)
     plt.ylabel(r'$P\ (\ lag\ \leq\ TU\ )$'), plt.xlabel(getLatexForString(xlabelTimeUnits)), plt.title(getLatexForString('CDF for dimension lag distribution.'))
     plt.ylim((0, 1.2))
     plt.legend(loc=4)
     if returnAxisValuesOnly: plt.show()
 def plotCDFDimensionsLagDistribution(self, returnAxisValuesOnly=True):
     '''
     Inactivity time is the time after which there is a high probability that a
     dimension will not appear. Find time_unit that gives this probability. 
     
     Cumulative distribution function (http://en.wikipedia.org/wiki/Cumulative_distribution_function)
     lag = time betweeen occurance of two dimensions (similar to inactivty_time)
     
     F(time_unit) = P(lag<=time_unit)
     time_unit = F_inv(P(lag<=time_unit))
     
     Given P(inactivty_time>time_unit) determine time_unit as shown:
     P(inactivty_time<=time_unit) = 1 - P(inactivty_time>time_unit)
     inactivty_time = F_inv(P(inactivty_time<=time_unit))
     
     numberOfTimeUnits=10*24*12
     
     Experts stream [ 0.23250341  0.250209  ] 0.25 107
     Houston stream [ 0.16948096  0.30751358] 0.25 126
     
     Experts stream [ 0.23250341  0.250209  ] 0.1, 223
     Houston stream [ 0.16948096  0.30751358] 0.1, 228
     
     Compared to other vaues these values are pretty close to each
     other. This is expected. Irrespective of size of the streams,
     the phrases have the same lifetime and hence decay close to each other.
     '''
     def calculateInActivityTimeFor(params, probabilityOfInactivity): return int(CurveFit.inverseOfIncreasingExponentialFunction(params, 1 - probabilityOfInactivity))
     data = list(FileIO.iterateJsonFromFile(self.dimensionInActivityTimeFile))[numberOfTimeUnits]
     total = float(sum(data[ParameterEstimation.dimensionInActivityTimeId].values()))
     x = sorted(map(int, data[ParameterEstimation.dimensionInActivityTimeId].keys()))
     y = getCumulativeDistribution([data[ParameterEstimation.dimensionInActivityTimeId][str(i)] / total for i in x])
     print len(x)
     exponentialCurveParams = CurveFit.getParamsAfterFittingData(x, y, CurveFit.increasingExponentialFunction, [1., 1.])
     print self.stream_settings['plot_label'], exponentialCurveParams, calculateInActivityTimeFor(exponentialCurveParams, 0.1) 
     plt.plot(x, y, 'o', label=getLatexForString(self.stream_settings['plot_label']) + getLatexForString(' (%0.2fx^{%0.2f})') % (exponentialCurveParams[0], exponentialCurveParams[1]), color=self.stream_settings['plot_color'])
     plt.plot(x, CurveFit.getYValues(CurveFit.increasingExponentialFunction, exponentialCurveParams, x), color=self.stream_settings['plot_color'], lw=2)
     plt.ylabel(r'$P\ (\ lag\ \leq\ TU\ )$'), plt.xlabel(getLatexForString(xlabelTimeUnits)), plt.title(getLatexForString('CDF for dimension lag distribution.'))
     plt.ylim((0, 1.2))
     plt.legend(loc=4)
     if returnAxisValuesOnly: plt.show()
 def plotDimensionsEstimation(self, returnAxisValuesOnly=True):
     def calculateDimensionsFor(params, percentageOfNewDimensions): 
         '''
         numberOfTimeUnits=10*24*12
         Experts stream [  1.17707899e+03   1.03794580e+00] 76819
         Houston stream [  2.73913900e+03   1.02758516e+00] 195731
         '''
         print getSmallestPrimeNumberGreaterThan(int(CurveFit.inverseOfDecreasingExponentialFunction(params, percentageOfNewDimensions)))
     dataDistribution = defaultdict(list)
     for line in FileIO.iterateJsonFromFile(self.dimensionsEstimationFile):
         for k, v in line[ParameterEstimation.dimensionsEstimationId].iteritems():
             k = int(k)
             if k not in dataDistribution: dataDistribution[k] = [0., 0.]
             dataDistribution[k][0] += v; dataDistribution[k][1] += 1
     x, y = [], []; [(x.append(k), y.append((dataDistribution[k][0] / dataDistribution[k][1]) / k)) for k in sorted(dataDistribution) if k > 1000]
     x, y = x[:numberOfTimeUnits], y[:numberOfTimeUnits]
     exponentialCurveParams = CurveFit.getParamsAfterFittingData(x, y, CurveFit.decreasingExponentialFunction, [1., 1.])
     print self.stream_settings['plot_label'], exponentialCurveParams, calculateDimensionsFor(exponentialCurveParams, 0.01) 
     plt.ylabel(getLatexForString('\% of decaying dimensions')), plt.xlabel(getLatexForString('\# of dimensions')), plt.title(getLatexForString('Dimension stability with increasing number of dimensions.'))
     plt.semilogy(x, y, 'o', color=self.stream_settings['plot_color'], label=getLatexForString(self.stream_settings['plot_label']) + getLatexForString(' (%0.2fx^{-%0.2f})') % (exponentialCurveParams[0], exponentialCurveParams[1]), lw=2)
     plt.semilogy(x, CurveFit.getYValues(CurveFit.decreasingExponentialFunction, exponentialCurveParams, x), color=self.stream_settings['plot_color'], lw=2)
     plt.legend()
     if returnAxisValuesOnly: plt.show()
 def subPlot(id, timeUnit):
     plt.subplot(id)
     print timeUnit, calculatePercentageOfDecayedPhrasesFor(params, timeUnit)
     plt.plot(x, y, 'o', label=getLatexForString(self.stream_settings['plot_label']) + getLatexForString(' (%0.2fx^{%0.2f})') % (params[0], params[1]), color=self.stream_settings['plot_color'])
     plt.plot(x, CurveFit.getYValues(CurveFit.increasingExponentialFunction, params, x), color=self.stream_settings['plot_color'], lw=2)