def plotCDFClustersLagDistribution(self, returnAxisValuesOnly=True): ''' This determines the time after which a cluster can be considered decayed and hence removed. Experts stream [ 0.66002386 0.07035227] 0.1 82 Houston stream [ 0.73800037 0.05890473] 0.1 29 458 (# of time units) Experts stream [ 0.66002386 0.07035227] 0.2 15 71 (# of time units) Houston stream [ 0.73756656 0.05883258] 0.2 3 ''' def calculateInActivityTimeFor(params, probabilityOfInactivity): return int(CurveFit.inverseOfIncreasingExponentialFunction(params, 1 - probabilityOfInactivity)) data = list(FileIO.iterateJsonFromFile(self.hdsClustering.stream_settings['%s_file' % ClusteringParametersEstimation.clusterLagDistributionId]))[-1] total = float(sum(data['lag_between_streams_added_to_cluster'].values())) x = sorted(map(int, data['lag_between_streams_added_to_cluster'].keys())) y = getCumulativeDistribution([data['lag_between_streams_added_to_cluster'][str(i)] / total for i in x]) exponentialCurveParams = CurveFit.getParamsAfterFittingData(x, y, CurveFit.increasingExponentialFunction, [1., 1.]) print self.stream_settings['plot_label'], exponentialCurveParams, calculateInActivityTimeFor(exponentialCurveParams, 0.2) plt.plot(x, y, 'o', label=getLatexForString(self.stream_settings['plot_label']) + getLatexForString(' (%0.2fx^{%0.2f})') % (exponentialCurveParams[0], exponentialCurveParams[1]), color=self.stream_settings['plot_color']) plt.plot(x, CurveFit.getYValues(CurveFit.increasingExponentialFunction, exponentialCurveParams, x), color=self.stream_settings['plot_color'], lw=2) plt.ylabel(r'$P\ (\ lag\ \leq\ TU\ )$'), plt.xlabel(getLatexForString(xlabelTimeUnits)), plt.title(getLatexForString('CDF for clusters lag distribution.')) plt.ylim((0, 1.2)) plt.legend(loc=4) if returnAxisValuesOnly: plt.show()
def plotDimensionsEstimation(self, returnAxisValuesOnly=True): def calculateDimensionsFor(params, percentageOfNewDimensions): ''' numberOfTimeUnits=10*24*12 Experts stream [ 1.17707899e+03 1.03794580e+00] 76819 Houston stream [ 2.73913900e+03 1.02758516e+00] 195731 ''' print getSmallestPrimeNumberGreaterThan( int( CurveFit.inverseOfDecreasingExponentialFunction( params, percentageOfNewDimensions))) dataDistribution = defaultdict(list) for line in FileIO.iterateJsonFromFile(self.dimensionsEstimationFile): for k, v in line[ ParameterEstimation.dimensionsEstimationId].iteritems(): k = int(k) if k not in dataDistribution: dataDistribution[k] = [0., 0.] dataDistribution[k][0] += v dataDistribution[k][1] += 1 x, y = [], [] [(x.append(k), y.append((dataDistribution[k][0] / dataDistribution[k][1]) / k)) for k in sorted(dataDistribution) if k > 1000] x, y = x[:numberOfTimeUnits], y[:numberOfTimeUnits] exponentialCurveParams = CurveFit.getParamsAfterFittingData( x, y, CurveFit.decreasingExponentialFunction, [1., 1.]) print self.stream_settings[ 'plot_label'], exponentialCurveParams, calculateDimensionsFor( exponentialCurveParams, 0.01) plt.ylabel(getLatexForString('\% of decaying dimensions')), plt.xlabel( getLatexForString('\# of dimensions') ), plt.title( getLatexForString( 'Dimension stability with increasing number of dimensions.')) plt.semilogy( x, y, 'o', color=self.stream_settings['plot_color'], label=getLatexForString(self.stream_settings['plot_label']) + getLatexForString(' (%0.2fx^{-%0.2f})') % (exponentialCurveParams[0], exponentialCurveParams[1]), lw=2) plt.semilogy(x, CurveFit.getYValues( CurveFit.decreasingExponentialFunction, exponentialCurveParams, x), color=self.stream_settings['plot_color'], lw=2) plt.legend() if returnAxisValuesOnly: plt.show()
def calculateDimensionsFor(params, percentageOfNewDimensions): ''' numberOfTimeUnits=10*24*12 Experts stream [ 1.17707899e+03 1.03794580e+00] 76819 Houston stream [ 2.73913900e+03 1.02758516e+00] 195731 ''' print getSmallestPrimeNumberGreaterThan(int(CurveFit.inverseOfDecreasingExponentialFunction(params, percentageOfNewDimensions)))
def plotCDFDimensionsLagDistribution(self, returnAxisValuesOnly=True): ''' Inactivity time is the time after which there is a high probability that a dimension will not appear. Find time_unit that gives this probability. Cumulative distribution function (http://en.wikipedia.org/wiki/Cumulative_distribution_function) lag = time betweeen occurance of two dimensions (similar to inactivty_time) F(time_unit) = P(lag<=time_unit) time_unit = F_inv(P(lag<=time_unit)) Given P(inactivty_time>time_unit) determine time_unit as shown: P(inactivty_time<=time_unit) = 1 - P(inactivty_time>time_unit) inactivty_time = F_inv(P(inactivty_time<=time_unit)) numberOfTimeUnits=10*24*12 Experts stream [ 0.23250341 0.250209 ] 0.25 107 Houston stream [ 0.16948096 0.30751358] 0.25 126 Experts stream [ 0.23250341 0.250209 ] 0.1, 223 Houston stream [ 0.16948096 0.30751358] 0.1, 228 Compared to other vaues these values are pretty close to each other. This is expected. Irrespective of size of the streams, the phrases have the same lifetime and hence decay close to each other. ''' def calculateInActivityTimeFor(params, probabilityOfInactivity): return int(CurveFit.inverseOfIncreasingExponentialFunction(params, 1 - probabilityOfInactivity)) data = list(FileIO.iterateJsonFromFile(self.dimensionInActivityTimeFile))[numberOfTimeUnits] total = float(sum(data[ParameterEstimation.dimensionInActivityTimeId].values())) x = sorted(map(int, data[ParameterEstimation.dimensionInActivityTimeId].keys())) y = getCumulativeDistribution([data[ParameterEstimation.dimensionInActivityTimeId][str(i)] / total for i in x]) print len(x) exponentialCurveParams = CurveFit.getParamsAfterFittingData(x, y, CurveFit.increasingExponentialFunction, [1., 1.]) print self.stream_settings['plot_label'], exponentialCurveParams, calculateInActivityTimeFor(exponentialCurveParams, 0.1) plt.plot(x, y, 'o', label=getLatexForString(self.stream_settings['plot_label']) + getLatexForString(' (%0.2fx^{%0.2f})') % (exponentialCurveParams[0], exponentialCurveParams[1]), color=self.stream_settings['plot_color']) plt.plot(x, CurveFit.getYValues(CurveFit.increasingExponentialFunction, exponentialCurveParams, x), color=self.stream_settings['plot_color'], lw=2) plt.ylabel(r'$P\ (\ lag\ \leq\ TU\ )$'), plt.xlabel(getLatexForString(xlabelTimeUnits)), plt.title(getLatexForString('CDF for dimension lag distribution.')) plt.ylim((0, 1.2)) plt.legend(loc=4) if returnAxisValuesOnly: plt.show()
def plotDimensionsEstimation(self, returnAxisValuesOnly=True): def calculateDimensionsFor(params, percentageOfNewDimensions): ''' numberOfTimeUnits=10*24*12 Experts stream [ 1.17707899e+03 1.03794580e+00] 76819 Houston stream [ 2.73913900e+03 1.02758516e+00] 195731 ''' print getSmallestPrimeNumberGreaterThan(int(CurveFit.inverseOfDecreasingExponentialFunction(params, percentageOfNewDimensions))) dataDistribution = defaultdict(list) for line in FileIO.iterateJsonFromFile(self.dimensionsEstimationFile): for k, v in line[ParameterEstimation.dimensionsEstimationId].iteritems(): k = int(k) if k not in dataDistribution: dataDistribution[k] = [0., 0.] dataDistribution[k][0] += v; dataDistribution[k][1] += 1 x, y = [], []; [(x.append(k), y.append((dataDistribution[k][0] / dataDistribution[k][1]) / k)) for k in sorted(dataDistribution) if k > 1000] x, y = x[:numberOfTimeUnits], y[:numberOfTimeUnits] exponentialCurveParams = CurveFit.getParamsAfterFittingData(x, y, CurveFit.decreasingExponentialFunction, [1., 1.]) print self.stream_settings['plot_label'], exponentialCurveParams, calculateDimensionsFor(exponentialCurveParams, 0.01) plt.ylabel(getLatexForString('\% of decaying dimensions')), plt.xlabel(getLatexForString('\# of dimensions')), plt.title(getLatexForString('Dimension stability with increasing number of dimensions.')) plt.semilogy(x, y, 'o', color=self.stream_settings['plot_color'], label=getLatexForString(self.stream_settings['plot_label']) + getLatexForString(' (%0.2fx^{-%0.2f})') % (exponentialCurveParams[0], exponentialCurveParams[1]), lw=2) plt.semilogy(x, CurveFit.getYValues(CurveFit.decreasingExponentialFunction, exponentialCurveParams, x), color=self.stream_settings['plot_color'], lw=2) plt.legend() if returnAxisValuesOnly: plt.show()
def subPlot(id, timeUnit): plt.subplot(id) print timeUnit, calculatePercentageOfDecayedPhrasesFor( params, timeUnit) plt.plot( x, y, 'o', label=getLatexForString(self.stream_settings['plot_label']) + getLatexForString(' (%0.2fx^{%0.2f})') % (params[0], params[1]), color=self.stream_settings['plot_color']) plt.plot(x, CurveFit.getYValues( CurveFit.increasingExponentialFunction, params, x), color=self.stream_settings['plot_color'], lw=2)
def plotPercentageOfDimensionsWithinALag(self, returnAxisValuesOnly=True): ''' This gives us the percentage of phrases we can loose everytime we prune phrases. Measures the percentage of dimensions having lag less than TU. So at the end of 10th day, almost y% of phrases can be removed. With some probabiity that it will not occure again. numberOfTimeUnits=10*24*12 With 75% probability. Experts stream [ 0.0097055 0.81888514] 107 0.554497397565 Houston stream [ 0.00943499 0.825918 ] 126 0.487757815615 With 90% probability. Experts stream [ 0.0097055 0.81888514] 223 0.187150798756 Houston stream [ 0.00943499 0.825918 ] 228 0.164007589276 ''' def calculatePercentageOfDecayedPhrasesFor(params, timeUnit): return 1 - CurveFit.increasingExponentialFunction(params, timeUnit) dataDistribution = {} currentTimeUnit = 0 for data in list(FileIO.iterateJsonFromFile(self.dimensionInActivityTimeFile))[:numberOfTimeUnits]: totalDimensions = float(sum(data['phrases_lag_distribution'].values())) tempArray = [] for k, v in data['phrases_lag_distribution'].iteritems(): k = int(k) if k not in dataDistribution: dataDistribution[k] = [0] * numberOfTimeUnits dataDistribution[k][currentTimeUnit] = v / totalDimensions tempArray.append(v / totalDimensions) currentTimeUnit += 1 x = sorted(dataDistribution) y = getCumulativeDistribution([np.mean(dataDistribution[k]) for k in x]) params = CurveFit.getParamsAfterFittingData(x, y, CurveFit.increasingExponentialFunction, [1., 1.]) print self.stream_settings['plot_label'], params, def subPlot(id, timeUnit): plt.subplot(id) print timeUnit, calculatePercentageOfDecayedPhrasesFor(params, timeUnit) plt.plot(x, y, 'o', label=getLatexForString(self.stream_settings['plot_label']) + getLatexForString(' (%0.2fx^{%0.2f})') % (params[0], params[1]), color=self.stream_settings['plot_color']) plt.plot(x, CurveFit.getYValues(CurveFit.increasingExponentialFunction, params, x), color=self.stream_settings['plot_color'], lw=2) if self.stream_settings['stream_id'] == 'experts_twitter_stream': subPlot(111, 107); plt.title(getLatexForString('Percentage of phrases within a lag')) else: subPlot(111, 126); plt.xlabel(getLatexForString(xlabelTimeUnits)) plt.ylabel(r'$\%\ of\ phrases\ with\ lag\ \leq\ TU$') plt.legend(loc=4) if returnAxisValuesOnly: plt.show()
def plotPercentageOfClustersWithinALag(self, returnAxisValuesOnly=True): ''' 458 Experts stream [ 0.01860266 0.70639136] 15 0.874004297177 80 Houston stream [ 0.0793181 0.47644004] 3 0.866127308876 ''' def calculatePercentageOfDecayedPhrasesFor(params, timeUnit): return 1 - CurveFit.increasingExponentialFunction(params, timeUnit) dataDistribution = {} currentTimeUnit = 0 # file='/mnt/chevron/kykamath/data/twitter/lsh_crowds/houston_stream/parameter_estimation/cluster_lag_distribution' file = self.hdsClustering.stream_settings['%s_file' % ClusteringParametersEstimation.clusterLagDistributionId] lines = list(FileIO.iterateJsonFromFile(file)) numberOfTimeUnits = len(lines) for data in lines: totalClusters = float(sum(data[ClusteringParametersEstimation.clusterLagDistributionId].values())) tempArray = [] for k, v in data[ClusteringParametersEstimation.clusterLagDistributionId].iteritems(): k = int(k) if k not in dataDistribution: dataDistribution[k] = [0] * numberOfTimeUnits dataDistribution[k][currentTimeUnit] = v / totalClusters tempArray.append(v / totalClusters) currentTimeUnit += 1 x = sorted(dataDistribution) print numberOfTimeUnits, y = getCumulativeDistribution([np.mean(dataDistribution[k]) for k in x]) params = CurveFit.getParamsAfterFittingData(x, y, CurveFit.increasingExponentialFunction, [1., 1.]) print self.stream_settings['plot_label'], params, def subPlot(id, timeUnit): plt.subplot(id) print timeUnit, calculatePercentageOfDecayedPhrasesFor(params, timeUnit) plt.plot(x, y, 'o', label=getLatexForString(self.stream_settings['plot_label']) + getLatexForString(' (%0.2fx^{%0.2f})') % (params[0], params[1]), color=self.stream_settings['plot_color']) plt.plot(x, CurveFit.getYValues(CurveFit.increasingExponentialFunction, params, x), color=self.stream_settings['plot_color'], lw=2) if self.stream_settings['stream_id'] == 'experts_twitter_stream': subPlot(111, 15); plt.title(getLatexForString('Percentage of clusters within a lag')) else: subPlot(111, 3); plt.xlabel(getLatexForString(xlabelTimeUnits)) plt.ylabel(r'$\%\ of\ clusters\ with\ lag\ \leq\ TU$') plt.legend(loc=4) if returnAxisValuesOnly: plt.show()
def subPlot(id, timeUnit): plt.subplot(id) print timeUnit, calculatePercentageOfDecayedPhrasesFor(params, timeUnit) plt.plot(x, y, 'o', label=getLatexForString(self.stream_settings['plot_label']) + getLatexForString(' (%0.2fx^{%0.2f})') % (params[0], params[1]), color=self.stream_settings['plot_color']) plt.plot(x, CurveFit.getYValues(CurveFit.increasingExponentialFunction, params, x), color=self.stream_settings['plot_color'], lw=2)
def calculatePercentageOfDecayedPhrasesFor(params, timeUnit): return 1 - CurveFit.increasingExponentialFunction(params, timeUnit) dataDistribution = {}
def calculateInActivityTimeFor(params, probabilityOfInactivity): return int(CurveFit.inverseOfIncreasingExponentialFunction(params, 1 - probabilityOfInactivity)) data = list(FileIO.iterateJsonFromFile(self.hdsClustering.stream_settings['%s_file' % ClusteringParametersEstimation.clusterLagDistributionId]))[-1]
def calculateInActivityTimeFor(params, probabilityOfInactivity): return int(CurveFit.inverseOfIncreasingExponentialFunction(params, 1 - probabilityOfInactivity)) data = list(FileIO.iterateJsonFromFile(self.dimensionInActivityTimeFile))[numberOfTimeUnits]
def calculateInActivityTimeFor(params, probabilityOfInactivity): return int( CurveFit.inverseOfIncreasingExponentialFunction( params, 1 - probabilityOfInactivity))
def calculatePercentageOfDecayedPhrasesFor(params, timeUnit): return 1 - CurveFit.increasingExponentialFunction(params, timeUnit)