def generateExperimentData2(self, fixedType): global previousTime experts_twitter_stream_settings[ 'cluster_analysis_method'] = JustifyDimensionsEstimation.modifiedClusterAnalysisMethod2 if fixedType: experts_twitter_stream_settings[ 'dimensions_performance_type'] = JustifyDimensionsEstimation.first_n_dimension # experts_twitter_stream_settings['update_dimensions_method'] = emptyUpdateDimensionsMethod experts_twitter_stream_settings['phrase_decay_coefficient'] = 1.0 experts_twitter_stream_settings['stream_decay_coefficient'] = 1.0 experts_twitter_stream_settings[ 'stream_cluster_decay_coefficient'] = 1.0 for dimensions in range(10**4, 21 * 10**4, 10**4): experts_twitter_stream_settings[ 'dimensions'] = getLargestPrimeLesserThan(dimensions) previousTime = time.time() HDStreaminClustering( **experts_twitter_stream_settings).cluster( TwitterIterators.iterateTweetsFromExperts( expertsDataStartTime=datetime(2011, 3, 19), expertsDataEndTime=datetime(2011, 3, 20, 5))) else: experts_twitter_stream_settings[ 'dimensions_performance_type'] = JustifyDimensionsEstimation.top_n_dimension previousTime = time.time() HDStreaminClustering(**experts_twitter_stream_settings).cluster( TwitterIterators.iterateTweetsFromExperts( expertsDataStartTime=datetime(2011, 3, 19), expertsDataEndTime=datetime(2011, 3, 20, 5)))
def performanceForCDAAt(noOfTweets, fileName, **stream_settings): clustering=HDStreaminClustering(**stream_settings) ts = time.time() clustering.cluster(TwitterIterators.iterateFromFile(fileName)) te = time.time() documentClusters = [cluster.documentsInCluster.keys() for k, cluster in clustering.clusters.iteritems() if len(cluster.documentsInCluster.keys())>=stream_settings['cluster_filter_threshold']] return Evaluation.getEvaluationMetrics(noOfTweets, documentClusters, te-ts)
def __init__(self, **stream_settings): stream_settings[ '%s_file' % ClusteringParametersEstimation. clusterLagDistributionId] = stream_settings[ 'parameter_estimation_folder'] + ClusteringParametersEstimation.clusterLagDistributionId self.stream_settings = stream_settings self.hdsClustering = HDStreaminClustering(**self.stream_settings)
def performanceForCDAAt(noOfTweets, fileName, **stream_settings): clustering = HDStreaminClustering(**stream_settings) ts = time.time() clustering.cluster(TwitterIterators.iterateFromFile(fileName)) te = time.time() documentClusters = [ cluster.documentsInCluster.keys() for k, cluster in clustering.clusters.iteritems() if len(cluster.documentsInCluster.keys()) >= stream_settings['cluster_filter_threshold'] ] return Evaluation.getEvaluationMetrics(noOfTweets, documentClusters, te - ts)
def generateExperimentData(self, withOutPruning): global previousTime if withOutPruning: experts_twitter_stream_settings['cluster_filtering_method'] = emptyClusterFilteringMethod; experts_twitter_stream_settings['pruing_type'] = JustifyMemoryPruning.without_memory_pruning else: experts_twitter_stream_settings['pruing_type'] = JustifyMemoryPruning.with_memory_pruning experts_twitter_stream_settings['cluster_analysis_method'] = JustifyMemoryPruning.modifiedClusterAnalysisMethod previousTime = time.time() HDStreaminClustering(**experts_twitter_stream_settings).cluster(TwitterIterators.iterateTweetsFromExperts(expertsDataStartTime=datetime(2011,3,19), expertsDataEndTime=datetime(2011,3,27)))
def generateExperimentData(self, with_vanilla_lsh): global previousTime if with_vanilla_lsh: experts_twitter_stream_settings[ 'lsh_type'] = JustifyNotUsingVanillaLSH.with_vanilla_lsh experts_twitter_stream_settings['phrase_decay_coefficient'] = 1.0 experts_twitter_stream_settings['stream_decay_coefficient'] = 1.0 experts_twitter_stream_settings[ 'stream_cluster_decay_coefficient'] = 1.0 experts_twitter_stream_settings[ 'cluster_filtering_method'] = emptyClusterFilteringMethod experts_twitter_stream_settings[ 'signature_type'] = 'signature_type_list' experts_twitter_stream_settings[ 'dimensions'] = getLargestPrimeLesserThan(10000) experts_twitter_stream_settings[ 'update_dimensions_method'] = emptyUpdateDimensionsMethod else: experts_twitter_stream_settings[ 'lsh_type'] = JustifyNotUsingVanillaLSH.with_modified_lsh experts_twitter_stream_settings[ 'cluster_analysis_method'] = JustifyNotUsingVanillaLSH.modifiedClusterAnalysisMethod previousTime = time.time() HDStreaminClustering(**experts_twitter_stream_settings).cluster( TwitterIterators.iterateTweetsFromExperts( expertsDataStartTime=datetime(2011, 3, 19), expertsDataEndTime=datetime(2011, 3, 27)))
def generateExperimentData(self, withOutDecay): global previousTime if withOutDecay: experts_twitter_stream_settings['decay_type'] = JustifyExponentialDecay.without_decay experts_twitter_stream_settings['phrase_decay_coefficient']=1.0; experts_twitter_stream_settings['stream_decay_coefficient']=1.0; experts_twitter_stream_settings['stream_cluster_decay_coefficient']=1.0; else: experts_twitter_stream_settings['decay_type'] = JustifyExponentialDecay.with_decay experts_twitter_stream_settings['cluster_analysis_method'] = JustifyExponentialDecay.modifiedClusterAnalysisMethod previousTime = time.time() HDStreaminClustering(**experts_twitter_stream_settings).cluster(TwitterIterators.iterateTweetsFromExperts(expertsDataStartTime=datetime(2011,4,1), expertsDataEndTime=datetime(2011,4,8)))
def generateExperimentData(self, withoutTrie): global previousTime if withoutTrie: experts_twitter_stream_settings['trie_type'] = JustifyTrie.with_sorted_list experts_twitter_stream_settings['signature_type']='signature_type_list' else: experts_twitter_stream_settings['trie_type'] = JustifyTrie.with_trie experts_twitter_stream_settings['cluster_analysis_method'] = JustifyTrie.modifiedClusterAnalysisMethod experts_twitter_stream_settings['cluster_filtering_method'] = emptyClusterFilteringMethod previousTime = time.time() HDStreaminClustering(**experts_twitter_stream_settings).cluster(TwitterIterators.iterateTweetsFromExperts(expertsDataStartTime=datetime(2011,3,19), expertsDataEndTime=datetime(2011,3,27)))
def generateExperimentData(self): global previousTime experts_twitter_stream_settings['dimensions_performance_type'] = JustifyDimensionsEstimation.first_n_dimension experts_twitter_stream_settings['update_dimensions_method'] = emptyUpdateDimensionsMethod experts_twitter_stream_settings['cluster_analysis_method'] = JustifyDimensionsEstimation.modifiedClusterAnalysisMethod for dimensions in range(10**4,21*10**4,10**4): experts_twitter_stream_settings['dimensions'] = getLargestPrimeLesserThan(dimensions) previousTime = time.time() try: HDStreaminClustering(**experts_twitter_stream_settings).cluster(TwitterIterators.iterateTweetsFromExperts()) except Exception as e: pass
def generateStatsForHDLSHClustering(self): print 'HD LSH' def _getDocumentFromTuple((user, text)): vector, words = Vector(), text.split() for word in words[1:]: if word not in vector: vector[word] = 1 else: vector[word] += 1 return Document(user, vector) self.stream_settings[ 'convert_data_to_message_method'] = TwitterCrowdsSpecificMethods.convertTweetJSONToMessage self.stream_settings[ 'cluster_analysis_method'] = emptyClusterAnalysisMethod # self.stream_settings['cluster_filtering_method'] = emptyClusterFilteringMethod self.documents = [ tw[1] for tw in list(self._tweetWithTimestampIterator()) if tw[1]['text'].strip() != '' ] self.documents = [ tw[0] for tw in sorted([( t, getDateTimeObjectFromTweetTimestamp(t['created_at'])) for t in self.documents], key=itemgetter(0)) ] clustering = HDStreaminClustering(**self.stream_settings) ts = time.time() # for tweet in self.documents: clustering.getClusterAndUpdateExistingClusters(_getDocumentFromTuple(tweet)) # clustering.cluster([_getDocumentFromTuple(d) for d in self.documents]) clustering.cluster(self.documents) te = time.time() documentClusters = [ cluster.documentsInCluster.keys() for k, cluster in clustering.clusters.iteritems() if len(cluster.documentsInCluster.keys()) >= self.stream_settings['cluster_filter_threshold'] ] return self.getEvaluationMetrics(documentClusters, te - ts)
def generateStatsForHDLSHClustering(self): print 'HD LSH' def _getDocumentFromTuple((user, text)): vector, words = Vector(), text.split() for word in words[1:]: if word not in vector: vector[word]=1 else: vector[word]+=1 return Document(user, vector) self.stream_settings['convert_data_to_message_method'] = TwitterCrowdsSpecificMethods.convertTweetJSONToMessage self.stream_settings['cluster_analysis_method'] = emptyClusterAnalysisMethod # self.stream_settings['cluster_filtering_method'] = emptyClusterFilteringMethod self.documents = [tw[1] for tw in list(self._tweetWithTimestampIterator()) if tw[1]['text'].strip()!=''] self.documents = [ tw[0] for tw in sorted([(t, getDateTimeObjectFromTweetTimestamp(t['created_at'])) for t in self.documents], key=itemgetter(0)) ] clustering=HDStreaminClustering(**self.stream_settings) ts = time.time() # for tweet in self.documents: clustering.getClusterAndUpdateExistingClusters(_getDocumentFromTuple(tweet)) # clustering.cluster([_getDocumentFromTuple(d) for d in self.documents]) clustering.cluster(self.documents) te = time.time() documentClusters = [cluster.documentsInCluster.keys() for k, cluster in clustering.clusters.iteritems() if len(cluster.documentsInCluster.keys())>=self.stream_settings['cluster_filter_threshold']] return self.getEvaluationMetrics(documentClusters, te-ts)
def __init__(self, **stream_settings): stream_settings['%s_file' % ClusteringParametersEstimation.clusterLagDistributionId] = stream_settings['parameter_estimation_folder'] + ClusteringParametersEstimation.clusterLagDistributionId self.stream_settings = stream_settings self.hdsClustering = HDStreaminClustering(**self.stream_settings)
class ClusteringParametersEstimation(): clusterLagDistributionId = 'cluster_lag_distribution' def __init__(self, **stream_settings): stream_settings['%s_file' % ClusteringParametersEstimation.clusterLagDistributionId] = stream_settings['parameter_estimation_folder'] + ClusteringParametersEstimation.clusterLagDistributionId self.stream_settings = stream_settings self.hdsClustering = HDStreaminClustering(**self.stream_settings) def run(self, iterator): self.hdsClustering.cluster(iterator) @staticmethod def emptyClusterFilteringMethod(hdStreamClusteringObject, currentMessageTime): pass @staticmethod def clusterLagDistributionMethod(hdStreamClusteringObject, currentMessageTime): lagDistribution = defaultdict(int) for cluster in hdStreamClusteringObject.clusters.values(): lag = DateTimeAirthematic.getDifferenceInTimeUnits(currentMessageTime, cluster.lastStreamAddedTime, hdStreamClusteringObject.stream_settings['time_unit_in_seconds'].seconds) lagDistribution[str(lag)] += 1 print currentMessageTime, len(hdStreamClusteringObject.clusters) iterationData = { 'time_stamp': getStringRepresentationForTweetTimestamp(currentMessageTime), 'settings': pprint.pformat(hdStreamClusteringObject.stream_settings), ClusteringParametersEstimation.clusterLagDistributionId: lagDistribution, 'lag_between_streams_added_to_cluster': hdStreamClusteringObject.stream_settings['lag_between_streams_added_to_cluster'] } # print hdStreamClusteringObject.stream_settings['lag_between_streams_added_to_cluster'] FileIO.writeToFileAsJson(iterationData, hdStreamClusteringObject.stream_settings['%s_file' % ClusteringParametersEstimation.clusterLagDistributionId]) def plotCDFClustersLagDistribution(self, returnAxisValuesOnly=True): ''' This determines the time after which a cluster can be considered decayed and hence removed. Experts stream [ 0.66002386 0.07035227] 0.1 82 Houston stream [ 0.73800037 0.05890473] 0.1 29 458 (# of time units) Experts stream [ 0.66002386 0.07035227] 0.2 15 71 (# of time units) Houston stream [ 0.73756656 0.05883258] 0.2 3 ''' def calculateInActivityTimeFor(params, probabilityOfInactivity): return int(CurveFit.inverseOfIncreasingExponentialFunction(params, 1 - probabilityOfInactivity)) data = list(FileIO.iterateJsonFromFile(self.hdsClustering.stream_settings['%s_file' % ClusteringParametersEstimation.clusterLagDistributionId]))[-1] total = float(sum(data['lag_between_streams_added_to_cluster'].values())) x = sorted(map(int, data['lag_between_streams_added_to_cluster'].keys())) y = getCumulativeDistribution([data['lag_between_streams_added_to_cluster'][str(i)] / total for i in x]) exponentialCurveParams = CurveFit.getParamsAfterFittingData(x, y, CurveFit.increasingExponentialFunction, [1., 1.]) print self.stream_settings['plot_label'], exponentialCurveParams, calculateInActivityTimeFor(exponentialCurveParams, 0.2) plt.plot(x, y, 'o', label=getLatexForString(self.stream_settings['plot_label']) + getLatexForString(' (%0.2fx^{%0.2f})') % (exponentialCurveParams[0], exponentialCurveParams[1]), color=self.stream_settings['plot_color']) plt.plot(x, CurveFit.getYValues(CurveFit.increasingExponentialFunction, exponentialCurveParams, x), color=self.stream_settings['plot_color'], lw=2) plt.ylabel(r'$P\ (\ lag\ \leq\ TU\ )$'), plt.xlabel(getLatexForString(xlabelTimeUnits)), plt.title(getLatexForString('CDF for clusters lag distribution.')) plt.ylim((0, 1.2)) plt.legend(loc=4) if returnAxisValuesOnly: plt.show() def plotPercentageOfClustersWithinALag(self, returnAxisValuesOnly=True): ''' 458 Experts stream [ 0.01860266 0.70639136] 15 0.874004297177 80 Houston stream [ 0.0793181 0.47644004] 3 0.866127308876 ''' def calculatePercentageOfDecayedPhrasesFor(params, timeUnit): return 1 - CurveFit.increasingExponentialFunction(params, timeUnit) dataDistribution = {} currentTimeUnit = 0 # file='/mnt/chevron/kykamath/data/twitter/lsh_crowds/houston_stream/parameter_estimation/cluster_lag_distribution' file = self.hdsClustering.stream_settings['%s_file' % ClusteringParametersEstimation.clusterLagDistributionId] lines = list(FileIO.iterateJsonFromFile(file)) numberOfTimeUnits = len(lines) for data in lines: totalClusters = float(sum(data[ClusteringParametersEstimation.clusterLagDistributionId].values())) tempArray = [] for k, v in data[ClusteringParametersEstimation.clusterLagDistributionId].iteritems(): k = int(k) if k not in dataDistribution: dataDistribution[k] = [0] * numberOfTimeUnits dataDistribution[k][currentTimeUnit] = v / totalClusters tempArray.append(v / totalClusters) currentTimeUnit += 1 x = sorted(dataDistribution) print numberOfTimeUnits, y = getCumulativeDistribution([np.mean(dataDistribution[k]) for k in x]) params = CurveFit.getParamsAfterFittingData(x, y, CurveFit.increasingExponentialFunction, [1., 1.]) print self.stream_settings['plot_label'], params, def subPlot(id, timeUnit): plt.subplot(id) print timeUnit, calculatePercentageOfDecayedPhrasesFor(params, timeUnit) plt.plot(x, y, 'o', label=getLatexForString(self.stream_settings['plot_label']) + getLatexForString(' (%0.2fx^{%0.2f})') % (params[0], params[1]), color=self.stream_settings['plot_color']) plt.plot(x, CurveFit.getYValues(CurveFit.increasingExponentialFunction, params, x), color=self.stream_settings['plot_color'], lw=2) if self.stream_settings['stream_id'] == 'experts_twitter_stream': subPlot(111, 15); plt.title(getLatexForString('Percentage of clusters within a lag')) else: subPlot(111, 3); plt.xlabel(getLatexForString(xlabelTimeUnits)) plt.ylabel(r'$\%\ of\ clusters\ with\ lag\ \leq\ TU$') plt.legend(loc=4) if returnAxisValuesOnly: plt.show() @staticmethod def thresholdForDocumentToBeInCluterEstimation(stats_file, **stream_settings): ''' Estimate thresold for the clusters by varying the threshold_for_document_to_be_in_cluster value. Run this on a document set of size 100K. ''' for length in [i * j for i in 10 ** 3, 10 ** 4, 10 ** 5 for j in range(1, 10)]: # for t in range(1, 16): for t in range(16, 21): stream_settings['threshold_for_document_to_be_in_cluster'] = t * 0.05 print length, stream_settings['threshold_for_document_to_be_in_cluster'] stats = {'streaming_lsh': KMeansTweetsFile(length, **stream_settings).generateStatsForStreamingLSHClustering(), 'settings': Settings.getSerialzedObject(stream_settings)} FileIO.writeToFileAsJson(stats, stats_file)
def generateClusters(self, iterator): self.stream_settings['cluster_analysis_method'] = TwitterStreamAnalysis.writeClusters HDStreaminClustering(**self.stream_settings).cluster(iterator)
class ClusteringParametersEstimation(): clusterLagDistributionId = 'cluster_lag_distribution' def __init__(self, **stream_settings): stream_settings[ '%s_file' % ClusteringParametersEstimation. clusterLagDistributionId] = stream_settings[ 'parameter_estimation_folder'] + ClusteringParametersEstimation.clusterLagDistributionId self.stream_settings = stream_settings self.hdsClustering = HDStreaminClustering(**self.stream_settings) def run(self, iterator): self.hdsClustering.cluster(iterator) @staticmethod def emptyClusterFilteringMethod(hdStreamClusteringObject, currentMessageTime): pass @staticmethod def clusterLagDistributionMethod(hdStreamClusteringObject, currentMessageTime): lagDistribution = defaultdict(int) for cluster in hdStreamClusteringObject.clusters.values(): lag = DateTimeAirthematic.getDifferenceInTimeUnits( currentMessageTime, cluster.lastStreamAddedTime, hdStreamClusteringObject. stream_settings['time_unit_in_seconds'].seconds) lagDistribution[str(lag)] += 1 print currentMessageTime, len(hdStreamClusteringObject.clusters) iterationData = { 'time_stamp': getStringRepresentationForTweetTimestamp(currentMessageTime), 'settings': pprint.pformat(hdStreamClusteringObject.stream_settings), ClusteringParametersEstimation.clusterLagDistributionId: lagDistribution, 'lag_between_streams_added_to_cluster': hdStreamClusteringObject. stream_settings['lag_between_streams_added_to_cluster'] } # print hdStreamClusteringObject.stream_settings['lag_between_streams_added_to_cluster'] FileIO.writeToFileAsJson( iterationData, hdStreamClusteringObject.stream_settings[ '%s_file' % ClusteringParametersEstimation.clusterLagDistributionId]) def plotCDFClustersLagDistribution(self, returnAxisValuesOnly=True): ''' This determines the time after which a cluster can be considered decayed and hence removed. Experts stream [ 0.66002386 0.07035227] 0.1 82 Houston stream [ 0.73800037 0.05890473] 0.1 29 458 (# of time units) Experts stream [ 0.66002386 0.07035227] 0.2 15 71 (# of time units) Houston stream [ 0.73756656 0.05883258] 0.2 3 ''' def calculateInActivityTimeFor(params, probabilityOfInactivity): return int( CurveFit.inverseOfIncreasingExponentialFunction( params, 1 - probabilityOfInactivity)) data = list( FileIO.iterateJsonFromFile(self.hdsClustering.stream_settings[ '%s_file' % ClusteringParametersEstimation.clusterLagDistributionId]))[-1] total = float( sum(data['lag_between_streams_added_to_cluster'].values())) x = sorted( map(int, data['lag_between_streams_added_to_cluster'].keys())) y = getCumulativeDistribution([ data['lag_between_streams_added_to_cluster'][str(i)] / total for i in x ]) exponentialCurveParams = CurveFit.getParamsAfterFittingData( x, y, CurveFit.increasingExponentialFunction, [1., 1.]) print self.stream_settings[ 'plot_label'], exponentialCurveParams, calculateInActivityTimeFor( exponentialCurveParams, 0.2) plt.plot(x, y, 'o', label=getLatexForString(self.stream_settings['plot_label']) + getLatexForString(' (%0.2fx^{%0.2f})') % (exponentialCurveParams[0], exponentialCurveParams[1]), color=self.stream_settings['plot_color']) plt.plot(x, CurveFit.getYValues(CurveFit.increasingExponentialFunction, exponentialCurveParams, x), color=self.stream_settings['plot_color'], lw=2) plt.ylabel(r'$P\ (\ lag\ \leq\ TU\ )$'), plt.xlabel( getLatexForString(xlabelTimeUnits)), plt.title( getLatexForString('CDF for clusters lag distribution.')) plt.ylim((0, 1.2)) plt.legend(loc=4) if returnAxisValuesOnly: plt.show() def plotPercentageOfClustersWithinALag(self, returnAxisValuesOnly=True): ''' 458 Experts stream [ 0.01860266 0.70639136] 15 0.874004297177 80 Houston stream [ 0.0793181 0.47644004] 3 0.866127308876 ''' def calculatePercentageOfDecayedPhrasesFor(params, timeUnit): return 1 - CurveFit.increasingExponentialFunction(params, timeUnit) dataDistribution = {} currentTimeUnit = 0 # file='/mnt/chevron/kykamath/data/twitter/lsh_crowds/houston_stream/parameter_estimation/cluster_lag_distribution' file = self.hdsClustering.stream_settings[ '%s_file' % ClusteringParametersEstimation.clusterLagDistributionId] lines = list(FileIO.iterateJsonFromFile(file)) numberOfTimeUnits = len(lines) for data in lines: totalClusters = float( sum(data[ClusteringParametersEstimation. clusterLagDistributionId].values())) tempArray = [] for k, v in data[ClusteringParametersEstimation. clusterLagDistributionId].iteritems(): k = int(k) if k not in dataDistribution: dataDistribution[k] = [0] * numberOfTimeUnits dataDistribution[k][currentTimeUnit] = v / totalClusters tempArray.append(v / totalClusters) currentTimeUnit += 1 x = sorted(dataDistribution) print numberOfTimeUnits, y = getCumulativeDistribution( [np.mean(dataDistribution[k]) for k in x]) params = CurveFit.getParamsAfterFittingData( x, y, CurveFit.increasingExponentialFunction, [1., 1.]) print self.stream_settings['plot_label'], params, def subPlot(id, timeUnit): plt.subplot(id) print timeUnit, calculatePercentageOfDecayedPhrasesFor( params, timeUnit) plt.plot( x, y, 'o', label=getLatexForString(self.stream_settings['plot_label']) + getLatexForString(' (%0.2fx^{%0.2f})') % (params[0], params[1]), color=self.stream_settings['plot_color']) plt.plot(x, CurveFit.getYValues( CurveFit.increasingExponentialFunction, params, x), color=self.stream_settings['plot_color'], lw=2) if self.stream_settings['stream_id'] == 'experts_twitter_stream': subPlot(111, 15) plt.title(getLatexForString('Percentage of clusters within a lag')) else: subPlot(111, 3) plt.xlabel(getLatexForString(xlabelTimeUnits)) plt.ylabel(r'$\%\ of\ clusters\ with\ lag\ \leq\ TU$') plt.legend(loc=4) if returnAxisValuesOnly: plt.show() @staticmethod def thresholdForDocumentToBeInCluterEstimation(stats_file, **stream_settings): ''' Estimate thresold for the clusters by varying the threshold_for_document_to_be_in_cluster value. Run this on a document set of size 100K. ''' for length in [ i * j for i in 10**3, 10**4, 10**5 for j in range(1, 10) ]: # for t in range(1, 16): for t in range(16, 21): stream_settings[ 'threshold_for_document_to_be_in_cluster'] = t * 0.05 print length, stream_settings[ 'threshold_for_document_to_be_in_cluster'] stats = { 'streaming_lsh': KMeansTweetsFile(length, **stream_settings). generateStatsForStreamingLSHClustering(), 'settings': Settings.getSerialzedObject(stream_settings) } FileIO.writeToFileAsJson(stats, stats_file)