def generateStatsForMRKMeansClusteringQuality():
     for i in [90000, 100000, 200000, 300000, 400000, 500000]: 
         print 'Generating stats for: ',i
         tf = TweetsFile(i, **experts_twitter_stream_settings)
         FileIO.writeToFileAsJson({'mr_k_means': tf.generateStatsForKMeansMRClustering(), 
                                   'settings': Settings.getSerialzedObject(tf.stream_settings)}, 
                                   TweetsFile.mr_stats_file)
 def writeClusters(hdStreamClusteringObject, currentMessageTime):
     print '\n\n\nEntering:', currentMessageTime, len(hdStreamClusteringObject.phraseTextAndDimensionMap), len(hdStreamClusteringObject.phraseTextToPhraseObjectMap), len(hdStreamClusteringObject.clusters)
     iterationData = {'time_stamp': getStringRepresentationForTweetTimestamp(currentMessageTime),
                      'clusters': map(TwitterCrowdsSpecificMethods.getClusterInMapFormat, [cluster for cluster, _ in sorted(StreamCluster.iterateByAttribute(hdStreamClusteringObject.clusters.values(), 'length'), key=itemgetter(1), reverse=True)]),
                      'settings': Settings.getSerialzedObject(hdStreamClusteringObject.stream_settings)
                      }
     FileIO.writeToFileAsJson(iterationData, hdStreamClusteringObject.stream_settings['lsh_clusters_folder']+FileIO.getFileByDay(currentMessageTime))
     print 'Leaving: ', currentMessageTime, len(hdStreamClusteringObject.phraseTextAndDimensionMap), len(hdStreamClusteringObject.phraseTextToPhraseObjectMap), len(hdStreamClusteringObject.clusters)
 def generateStatsForDefaultStreamSettings():
     for i in [10**3, 10**4, 10**5]: 
         for j in range(1, 10):
             print 'Generating stats for: ',i*j
             tf = TweetsFile(i*j, **default_experts_twitter_stream_settings)
             FileIO.writeToFileAsJson({'streaming_lsh': tf.generateStatsForStreamingLSHClustering(), 
                                       'settings': Settings.getSerialzedObject(tf.stream_settings)}, 
                                       TweetsFile.default_stats_file)
 def generateStatsForQualityComparisonWithSSA():
     #        for length in [i*j for i in 10**3, 10**4, 10**5 for j in range(1, 10)]:
     for length in [1000000]:
         print "Generating stats for: ", length
         tf = TweetsFile(length, **experts_twitter_stream_settings)
         #            stats = {'ssa': tf.getStatsForSSA(), 'ssa_mr': tf.getStatsForSSAMR(), 'streaming_lsh': KMeansTweetsFile(length, **experts_twitter_stream_settings).generateStatsForStreamingLSHClustering(), 'settings': Settings.getSerialzedObject(tf.stream_settings)}
         stats = {"ssa_mr": tf.getStatsForSSAMR(), "settings": Settings.getSerialzedObject(tf.stream_settings)}
         FileIO.writeToFileAsJson(stats, TweetsFile.stats_file)
예제 #5
0
 def generateStatsForMRKMeansClusteringQuality():
     for i in [90000, 100000, 200000, 300000, 400000, 500000]:
         print 'Generating stats for: ', i
         tf = TweetsFile(i, **experts_twitter_stream_settings)
         FileIO.writeToFileAsJson(
             {
                 'mr_k_means': tf.generateStatsForKMeansMRClustering(),
                 'settings': Settings.getSerialzedObject(tf.stream_settings)
             }, TweetsFile.mr_stats_file)
    def generateStatsForOptimized():
#        for i in [10**3, 10**4, 10**5]: 
        for length in [1000000, 1100000, 1200000]: 
#        for i in [10**6]:
#            for j in range(1, 10): 
                print 'Generating stats for: ', length
                tf = TweetsFile(length, **experts_twitter_stream_settings)
                FileIO.writeToFileAsJson({'streaming_lsh': tf.generateStatsForHDLSHClustering(), 
                                          'settings': Settings.getSerialzedObject(tf.stream_settings)}, 
                                          hd_clustering_performance_folder+'cda')
 def generateStatsForQualityComparisonWithSSA():
     #        for length in [i*j for i in 10**3, 10**4, 10**5 for j in range(1, 10)]:
     for length in [1000000]:
         print 'Generating stats for: ', length
         tf = TweetsFile(length, **experts_twitter_stream_settings)
         #            stats = {'ssa': tf.getStatsForSSA(), 'ssa_mr': tf.getStatsForSSAMR(), 'streaming_lsh': KMeansTweetsFile(length, **experts_twitter_stream_settings).generateStatsForStreamingLSHClustering(), 'settings': Settings.getSerialzedObject(tf.stream_settings)}
         stats = {
             'ssa_mr': tf.getStatsForSSAMR(),
             'settings': Settings.getSerialzedObject(tf.stream_settings)
         }
         FileIO.writeToFileAsJson(stats, TweetsFile.stats_file)
예제 #8
0
 def generateStatsForCDA():
     for length, fileName in GenerateStats.lengthAndFileIterator():
         print 'Generating stats for: ', length
         performance = GenerateStats.performanceForCDAAt(
             length, fileName, **experts_twitter_stream_settings)
         stats = {
             CDA:
             performance,
             'settings':
             Settings.getSerialzedObject(experts_twitter_stream_settings)
         }
         FileIO.writeToFileAsJson(stats, getPerformanceFile(CDA))
예제 #9
0
 def generateStatsForOptimized():
     #        for i in [10**3, 10**4, 10**5]:
     for length in [1000000, 1100000, 1200000]:
         #        for i in [10**6]:
         #            for j in range(1, 10):
         print 'Generating stats for: ', length
         tf = TweetsFile(length, **experts_twitter_stream_settings)
         FileIO.writeToFileAsJson(
             {
                 'streaming_lsh': tf.generateStatsForHDLSHClustering(),
                 'settings': Settings.getSerialzedObject(tf.stream_settings)
             }, hd_clustering_performance_folder + 'cda')
예제 #10
0
 def generateStatsForDefaultStreamSettings():
     for i in [10**3, 10**4, 10**5]:
         for j in range(1, 10):
             print 'Generating stats for: ', i * j
             tf = TweetsFile(i * j,
                             **default_experts_twitter_stream_settings)
             FileIO.writeToFileAsJson(
                 {
                     'streaming_lsh':
                     tf.generateStatsForStreamingLSHClustering(),
                     'settings':
                     Settings.getSerialzedObject(tf.stream_settings)
                 }, TweetsFile.default_stats_file)
    def generateStatsForUnOptimized():
#        for i in [10**3, 10**4, 10**5]: 
        for length in [1000000, 1100000, 1200000]: 
#        for i in [10**6]:
#            for j in range(1, 10): 
                print 'Generating stats for: ', length
#                default_experts_twitter_stream_settings['cluster_filtering_method'] = emptyClusterFilteringMethod
                tf = TweetsFile(length, **default_experts_twitter_stream_settings)
                performance = tf.generateStatsForHDLSHClustering()
                FileIO.writeToFileAsJson({'streaming_lsh': performance,
                                          'settings': Settings.getSerialzedObject(tf.stream_settings)}, 
                                          hd_clustering_performance_folder+'cda_unopt')
                del performance['clusters']
                print performance
예제 #12
0
 def generateStatsForUnOptimized():
     #        for i in [10**3, 10**4, 10**5]:
     for length in [1000000, 1100000, 1200000]:
         #        for i in [10**6]:
         #            for j in range(1, 10):
         print 'Generating stats for: ', length
         #                default_experts_twitter_stream_settings['cluster_filtering_method'] = emptyClusterFilteringMethod
         tf = TweetsFile(length, **default_experts_twitter_stream_settings)
         performance = tf.generateStatsForHDLSHClustering()
         FileIO.writeToFileAsJson(
             {
                 'streaming_lsh': performance,
                 'settings': Settings.getSerialzedObject(tf.stream_settings)
             }, hd_clustering_performance_folder + 'cda_unopt')
         del performance['clusters']
         print performance
예제 #13
0
 def thresholdForDocumentToBeInCluterEstimation(stats_file,
                                                **stream_settings):
     ''' Estimate thresold for the clusters by varying the threshold_for_document_to_be_in_cluster value.
     Run this on a document set of size 100K. 
     '''
     for length in [
             i * j for i in 10**3, 10**4, 10**5 for j in range(1, 10)
     ]:
         #            for t in range(1, 16):
         for t in range(16, 21):
             stream_settings[
                 'threshold_for_document_to_be_in_cluster'] = t * 0.05
             print length, stream_settings[
                 'threshold_for_document_to_be_in_cluster']
             stats = {
                 'streaming_lsh':
                 KMeansTweetsFile(length, **stream_settings).
                 generateStatsForStreamingLSHClustering(),
                 'settings':
                 Settings.getSerialzedObject(stream_settings)
             }
             FileIO.writeToFileAsJson(stats, stats_file)
    def thresholdForDocumentToBeInCluterEstimation(stats_file, **stream_settings):
        ''' Estimate thresold for the clusters by varying the threshold_for_document_to_be_in_cluster value.
        Run this on a document set of size 100K. 
        '''
        for length in [i * j for i in 10 ** 3, 10 ** 4, 10 ** 5 for j in range(1, 10)]: 
#            for t in range(1, 16): 
            for t in range(16, 21):
                stream_settings['threshold_for_document_to_be_in_cluster'] = t * 0.05
                print length, stream_settings['threshold_for_document_to_be_in_cluster']
                stats = {'streaming_lsh': KMeansTweetsFile(length, **stream_settings).generateStatsForStreamingLSHClustering(), 'settings': Settings.getSerialzedObject(stream_settings)}
                FileIO.writeToFileAsJson(stats, stats_file)
 def generateStatsForCDA():
     for length, fileName in GenerateStats.lengthAndFileIterator(): 
         print 'Generating stats for: ',length
         performance = GenerateStats.performanceForCDAAt(length, fileName, **experts_twitter_stream_settings)
         stats = {CDA: performance, 'settings': Settings.getSerialzedObject(experts_twitter_stream_settings)}
         FileIO.writeToFileAsJson(stats, getPerformanceFile(CDA))
예제 #16
0
# General twitter stream settings.
time_unit_in_seconds=timedelta(seconds=5*60)
twitter_stream_settings = Settings(
                                   stream_id='twitter_stream', # Unique id to represent the stream.
                                   dimensions=0, # Number of maximum dimensions to consider at a time. Make sue this is prime. This is also equal to the number of top phrases that will be considered for crowd discovery.
                                   min_phrase_length=2, # Minumum lenght of phrases. For example min_phrase_length=1 and max_phrase_length=1 will result in only unigrams as features.
                                   max_phrase_length=2, # Maximum lenght of phrases. For example min_phrase_length=1 and max_phrase_length=2 will result in both unigrams and bigrams as features.
                                   
                                   phrase_decay_coefficient=0.75, # The rate at which phrases decays.
                                   stream_decay_coefficient=0.75, # The rate at which stream decays.
                                   stream_cluster_decay_coefficient=0.5, # The rate at which a cluster decays.
                                   
                                   time_unit_in_seconds=time_unit_in_seconds, # This value will be used to determine the length of unit time intervals.
                                   
#                                   dimension_update_frequency_in_seconds=timedelta(seconds=15*60)
                                   dimension_update_frequency_in_seconds=None, # Every these many seconds, old phrases are pruned and new dimensions are created.
#                                   max_phrase_inactivity_time_in_seconds=timedelta(seconds=30*60)
                                   max_phrase_inactivity_time_in_seconds=None, # Time after which a phrase can be considered old and need not be tracked.

                                   cluster_analysis_frequency_in_seconds=time_unit_in_seconds*3, # Every these many seconds current clusters will be analyzed.
                                   cluster_filtering_frequency_in_seconds=time_unit_in_seconds*3, # Every these many seconds current clusters will be filtered.
                                   cluster_inactivity_time_in_seconds=None, # Clusters that have not added users below this are removed.
                                   # Cluster pruning properties.
                                   cluster_filter_attribute = 'length', # The attribute based on which stream clusters will be pruned. 'length' => Size of clusters; score => streaming cluster score.
                                   cluster_filter_threshold = 0, # Value for the cluster filter threshold. All clusters with attribute values below this will be pruned.
                                   cluster_merging_jaccard_distance_threshold = 0.75 # Clusters are merged if the jaccard similarity is above this value. 
                                   )

# Streaming LSH clustering specific settings.
streaming_lsh_settings=Settings(