def generateExperimentData2(self, fixedType):
     global previousTime
     experts_twitter_stream_settings[
         'cluster_analysis_method'] = JustifyDimensionsEstimation.modifiedClusterAnalysisMethod2
     if fixedType:
         experts_twitter_stream_settings[
             'dimensions_performance_type'] = JustifyDimensionsEstimation.first_n_dimension
         #            experts_twitter_stream_settings['update_dimensions_method'] = emptyUpdateDimensionsMethod
         experts_twitter_stream_settings['phrase_decay_coefficient'] = 1.0
         experts_twitter_stream_settings['stream_decay_coefficient'] = 1.0
         experts_twitter_stream_settings[
             'stream_cluster_decay_coefficient'] = 1.0
         for dimensions in range(10**4, 21 * 10**4, 10**4):
             experts_twitter_stream_settings[
                 'dimensions'] = getLargestPrimeLesserThan(dimensions)
             previousTime = time.time()
             HDStreaminClustering(
                 **experts_twitter_stream_settings).cluster(
                     TwitterIterators.iterateTweetsFromExperts(
                         expertsDataStartTime=datetime(2011, 3, 19),
                         expertsDataEndTime=datetime(2011, 3, 20, 5)))
     else:
         experts_twitter_stream_settings[
             'dimensions_performance_type'] = JustifyDimensionsEstimation.top_n_dimension
         previousTime = time.time()
         HDStreaminClustering(**experts_twitter_stream_settings).cluster(
             TwitterIterators.iterateTweetsFromExperts(
                 expertsDataStartTime=datetime(2011, 3, 19),
                 expertsDataEndTime=datetime(2011, 3, 20, 5)))
 def generateExperimentData2(self, fixedType):
     global previousTime
     experts_twitter_stream_settings[
         "cluster_analysis_method"
     ] = JustifyDimensionsEstimation.modifiedClusterAnalysisMethod2
     if fixedType:
         experts_twitter_stream_settings[
             "dimensions_performance_type"
         ] = JustifyDimensionsEstimation.first_n_dimension
         #            experts_twitter_stream_settings['update_dimensions_method'] = emptyUpdateDimensionsMethod
         experts_twitter_stream_settings["phrase_decay_coefficient"] = 1.0
         experts_twitter_stream_settings["stream_decay_coefficient"] = 1.0
         experts_twitter_stream_settings["stream_cluster_decay_coefficient"] = 1.0
         for dimensions in range(10 ** 4, 21 * 10 ** 4, 10 ** 4):
             experts_twitter_stream_settings["dimensions"] = getLargestPrimeLesserThan(dimensions)
             previousTime = time.time()
             HDStreaminClustering(**experts_twitter_stream_settings).cluster(
                 TwitterIterators.iterateTweetsFromExperts(
                     expertsDataStartTime=datetime(2011, 3, 19), expertsDataEndTime=datetime(2011, 3, 20, 5)
                 )
             )
     else:
         experts_twitter_stream_settings["dimensions_performance_type"] = JustifyDimensionsEstimation.top_n_dimension
         previousTime = time.time()
         HDStreaminClustering(**experts_twitter_stream_settings).cluster(
             TwitterIterators.iterateTweetsFromExperts(
                 expertsDataStartTime=datetime(2011, 3, 19), expertsDataEndTime=datetime(2011, 3, 20, 5)
             )
         )
Пример #3
0
 def generate(self):
     i = 0
     for tweet in TwitterIterators.iterateTweetsFromExperts():
         FileIO.writeToFileAsJson(tweet, self.fileName)
         i += 1
         if i == self.length: break
     os.system('gzip %s' % self.fileName)
 def generateExperimentData(self, withOutPruning):
     global previousTime
     if withOutPruning: experts_twitter_stream_settings['cluster_filtering_method'] = emptyClusterFilteringMethod; experts_twitter_stream_settings['pruing_type'] = JustifyMemoryPruning.without_memory_pruning
     else: experts_twitter_stream_settings['pruing_type'] = JustifyMemoryPruning.with_memory_pruning
     experts_twitter_stream_settings['cluster_analysis_method'] = JustifyMemoryPruning.modifiedClusterAnalysisMethod
     previousTime = time.time()
     HDStreaminClustering(**experts_twitter_stream_settings).cluster(TwitterIterators.iterateTweetsFromExperts(expertsDataStartTime=datetime(2011,3,19), expertsDataEndTime=datetime(2011,3,27)))
 def generateExperimentData(self, with_vanilla_lsh):
     global previousTime
     if with_vanilla_lsh:
         experts_twitter_stream_settings[
             'lsh_type'] = JustifyNotUsingVanillaLSH.with_vanilla_lsh
         experts_twitter_stream_settings['phrase_decay_coefficient'] = 1.0
         experts_twitter_stream_settings['stream_decay_coefficient'] = 1.0
         experts_twitter_stream_settings[
             'stream_cluster_decay_coefficient'] = 1.0
         experts_twitter_stream_settings[
             'cluster_filtering_method'] = emptyClusterFilteringMethod
         experts_twitter_stream_settings[
             'signature_type'] = 'signature_type_list'
         experts_twitter_stream_settings[
             'dimensions'] = getLargestPrimeLesserThan(10000)
         experts_twitter_stream_settings[
             'update_dimensions_method'] = emptyUpdateDimensionsMethod
     else:
         experts_twitter_stream_settings[
             'lsh_type'] = JustifyNotUsingVanillaLSH.with_modified_lsh
     experts_twitter_stream_settings[
         'cluster_analysis_method'] = JustifyNotUsingVanillaLSH.modifiedClusterAnalysisMethod
     previousTime = time.time()
     HDStreaminClustering(**experts_twitter_stream_settings).cluster(
         TwitterIterators.iterateTweetsFromExperts(
             expertsDataStartTime=datetime(2011, 3, 19),
             expertsDataEndTime=datetime(2011, 3, 27)))
 def generateExperimentData(self, withOutPruning):
     global previousTime
     if withOutPruning: experts_twitter_stream_settings['cluster_filtering_method'] = emptyClusterFilteringMethod; experts_twitter_stream_settings['pruing_type'] = JustifyMemoryPruning.without_memory_pruning
     else: experts_twitter_stream_settings['pruing_type'] = JustifyMemoryPruning.with_memory_pruning
     experts_twitter_stream_settings['cluster_analysis_method'] = JustifyMemoryPruning.modifiedClusterAnalysisMethod
     previousTime = time.time()
     HDStreaminClustering(**experts_twitter_stream_settings).cluster(TwitterIterators.iterateTweetsFromExperts(expertsDataStartTime=datetime(2011,3,19), expertsDataEndTime=datetime(2011,3,27)))
 def generate(self):
     i=0
     for tweet in TwitterIterators.iterateTweetsFromExperts(): 
         FileIO.writeToFileAsJson(tweet, self.fileName)
         i+=1
         if i==self.length: break;
     os.system('gzip %s'%self.fileName)
 def performanceForCDAAt(noOfTweets, fileName, **stream_settings):
     clustering=HDStreaminClustering(**stream_settings)
     ts = time.time()
     clustering.cluster(TwitterIterators.iterateFromFile(fileName))
     te = time.time()
     documentClusters = [cluster.documentsInCluster.keys() for k, cluster in clustering.clusters.iteritems() if len(cluster.documentsInCluster.keys())>=stream_settings['cluster_filter_threshold']]
     return Evaluation.getEvaluationMetrics(noOfTweets, documentClusters, te-ts)
 def generateExperimentData(self, withOutDecay):
     global previousTime
     if withOutDecay: 
         experts_twitter_stream_settings['decay_type'] = JustifyExponentialDecay.without_decay
         experts_twitter_stream_settings['phrase_decay_coefficient']=1.0; experts_twitter_stream_settings['stream_decay_coefficient']=1.0; experts_twitter_stream_settings['stream_cluster_decay_coefficient']=1.0;
     else: experts_twitter_stream_settings['decay_type'] = JustifyExponentialDecay.with_decay
     experts_twitter_stream_settings['cluster_analysis_method'] = JustifyExponentialDecay.modifiedClusterAnalysisMethod
     previousTime = time.time()
     HDStreaminClustering(**experts_twitter_stream_settings).cluster(TwitterIterators.iterateTweetsFromExperts(expertsDataStartTime=datetime(2011,4,1), expertsDataEndTime=datetime(2011,4,8))) 
 def _tweetIterator(self):
         userMap = {}
         for tweet in TwitterIterators.iterateFromFile(self.fileName+'.gz'):
             user = tweet['user']['screen_name']
             phrases = [phrase.replace(' ', unique_string) for phrase in getPhrases(getWordsFromRawEnglishMessage(tweet['text']), self.stream_settings['min_phrase_length'], self.stream_settings['max_phrase_length'])]
             if phrases:
                 if user not in userMap: userMap[user] = ' '.join(phrases)
                 else: userMap[user]+= ' ' + ' '.join(phrases)
         return userMap.iteritems()
 def generateExperimentData(self, withOutDecay):
     global previousTime
     if withOutDecay: 
         experts_twitter_stream_settings['decay_type'] = JustifyExponentialDecay.without_decay
         experts_twitter_stream_settings['phrase_decay_coefficient']=1.0; experts_twitter_stream_settings['stream_decay_coefficient']=1.0; experts_twitter_stream_settings['stream_cluster_decay_coefficient']=1.0;
     else: experts_twitter_stream_settings['decay_type'] = JustifyExponentialDecay.with_decay
     experts_twitter_stream_settings['cluster_analysis_method'] = JustifyExponentialDecay.modifiedClusterAnalysisMethod
     previousTime = time.time()
     HDStreaminClustering(**experts_twitter_stream_settings).cluster(TwitterIterators.iterateTweetsFromExperts(expertsDataStartTime=datetime(2011,4,1), expertsDataEndTime=datetime(2011,4,8))) 
 def generateExperimentData(self, withoutTrie):
     global previousTime
     if withoutTrie: 
         experts_twitter_stream_settings['trie_type'] = JustifyTrie.with_sorted_list
         experts_twitter_stream_settings['signature_type']='signature_type_list'
     else: experts_twitter_stream_settings['trie_type'] = JustifyTrie.with_trie
     experts_twitter_stream_settings['cluster_analysis_method'] = JustifyTrie.modifiedClusterAnalysisMethod
     experts_twitter_stream_settings['cluster_filtering_method'] = emptyClusterFilteringMethod
     previousTime = time.time()
     HDStreaminClustering(**experts_twitter_stream_settings).cluster(TwitterIterators.iterateTweetsFromExperts(expertsDataStartTime=datetime(2011,3,19), expertsDataEndTime=datetime(2011,3,27))) 
 def generateExperimentData(self, withoutTrie):
     global previousTime
     if withoutTrie: 
         experts_twitter_stream_settings['trie_type'] = JustifyTrie.with_sorted_list
         experts_twitter_stream_settings['signature_type']='signature_type_list'
     else: experts_twitter_stream_settings['trie_type'] = JustifyTrie.with_trie
     experts_twitter_stream_settings['cluster_analysis_method'] = JustifyTrie.modifiedClusterAnalysisMethod
     experts_twitter_stream_settings['cluster_filtering_method'] = emptyClusterFilteringMethod
     previousTime = time.time()
     HDStreaminClustering(**experts_twitter_stream_settings).cluster(TwitterIterators.iterateTweetsFromExperts(expertsDataStartTime=datetime(2011,3,19), expertsDataEndTime=datetime(2011,3,27))) 
 def generateDocsByLength():
     for length in [1500000]:
         #        for length in [150]:
         fileName = clustering_quality_experts_folder + 'data/%s' % str(
             length)
         print fileName
         i = 0
         for tweet in TwitterIterators.iterateTweetsFromExperts():
             FileIO.writeToFileAsJson(tweet, fileName)
             i += 1
             if i == length: break
 def _tweetWithTimestampIterator(self):
         userMap = defaultdict(dict)
         for tweet in TwitterIterators.iterateFromFile(self.fileName+'.gz'):
             user = tweet['user']['screen_name']
             userMap[user]['user'] = {'screen_name': user}
             userMap[user]['id'] = tweet['id']
             userMap[user]['created_at'] = tweet['created_at']
             if 'text' not in userMap[user]: userMap[user]['text'] = ' '
             phrases = [phrase.replace(' ', unique_string) for phrase in getPhrases(getWordsFromRawEnglishMessage(tweet['text']), self.stream_settings['min_phrase_length'], self.stream_settings['max_phrase_length'])]
             if phrases: userMap[user]['text']+= ' ' + ' '.join(phrases)
         return userMap.iteritems()
 def generateExperimentData(self):
     global previousTime
     experts_twitter_stream_settings['dimensions_performance_type'] = JustifyDimensionsEstimation.first_n_dimension
     experts_twitter_stream_settings['update_dimensions_method'] = emptyUpdateDimensionsMethod
     experts_twitter_stream_settings['cluster_analysis_method'] = JustifyDimensionsEstimation.modifiedClusterAnalysisMethod
     for dimensions in range(10**4,21*10**4,10**4):
         experts_twitter_stream_settings['dimensions'] = getLargestPrimeLesserThan(dimensions)
         previousTime = time.time()
         try:
             HDStreaminClustering(**experts_twitter_stream_settings).cluster(TwitterIterators.iterateTweetsFromExperts())
         except Exception as e: pass
 def generateExperimentData(self):
     global previousTime
     experts_twitter_stream_settings['dimensions_performance_type'] = JustifyDimensionsEstimation.first_n_dimension
     experts_twitter_stream_settings['update_dimensions_method'] = emptyUpdateDimensionsMethod
     experts_twitter_stream_settings['cluster_analysis_method'] = JustifyDimensionsEstimation.modifiedClusterAnalysisMethod
     for dimensions in range(10**4,21*10**4,10**4):
         experts_twitter_stream_settings['dimensions'] = getLargestPrimeLesserThan(dimensions)
         previousTime = time.time()
         try:
             HDStreaminClustering(**experts_twitter_stream_settings).cluster(TwitterIterators.iterateTweetsFromExperts())
         except Exception as e: pass
 def generateDocsByLength():
     for length in [1500000]:
         #        for length in [150]:
         fileName = clustering_quality_experts_folder + "data/%s" % str(length)
         print fileName
         i = 0
         for tweet in TwitterIterators.iterateTweetsFromExperts():
             FileIO.writeToFileAsJson(tweet, fileName)
             i += 1
             if i == length:
                 break
 def generateExperimentData(self, with_vanilla_lsh):
     global previousTime
     if with_vanilla_lsh: 
         experts_twitter_stream_settings['lsh_type'] = JustifyNotUsingVanillaLSH.with_vanilla_lsh
         experts_twitter_stream_settings['phrase_decay_coefficient']=1.0; experts_twitter_stream_settings['stream_decay_coefficient']=1.0; experts_twitter_stream_settings['stream_cluster_decay_coefficient']=1.0;
         experts_twitter_stream_settings['cluster_filtering_method'] = emptyClusterFilteringMethod;
         experts_twitter_stream_settings['signature_type']='signature_type_list'
         experts_twitter_stream_settings['dimensions'] = getLargestPrimeLesserThan(10000)
         experts_twitter_stream_settings['update_dimensions_method'] = emptyUpdateDimensionsMethod
     else: experts_twitter_stream_settings['lsh_type'] = JustifyNotUsingVanillaLSH.with_modified_lsh
     experts_twitter_stream_settings['cluster_analysis_method'] = JustifyNotUsingVanillaLSH.modifiedClusterAnalysisMethod
     previousTime = time.time()
     HDStreaminClustering(**experts_twitter_stream_settings).cluster(TwitterIterators.iterateTweetsFromExperts(expertsDataStartTime=datetime(2011,3,19), expertsDataEndTime=datetime(2011,3,27))) 
Пример #20
0
 def performanceForCDAAt(noOfTweets, fileName, **stream_settings):
     clustering = HDStreaminClustering(**stream_settings)
     ts = time.time()
     clustering.cluster(TwitterIterators.iterateFromFile(fileName))
     te = time.time()
     documentClusters = [
         cluster.documentsInCluster.keys()
         for k, cluster in clustering.clusters.iteritems()
         if len(cluster.documentsInCluster.keys()) >=
         stream_settings['cluster_filter_threshold']
     ]
     return Evaluation.getEvaluationMetrics(noOfTweets, documentClusters,
                                            te - ts)
Пример #21
0
 def _tweetIterator(self):
     userMap = {}
     for tweet in TwitterIterators.iterateFromFile(self.fileName + '.gz'):
         user = tweet['user']['screen_name']
         phrases = [
             phrase.replace(' ', unique_string) for phrase in getPhrases(
                 getWordsFromRawEnglishMessage(tweet['text']),
                 self.stream_settings['min_phrase_length'],
                 self.stream_settings['max_phrase_length'])
         ]
         if phrases:
             if user not in userMap: userMap[user] = ' '.join(phrases)
             else: userMap[user] += ' ' + ' '.join(phrases)
     return userMap.iteritems()
Пример #22
0
 def _tweetWithTimestampIterator(self):
     userMap = defaultdict(dict)
     for tweet in TwitterIterators.iterateFromFile(self.fileName + '.gz'):
         user = tweet['user']['screen_name']
         userMap[user]['user'] = {'screen_name': user}
         userMap[user]['id'] = tweet['id']
         userMap[user]['created_at'] = tweet['created_at']
         if 'text' not in userMap[user]: userMap[user]['text'] = ' '
         phrases = [
             phrase.replace(' ', unique_string) for phrase in getPhrases(
                 getWordsFromRawEnglishMessage(tweet['text']),
                 self.stream_settings['min_phrase_length'],
                 self.stream_settings['max_phrase_length'])
         ]
         if phrases: userMap[user]['text'] += ' ' + ' '.join(phrases)
     return userMap.iteritems()
def generateClusters():
    TwitterStreamAnalysis(**experts_twitter_stream_settings).generateClusters(TwitterIterators.iterateTweetsFromExperts())