def generateExperimentData2(self, fixedType): global previousTime experts_twitter_stream_settings[ 'cluster_analysis_method'] = JustifyDimensionsEstimation.modifiedClusterAnalysisMethod2 if fixedType: experts_twitter_stream_settings[ 'dimensions_performance_type'] = JustifyDimensionsEstimation.first_n_dimension # experts_twitter_stream_settings['update_dimensions_method'] = emptyUpdateDimensionsMethod experts_twitter_stream_settings['phrase_decay_coefficient'] = 1.0 experts_twitter_stream_settings['stream_decay_coefficient'] = 1.0 experts_twitter_stream_settings[ 'stream_cluster_decay_coefficient'] = 1.0 for dimensions in range(10**4, 21 * 10**4, 10**4): experts_twitter_stream_settings[ 'dimensions'] = getLargestPrimeLesserThan(dimensions) previousTime = time.time() HDStreaminClustering( **experts_twitter_stream_settings).cluster( TwitterIterators.iterateTweetsFromExperts( expertsDataStartTime=datetime(2011, 3, 19), expertsDataEndTime=datetime(2011, 3, 20, 5))) else: experts_twitter_stream_settings[ 'dimensions_performance_type'] = JustifyDimensionsEstimation.top_n_dimension previousTime = time.time() HDStreaminClustering(**experts_twitter_stream_settings).cluster( TwitterIterators.iterateTweetsFromExperts( expertsDataStartTime=datetime(2011, 3, 19), expertsDataEndTime=datetime(2011, 3, 20, 5)))
def generateExperimentData2(self, fixedType): global previousTime experts_twitter_stream_settings[ "cluster_analysis_method" ] = JustifyDimensionsEstimation.modifiedClusterAnalysisMethod2 if fixedType: experts_twitter_stream_settings[ "dimensions_performance_type" ] = JustifyDimensionsEstimation.first_n_dimension # experts_twitter_stream_settings['update_dimensions_method'] = emptyUpdateDimensionsMethod experts_twitter_stream_settings["phrase_decay_coefficient"] = 1.0 experts_twitter_stream_settings["stream_decay_coefficient"] = 1.0 experts_twitter_stream_settings["stream_cluster_decay_coefficient"] = 1.0 for dimensions in range(10 ** 4, 21 * 10 ** 4, 10 ** 4): experts_twitter_stream_settings["dimensions"] = getLargestPrimeLesserThan(dimensions) previousTime = time.time() HDStreaminClustering(**experts_twitter_stream_settings).cluster( TwitterIterators.iterateTweetsFromExperts( expertsDataStartTime=datetime(2011, 3, 19), expertsDataEndTime=datetime(2011, 3, 20, 5) ) ) else: experts_twitter_stream_settings["dimensions_performance_type"] = JustifyDimensionsEstimation.top_n_dimension previousTime = time.time() HDStreaminClustering(**experts_twitter_stream_settings).cluster( TwitterIterators.iterateTweetsFromExperts( expertsDataStartTime=datetime(2011, 3, 19), expertsDataEndTime=datetime(2011, 3, 20, 5) ) )
def generate(self): i = 0 for tweet in TwitterIterators.iterateTweetsFromExperts(): FileIO.writeToFileAsJson(tweet, self.fileName) i += 1 if i == self.length: break os.system('gzip %s' % self.fileName)
def generateExperimentData(self, withOutPruning): global previousTime if withOutPruning: experts_twitter_stream_settings['cluster_filtering_method'] = emptyClusterFilteringMethod; experts_twitter_stream_settings['pruing_type'] = JustifyMemoryPruning.without_memory_pruning else: experts_twitter_stream_settings['pruing_type'] = JustifyMemoryPruning.with_memory_pruning experts_twitter_stream_settings['cluster_analysis_method'] = JustifyMemoryPruning.modifiedClusterAnalysisMethod previousTime = time.time() HDStreaminClustering(**experts_twitter_stream_settings).cluster(TwitterIterators.iterateTweetsFromExperts(expertsDataStartTime=datetime(2011,3,19), expertsDataEndTime=datetime(2011,3,27)))
def generateExperimentData(self, with_vanilla_lsh): global previousTime if with_vanilla_lsh: experts_twitter_stream_settings[ 'lsh_type'] = JustifyNotUsingVanillaLSH.with_vanilla_lsh experts_twitter_stream_settings['phrase_decay_coefficient'] = 1.0 experts_twitter_stream_settings['stream_decay_coefficient'] = 1.0 experts_twitter_stream_settings[ 'stream_cluster_decay_coefficient'] = 1.0 experts_twitter_stream_settings[ 'cluster_filtering_method'] = emptyClusterFilteringMethod experts_twitter_stream_settings[ 'signature_type'] = 'signature_type_list' experts_twitter_stream_settings[ 'dimensions'] = getLargestPrimeLesserThan(10000) experts_twitter_stream_settings[ 'update_dimensions_method'] = emptyUpdateDimensionsMethod else: experts_twitter_stream_settings[ 'lsh_type'] = JustifyNotUsingVanillaLSH.with_modified_lsh experts_twitter_stream_settings[ 'cluster_analysis_method'] = JustifyNotUsingVanillaLSH.modifiedClusterAnalysisMethod previousTime = time.time() HDStreaminClustering(**experts_twitter_stream_settings).cluster( TwitterIterators.iterateTweetsFromExperts( expertsDataStartTime=datetime(2011, 3, 19), expertsDataEndTime=datetime(2011, 3, 27)))
def generate(self): i=0 for tweet in TwitterIterators.iterateTweetsFromExperts(): FileIO.writeToFileAsJson(tweet, self.fileName) i+=1 if i==self.length: break; os.system('gzip %s'%self.fileName)
def performanceForCDAAt(noOfTweets, fileName, **stream_settings): clustering=HDStreaminClustering(**stream_settings) ts = time.time() clustering.cluster(TwitterIterators.iterateFromFile(fileName)) te = time.time() documentClusters = [cluster.documentsInCluster.keys() for k, cluster in clustering.clusters.iteritems() if len(cluster.documentsInCluster.keys())>=stream_settings['cluster_filter_threshold']] return Evaluation.getEvaluationMetrics(noOfTweets, documentClusters, te-ts)
def generateExperimentData(self, withOutDecay): global previousTime if withOutDecay: experts_twitter_stream_settings['decay_type'] = JustifyExponentialDecay.without_decay experts_twitter_stream_settings['phrase_decay_coefficient']=1.0; experts_twitter_stream_settings['stream_decay_coefficient']=1.0; experts_twitter_stream_settings['stream_cluster_decay_coefficient']=1.0; else: experts_twitter_stream_settings['decay_type'] = JustifyExponentialDecay.with_decay experts_twitter_stream_settings['cluster_analysis_method'] = JustifyExponentialDecay.modifiedClusterAnalysisMethod previousTime = time.time() HDStreaminClustering(**experts_twitter_stream_settings).cluster(TwitterIterators.iterateTweetsFromExperts(expertsDataStartTime=datetime(2011,4,1), expertsDataEndTime=datetime(2011,4,8)))
def _tweetIterator(self): userMap = {} for tweet in TwitterIterators.iterateFromFile(self.fileName+'.gz'): user = tweet['user']['screen_name'] phrases = [phrase.replace(' ', unique_string) for phrase in getPhrases(getWordsFromRawEnglishMessage(tweet['text']), self.stream_settings['min_phrase_length'], self.stream_settings['max_phrase_length'])] if phrases: if user not in userMap: userMap[user] = ' '.join(phrases) else: userMap[user]+= ' ' + ' '.join(phrases) return userMap.iteritems()
def generateExperimentData(self, withoutTrie): global previousTime if withoutTrie: experts_twitter_stream_settings['trie_type'] = JustifyTrie.with_sorted_list experts_twitter_stream_settings['signature_type']='signature_type_list' else: experts_twitter_stream_settings['trie_type'] = JustifyTrie.with_trie experts_twitter_stream_settings['cluster_analysis_method'] = JustifyTrie.modifiedClusterAnalysisMethod experts_twitter_stream_settings['cluster_filtering_method'] = emptyClusterFilteringMethod previousTime = time.time() HDStreaminClustering(**experts_twitter_stream_settings).cluster(TwitterIterators.iterateTweetsFromExperts(expertsDataStartTime=datetime(2011,3,19), expertsDataEndTime=datetime(2011,3,27)))
def generateDocsByLength(): for length in [1500000]: # for length in [150]: fileName = clustering_quality_experts_folder + 'data/%s' % str( length) print fileName i = 0 for tweet in TwitterIterators.iterateTweetsFromExperts(): FileIO.writeToFileAsJson(tweet, fileName) i += 1 if i == length: break
def _tweetWithTimestampIterator(self): userMap = defaultdict(dict) for tweet in TwitterIterators.iterateFromFile(self.fileName+'.gz'): user = tweet['user']['screen_name'] userMap[user]['user'] = {'screen_name': user} userMap[user]['id'] = tweet['id'] userMap[user]['created_at'] = tweet['created_at'] if 'text' not in userMap[user]: userMap[user]['text'] = ' ' phrases = [phrase.replace(' ', unique_string) for phrase in getPhrases(getWordsFromRawEnglishMessage(tweet['text']), self.stream_settings['min_phrase_length'], self.stream_settings['max_phrase_length'])] if phrases: userMap[user]['text']+= ' ' + ' '.join(phrases) return userMap.iteritems()
def generateExperimentData(self): global previousTime experts_twitter_stream_settings['dimensions_performance_type'] = JustifyDimensionsEstimation.first_n_dimension experts_twitter_stream_settings['update_dimensions_method'] = emptyUpdateDimensionsMethod experts_twitter_stream_settings['cluster_analysis_method'] = JustifyDimensionsEstimation.modifiedClusterAnalysisMethod for dimensions in range(10**4,21*10**4,10**4): experts_twitter_stream_settings['dimensions'] = getLargestPrimeLesserThan(dimensions) previousTime = time.time() try: HDStreaminClustering(**experts_twitter_stream_settings).cluster(TwitterIterators.iterateTweetsFromExperts()) except Exception as e: pass
def generateDocsByLength(): for length in [1500000]: # for length in [150]: fileName = clustering_quality_experts_folder + "data/%s" % str(length) print fileName i = 0 for tweet in TwitterIterators.iterateTweetsFromExperts(): FileIO.writeToFileAsJson(tweet, fileName) i += 1 if i == length: break
def generateExperimentData(self, with_vanilla_lsh): global previousTime if with_vanilla_lsh: experts_twitter_stream_settings['lsh_type'] = JustifyNotUsingVanillaLSH.with_vanilla_lsh experts_twitter_stream_settings['phrase_decay_coefficient']=1.0; experts_twitter_stream_settings['stream_decay_coefficient']=1.0; experts_twitter_stream_settings['stream_cluster_decay_coefficient']=1.0; experts_twitter_stream_settings['cluster_filtering_method'] = emptyClusterFilteringMethod; experts_twitter_stream_settings['signature_type']='signature_type_list' experts_twitter_stream_settings['dimensions'] = getLargestPrimeLesserThan(10000) experts_twitter_stream_settings['update_dimensions_method'] = emptyUpdateDimensionsMethod else: experts_twitter_stream_settings['lsh_type'] = JustifyNotUsingVanillaLSH.with_modified_lsh experts_twitter_stream_settings['cluster_analysis_method'] = JustifyNotUsingVanillaLSH.modifiedClusterAnalysisMethod previousTime = time.time() HDStreaminClustering(**experts_twitter_stream_settings).cluster(TwitterIterators.iterateTweetsFromExperts(expertsDataStartTime=datetime(2011,3,19), expertsDataEndTime=datetime(2011,3,27)))
def performanceForCDAAt(noOfTweets, fileName, **stream_settings): clustering = HDStreaminClustering(**stream_settings) ts = time.time() clustering.cluster(TwitterIterators.iterateFromFile(fileName)) te = time.time() documentClusters = [ cluster.documentsInCluster.keys() for k, cluster in clustering.clusters.iteritems() if len(cluster.documentsInCluster.keys()) >= stream_settings['cluster_filter_threshold'] ] return Evaluation.getEvaluationMetrics(noOfTweets, documentClusters, te - ts)
def _tweetIterator(self): userMap = {} for tweet in TwitterIterators.iterateFromFile(self.fileName + '.gz'): user = tweet['user']['screen_name'] phrases = [ phrase.replace(' ', unique_string) for phrase in getPhrases( getWordsFromRawEnglishMessage(tweet['text']), self.stream_settings['min_phrase_length'], self.stream_settings['max_phrase_length']) ] if phrases: if user not in userMap: userMap[user] = ' '.join(phrases) else: userMap[user] += ' ' + ' '.join(phrases) return userMap.iteritems()
def _tweetWithTimestampIterator(self): userMap = defaultdict(dict) for tweet in TwitterIterators.iterateFromFile(self.fileName + '.gz'): user = tweet['user']['screen_name'] userMap[user]['user'] = {'screen_name': user} userMap[user]['id'] = tweet['id'] userMap[user]['created_at'] = tweet['created_at'] if 'text' not in userMap[user]: userMap[user]['text'] = ' ' phrases = [ phrase.replace(' ', unique_string) for phrase in getPhrases( getWordsFromRawEnglishMessage(tweet['text']), self.stream_settings['min_phrase_length'], self.stream_settings['max_phrase_length']) ] if phrases: userMap[user]['text'] += ' ' + ' '.join(phrases) return userMap.iteritems()
def generateClusters(): TwitterStreamAnalysis(**experts_twitter_stream_settings).generateClusters(TwitterIterators.iterateTweetsFromExperts())