def performanceForCDAAt(noOfTweets, fileName, **stream_settings): clustering=HDStreaminClustering(**stream_settings) ts = time.time() clustering.cluster(TwitterIterators.iterateFromFile(fileName)) te = time.time() documentClusters = [cluster.documentsInCluster.keys() for k, cluster in clustering.clusters.iteritems() if len(cluster.documentsInCluster.keys())>=stream_settings['cluster_filter_threshold']] return Evaluation.getEvaluationMetrics(noOfTweets, documentClusters, te-ts)
def _tweetIterator(self): userMap = {} for tweet in TwitterIterators.iterateFromFile(self.fileName+'.gz'): user = tweet['user']['screen_name'] phrases = [phrase.replace(' ', unique_string) for phrase in getPhrases(getWordsFromRawEnglishMessage(tweet['text']), self.stream_settings['min_phrase_length'], self.stream_settings['max_phrase_length'])] if phrases: if user not in userMap: userMap[user] = ' '.join(phrases) else: userMap[user]+= ' ' + ' '.join(phrases) return userMap.iteritems()
def _tweetWithTimestampIterator(self): userMap = defaultdict(dict) for tweet in TwitterIterators.iterateFromFile(self.fileName+'.gz'): user = tweet['user']['screen_name'] userMap[user]['user'] = {'screen_name': user} userMap[user]['id'] = tweet['id'] userMap[user]['created_at'] = tweet['created_at'] if 'text' not in userMap[user]: userMap[user]['text'] = ' ' phrases = [phrase.replace(' ', unique_string) for phrase in getPhrases(getWordsFromRawEnglishMessage(tweet['text']), self.stream_settings['min_phrase_length'], self.stream_settings['max_phrase_length'])] if phrases: userMap[user]['text']+= ' ' + ' '.join(phrases) return userMap.iteritems()
def performanceForCDAAt(noOfTweets, fileName, **stream_settings): clustering = HDStreaminClustering(**stream_settings) ts = time.time() clustering.cluster(TwitterIterators.iterateFromFile(fileName)) te = time.time() documentClusters = [ cluster.documentsInCluster.keys() for k, cluster in clustering.clusters.iteritems() if len(cluster.documentsInCluster.keys()) >= stream_settings['cluster_filter_threshold'] ] return Evaluation.getEvaluationMetrics(noOfTweets, documentClusters, te - ts)
def _tweetIterator(self): userMap = {} for tweet in TwitterIterators.iterateFromFile(self.fileName + '.gz'): user = tweet['user']['screen_name'] phrases = [ phrase.replace(' ', unique_string) for phrase in getPhrases( getWordsFromRawEnglishMessage(tweet['text']), self.stream_settings['min_phrase_length'], self.stream_settings['max_phrase_length']) ] if phrases: if user not in userMap: userMap[user] = ' '.join(phrases) else: userMap[user] += ' ' + ' '.join(phrases) return userMap.iteritems()
def _tweetWithTimestampIterator(self): userMap = defaultdict(dict) for tweet in TwitterIterators.iterateFromFile(self.fileName + '.gz'): user = tweet['user']['screen_name'] userMap[user]['user'] = {'screen_name': user} userMap[user]['id'] = tweet['id'] userMap[user]['created_at'] = tweet['created_at'] if 'text' not in userMap[user]: userMap[user]['text'] = ' ' phrases = [ phrase.replace(' ', unique_string) for phrase in getPhrases( getWordsFromRawEnglishMessage(tweet['text']), self.stream_settings['min_phrase_length'], self.stream_settings['max_phrase_length']) ] if phrases: userMap[user]['text'] += ' ' + ' '.join(phrases) return userMap.iteritems()