def main(): ######### ## DEBUG: Sample querying ######### source = "data/tweetsprocessedashton.txt" tweets = load_tweets(source) # Training set is 2/3 the number of tweets num_training = (2 * len(tweets)) // 3 # System training and testing tweets training = random.sample(tweets, num_training) testing = [tweet for tweet in tweets if tweet not in training] # Perform k-means on the training set K = 50 clusters = segmentation.kmeans(training, K, 20, 0.8) for i in range(len(clusters)): print "len: %s, dt:%s" % (len(clusters[i].tweets), clusters[i].dt) # Grab a stem -> word mapping from the file lut_source = "data/tweetsashton.txt" LUT = get_LUT(lut_source) for tweet in testing: hashtag = suggest_hashtag(tweet, clusters, LUT) print tweet, "#" + hashtag
def main(): # Open the tweet file file = open("data/tweetsprocessedashton.txt") tweets = [] for t in file: tweets.append(t.replace("\n", "")) subtweets = random.sample(tweets, len(tweets)/2) clusters = segmentation.kmeans(subtweets, 20, 25, 0.5) for i in range(len(clusters)): print "len: %s, dt:%s" % (len(clusters[i].tweets), clusters[i].dt)
def main(): # Obtain value from POST form = cgi.FieldStorage() k = int(form.getvalue('K')) # number of clusters cutoff = float(form.getvalue('Thres')) # Threshold iteration = int(form.getvalue('Iter')) # number of iteration queryTweet = form.getvalue('Tweet') # Query tweet source = "data/tweetsprocessedashton.txt" tweets = load_tweets(source) # Training set is 2/3 the number of tweets num_training = (2 * len(tweets)) // 3 training = random.sample(tweets, num_training) testing = [tweet for tweet in tweets if tweet not in training] # Perform k-means on the training set clusters = segmentation.kmeans(training, k, iteration, (1.0 - cutoff)) # Grab a stem -> word mapping from the file lut_source = "data/tweetsashton.txt" LUT = get_LUT(lut_source) # Get the raw, unprocessed tweets raw_tweets = load_tweets(lut_source) #Print raw tweets for testing set instead of the processed testing raw_testing = [] # for each raw tweet: for raw in raw_tweets: proc = process_query(raw) proc_string = get_tweet_string(proc) # If the string is in the training, then continue if proc_string not in training: # Otherwise, store the *raw* string in the raw_testing raw_testing.append(raw) hashtags = [suggest_hashtag(raw, clusters, LUT) for raw in raw_testing] testing_tweets_hashtags = zip(raw_tweets, hashtags) # Query Tweet Suggestion hashtag = suggest_hashtag(queryTweet, clusters, LUT) print "<h1> Suggested Hashtag Output: </h1>" print queryTweet, "<b style='font-size: 18px'>#" + hashtag + "</b>" print "<p>Suggested hashtag: #" + hashtag + "<p>" print "<hr />" # Testing Set Suggestions print "<h1> Testing Set Output:", len(raw_testing), "Testing Tweets","</h1>" # Compute statistics print "<h2> Statistics about Testing Set</h2>" print_hashtag_frequency(testing_tweets_hashtags) print_average_number_words(testing_tweets_hashtags) print_cluster_centroids(clusters) print "<hr />" # List of hashtag suggestions print "<h2> Hashtag Suggestions for Testing Set</h2>" print "<table style='width: 750px;'>" print "<tr><td><b>Tweet</b></td><td><b>Suggested Hashtag</b></td></tr>" for raw, hashtag in testing_tweets_hashtags: print "<tr>" print "<td>", raw, "</td>" print "<td>", "#" + hashtag, "</td>" print "</tr>" print "</table>"
def main(): # Obtain value from POST form = cgi.FieldStorage() k = int(form.getvalue('K')) # number of clusters cutoff = float(form.getvalue('Thres')) # Threshold iteration = int(form.getvalue('Iter')) # number of iteration queryTweet = form.getvalue('Tweet') # Query tweet source = "data/tweetsprocessedashton.txt" tweets = load_tweets(source) # Training set is 2/3 the number of tweets num_training = (2 * len(tweets)) // 3 training = random.sample(tweets, num_training) testing = [tweet for tweet in tweets if tweet not in training] # Perform k-means on the training set clusters = segmentation.kmeans(training, k, iteration, (1.0 - cutoff)) # Grab a stem -> word mapping from the file lut_source = "data/tweetsashton.txt" LUT = get_LUT(lut_source) # Get the raw, unprocessed tweets raw_tweets = load_tweets(lut_source) #Print raw tweets for testing set instead of the processed testing raw_testing = [] # for each raw tweet: for raw in raw_tweets: proc = process_query(raw) proc_string = get_tweet_string(proc) # If the string is in the training, then continue if proc_string not in training: # Otherwise, store the *raw* string in the raw_testing raw_testing.append(raw) hashtags = [suggest_hashtag(raw, clusters, LUT) for raw in raw_testing] testing_tweets_hashtags = zip(raw_tweets, hashtags) # Query Tweet Suggestion hashtag = suggest_hashtag(queryTweet, clusters, LUT) print "<h1> Suggested Hashtag Output: </h1>" print queryTweet, "<b style='font-size: 18px'>#" + hashtag + "</b>" print "<p>Suggested hashtag: #" + hashtag + "<p>" print "<hr />" # Testing Set Suggestions print "<h1> Testing Set Output:", len( raw_testing), "Testing Tweets", "</h1>" # Compute statistics print "<h2> Statistics about Testing Set</h2>" print_hashtag_frequency(testing_tweets_hashtags) print_average_number_words(testing_tweets_hashtags) print_cluster_centroids(clusters) print "<hr />" # List of hashtag suggestions print "<h2> Hashtag Suggestions for Testing Set</h2>" print "<table style='width: 750px;'>" print "<tr><td><b>Tweet</b></td><td><b>Suggested Hashtag</b></td></tr>" for raw, hashtag in testing_tweets_hashtags: print "<tr>" print "<td>", raw, "</td>" print "<td>", "#" + hashtag, "</td>" print "</tr>" print "</table>"