Python kmeans示例，segmentation.kmeans Python示例

示例#1

0

显示文件

文件： mainjoel.py 项目： mrjoelkemp/hashsuggest

def main():
	
	#########
	## DEBUG: Sample querying
	#########
	source = "data/tweetsprocessedashton.txt"
	tweets = load_tweets(source)
	
	# Training set is 2/3 the number of tweets
	num_training = (2 * len(tweets)) // 3
	# System training and testing tweets
	training = random.sample(tweets, num_training)
	testing = [tweet for tweet in tweets if tweet not in training]
	
	# Perform k-means on the training set
	K = 50
	clusters = segmentation.kmeans(training, K, 20, 0.8)
	for i in range(len(clusters)):
		print "len: %s, dt:%s" % (len(clusters[i].tweets), clusters[i].dt)

	# Grab a stem -> word mapping from the file
	lut_source = "data/tweetsashton.txt"
	LUT = get_LUT(lut_source)

	for tweet in testing:
		hashtag = suggest_hashtag(tweet, clusters, LUT)
		print tweet, "#" + hashtag

示例#2

0

显示文件

文件： mainjoel.py 项目： thexdesk/hashsuggest

def main():

    #########
    ## DEBUG: Sample querying
    #########
    source = "data/tweetsprocessedashton.txt"
    tweets = load_tweets(source)

    # Training set is 2/3 the number of tweets
    num_training = (2 * len(tweets)) // 3
    # System training and testing tweets
    training = random.sample(tweets, num_training)
    testing = [tweet for tweet in tweets if tweet not in training]

    # Perform k-means on the training set
    K = 50
    clusters = segmentation.kmeans(training, K, 20, 0.8)
    for i in range(len(clusters)):
        print "len: %s, dt:%s" % (len(clusters[i].tweets), clusters[i].dt)

    # Grab a stem -> word mapping from the file
    lut_source = "data/tweetsashton.txt"
    LUT = get_LUT(lut_source)

    for tweet in testing:
        hashtag = suggest_hashtag(tweet, clusters, LUT)
        print tweet, "#" + hashtag

示例#3

0

显示文件

文件： mainwai.py 项目： thexdesk/hashsuggest

def main():
	# Open the tweet file
	file = open("data/tweetsprocessedashton.txt")
	tweets = []
	for t in file:
		tweets.append(t.replace("\n", ""))

	subtweets = random.sample(tweets, len(tweets)/2)
	clusters = segmentation.kmeans(subtweets, 20, 25, 0.5)

	for i in range(len(clusters)):
		print "len: %s, dt:%s" % (len(clusters[i].tweets), clusters[i].dt)

示例#4

0

显示文件

文件： main.py 项目： mrjoelkemp/hashsuggest

def main():
	# Obtain value from POST
	form = cgi.FieldStorage()
	k = int(form.getvalue('K')) # number of clusters
	cutoff = float(form.getvalue('Thres')) # Threshold
	iteration = int(form.getvalue('Iter')) # number of iteration
	queryTweet = form.getvalue('Tweet') # Query tweet
	
	source = "data/tweetsprocessedashton.txt"
	tweets = load_tweets(source)
	
	# Training set is 2/3 the number of tweets
	num_training = (2 * len(tweets)) // 3
	training = random.sample(tweets, num_training)
	testing = [tweet for tweet in tweets if tweet not in training]
	
	# Perform k-means on the training set
	clusters = segmentation.kmeans(training, k, iteration, (1.0 - cutoff))

	# Grab a stem -> word mapping from the file
	lut_source = "data/tweetsashton.txt"
	LUT = get_LUT(lut_source)

	# Get the raw, unprocessed tweets
	raw_tweets = load_tweets(lut_source)

	#Print raw tweets for testing set instead of the processed testing
	raw_testing = []
	
	# for each raw tweet:
	for raw in raw_tweets:
		proc = process_query(raw)
		proc_string = get_tweet_string(proc)

		# If the string is in the training, then continue
		if proc_string not in training:
			# Otherwise, store the *raw* string in the raw_testing
			raw_testing.append(raw)

	hashtags = [suggest_hashtag(raw, clusters, LUT) for raw in raw_testing]
	testing_tweets_hashtags = zip(raw_tweets, hashtags)

	# Query Tweet Suggestion
	hashtag = suggest_hashtag(queryTweet, clusters, LUT)
	print "<h1> Suggested Hashtag Output: </h1>"
	print queryTweet, "<b style='font-size: 18px'>#" + hashtag + "</b>"
	print "<p>Suggested hashtag: #" + hashtag + "<p>"
	print "<hr />"	
	
	# Testing Set Suggestions
	print "<h1> Testing Set Output:", len(raw_testing), "Testing Tweets","</h1>"
	
	# Compute statistics
	print "<h2> Statistics about Testing Set</h2>"
	print_hashtag_frequency(testing_tweets_hashtags)
	print_average_number_words(testing_tweets_hashtags)
	print_cluster_centroids(clusters)
	print "<hr />"

	# List of hashtag suggestions
	print "<h2> Hashtag Suggestions for Testing Set</h2>"
	print "<table style='width: 750px;'>"
	print "<tr><td><b>Tweet</b></td><td><b>Suggested Hashtag</b></td></tr>"
	for raw, hashtag in testing_tweets_hashtags:
		print "<tr>"
		print "<td>", raw, "</td>"
		print "<td>", "#" + hashtag, "</td>"
		print "</tr>"
	print "</table>"

示例#5

0

显示文件

def main():
    # Obtain value from POST
    form = cgi.FieldStorage()
    k = int(form.getvalue('K'))  # number of clusters
    cutoff = float(form.getvalue('Thres'))  # Threshold
    iteration = int(form.getvalue('Iter'))  # number of iteration
    queryTweet = form.getvalue('Tweet')  # Query tweet

    source = "data/tweetsprocessedashton.txt"
    tweets = load_tweets(source)

    # Training set is 2/3 the number of tweets
    num_training = (2 * len(tweets)) // 3
    training = random.sample(tweets, num_training)
    testing = [tweet for tweet in tweets if tweet not in training]

    # Perform k-means on the training set
    clusters = segmentation.kmeans(training, k, iteration, (1.0 - cutoff))

    # Grab a stem -> word mapping from the file
    lut_source = "data/tweetsashton.txt"
    LUT = get_LUT(lut_source)

    # Get the raw, unprocessed tweets
    raw_tweets = load_tweets(lut_source)

    #Print raw tweets for testing set instead of the processed testing
    raw_testing = []

    # for each raw tweet:
    for raw in raw_tweets:
        proc = process_query(raw)
        proc_string = get_tweet_string(proc)

        # If the string is in the training, then continue
        if proc_string not in training:
            # Otherwise, store the *raw* string in the raw_testing
            raw_testing.append(raw)

    hashtags = [suggest_hashtag(raw, clusters, LUT) for raw in raw_testing]
    testing_tweets_hashtags = zip(raw_tweets, hashtags)

    # Query Tweet Suggestion
    hashtag = suggest_hashtag(queryTweet, clusters, LUT)
    print "<h1> Suggested Hashtag Output: </h1>"
    print queryTweet, "<b style='font-size: 18px'>#" + hashtag + "</b>"
    print "<p>Suggested hashtag: #" + hashtag + "<p>"
    print "<hr />"

    # Testing Set Suggestions
    print "<h1> Testing Set Output:", len(
        raw_testing), "Testing Tweets", "</h1>"

    # Compute statistics
    print "<h2> Statistics about Testing Set</h2>"
    print_hashtag_frequency(testing_tweets_hashtags)
    print_average_number_words(testing_tweets_hashtags)
    print_cluster_centroids(clusters)
    print "<hr />"

    # List of hashtag suggestions
    print "<h2> Hashtag Suggestions for Testing Set</h2>"
    print "<table style='width: 750px;'>"
    print "<tr><td><b>Tweet</b></td><td><b>Suggested Hashtag</b></td></tr>"
    for raw, hashtag in testing_tweets_hashtags:
        print "<tr>"
        print "<td>", raw, "</td>"
        print "<td>", "#" + hashtag, "</td>"
        print "</tr>"
    print "</table>"