def keyWordsCluster_KMeansTFIDF(log_directory, channel_name, output_directory,
                                startingDate, startingMonth, endingDate,
                                endingMonth):
    """ Uses `createKeyWords` to form clusters of words post TF IDF (optional).

    Args:
        log_directory (str): Location of the logs (Assumed to be arranged in directory structure as : <year>/<month>/<day>/<log-file-for-channel>.txt)
        channel_name (str): Channel to be perform analysis on
        output_directory (str): Location of output directory
        startingDate (int): Date to start the analysis (in conjunction with startingMonth)
        startingMonth (int): Date to start the analysis (in conjunction with startingDate)
        endingDate (int): Date to end the analysis (in conjunction with endingMonth)
        endingMonth (int): Date to end the analysis (in conjunction with endingDate)

    Returns:
       null 

    """
    do_SVD = False
    words_to_show_per_cluster = 10
    elbow_method_for_finding_K = False
    '''NON ELBOW'''
    number_of_clusters = 11  #elbow for jan-2013 =
    '''ELBOW SETTINGS'''
    check_k_till = 20
    '''
		MANUALLY CREATING A MATRIX
	'''

    #   each user's normalised frequency stored in rows
    #   all the keywords (unfiltered)
    # '''
    # keyword_list = []
    # user_list = []

    # keyword_dict_list, user_keyword_freq_dict, user_words_dict, nicks_for_stop_words = CKW.createKeyWords(log_directory, channel_name, output_directory, startingDate, startingMonth, endingDate, endingMonth)

    # for dictionary in user_keyword_freq_dict:
    #   # print dictionary['keywords']
    #   keyword_list = list(set(keyword_list + [x[0] for x in  dictionary['keywords']]))
    #   user_list.append(dictionary['nick'])

    # # print "\n \n \n", "KEYWORDS_LIST", keyword_list
    # # print "\n \n \n", "USER_LIST", user_list

    # #GENERATE A MATRIX WITH USERS AS ROWS AND KEYWORDS AS COLUMNS
    # user_keyword_matrix = np.zeros(shape=(len(user_list), len(keyword_list)))
    # # user_keyword_matrix = [[0]*len(keyword_list) for _ in xrange(len(user_list))]

    # for dictionary in user_keyword_freq_dict:
    #   # print dictionary['nick'], user_list.index(dictionary['nick'])
    #   for word_tuple in dictionary['keywords']:
    #     # print word_tuple, keyword_list.index(word_tuple[0])
    #     user_keyword_matrix[user_list.index(dictionary['nick'])][keyword_list.index(word_tuple[0])] += word_tuple[1]

    # print user_keyword_matrix

    # transformer = TfidfTransformer()
    # tfidf = transformer.fit_transform(user_keyword_matrix)
    # tfIDFMatrix = tfidf.toarray()

    # print np.nonzero(tfIDFMatrix)

    # # Each row is normalized to have unit euclidean norm.
    # # The weights of each feature computed by the fit method call are stored in a model attribute:
    # print "Weights of each feature", transformer.idf_
    # for i in xrange(len(transformer.idf_)):
    #   print keyword_list[i], transformer.idf_[i]
    #
    #
    '''
		AUTO TFIDF FROM JUST SENTENCES
	'''
    #http://scikit-learn.org/stable/auto_examples/text/document_clustering.html
    #BUILDING CORPUS

    keyword_dict_list, user_keyword_freq_dict, user_words_dict_list, nicks_for_stop_words = CKW.createKeyWords(
        log_directory, channel_name, output_directory, startingDate,
        startingMonth, endingDate, endingMonth)

    corpus = []

    for user_words_dict in user_words_dict_list:
        # print "SENDER", user_words_dict['sender']
        # print "WORDS", " ".join(user_words_dict['words'])
        corpus.append(" ".join(map(str, user_words_dict['words'])))

    print "No. of users", len(corpus)

    #TF_IDF
    stop_word_without_apostrophe = []
    for words in common_english_words.words:
        stop_word_without_apostrophe.append(words.replace("'", ""))

    stop_words_extended = text.ENGLISH_STOP_WORDS.union(
        common_english_words.words).union(nicks_for_stop_words).union(
            stop_word_without_apostrophe).union(custom_stop_words.words).union(
                custom_stop_words.slangs)

    vectorizer = TfidfVectorizer(max_df=0.5,
                                 min_df=2,
                                 stop_words=stop_words_extended,
                                 use_idf=True)
    print "Extracting features from the training dataset using TF-IDF"
    t0 = time()
    tf_idf = vectorizer.fit_transform(corpus)
    print("done in %fs" % (time() - t0))
    print "n_samples: %d, n_features: %d \n" % tf_idf.shape

    # LSA
    if do_SVD:
        print("============USING SVD==========")
        print("Performing dimensionality reduction using LSA")
        t0 = time()
        # Vectorizer results are normalized, which makes KMeans behave as
        # spherical k-means for better results. Since LSA/SVD results are
        # not normalized, we have to redo the normalization.
        svd = TruncatedSVD(100)  #recommened value = 100
        normalizer = Normalizer(copy=False)
        lsa = make_pipeline(svd, normalizer)

        tf_idf = lsa.fit_transform(tf_idf)

        print("done in %fs" % (time() - t0))

        explained_variance = svd.explained_variance_ratio_.sum()
        print("Explained variance of the SVD step: {}%".format(
            int(explained_variance * 100)))

    if not elbow_method_for_finding_K:
        # CLUSTERING
        km = KMeans(n_clusters=number_of_clusters,
                    init='k-means++',
                    random_state=3465,
                    max_iter=100,
                    n_init=8)

        print("Clustering sparse data with %s" % km)
        t0 = time()
        km.fit(tf_idf)
        print("done in %0.3fs" % (time() - t0))

        print("Top terms per cluster:")
        if do_SVD:
            original_space_centroids = svd.inverse_transform(
                km.cluster_centers_)
            order_centroids = original_space_centroids.argsort()[:, ::-1]
        else:
            order_centroids = km.cluster_centers_.argsort()[:, ::-1]
        np.set_printoptions(threshold=np.nan)

        terms = vectorizer.get_feature_names()
        for i in range(number_of_clusters):
            print("Cluster %d:" % i)
            for ind in order_centroids[i, :words_to_show_per_cluster]:
                print terms[ind] + "\t" + str(
                    round(km.cluster_centers_[i][ind], 2))
            print ""

    else:
        print "============ELBOW METHOD ============="

        sum_squared_errors_list = []
        avg_sum_squared_errors_list = []

        for i in xrange(1, check_k_till + 1):

            print "\n===>> K = ", i

            km = KMeans(n_clusters=i, init='k-means++', max_iter=100, n_init=8)

            t0 = time()
            km.fit(tf_idf)

            if do_SVD:
                original_space_centroids = svd.inverse_transform(
                    km.cluster_centers_)
                order_centroids = original_space_centroids.argsort()[:, ::-1]
            else:
                order_centroids = km.cluster_centers_.argsort()[:, ::-1]

            distance_matrix_all_combination = cdist(tf_idf,
                                                    km.cluster_centers_,
                                                    'euclidean')
            # cIdx = np.argmin(distance_matrix_all_combination,axis=1)
            distance_from_nearest_centroid = np.min(
                distance_matrix_all_combination, axis=1)
            sum_squared_errors = sum(distance_from_nearest_centroid)
            avg_sum_squared_errors = sum_squared_errors / tf_idf.shape[0]

            print "Sum Squared Error =", sum_squared_errors
            print "Avg Sum Squared Error =", avg_sum_squared_errors

            sum_squared_errors_list.append(sum_squared_errors)
            avg_sum_squared_errors_list.append(avg_sum_squared_errors)
            print("Top terms per cluster:")
            terms = vectorizer.get_feature_names()
            for i in range(i):
                print("Cluster %d:" % i)
                for ind in order_centroids[i, :words_to_show_per_cluster]:
                    print(' %s' % terms[ind])
                print()

        plt.plot(range(1, check_k_till + 1), sum_squared_errors_list, 'b*-')
        # ax.plot(K[kIdx], avgWithinSS[kIdx], marker='o', markersize=12,
        # markeredgewidth=2, markeredgecolor='r', markerfacecolor='None')
        plt.grid(True)
        plt.xlabel('Number of clusters')
        plt.ylabel('Average sum of squares')
        plt.title('Elbow for KMeans clustering')

        plt.savefig(output_directory + 'key-words/' + 'elbow_KMeans.png')
        plt.show()

        #NOTE RANDOM OUTPUTS BECAUSE OF RANDOM INITIALISATION
        print "NOTE RANDOM OUTPUTS BECAUSE OF RANDOM INITIALISATION"
Пример #2
0
import createKeyWords as CKW

log_directory = "/home/rohan/parser_files/2013/"
channel_name= "#kubuntu-devel" #channel name
output_directory = "/home/rohan/parser_files/Output/"
startingDate = 1
startingMonth = 1
endingDate = 4
endingMonth = 2


user_list = []
keyword_list = []
keyword_dict_list, user_keyword_freq_dict = CKW.createKeyWords(log_directory, channel_name, output_directory, startingDate, startingMonth, endingDate, endingMonth)

for dictionary in keyword_dict_list:
	user_list.append(dictionary['nick'])
	keyword_list = list(set(keyword_list + dictionary['keywords']))

keyword_user_binary_matrix = [[0 for i in xrange(len(user_list))] for x in xrange(len(keyword_list))]

# print user_list
# print keyword_list
# print keyword_user_binary_matrix

for user in user_list:
	key_words_for_users = filter(lambda keywords_user: keywords_user['nick'] == user, keyword_dict_list)[0]['keywords']
	for word in key_words_for_users:
		keyword_user_binary_matrix[keyword_list.index(word)][user_list.index(user)] = 1

print user_list, "\n"
Пример #3
0
def fuzzyCMeans(log_directory, channel_name, output_directory, startingDate, startingMonth, endingDate, endingMonth):
	"""[Deprecated]
	Fuzzy C Means clustering on key-words instead of KMeans
	"""
	do_SVD = True
	words_to_show_per_cluster = 20
	number_of_clusters = 8

	keyword_dict_list, user_keyword_freq_dict, user_words_dict_list, nicks_for_stop_words = CKW.createKeyWords(log_directory, channel_name, output_directory, startingDate, startingMonth, endingDate, endingMonth)

	corpus = []
	for user_words_dict in user_words_dict_list:
		# print "SENDER", user_words_dict['sender']
		# print "WORDS", " ".join(user_words_dict['words'])
		corpus.append(" ".join(map(str,user_words_dict['words'])))

	print "No. of users", len(corpus)

	#TF_IDF
	stop_word_without_apostrophe=[]
	for words in common_english_words.words:
		stop_word_without_apostrophe.append(words.replace("'",""))
	
	stop_words_extended = text.ENGLISH_STOP_WORDS.union(common_english_words.words).union(nicks_for_stop_words).union(stop_word_without_apostrophe).union(custom_stop_words.words).union(custom_stop_words.slangs)
	
	vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, stop_words=stop_words_extended,
																																use_idf=True)
	print "Extracting features from the training dataset using TF-IDF"
	t0 = time()
	tf_idf = vectorizer.fit_transform(corpus)
	print("done in %fs" % (time() - t0))
	print "n_samples: %d, n_features: %d \n" % tf_idf.shape

	# LSA
	if do_SVD:
		print("============USING SVD==========")
		print("Performing dimensionality reduction using LSA")
		t0 = time()
		# Vectorizer results are normalized, which makes KMeans behave as
		# spherical k-means for better results. Since LSA/SVD results are
		# not normalized, we have to redo the normalization.
		svd = TruncatedSVD(100) #recommened value = 100
		normalizer = Normalizer(copy=False)
		lsa = make_pipeline(svd, normalizer)

		tf_idf = lsa.fit_transform(tf_idf)

		print("done in %fs" % (time() - t0))

		explained_variance = svd.explained_variance_ratio_.sum()
		print("Explained variance of the SVD step: {}%".format(
						int(explained_variance * 100)))

	np.set_printoptions(threshold=np.inf)
	#clusters
	tf_idf_transpose = tf_idf.T #c-means takes the transpose
	centroids, U, U0, d, Jm, p, fpc = fuzz.cluster.cmeans(
			tf_idf_transpose, number_of_clusters, 2., error=0.005, maxiter=1000, init=None)

	print "CENTROIDS", centroids

	if do_SVD:
		original_space_centroids = svd.inverse_transform(centroids)
		order_centroids = original_space_centroids.argsort()[:, ::-1]
	else:
		order_centroids = centroids.argsort()[:, ::-1]

	print "original_space_centroids", original_space_centroids
	print "order_centroids", order_centroids

	terms = vectorizer.get_feature_names()
	for i in range(number_of_clusters):
		print("Cluster %d:" % i)
		for ind in order_centroids[i, :words_to_show_per_cluster]:
			print(' %s' % terms[ind])
		print()
Пример #4
0
def svdOnKeywords(log_directory, channel_name, output_directory, startingDate,
                  startingMonth, endingDate, endingMonth):
    """[Deprecated]
	uses createKeyWords function and then tries to form clusters by extracting more meaningful keywords. Performs a  Singular Value Decomposition(SVD) after doing a Term Frequency–Inverse Document Frequency(TF-IDF).tered)
	"""
    keyword_list = []
    user_list = []

    keyword_dict_list, user_keyword_freq_dict = CKW.createKeyWords(
        log_directory, channel_name, output_directory, startingDate,
        startingMonth, endingDate, endingMonth)

    for dictionary in user_keyword_freq_dict:
        # print dictionary['keywords']
        keyword_list = list(
            set(keyword_list + [x[0] for x in dictionary['keywords']]))

    # print user_keyword_freq_dict #(Format : [<word>, <frequency>, <normalised_score>])'
    user_keyword_normalfreq_matrix = []
    user_keyword_freq_matrix_for_doc_ = []
    keyword_for_user = []

    for user_tuple in user_keyword_freq_dict:
        nick = user_tuple['nick']
        keywords = user_tuple['keywords']
        user_list.append(nick)

        N = 0
        temp = 0
        '''calculete N = (summation of ni**2)**1/2'''
        for keyword in keywords:
            temp += keyword[1]**2

        N = math.sqrt(temp)
        temp = []
        keyword_normal_freq_for_user = [0 for i in range(len(keyword_list))
                                        ]  #to be used as column

        for keyword_tuple in keywords:
            keyword = keyword_tuple[0]
            normal_freq = keyword_tuple[1] / N
            keyword_normal_freq_for_user[keyword_list.index(
                keyword)] = normal_freq
            for i in range(0, keyword_tuple[1]):
                temp.append(keyword)

        keyword_for_user.append(temp)
        user_keyword_normalfreq_matrix.append(keyword_normal_freq_for_user)

    # print len(user_list)
    # print len(keyword_list)
    # print keyword_for_user
    # print user_keyword_normalfreq_matrix
    # print len(user_keyword_normalfreq_matrix )
    '''
		IF-IDF
		https://stanford.edu/~rjweiss/public_html/IRiSS2013/text2/notebooks/tfidf.html
	'''
    mydoclist = keyword_for_user
    vocabulary = keyword_list
    doc_term_matrix = []

    def l2_normalizer(vec):
        denom = numpy.sum([el**2 for el in vec])
        return [(el / math.sqrt(denom)) for el in vec]

    def tf(term, document):
        return freq(term, document)

    def freq(term, document):
        return document.count(term)

    for doc in mydoclist:
        print('The doc is "' + ",".join(doc) + '"')
        tf_vector = [tf(word, doc) for word in vocabulary]
        tf_vector_string = ', '.join(format(freq, 'd') for freq in tf_vector)
        print('The tf vector for Document %d is [%s]' %
              ((mydoclist.index(doc) + 1), tf_vector_string))
        doc_term_matrix.append(tf_vector)

    def numDocsContaining(word, doclist):
        doccount = 0
        for doc in doclist:
            if freq(word, doc) > 0:
                doccount += 1
        return doccount

    def idf(word, doclist):
        n_samples = len(doclist)
        df = numDocsContaining(word, doclist)
        return numpy.log(n_samples / 1 + df)

    my_idf_vector = [idf(word, mydoclist) for word in vocabulary]

    # print 'Our vocabulary vector is [' + ', '.join(list(vocabulary)) + ']'
    # print 'The inverse document frequency vector is [' + ', '.join(format(freq, 'f') for freq in my_idf_vector) + ']'

    def build_idf_matrix(idf_vector):
        idf_mat = numpy.zeros((len(idf_vector), len(idf_vector)))
        numpy.fill_diagonal(idf_mat, idf_vector)
        return idf_mat

    my_idf_matrix = build_idf_matrix(my_idf_vector)

    print("idf-matrix", my_idf_matrix)

    # Now we have converted our IDF vector into a matrix of size BxB, where the diagonal is the IDF vector. That means we can perform now multiply every term frequency vector by the inverse document frequency matrix. Then to make sure we are also accounting for words that appear too frequently within documents, we'll normalize each document such that the L2 norm = 1.
    doc_term_matrix_tfidf = []

    #performing tf-idf matrix multiplication
    for tf_vector in doc_term_matrix:
        doc_term_matrix_tfidf.append(numpy.dot(tf_vector, my_idf_matrix))

    #normalizing
    doc_term_matrix_tfidf_l2 = []
    for tf_vector in doc_term_matrix_tfidf:
        doc_term_matrix_tfidf_l2.append(l2_normalizer(tf_vector))

    print(vocabulary)
    print(doc_term_matrix_tfidf_l2
          )  # np.matrix() just to make it easier to look at
    '''
		SVD
	'''
    # clusterer = nltk.cluster.util.VectorSpaceClusterer(normalise=False, svd_dimensions=25)#http://www.nltk.org/_modules/nltk/cluster/util.html
    # clusterer.cluster(user_keyword_normalfreq_matrix)

    #borrow cluster code from http://www.nltk.org/_modules/nltk/cluster/util.html

    svd_dimensions = 5
    # vectors = user_keyword_normalfreq_matrix
    # vectors = doc_term_matrix_tfidf_l2
    vectors = doc_term_matrix_tfidf

    if svd_dimensions and svd_dimensions < len(vectors[0]):
        [u, d, vt] = numpy.linalg.svd(numpy.transpose(numpy.array(vectors)))
        S = d[:svd_dimensions] * \
        numpy.identity(svd_dimensions, numpy.float64)
        T = u[:, :svd_dimensions]
        Dt = vt[:svd_dimensions, :]
        vectors = numpy.transpose(numpy.dot(S, Dt))

        print("S", S)
        print("T", T)
        print("Dt", Dt)
def keyWordsCluster_KMeansTFIDF(
    log_directory, channel_name, output_directory, startingDate, startingMonth, endingDate, endingMonth
):

    do_SVD = False
    words_to_show_per_cluster = 10
    elbow_method_for_finding_K = False

    """NON ELBOW"""
    number_of_clusters = 11  # elbow for jan-2013 =

    """ELBOW SETTINGS"""
    check_k_till = 20

    """
		MANUALLY CREATING A MATRIX
	"""

    #   each user's normalised frequency stored in rows
    #   all the keywords (unfiltered)
    # '''
    # keyword_list = []
    # user_list = []

    # keyword_dict_list, user_keyword_freq_dict, user_words_dict, nicks_for_stop_words = CKW.createKeyWords(log_directory, channel_name, output_directory, startingDate, startingMonth, endingDate, endingMonth)

    # for dictionary in user_keyword_freq_dict:
    #   # print dictionary['keywords']
    #   keyword_list = list(set(keyword_list + [x[0] for x in  dictionary['keywords']]))
    #   user_list.append(dictionary['nick'])

    # # print "\n \n \n", "KEYWORDS_LIST", keyword_list
    # # print "\n \n \n", "USER_LIST", user_list

    # #GENERATE A MATRIX WITH USERS AS ROWS AND KEYWORDS AS COLUMNS
    # user_keyword_matrix = np.zeros(shape=(len(user_list), len(keyword_list)))
    # # user_keyword_matrix = [[0]*len(keyword_list) for _ in xrange(len(user_list))]

    # for dictionary in user_keyword_freq_dict:
    #   # print dictionary['nick'], user_list.index(dictionary['nick'])
    #   for word_tuple in dictionary['keywords']:
    #     # print word_tuple, keyword_list.index(word_tuple[0])
    #     user_keyword_matrix[user_list.index(dictionary['nick'])][keyword_list.index(word_tuple[0])] += word_tuple[1]

    # print user_keyword_matrix

    # transformer = TfidfTransformer()
    # tfidf = transformer.fit_transform(user_keyword_matrix)
    # tfIDFMatrix = tfidf.toarray()

    # print np.nonzero(tfIDFMatrix)

    # # Each row is normalized to have unit euclidean norm.
    # # The weights of each feature computed by the fit method call are stored in a model attribute:
    # print "Weights of each feature", transformer.idf_
    # for i in xrange(len(transformer.idf_)):
    #   print keyword_list[i], transformer.idf_[i]
    #
    #

    """
		AUTO TFIDF FROM JUST SENTENCES
	"""
    # http://scikit-learn.org/stable/auto_examples/text/document_clustering.html
    # BUILDING CORPUS

    keyword_dict_list, user_keyword_freq_dict, user_words_dict_list, nicks_for_stop_words = CKW.createKeyWords(
        log_directory, channel_name, output_directory, startingDate, startingMonth, endingDate, endingMonth
    )

    corpus = []

    for user_words_dict in user_words_dict_list:
        # print "SENDER", user_words_dict['sender']
        # print "WORDS", " ".join(user_words_dict['words'])
        corpus.append(" ".join(map(str, user_words_dict["words"])))

    print "No. of users", len(corpus)

    # TF_IDF
    stop_word_without_apostrophe = []
    for words in common_english_words.words:
        stop_word_without_apostrophe.append(words.replace("'", ""))

    stop_words_extended = (
        text.ENGLISH_STOP_WORDS.union(common_english_words.words)
        .union(nicks_for_stop_words)
        .union(stop_word_without_apostrophe)
        .union(custom_stop_words.words)
        .union(custom_stop_words.slangs)
    )

    vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, stop_words=stop_words_extended, use_idf=True)
    print "Extracting features from the training dataset using TF-IDF"
    t0 = time()
    tf_idf = vectorizer.fit_transform(corpus)
    print ("done in %fs" % (time() - t0))
    print "n_samples: %d, n_features: %d \n" % tf_idf.shape

    # LSA
    if do_SVD:
        print ("============USING SVD==========")
        print ("Performing dimensionality reduction using LSA")
        t0 = time()
        # Vectorizer results are normalized, which makes KMeans behave as
        # spherical k-means for better results. Since LSA/SVD results are
        # not normalized, we have to redo the normalization.
        svd = TruncatedSVD(100)  # recommened value = 100
        normalizer = Normalizer(copy=False)
        lsa = make_pipeline(svd, normalizer)

        tf_idf = lsa.fit_transform(tf_idf)

        print ("done in %fs" % (time() - t0))

        explained_variance = svd.explained_variance_ratio_.sum()
        print ("Explained variance of the SVD step: {}%".format(int(explained_variance * 100)))

    if not elbow_method_for_finding_K:
        # CLUSTERING
        km = KMeans(n_clusters=number_of_clusters, init="k-means++", random_state=3465, max_iter=100, n_init=8)

        print ("Clustering sparse data with %s" % km)
        t0 = time()
        km.fit(tf_idf)
        print ("done in %0.3fs" % (time() - t0))

        print ("Top terms per cluster:")
        if do_SVD:
            original_space_centroids = svd.inverse_transform(km.cluster_centers_)
            order_centroids = original_space_centroids.argsort()[:, ::-1]
        else:
            order_centroids = km.cluster_centers_.argsort()[:, ::-1]
        np.set_printoptions(threshold=np.nan)

        terms = vectorizer.get_feature_names()
        for i in range(number_of_clusters):
            print ("Cluster %d:" % i)
            for ind in order_centroids[i, :words_to_show_per_cluster]:
                print terms[ind] + "\t" + str(round(km.cluster_centers_[i][ind], 2))
            print ""

    else:
        print "============ELBOW METHOD ============="

        sum_squared_errors_list = []
        avg_sum_squared_errors_list = []

        for i in xrange(1, check_k_till + 1):

            print "\n===>> K = ", i

            km = KMeans(n_clusters=i, init="k-means++", max_iter=100, n_init=8)

            t0 = time()
            km.fit(tf_idf)

            if do_SVD:
                original_space_centroids = svd.inverse_transform(km.cluster_centers_)
                order_centroids = original_space_centroids.argsort()[:, ::-1]
            else:
                order_centroids = km.cluster_centers_.argsort()[:, ::-1]

            distance_matrix_all_combination = cdist(tf_idf, km.cluster_centers_, "euclidean")
            # cIdx = np.argmin(distance_matrix_all_combination,axis=1)
            distance_from_nearest_centroid = np.min(distance_matrix_all_combination, axis=1)
            sum_squared_errors = sum(distance_from_nearest_centroid)
            avg_sum_squared_errors = sum_squared_errors / tf_idf.shape[0]

            print "Sum Squared Error =", sum_squared_errors
            print "Avg Sum Squared Error =", avg_sum_squared_errors

            sum_squared_errors_list.append(sum_squared_errors)
            avg_sum_squared_errors_list.append(avg_sum_squared_errors)
            print ("Top terms per cluster:")
            terms = vectorizer.get_feature_names()
            for i in range(i):
                print ("Cluster %d:" % i)
                for ind in order_centroids[i, :words_to_show_per_cluster]:
                    print (" %s" % terms[ind])
                print ()

        plt.plot(range(1, check_k_till + 1), sum_squared_errors_list, "b*-")
        # ax.plot(K[kIdx], avgWithinSS[kIdx], marker='o', markersize=12,
        # markeredgewidth=2, markeredgecolor='r', markerfacecolor='None')
        plt.grid(True)
        plt.xlabel("Number of clusters")
        plt.ylabel("Average sum of squares")
        plt.title("Elbow for KMeans clustering")

        plt.savefig(output_directory + "key-words/" + "elbow_KMeans.png")
        plt.show()

        # NOTE RANDOM OUTPUTS BECAUSE OF RANDOM INITIALISATION
        print "NOTE RANDOM OUTPUTS BECAUSE OF RANDOM INITIALISATION"
Пример #6
0
def svdOnKeywords(log_directory, channel_name, output_directory, startingDate, startingMonth, endingDate, endingMonth):
	"""[Deprecated]
	uses createKeyWords function and then tries to form clusters by extracting more meaningful keywords. Performs a  Singular Value Decomposition(SVD) after doing a Term Frequency–Inverse Document Frequency(TF-IDF).tered)
	"""
	keyword_list = []
	user_list = []

	keyword_dict_list, user_keyword_freq_dict = CKW.createKeyWords(log_directory, channel_name, output_directory, startingDate, startingMonth, endingDate, endingMonth)

	for dictionary in user_keyword_freq_dict:
		# print dictionary['keywords']
		keyword_list = list(set(keyword_list + [x[0] for x in  dictionary['keywords']]))

	# print user_keyword_freq_dict #(Format : [<word>, <frequency>, <normalised_score>])'
	user_keyword_normalfreq_matrix = []
	user_keyword_freq_matrix_for_doc_ = []
	keyword_for_user = []

	for user_tuple in user_keyword_freq_dict:
		nick =  user_tuple['nick']
		keywords =  user_tuple['keywords']
		user_list.append(nick)

		N = 0
		temp = 0
		'''calculete N = (summation of ni**2)**1/2'''
		for keyword in keywords:
			temp += keyword[1]**2

		N = math.sqrt(temp)
		temp = []
		keyword_normal_freq_for_user = [0 for i in xrange(len(keyword_list))] #to be used as column
		
		for keyword_tuple in keywords:
			keyword = keyword_tuple[0]
			normal_freq = keyword_tuple[1]/N
			keyword_normal_freq_for_user[keyword_list.index(keyword)] = normal_freq
			for i in xrange(0,keyword_tuple[1]):
				temp.append(keyword)
		
		keyword_for_user.append(temp)
		user_keyword_normalfreq_matrix.append(keyword_normal_freq_for_user)

	# print len(user_list)
	# print len(keyword_list)
	# print keyword_for_user
	# print user_keyword_normalfreq_matrix
	# print len(user_keyword_normalfreq_matrix )

	'''
		IF-IDF
		https://stanford.edu/~rjweiss/public_html/IRiSS2013/text2/notebooks/tfidf.html
	'''
	mydoclist = keyword_for_user
	vocabulary = keyword_list
	doc_term_matrix = []

	def l2_normalizer(vec):
		denom = numpy.sum([el**2 for el in vec])
		return [(el / math.sqrt(denom)) for el in vec]

	def tf(term, document):
		return freq(term, document)

	def freq(term, document):
		return document.count(term)

	for doc in mydoclist:
		print 'The doc is "' + ",".join(doc)+ '"'
		tf_vector = [tf(word, doc) for word in vocabulary]
		tf_vector_string = ', '.join(format(freq, 'd') for freq in tf_vector)
		print 'The tf vector for Document %d is [%s]' % ((mydoclist.index(doc)+1), tf_vector_string)
		doc_term_matrix.append(tf_vector)

	def numDocsContaining(word, doclist):
		doccount = 0
		for doc in doclist:
			if freq(word, doc) > 0:
				doccount +=1
		return doccount 

	def idf(word, doclist):
		n_samples = len(doclist)
		df = numDocsContaining(word, doclist)
		return numpy.log(n_samples / 1+df)

	my_idf_vector = [idf(word, mydoclist) for word in vocabulary]

	# print 'Our vocabulary vector is [' + ', '.join(list(vocabulary)) + ']'
	# print 'The inverse document frequency vector is [' + ', '.join(format(freq, 'f') for freq in my_idf_vector) + ']'

	def build_idf_matrix(idf_vector):
		idf_mat = numpy.zeros((len(idf_vector), len(idf_vector)))
		numpy.fill_diagonal(idf_mat, idf_vector)
		return idf_mat

	my_idf_matrix = build_idf_matrix(my_idf_vector)

	print "idf-matrix" , my_idf_matrix

	 # Now we have converted our IDF vector into a matrix of size BxB, where the diagonal is the IDF vector. That means we can perform now multiply every term frequency vector by the inverse document frequency matrix. Then to make sure we are also accounting for words that appear too frequently within documents, we'll normalize each document such that the L2 norm = 1.
	doc_term_matrix_tfidf = []

	#performing tf-idf matrix multiplication
	for tf_vector in doc_term_matrix:
		doc_term_matrix_tfidf.append(numpy.dot(tf_vector, my_idf_matrix))

	#normalizing
	doc_term_matrix_tfidf_l2 = []
	for tf_vector in doc_term_matrix_tfidf:
		doc_term_matrix_tfidf_l2.append(l2_normalizer(tf_vector))
																			
	print vocabulary
	print doc_term_matrix_tfidf_l2# np.matrix() just to make it easier to look at

	'''
		SVD
	'''
	# clusterer = nltk.cluster.util.VectorSpaceClusterer(normalise=False, svd_dimensions=25)#http://www.nltk.org/_modules/nltk/cluster/util.html
	# clusterer.cluster(user_keyword_normalfreq_matrix)

	#borrow cluster code from http://www.nltk.org/_modules/nltk/cluster/util.html

	svd_dimensions = 5
	# vectors = user_keyword_normalfreq_matrix
	# vectors = doc_term_matrix_tfidf_l2
	vectors = doc_term_matrix_tfidf

	if svd_dimensions and svd_dimensions < len(vectors[0]):
		[u, d, vt] = numpy.linalg.svd(numpy.transpose(numpy.array(vectors)))
		S = d[:svd_dimensions] * \
		numpy.identity(svd_dimensions, numpy.float64)
		T = u[:,:svd_dimensions]
		Dt = vt[:svd_dimensions,:]
		vectors = numpy.transpose(numpy.dot(S, Dt))

		print "S", S
		print "T", T
		print "Dt", Dt