def demo(): """ Non-interactive demonstration of the clusterers with simple 2-D data. """ from nltk.cluster import GAAClusterer # use a set of tokens with 2D indices vectors = [numpy.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]] # test the GAAC clusterer with 4 clusters clusterer = GAAClusterer(4) clusters = clusterer.cluster(vectors, True) print 'Clusterer:', clusterer print 'Clustered:', vectors print 'As:', clusters print # show the dendrogram clusterer.dendrogram().show() # classify a new vector vector = numpy.array([3, 3]) print 'classify(%s):' % vector, print clusterer.classify(vector) print
def extract_tweets_citedby_graph(df): global stemmer_func, words, stopwords stemmer_func = nltk.stem.snowball.SnowballStemmer("english").stem stopwords = set(nltk.corpus.stopwords.words('english')) words = get_words(df[2].values) # pp.pprint(words[:10]) # K-Means clustering: # cluster = KMeansClusterer(7, euclidean_distance,avoid_empty_clusters=True) # GAAClusterer cluster = GAAClusterer(21) cluster.cluster([vectorspaced(title) for title in df[2].values if title],True) classified_examples = [cluster.classify(vectorspaced(title)) for title in df[2].values] # for cluster_id, title in sorted(zip(classified_examples, df[2].values)): # # print "{}\t{}\t{}\n".format(cluster_id, df[0].loc[df[2] == title].values, df[1].loc[df[2] == title].values) # print "{}\t{}\t{}".format(cluster_id, df[1].loc[df[2] == title].values, title) # Display clusters / write to disk with open ('Results/clustered_relevant_users.tsv', 'w') as f: for cluster_id,title in sorted(zip(classified_examples, df[2].values)): if cluster_id>6: # save: docid tab userids f.write('{}\t{}\n'.format(df[0].loc[df[2] == title].values, df[1].loc[df[2] == title].values)) if os.path.exists('Results/clustered_relevant_users.tsv'): print 'file saved: Results/clustered_relevant_users.tsv' return
def get_word_clusters(tweets): all_words = set() for tweet in tweets: for word in get_words(tweet[HEADER_DICT["text"]]): all_words.add(word) all_words = tuple(all_words) cluster = GAAClusterer(5) cluster.cluster([vectorspaced(tweet[HEADER_DICT["text"]], all_words) for tweet in tweets]) classified_examples = [cluster.classify(vectorspaced(tweet[HEADER_DICT["text"]], all_words)) for tweet in tweets] for cluster_id, title in sorted(zip(classified_examples, job_titles)): print cluster_id, title
def get_word_clusters(): all_words = set() for tweet in tweets.find(): for word in get_words(tweet['text']): all_words.add(word) all_words = tuple(all_words) cluster = GAAClusterer(5) cluster.cluster([vectorspaced(tweet['text'], all_words) for tweet in tweets.find()]) classified_examples = [ cluster.classify(vectorspaced(tweet['text'], all_words)) for tweet in tweets.find() ] for cluster_id, title in sorted(zip(classified_examples, job_titles)): print cluster_id, title
def get_word_clusters(): all_words = set() for tweet in tweets.find(): for word in get_words(tweet['text']): all_words.add(word) all_words = tuple(all_words) cluster = GAAClusterer(5) cluster.cluster( [vectorspaced(tweet['text'], all_words) for tweet in tweets.find()]) classified_examples = [ cluster.classify(vectorspaced(tweet['text'], all_words)) for tweet in tweets.find() ] for cluster_id, title in sorted(zip(classified_examples, job_titles)): print cluster_id, title
def clusterIt(kwnb, clusternb, keywords): @decorators.memoize def normalize_word(word): return stemmer_func(word.lower()) def get_words(titles): words = set() for title in job_titles: for word in title.split(): words.add(normalize_word(word)) return list(words) @decorators.memoize def vectorspaced(title): title_components = [normalize_word(word) for word in title.split()] return numpy.array([ word in title_components and not word in stopwords for word in words], numpy.short) ret = list() if len(keywords) > 0: job_titles = [x.keyword for x in keywords] job_titles = [x.strip() for x in job_titles] words = get_words(job_titles) # cluster = KMeansClusterer(5, euclidean_distance) cluster = GAAClusterer(clusternb) cluster.cluster([vectorspaced(title) for title in job_titles if title]) classified_examples = [cluster.classify(vectorspaced(title)) for title in job_titles] for cluster_id, title in sorted(zip(classified_examples, job_titles)): if(title != ''): for keyword in keywords: if (title==keyword.keyword): keyword.assignCluster(cluster_id) ret.append(keyword) return ret
title_components = [normalize_word(word) for word in title.split()] return numpy.array( [word in title_components and not word in stopwords for word in words], numpy.short) if __name__ == '__main__': filename = 'example.txt' if len(sys.argv) == 2: filename = sys.argv[1] with open(filename) as title_file: job_titles = [line.strip() for line in title_file.readlines()] words = get_words(job_titles) # cluster = KMeansClusterer(5, euclidean_distance) cluster = GAAClusterer(5) cluster.cluster([vectorspaced(title) for title in job_titles if title]) # NOTE: This is inefficient, cluster.classify should really just be # called when you are classifying previously unseen examples! classified_examples = [ cluster.classify(vectorspaced(title)) for title in job_titles ] for cluster_id, title in sorted(zip(classified_examples, job_titles)): print cluster_id, title
def vectorspaced(stemmer, title): title_components = [stemmer.stem(word.lower()) for word in title.split()] return numpy.array([ word in title_components and not word in stopwords for word in words], numpy.short) if __name__ == '__main__': filename = 'CSV/pridected_true_text_alldata.csv' if len(sys.argv) == 2: filename = sys.argv[1] with open(filename) as title_file: job_titles = [line.strip() for line in title_file.readlines()] words = get_words(stemmer, job_titles) # cluster = KMeansClusterer(5, euclidean_distance) cluster = GAAClusterer(30) cluster.cluster([vectorspaced(stemmer, title) for title in job_titles if title]) # NOTE: This is inefficient, cluster.classify should really just be # called when you are classifying previously unseen examples! classified_examples = [ cluster.classify(vectorspaced(stemmer, title)) for title in job_titles ] for cluster_id, title in sorted(zip(classified_examples, job_titles)): print cluster_id, title
return numpy.array( [word in title_components and not word in stopwords for word in words], numpy.short) if __name__ == '__main__': filename = 'CSV/pridected_true_text_alldata.csv' if len(sys.argv) == 2: filename = sys.argv[1] with open(filename) as title_file: job_titles = [line.strip() for line in title_file.readlines()] words = get_words(stemmer, job_titles) # cluster = KMeansClusterer(5, euclidean_distance) cluster = GAAClusterer(30) cluster.cluster( [vectorspaced(stemmer, title) for title in job_titles if title]) # NOTE: This is inefficient, cluster.classify should really just be # called when you are classifying previously unseen examples! classified_examples = [ cluster.classify(vectorspaced(stemmer, title)) for title in job_titles ] for cluster_id, title in sorted(zip(classified_examples, job_titles)): print cluster_id, title