Пример #1
0
def demo():
    """
    Non-interactive demonstration of the clusterers with simple 2-D data.
    """

    from nltk.cluster import GAAClusterer

    # use a set of tokens with 2D indices
    vectors = [numpy.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]]

    # test the GAAC clusterer with 4 clusters
    clusterer = GAAClusterer(4)
    clusters = clusterer.cluster(vectors, True)

    print 'Clusterer:', clusterer
    print 'Clustered:', vectors
    print 'As:', clusters
    print

    # show the dendrogram
    clusterer.dendrogram().show()

    # classify a new vector
    vector = numpy.array([3, 3])
    print 'classify(%s):' % vector,
    print clusterer.classify(vector)
    print
def extract_tweets_citedby_graph(df):
  global	stemmer_func,	words,	stopwords

  stemmer_func	=	nltk.stem.snowball.SnowballStemmer("english").stem
  stopwords	=	set(nltk.corpus.stopwords.words('english'))

  words	=	get_words(df[2].values)
  # pp.pprint(words[:10])

  #	K-Means	clustering:
  # cluster	=	KMeansClusterer(7,	euclidean_distance,avoid_empty_clusters=True)

  #	GAAClusterer
  cluster	=	GAAClusterer(21)

  cluster.cluster([vectorspaced(title)	for	title	in	df[2].values	if	title],True)
  classified_examples	=	[cluster.classify(vectorspaced(title))	for	title	in	df[2].values]

  # for	cluster_id,	title	in	sorted(zip(classified_examples,	df[2].values)):
  #   # print	"{}\t{}\t{}\n".format(cluster_id,	df[0].loc[df[2]	==	title].values,	df[1].loc[df[2]	==	title].values)
  #   print	"{}\t{}\t{}".format(cluster_id,	df[1].loc[df[2]	==	title].values, title)

  #	Display	clusters	/	write	to	disk
  with	open	('Results/clustered_relevant_users.tsv', 'w')	as f:
    for	cluster_id,title in sorted(zip(classified_examples,	df[2].values)):
      if	cluster_id>6:
        #	save:	docid	tab	userids
        f.write('{}\t{}\n'.format(df[0].loc[df[2]	==	title].values,	df[1].loc[df[2]	==	title].values))
  if os.path.exists('Results/clustered_relevant_users.tsv'):  print 'file saved: Results/clustered_relevant_users.tsv'

  return
Пример #3
0
def get_word_clusters(tweets):
    all_words = set()
    for tweet in tweets:
        for word in get_words(tweet[HEADER_DICT["text"]]):
            all_words.add(word)
    all_words = tuple(all_words)

    cluster = GAAClusterer(5)
    cluster.cluster([vectorspaced(tweet[HEADER_DICT["text"]], all_words) for tweet in tweets])

    classified_examples = [cluster.classify(vectorspaced(tweet[HEADER_DICT["text"]], all_words)) for tweet in tweets]

    for cluster_id, title in sorted(zip(classified_examples, job_titles)):
        print cluster_id, title
Пример #4
0
def get_word_clusters():
    all_words = set()
    for tweet in tweets.find():
        for word in get_words(tweet['text']):
            all_words.add(word)
    all_words = tuple(all_words)

    cluster = GAAClusterer(5)
    cluster.cluster([vectorspaced(tweet['text'], all_words) for tweet in tweets.find()])

    classified_examples = [
        cluster.classify(vectorspaced(tweet['text'], all_words)) for tweet in tweets.find()
    ]

    for cluster_id, title in sorted(zip(classified_examples, job_titles)):
        print cluster_id, title
Пример #5
0
def get_word_clusters():
    all_words = set()
    for tweet in tweets.find():
        for word in get_words(tweet['text']):
            all_words.add(word)
    all_words = tuple(all_words)

    cluster = GAAClusterer(5)
    cluster.cluster(
        [vectorspaced(tweet['text'], all_words) for tweet in tweets.find()])

    classified_examples = [
        cluster.classify(vectorspaced(tweet['text'], all_words))
        for tweet in tweets.find()
    ]

    for cluster_id, title in sorted(zip(classified_examples, job_titles)):
        print cluster_id, title
Пример #6
0
def clusterIt(kwnb, clusternb, keywords):
    @decorators.memoize
    def normalize_word(word):
        return stemmer_func(word.lower())
     
    def get_words(titles):
        words = set()
        for title in job_titles:                        
                for word in title.split():
                    words.add(normalize_word(word))
        return list(words)
     
    @decorators.memoize
    def vectorspaced(title):
        title_components = [normalize_word(word) for word in title.split()]
        return numpy.array([
            word in title_components and not word in stopwords
            for word in words], numpy.short)
    
    ret = list()          
    if len(keywords) > 0:                        
        job_titles = [x.keyword for x in keywords]
        job_titles = [x.strip() for x in job_titles]
        words = get_words(job_titles)    
        
        # cluster = KMeansClusterer(5, euclidean_distance)
        cluster = GAAClusterer(clusternb)
        cluster.cluster([vectorspaced(title) for title in job_titles if title])
        classified_examples = [cluster.classify(vectorspaced(title)) for title in job_titles]
        
        for cluster_id, title in sorted(zip(classified_examples, job_titles)):
            if(title != ''):
                for keyword in keywords:
                    if (title==keyword.keyword):
                        keyword.assignCluster(cluster_id)
                        ret.append(keyword)
    return ret
Пример #7
0
    title_components = [normalize_word(word) for word in title.split()]
    return numpy.array(
        [word in title_components and not word in stopwords for word in words],
        numpy.short)


if __name__ == '__main__':

    filename = 'example.txt'
    if len(sys.argv) == 2:
        filename = sys.argv[1]

    with open(filename) as title_file:

        job_titles = [line.strip() for line in title_file.readlines()]

        words = get_words(job_titles)

        # cluster = KMeansClusterer(5, euclidean_distance)
        cluster = GAAClusterer(5)
        cluster.cluster([vectorspaced(title) for title in job_titles if title])

        # NOTE: This is inefficient, cluster.classify should really just be
        # called when you are classifying previously unseen examples!
        classified_examples = [
            cluster.classify(vectorspaced(title)) for title in job_titles
        ]

        for cluster_id, title in sorted(zip(classified_examples, job_titles)):
            print cluster_id, title
Пример #8
0
def vectorspaced(stemmer, title):
    title_components = [stemmer.stem(word.lower()) for word in title.split()]
    return numpy.array([
        word in title_components and not word in stopwords
        for word in words], numpy.short)
 
if __name__ == '__main__':
 
    filename = 'CSV/pridected_true_text_alldata.csv'
    if len(sys.argv) == 2:
        filename = sys.argv[1]
 
    
    with open(filename) as title_file:
 
        job_titles = [line.strip() for line in title_file.readlines()]
 
        words = get_words(stemmer, job_titles)
 
        # cluster = KMeansClusterer(5, euclidean_distance)
        cluster = GAAClusterer(30)
        cluster.cluster([vectorspaced(stemmer, title) for title in job_titles if title])
 
        # NOTE: This is inefficient, cluster.classify should really just be
        # called when you are classifying previously unseen examples!
        classified_examples = [
                cluster.classify(vectorspaced(stemmer, title)) for title in job_titles
            ]
 
        for cluster_id, title in sorted(zip(classified_examples, job_titles)):
            print cluster_id, title
Пример #9
0
    return numpy.array(
        [word in title_components and not word in stopwords for word in words],
        numpy.short)


if __name__ == '__main__':

    filename = 'CSV/pridected_true_text_alldata.csv'
    if len(sys.argv) == 2:
        filename = sys.argv[1]

    with open(filename) as title_file:

        job_titles = [line.strip() for line in title_file.readlines()]

        words = get_words(stemmer, job_titles)

        # cluster = KMeansClusterer(5, euclidean_distance)
        cluster = GAAClusterer(30)
        cluster.cluster(
            [vectorspaced(stemmer, title) for title in job_titles if title])

        # NOTE: This is inefficient, cluster.classify should really just be
        # called when you are classifying previously unseen examples!
        classified_examples = [
            cluster.classify(vectorspaced(stemmer, title))
            for title in job_titles
        ]

        for cluster_id, title in sorted(zip(classified_examples, job_titles)):
            print cluster_id, title