Python GAAClusterer.classify примеры использования

Язык программирования: Python

Пространство имен/Пакет: nltk.cluster

Класс/Тип: GAAClusterer

Метод/Функция: classify

Примеров на hotexamples.com: 9

Python GAAClusterer.classify - 9 примеров найдено. Это лучшие примеры Python кода для nltk.cluster.GAAClusterer.classify, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

GAAClusterer(11)

cluster(9)

classify(4)

dendrogram(1)

update_clusters(1)

Пример #1

Показать файл

def demo():
    """
    Non-interactive demonstration of the clusterers with simple 2-D data.
    """

    from nltk.cluster import GAAClusterer

    # use a set of tokens with 2D indices
    vectors = [numpy.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]]

    # test the GAAC clusterer with 4 clusters
    clusterer = GAAClusterer(4)
    clusters = clusterer.cluster(vectors, True)

    print 'Clusterer:', clusterer
    print 'Clustered:', vectors
    print 'As:', clusters
    print

    # show the dendrogram
    clusterer.dendrogram().show()

    # classify a new vector
    vector = numpy.array([3, 3])
    print 'classify(%s):' % vector,
    print clusterer.classify(vector)
    print

Пример #2

Показать файл

Файл: procjson_clust.py Проект: abitofalchemy/ScientificImpactPrediction

def extract_tweets_citedby_graph(df):
  global	stemmer_func,	words,	stopwords

  stemmer_func	=	nltk.stem.snowball.SnowballStemmer("english").stem
  stopwords	=	set(nltk.corpus.stopwords.words('english'))

  words	=	get_words(df[2].values)
  # pp.pprint(words[:10])

  #	K-Means	clustering:
  # cluster	=	KMeansClusterer(7,	euclidean_distance,avoid_empty_clusters=True)

  #	GAAClusterer
  cluster	=	GAAClusterer(21)

  cluster.cluster([vectorspaced(title)	for	title	in	df[2].values	if	title],True)
  classified_examples	=	[cluster.classify(vectorspaced(title))	for	title	in	df[2].values]

  # for	cluster_id,	title	in	sorted(zip(classified_examples,	df[2].values)):
  #   # print	"{}\t{}\t{}\n".format(cluster_id,	df[0].loc[df[2]	==	title].values,	df[1].loc[df[2]	==	title].values)
  #   print	"{}\t{}\t{}".format(cluster_id,	df[1].loc[df[2]	==	title].values, title)

  #	Display	clusters	/	write	to	disk
  with	open	('Results/clustered_relevant_users.tsv', 'w')	as f:
    for	cluster_id,title in sorted(zip(classified_examples,	df[2].values)):
      if	cluster_id>6:
        #	save:	docid	tab	userids
        f.write('{}\t{}\n'.format(df[0].loc[df[2]	==	title].values,	df[1].loc[df[2]	==	title].values))
  if os.path.exists('Results/clustered_relevant_users.tsv'):  print 'file saved: Results/clustered_relevant_users.tsv'

  return

Пример #3

Показать файл

Файл: analyze.py Проект: caar2000/twitter-archive-analysis

def get_word_clusters(tweets):
    all_words = set()
    for tweet in tweets:
        for word in get_words(tweet[HEADER_DICT["text"]]):
            all_words.add(word)
    all_words = tuple(all_words)

    cluster = GAAClusterer(5)
    cluster.cluster([vectorspaced(tweet[HEADER_DICT["text"]], all_words) for tweet in tweets])

    classified_examples = [cluster.classify(vectorspaced(tweet[HEADER_DICT["text"]], all_words)) for tweet in tweets]

    for cluster_id, title in sorted(zip(classified_examples, job_titles)):
        print cluster_id, title

Пример #4

Показать файл

Файл: basic_time_stats.py Проект: terratenney/harvesters

def get_word_clusters():
    all_words = set()
    for tweet in tweets.find():
        for word in get_words(tweet['text']):
            all_words.add(word)
    all_words = tuple(all_words)

    cluster = GAAClusterer(5)
    cluster.cluster([vectorspaced(tweet['text'], all_words) for tweet in tweets.find()])

    classified_examples = [
        cluster.classify(vectorspaced(tweet['text'], all_words)) for tweet in tweets.find()
    ]

    for cluster_id, title in sorted(zip(classified_examples, job_titles)):
        print cluster_id, title

Пример #5

Показать файл

Файл: basic_time_stats.py Проект: terratenney/harvesters

def get_word_clusters():
    all_words = set()
    for tweet in tweets.find():
        for word in get_words(tweet['text']):
            all_words.add(word)
    all_words = tuple(all_words)

    cluster = GAAClusterer(5)
    cluster.cluster(
        [vectorspaced(tweet['text'], all_words) for tweet in tweets.find()])

    classified_examples = [
        cluster.classify(vectorspaced(tweet['text'], all_words))
        for tweet in tweets.find()
    ]

    for cluster_id, title in sorted(zip(classified_examples, job_titles)):
        print cluster_id, title

Пример #6

Показать файл

Файл: clustering.py Проект: E-Conference/KeywordExtactor

def clusterIt(kwnb, clusternb, keywords):
    @decorators.memoize
    def normalize_word(word):
        return stemmer_func(word.lower())
     
    def get_words(titles):
        words = set()
        for title in job_titles:                        
                for word in title.split():
                    words.add(normalize_word(word))
        return list(words)
     
    @decorators.memoize
    def vectorspaced(title):
        title_components = [normalize_word(word) for word in title.split()]
        return numpy.array([
            word in title_components and not word in stopwords
            for word in words], numpy.short)
    
    ret = list()          
    if len(keywords) > 0:                        
        job_titles = [x.keyword for x in keywords]
        job_titles = [x.strip() for x in job_titles]
        words = get_words(job_titles)    
        
        # cluster = KMeansClusterer(5, euclidean_distance)
        cluster = GAAClusterer(clusternb)
        cluster.cluster([vectorspaced(title) for title in job_titles if title])
        classified_examples = [cluster.classify(vectorspaced(title)) for title in job_titles]
        
        for cluster_id, title in sorted(zip(classified_examples, job_titles)):
            if(title != ''):
                for keyword in keywords:
                    if (title==keyword.keyword):
                        keyword.assignCluster(cluster_id)
                        ret.append(keyword)
    return ret

Пример #7

Показать файл

Файл: pynltk.py Проект: manojkmr63712/Python_Repo

    title_components = [normalize_word(word) for word in title.split()]
    return numpy.array(
        [word in title_components and not word in stopwords for word in words],
        numpy.short)


if __name__ == '__main__':

    filename = 'example.txt'
    if len(sys.argv) == 2:
        filename = sys.argv[1]

    with open(filename) as title_file:

        job_titles = [line.strip() for line in title_file.readlines()]

        words = get_words(job_titles)

        # cluster = KMeansClusterer(5, euclidean_distance)
        cluster = GAAClusterer(5)
        cluster.cluster([vectorspaced(title) for title in job_titles if title])

        # NOTE: This is inefficient, cluster.classify should really just be
        # called when you are classifying previously unseen examples!
        classified_examples = [
            cluster.classify(vectorspaced(title)) for title in job_titles
        ]

        for cluster_id, title in sorted(zip(classified_examples, job_titles)):
            print cluster_id, title

Пример #8

Показать файл

Файл: Clusterer.py Проект: GBelzoni/iHub

def vectorspaced(stemmer, title):
    title_components = [stemmer.stem(word.lower()) for word in title.split()]
    return numpy.array([
        word in title_components and not word in stopwords
        for word in words], numpy.short)
 
if __name__ == '__main__':
 
    filename = 'CSV/pridected_true_text_alldata.csv'
    if len(sys.argv) == 2:
        filename = sys.argv[1]
 
    
    with open(filename) as title_file:
 
        job_titles = [line.strip() for line in title_file.readlines()]
 
        words = get_words(stemmer, job_titles)
 
        # cluster = KMeansClusterer(5, euclidean_distance)
        cluster = GAAClusterer(30)
        cluster.cluster([vectorspaced(stemmer, title) for title in job_titles if title])
 
        # NOTE: This is inefficient, cluster.classify should really just be
        # called when you are classifying previously unseen examples!
        classified_examples = [
                cluster.classify(vectorspaced(stemmer, title)) for title in job_titles
            ]
 
        for cluster_id, title in sorted(zip(classified_examples, job_titles)):
            print cluster_id, title

Пример #9

Показать файл

Файл: Clusterer.py Проект: GBelzoni/iHub

    return numpy.array(
        [word in title_components and not word in stopwords for word in words],
        numpy.short)


if __name__ == '__main__':

    filename = 'CSV/pridected_true_text_alldata.csv'
    if len(sys.argv) == 2:
        filename = sys.argv[1]

    with open(filename) as title_file:

        job_titles = [line.strip() for line in title_file.readlines()]

        words = get_words(stemmer, job_titles)

        # cluster = KMeansClusterer(5, euclidean_distance)
        cluster = GAAClusterer(30)
        cluster.cluster(
            [vectorspaced(stemmer, title) for title in job_titles if title])

        # NOTE: This is inefficient, cluster.classify should really just be
        # called when you are classifying previously unseen examples!
        classified_examples = [
            cluster.classify(vectorspaced(stemmer, title))
            for title in job_titles
        ]

        for cluster_id, title in sorted(zip(classified_examples, job_titles)):
            print cluster_id, title