示例#1
0
import sys

# text1 = "eat pizza apple sauce food yummy delicious"
# text2 = "tomato pizza food italian delicious eat"
# text3 = "dog cat pet cute kitty puppy love nature"
# text4 = "eat dog cat food puppy love"
# documents = [text1, text2, text3, text4]
# texts = [document.lower().split(" ") for document in documents]

listOfPageName = ['Red','Green','Blue','Black','White','Brown','Pink','Gray','Thailand', 'Japan', 'Qatar', 'India', \
'China', 'Vietnam', 'Singapore', 'Malaysia', 'Indonesia', 'Dog', 'Cat', 'Pig', 'Cow', 'Bird', 'Lion', 'Elephant',\
'Fish', 'Snake']

pages = [util.getCleanWikiContent(names) for names in listOfPageName]

texts = [util.removeMeaningless(page.lower().split(' ')) for page in pages]
dictionary = gs.corpora.Dictionary(texts)

d = dict()
for k,v in dictionary.items():
    d[k] = v
corpus = [dictionary.doc2bow(text) for text in texts]
lda = gs.models.ldamodel.LdaModel(corpus, num_topics=8)

topic0 =  lda.get_topic_terms(0)
topic1 =  lda.get_topic_terms(1)
topic2 =  lda.get_topic_terms(2)
topic3 =  lda.get_topic_terms(3)
topic4 =  lda.get_topic_terms(4)
topic5 =  lda.get_topic_terms(5)
topic6 =  lda.get_topic_terms(6)
示例#2
0
listOfPageName = [
    "Sushi",
    "Burrito",
    "Thailand",
    "google",
    "Muslim",
    "Islam",
    "Pizza",
    "South Korea",
    "Andrew Ng",
    "Barack Obama",
    "Google",
]
pages = [util.getCleanWikiContent(names) for names in listOfPageName]
texts = [util.freqFilter(util.removeMeaningless(page.lower().split(" ")), 0.003, 1) for page in pages]
# texts = [getRelativeCount(name, listOfPageName, 4, 0.35) for name in listOfPageName]
# texts = [util.removeMeaningless(page.lower().split(' ')) for page in pages]
# print texts
# texts = util.truncatedList(texts, 0.5, 1)
# print texts
# print truncatedLists[0]

# List of list
resampledTexts = [util.updatedResample(text, 100) for text in texts]
countTexts = [collections.Counter(text) for text in resampledTexts]

# print collections.Counter(resampledTexts[0])

# text0 = list(set(resampledTexts[2]))
text0 = resampledTexts[2]