import sys # text1 = "eat pizza apple sauce food yummy delicious" # text2 = "tomato pizza food italian delicious eat" # text3 = "dog cat pet cute kitty puppy love nature" # text4 = "eat dog cat food puppy love" # documents = [text1, text2, text3, text4] # texts = [document.lower().split(" ") for document in documents] listOfPageName = ['Red','Green','Blue','Black','White','Brown','Pink','Gray','Thailand', 'Japan', 'Qatar', 'India', \ 'China', 'Vietnam', 'Singapore', 'Malaysia', 'Indonesia', 'Dog', 'Cat', 'Pig', 'Cow', 'Bird', 'Lion', 'Elephant',\ 'Fish', 'Snake'] pages = [util.getCleanWikiContent(names) for names in listOfPageName] texts = [util.removeMeaningless(page.lower().split(' ')) for page in pages] dictionary = gs.corpora.Dictionary(texts) d = dict() for k,v in dictionary.items(): d[k] = v corpus = [dictionary.doc2bow(text) for text in texts] lda = gs.models.ldamodel.LdaModel(corpus, num_topics=8) topic0 = lda.get_topic_terms(0) topic1 = lda.get_topic_terms(1) topic2 = lda.get_topic_terms(2) topic3 = lda.get_topic_terms(3) topic4 = lda.get_topic_terms(4) topic5 = lda.get_topic_terms(5) topic6 = lda.get_topic_terms(6)
listOfPageName = [ "Sushi", "Burrito", "Thailand", "google", "Muslim", "Islam", "Pizza", "South Korea", "Andrew Ng", "Barack Obama", "Google", ] pages = [util.getCleanWikiContent(names) for names in listOfPageName] texts = [util.freqFilter(util.removeMeaningless(page.lower().split(" ")), 0.003, 1) for page in pages] # texts = [getRelativeCount(name, listOfPageName, 4, 0.35) for name in listOfPageName] # texts = [util.removeMeaningless(page.lower().split(' ')) for page in pages] # print texts # texts = util.truncatedList(texts, 0.5, 1) # print texts # print truncatedLists[0] # List of list resampledTexts = [util.updatedResample(text, 100) for text in texts] countTexts = [collections.Counter(text) for text in resampledTexts] # print collections.Counter(resampledTexts[0]) # text0 = list(set(resampledTexts[2])) text0 = resampledTexts[2]