def svgTextFromCnnUrl(cnnurl, N=8, nGram=2, width=740, height=500): t = time() p = posts.getPostsFromUrl(cnnurl) print str(time()-t) + " seconds to fetch " + str(len(p)) + " unfiltered posts from disqus" t = time() # currently just displays the most common bigrams common_ngrams = stats.mostCommonNGramsFromPosts(p,N) common_ngrams = stats.nGramsListToTokenStringsList(common_ngrams[nGram-1]) print str(time()-t) + " seconds to parse all the comments" s = svggen.commonTokensListToSVGImage(common_ngrams, width, height) return s
def commonNGrams(cnnurl, p, N=30, bn=12): article_ngrams = stats.mostCommonNGramsFromString(art.articleTextFromURL(cnnurl),10) if p == None: p = posts.getPostsFromUrl(cnnurl) common_ngrams = stats.mostCommonNGramsFromPosts(p,N) bigrams = filter(lambda x: x[0] not in article_ngrams[1], common_ngrams[1]) unigrams = filter(lambda x: x[0] not in article_ngrams[0], common_ngrams[0]) bigrams = bigrams[0:bn] # fake reweighting by multiplying the bigram counts by a constant bigrams = map(lambda x: tuple([x[0], x[1]*3]), bigrams) unigrams = unigrams[0:(N-bn)] common_ngrams = unigrams common_ngrams.extend(bigrams) common_ngrams = stats.nGramsListToTokenStringsList(common_ngrams) return common_ngrams
def getPostsContaining(token, posts): return thread.findPostsContaining(posts, token)