def collocations(query, lang): langMap = {'es': 'Spanish', 'en': 'English'} stemmer = nltk.stem.snowball.SnowballStemmer(langMap[lang].lower()) j = wikiApi.get_article(query, lang) wordDict = {} corpus = '' for page in j: wikitext = wikiParser(j[page]['content']).text bfSoup = ' '.join(BeautifulSoup(wikitext).findAll(text=True)) corpus = corpus + " " + bfSoup tokens = nltk.wordpunct_tokenize(corpus) assert (tokens) finder = BigramCollocationFinder.from_words(tokens, window_size=20) finder.apply_freq_filter(4) ignored_words = nltk.corpus.stopwords.words('english') ignored_words.extend(htmltags) finder.apply_word_filter( lambda w: len(w) < 3 or w.lower() in ignored_words) a = finder.nbest(bigram_measures.likelihood_ratio, 500) final = [] for k in a: if k in final or (k[1], k[0]) in final: continue final.append(k) return final
def main(query, lang): langMap = {'es': 'Spanish', 'en': 'English'} stemmer = nltk.stem.snowball.SnowballStemmer(langMap[lang].lower()) j = wikiApi.get_article(query, lang) wordDict = {} for page in j: t = wikiParser(j[page]['content']) for header in t.headers: try: stemmedHeader = stemmer.stem(header) except Exception, e: print str(e) header = unidecode(header) stemmedHeader = stemmer.stem(header) if stemmedHeader in wordDict: wordDict[stemmedHeader]['count'] = 1 else: wordDict[stemmedHeader] = {'count': 1, 'form': stemmedHeader} text = t.text print type(text) tokens = [ k.split('|')[0] for k in nltk.PunktWordTokenizer().tokenize(text) if re.match('[a-zA-Z]', k) ] words = [ w.lower() for w in tokens if w.encode('utf-8').lower() not in nltk.corpus.stopwords.words(langMap[lang].lower()) ] print len(words) for w in words: try: st = stemmer.stem(w) except Exception, e: st = stemmer.stem(unidecode(w)) w = unidecode(w) continue if st in wordDict: wordDict[st]['count'] += 1 else: wordDict[st] = {'count': 1, 'form': w}
def main(query, lang): langMap = {'es': 'Spanish', 'en': 'English'} stemmer = nltk.stem.snowball.SnowballStemmer(langMap[lang].lower()) j = wikiApi.get_article(query, lang) wordDict = {} for page in j: t = wikiParser(j[page]['content']) for header in t.headers: try: stemmedHeader = stemmer.stem(header) except Exception, e: print str(e) header = unidecode(header) stemmedHeader = stemmer.stem(header) if stemmedHeader in wordDict: wordDict[stemmedHeader]['count'] = 1 else: wordDict[stemmedHeader] = {'count': 1, 'form': stemmedHeader} text = t.text print type(text) tokens = [k.split('|')[0] for k in nltk.PunktWordTokenizer().tokenize(text) if re.match('[a-zA-Z]', k)] words = [w.lower() for w in tokens if w.encode('utf-8').lower() not in nltk.corpus.stopwords.words(langMap[lang].lower())] print len(words) for w in words: try: st = stemmer.stem(w) except Exception, e: st = stemmer.stem(unidecode(w)) w = unidecode(w) continue if st in wordDict: wordDict[st]['count'] += 1 else: wordDict[st] = {'count': 1, 'form': w}
def collocations(query, lang): langMap = {'es': 'Spanish', 'en': 'English'} stemmer = nltk.stem.snowball.SnowballStemmer(langMap[lang].lower()) j = wikiApi.get_article(query, lang) wordDict = {} corpus = '' for page in j: wikitext = wikiParser(j[page]['content']).text bfSoup = ' '.join(BeautifulSoup(wikitext).findAll(text=True)) corpus = corpus + " " + bfSoup tokens = nltk.wordpunct_tokenize(corpus) assert(tokens) finder = BigramCollocationFinder.from_words(tokens, window_size=20) finder.apply_freq_filter(4) ignored_words = nltk.corpus.stopwords.words('english') ignored_words.extend(htmltags) finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words) a = finder.nbest(bigram_measures.likelihood_ratio, 500) final = [] for k in a: if k in final or (k[1], k[0]) in final: continue final.append(k) return final