def test(): input = slurp("/Users/andrewkittredge/Source/ai/similar_page/corpora/friendly_rentals.txt") cleansed_input = cleanse(input) n_grams = word_n_grams(cleansed_input, 1) corpa = Corpa() corpa.add_n_grams(n_grams) corpa.variance_from_model()
def process_url(url): page = get_page_contents(url) page_soup = BeautifulSoup(page) page_text = visible_text(page_soup) page_text = cleanse(page_text) try: urls = [anchor['href'].strip() for anchor in page_soup.findAll('a') if anchor.has_key('href')] except Exception as e: print e return page_text, urls