import gethtml import articletext import articletrans import getarticle import test1 from bs4 import BeautifulSoup import paraphrase import googlesearch import articletext urls = [] topics = [] visited = [] root_topic = "news" for googl in googlesearch.getGoogleLinks(root_topic): urls.append(googl) for u in urls: mytext = articletext.getArticle(u) keywords = articletext.getKeywords(mytext) for k in keywords: if k not in topics: topics.append(k) print mytext #print paraphrase.getTrans("http://sparkbrowser.com")
keyword_dict = [] br = mechanize.Browser() br.set_handle_robots(False) br.addheaders = [('User-agent', 'Firefox')] categories = ["business","celebrity","entertainment","trading","online broker","equity trading"] for cat in categories: myfile = open("categorieslist/"+cat.replace(" ","_")+".txt","w+") myfile.close() myfile = open("categorieslist/"+cat.replace(" ","_")+".txt","a") linklist = googlesearch.getGoogleLinks(cat) for link in linklist: try: htmltext = urllib.urlopen(link).read() soup = BeautifulSoup(htmltext) res = soup.findAll('meta',attrs={"name":"keywords"}) for r in res: keylist = r['content'] for key in keylist.split(","): myfile.write(str(key)+"\n") except: print "err" myfile.close()