예제 #1
0
import gethtml
import articletext
import articletrans
import getarticle
import test1
from bs4 import BeautifulSoup
import paraphrase
import googlesearch
import articletext

urls = []
topics = []
visited = []
root_topic = "news"

for googl in googlesearch.getGoogleLinks(root_topic):
    urls.append(googl)

for u in urls:
    mytext =  articletext.getArticle(u)
    keywords = articletext.getKeywords(mytext)
    for k in keywords:
        if k not in topics:
            topics.append(k)
    print mytext

#print paraphrase.getTrans("http://sparkbrowser.com")
예제 #2
0

keyword_dict = []
br = mechanize.Browser()
br.set_handle_robots(False)
br.addheaders = [('User-agent', 'Firefox')]

categories = ["business","celebrity","entertainment","trading","online broker","equity trading"]



for cat in categories:
    myfile = open("categorieslist/"+cat.replace(" ","_")+".txt","w+")
    myfile.close()
    myfile = open("categorieslist/"+cat.replace(" ","_")+".txt","a")
    linklist = googlesearch.getGoogleLinks(cat)
    for link in linklist:
        try:
            htmltext = urllib.urlopen(link).read()
            soup = BeautifulSoup(htmltext)
            res = soup.findAll('meta',attrs={"name":"keywords"})
            for r in res:
                keylist =  r['content']
                
                for key in keylist.split(","):
                    myfile.write(str(key)+"\n")
        except:
            print "err"
    myfile.close()