def setUp(self): self.config = config self.dbname = self.config.dbname #setupcommon(config) if self.config.startfromcrawling: dbcleanup(self.dbname) # seed=["http://en.wikipedia.org/wiki/India", # "http://en.wikipedia.org/wiki/Native_Americans_in_the_United_States"] seed=self.config.seed #seed = ["http://en.wikipedia.org/wiki/Memory", "http://en.wikipedia.org/wiki/Computer_memory"] crawled=crawler(self.dbname) crawled.createindextables() crawled.crawl(seed, depth=self.config.crawlerdepth) crawled.calculatepagerank() cur=crawler.con.execute('select * from pagerank,urllist where pagerank.urlid=urllist.rowid order by score desc') else: if self.config.startfromurlsimilarity: urlsimilarityobj = urlsimilarity(self.dbname) urlsimilarityobj.createtables() urlsimilarityobj.fillsimilaritymatrix() else: if self.config.startfromuserurlhits: self.numusers= createuserurlhitsfromsimilarurls(self.dbname, minclusterlength=self.config.minclusterlength, minsimilarity=self.config.minsimilarityforclustering, loglevel=self.config.loglevel) self.numusers = getuseridcountfromdbase(self.dbname)
def test_crawler(new=False): c = crawler("output/search.db") if new: c.createindextables() pages = ['https://www.york.ac.uk/teaching/cws/wws/webpage1.html'] c.crawl(pages) c.calculatepagerank()
def test_crawler2(): sys.stderr.write("testing crawler...\n") crawler=searchengine.crawler('searchindex.db') pages= \ ['http://kiwitobes.com/'] #crawler.crawl(pages) print [row for row in crawler.con.execute( 'select rowid from wordlocation where wordid=1')]
def generateDB(): crawler = searchengine.crawler('searchindex.db') crawler.createindextables() pages = ['http://www.chinagrain.cn/'] try: crawler.crawl(pages, maxpages=100) except Exception, e: print Exception, ":", e
def test_se_crawler(): """ Test search engine cralwer """ crawler = se.crawler('crawler.db') crawler.createindextables() pages = ['https://mengyangyang.org/', 'https://docs.python.org/2/'] crawler.crawl(pages, 2) crawler.calculatepagerank(20)
def pageRank(): reload(searchengine) crawler=searchengine.crawler('searchindex.db') e=searchengine.searcher('searchindex.db') #crawler.calculatepagerank( ) cur=crawler.con.execute('select * from pagerank order by score desc') for i in range(3): d=cur.next() print d,e.geturlname(d[0])
def testseparatewords(self): craw = searchengine.crawler('searchindex.db') page = "http://forum.ubuntu.org.cn/index.php" c = urllib2.urlopen(page) soup = BeautifulSoup(c.read()) text = craw.gettextonly(soup) print(text) words = craw.separatewords(text) wordsneedtobe = [] self.assertEqual(words[100:120], wordsneedtobe[:]) pass
def test_calculate_pagerank(): sys.stderr.write("testing pagerank calculation...\n") crawler=searchengine.crawler('searchindex.db') crawler.calculatepagerank() sys.stderr.write("checking pagerank result...\n") cur=crawler.con.execute('select * from pagerank order by score desc') for i in range(3): print cur.next() sys.stderr.write("checking pagerank top url...\n") e=searchengine.searcher('searchindex.db') urlid=cur.next()[0] print e.geturlname(urlid)
import os import searchengine fn = 'searchindex.db' if (False): os.unlink(fn) # execute if we want to "recrawl" the Perl web pages crawler = searchengine.crawler(fn) #crawler.createindextables() # 若 db 已建好,可注释此句 # crawl some pages: #pagelist=['https://en.wikipedia.org/wiki/R_(programming_language)'] #crawler.crawl(pagelist) pagelist = ['http://www.diveintopython.net'] #crawler.crawl(pagelist) # 若 db 已建好,可注释下句 #[row for row in crawler.con.execute('select rowid from wordlocation where wordid=1')] import neuralnetwork as nn mynet = nn.searchnet('nn.db') # mynet.maketables() wordstosearch = 'python programming' e = searchengine.searcher('searchindex.db') e.getmatchrows(wordstosearch) # create the needed tables for the page rank algorithm: crawler.calculatepagerank() # before adding in the page rank algorithm into the weights that form the scoring function
print '\n' if __name__ == '__main__': ''' 2. Boolean operations. Many search engines support Boolean queries, which allow users to construct searches like "python OR perl." An OR search can work by doing the queries separately and combining the results, but what about "python AND (program OR code)"? Modify the query methods to support some basic Boolean operations. 3. Exact matches. Search engines often support "exact match" queries, where the words in the page must match the words in the query in the same order with no additional words in between. Create a new version of getrows that only returns results that are exact matches. (Hint: you can use subtraction in SQL to get the difference between the word locations.) ''' dbname = 'searchindex.db' if True: crawler = se.crawler(dbname) crawler.createindextables() pages = [ 'https://www.zhihu.com/', 'https://github.com/' ] crawler.crawl(pages, depth=2) crawler.calculatepagerank() else: searcher = se.searcher(dbname) q = 'zhihu career' print searcher.query(q)
def test_createindextables(): sys.stderr.write("testing create index tables...\n") crawler=searchengine.crawler('searchindex.db') crawler.createindextables()
def test_crawler(): sys.stderr.write("testing crawler...\n") pagelist=['http://kiwitobes.com/'] crawler=searchengine.crawler('') crawler.crawl(pagelist)
def setUp(self): self.c = searchengine.crawler("test.db")
#coding:utf-8 #!/usr/bin/env python __author__ = 'dick' import searchengine craw = searchengine.crawler('searchindex.db') # craw.createindextables() pages = [ # 'http://www.bbc.com/', 'https://www.hao123.com/?1477704964', # 'https://www.baidu.com', ] # craw.crawl(pages) e = searchengine.searcher('searchindex.db') print e.getmatchrows('hao weather yes')
def calculatepagerank(): reload(searchengine) crawler=searchengine.crawler('searchindex.db') crawler.calculatepagerank( ) cur=crawler.con.execute('select * from pagerank order by score desc') for i in range(3): print cur.next( )
import searchengine pagelist=[ 'https://www.guokr.com' ,'http://www.zhihu.com' ,'http://www.douban.com' ,'https://zh.wikipedia.org' ] crawler=searchengine.crawler('searchindex.db') crawler.crawl(pagelist)
def intialize(): pagelist=['http://kiwitobes.com/wiki/Perl.html'] crawler=searchengine.crawler('searchIndex.db') crawler.createindextables() crawler.crawl(pagelist)
#!/usr/bin/python import searchengine pages = ['https://www.python.org/'] crawler = searchengine.crawler('index.db') crawler.createindextables() crawler.crawl(pages) crawler.calculate_page_rank() #cur=crawler.con.execute('select * from pagerank order by score desc') #for i in range(3): print cur.next( )
import searchengine pages = ['https://news.google.com.tw/'] crawler = searchengine.crawler('test') crawler.createindextables() #create tables crawler.crawl(pages) crawler.caculatepagerank() e = searchengine.searcher('test') e.query('單場 球季')
def testcrawl(pages = ['http://kiwitobes.com/wiki/Categorical_list_of_programming_languages.html']): crawler = searchengine.crawler() crawler.crawl(pages)
import searchengine if __name__ == "__main__": pages = ['https://en.wikipedia.org/wiki/Finite_difference'] crawler = searchengine.crawler('searchengine.db') crawler.createindextables() crawler.crawl(pages) print "Added %d pages" % (crawler.totallinks())
def intialize(): pagelist = ['http://kiwitobes.com/wiki/Perl.html'] crawler = searchengine.crawler('searchIndex.db') crawler.createindextables() crawler.crawl(pagelist)
import searchengine pagelist = ['http://www.baidu.com'] craweler = searchengine.crawler('') craweler.crawl(pagelist)
def setUp(self): self.c = searchengine.crawler('')
# c=urllib2.urlopen('http://kiwitobes.com/wiki/Programming_language.html') # contents=c.read() # print contents[0:50] import os os.chdir("D:\\Machine-Learning\\trunk\\programming-collective-intelligence\\chapter04-search-and-ranking\\searchengine") import searchengine # pagelist=['http://kiwitobes.com/wiki/Perl.html'] # crawler=searchengine.crawler('mydb.db') # crawler.crawl(pagelist) # crawler.createindextables() crawler = searchengine.crawler("searchindex.db") # crawler.createindextables() # pages=['http://kiwitobes.com/wiki/Categorical_list_of_programming_languages.html'] # crawler.crawl(pages) # crawler.calculatepagerank() # search # e=searchengine.searcher('searchindex.db') # e.query('function programming') import nn mynn = nn.searchnet("nndb.db") # mynn.maketables()
def crawlerhtml(): pagelist=['http://www.linuxidc.com/Linux/2012-09/70576.htm'] crawler=searchengine.crawler('searchindex.db')
''' Created on Feb 10, 2014 @author: ssashita ''' import searchengine from pysqlite2 import dbapi2 as sqlite def dbcleanup(dbname): con = sqlite.connect(dbname) if con != None: con.execute('drop table if exists urllist') con.execute('drop table if exists wordlist') con.execute('drop table if exists wordlocation') con.execute('drop table if exists link') con.execute('drop table if exists linkwords') con.commit() con.close() if __name__ == '__main__': pagelist=['http://lxml.de/parsing.html', 'http://kiwitobes.com'] dbcleanup('crawled.db') crawler=searchengine.crawler('crawled.db') crawler.createindextables() crawler.crawl(pagelist)
import os import searchengine #pagelist = ['http://kiwitobes.com/wiki/Perl.html'] db_file = "searchindex.db" # create table #crawler = searchengine.crawler(db_file) #crawler.createindextables() pagelist = ['https://en.wikipedia.org/wiki/Python'] crawler = searchengine.crawler(db_file) # crawl #crawler.crawl(pagelist, depth=2) # calculate pr crawler.calculatepagerank(iterations=20) # query #se = searchengine.searcher('searchindex.db') #se.getmatchrows('programming language') #se.query('programming language')
import searchengine crawler=searchengine.crawler() crawler.crawl()
import searchengine # website = {'http://es.wikipedia.org/wiki/Wikipedia:Portada'} website={'http://www.univision.com/'} crawler = searchengine.crawler('searchindex5.db') crawler.createindextables() crawler.crawl(website)
for test in eval_tests: node = ast.parse(test) print ast.dump(node) # MyVisitor().visit(node) print '\n' if __name__ == '__main__': ''' 2. Boolean operations. Many search engines support Boolean queries, which allow users to construct searches like "python OR perl." An OR search can work by doing the queries separately and combining the results, but what about "python AND (program OR code)"? Modify the query methods to support some basic Boolean operations. 3. Exact matches. Search engines often support "exact match" queries, where the words in the page must match the words in the query in the same order with no additional words in between. Create a new version of getrows that only returns results that are exact matches. (Hint: you can use subtraction in SQL to get the difference between the word locations.) ''' dbname = 'searchindex.db' if True: crawler = se.crawler(dbname) crawler.createindextables() pages = ['https://www.zhihu.com/', 'https://github.com/'] crawler.crawl(pages, depth=2) crawler.calculatepagerank() else: searcher = se.searcher(dbname) q = 'zhihu career' print searcher.query(q)