def __init__(self,dbname): self.con = sqlite.connect(dbname) self.net = nn.searchnet(dbname)
import urllib from bs4 import BeautifulSoup from urllib.parse import urljoin import urllib.request import sqlite3 as sqlite #from pysqlite2 import dbapi2 as sqlite #from pysqlite3 import dbapi3 import re import neuralnetwork as nn import ssl # 以https开头的网站若ssl证书不通过会报错 mynet = nn.searchnet('nn.db') # Create a list of words to ignore ignorewords = {'the':1,'of':1,'to':1,'and':1,'a':1,'in':1,'is':1,'it':1} class crawler: # Initialize the crawler with the name of database def __init__(self, dbname): self.con=sqlite.connect(dbname) def __del__(self): self.con.close() def dbcommit(self): self.con.commit() # Auxilliary function for getting an entry id and adding # it if it's not present def getentryid(self, table, field, value, createnew=True): cur = self.con.execute( "select rowid from %s where %s='%s'" % (table,field,value)) res = cur.fetchone() if res == None:
import neuralnetwork mynet = neuralnetwork.searchnet('nn.db') # mynet.maketables() wWorld, wRiver, wBank = 101, 102, 103 uWorldBank, uRiver, uEwarth = 201, 202, 203 mynet.generatehiddennode([wWorld, wBank], [uWorldBank, uRiver, uEwarth]) for c in mynet.con.execute("SELECT * FROM wordhidden"): print c print "------------------" for c in mynet.con.execute("SELECT * FROM hiddenurl"): print c
crawler = searchengine.crawler(fn) #crawler.createindextables() # 若 db 已建好,可注释此句 # crawl some pages: #pagelist=['https://en.wikipedia.org/wiki/R_(programming_language)'] #crawler.crawl(pagelist) pagelist = ['http://www.diveintopython.net'] #crawler.crawl(pagelist) # 若 db 已建好,可注释下句 #[row for row in crawler.con.execute('select rowid from wordlocation where wordid=1')] import neuralnetwork as nn mynet = nn.searchnet('nn.db') # mynet.maketables() wordstosearch = 'python programming' e = searchengine.searcher('searchindex.db') e.getmatchrows(wordstosearch) # create the needed tables for the page rank algorithm: crawler.calculatepagerank() # before adding in the page rank algorithm into the weights that form the scoring function e.query(wordstosearch) # cur = crawler.con.execute('select * from pagerank order by score desc') #for i in range(3): print(cur.next()) #e.geturlname(17) # word "python" rowid==17
for (u, score) in linkscores.iteritems()]) return normalizedScores def neuralNetwordScore(self, rows, wordids): urlids = [urlid for urlid in set([row[0] for row in rows])] nnres = mynet.getResult(wordids, urlids) scores = dict([(urlids[i], nnres[i]) for i in range(len(urlids))]) return self.normalizeScores(scores) if __name__ == '__main__': pagelist = [] seedpage = raw_input('Enter the website page address: ') if seedpage.find('http://') == -1: seedpage = 'http://' + seedpage pagelist.append(seedpage) dbname = raw_input('Enter the database name: ') if dbname.find('.db') == -1: dbname += '.db' crawler = crawler(dbname) if not os.path.isfile(dbname): crawler.createIndexTables() crawler.crawl(pagelist) crawler.calculatePageRank() searcher = searcher(dbname) mynet = neuralnetwork.searchnet('nn.db') while True: qStr = raw_input('Enter the query string (press Enter to quit): ') if qStr == '' or qStr == '\n': break searcher.query(qStr)