def nn2(): import nn wWorld,wRiver,wBank = 101,102,103 uWorldBank,uRiver,uEarth = 201,202,203 mynet = nn.searchnet("nn.db") print(mynet.getresult([wWorld,wBank],[uWorldBank,uRiver,uEarth]))
def __init__(self): self.net = nn.searchnet('gridwar.db') # self.net.maketables() self.oUP = 0 self.oDown = 1 self.oLeft = 2 self.oRight = 3 self.outputs = [self.oUP, self.oDown, self.oLeft, self.oRight]
def onclick(): mynet=nn.searchnet('nn.db') wWorld,wRiver,wBank =101,102,103 uWorldBank,uRiver,uEarth =201,202,203 mynet.generatehiddennode([wWorld,wBank],[uWorldBank,uRiver,uEarth]) for c in mynet.con.execute('select * from wordhidden'): print c for c in mynet.con.execute('select * from hiddenurl'): print c print mynet.getresult([wWorld,wBank],[uWorldBank,uRiver,uEarth])
def train_nn(request, page_alias, selected_result): network = nn.searchnet('nn.db') words = e.getwordids(page_alias) if (selected_result.endswith("/")): selected_result = selected_result[:-1] urlid = e.geturlid(selected_result) network.trainquery(words, allurls, urlid) context_dict = {'link': selected_result} return render_to_response('redirect_page.html', context_dict)
def test_select(): sys.stderr.write("testing create hiddennodes...\n") mynet=nn.searchnet('nn.db') mynet.maketables() wWorld,wRiver,wBank =101,102,103 uWorldBank,uRiver,uEarth =201,202,203 mynet.generatehiddennode([wWorld,wBank],[uWorldBank,uRiver,uEarth]) sys.stderr.write("testing 'select * from wordhidden'...\n") for c in mynet.con.execute('select * from wordhidden'): print c sys.stderr.write("testing 'select * from hiddenurl'...\n") for c in mynet.con.execute('select * from hiddenurl'): print c
def test_trainqueries(): sys.stderr.write("testing training queries...\n") mynet=nn.searchnet('nn.db') allurls=[uWorldBank,uRiver,uEarth] for i in range(30): mynet.trainquery([wWorld,wBank],allurls,uWorldBank) mynet.trainquery([wRiver,wBank],allurls,uRiver) mynet.trainquery([wWorld],allurls,uEarth) print mynet.getresult([wWorld,wBank],allurls) print mynet.getresult([wRiver,wBank],allurls) print mynet.getresult([wBank],allurls)
def result(): if request.method == 'POST': a = nn.searchnet('Truth.db') res = a.getresult(request.form['statement']) if res[0] > res[1]: return render_template('detection.html', truthOrLie='Lie') else: print(res) return render_template('detection.html', truthOrLie='Truth') else: return render_template("detect.html")
def nn(): import nn mynet = nn.searchnet("nn.db") mynet.maketables() wWorld,wRiver,wBank = 101,102,103 uWorldBank,uRiver,uEarth = 201,202,203 mynet.generatehiddennode([wWorld,wBank],[uWorldBank,uRiver,uEarth]) for c in mynet.con.execute("select * from wordhidden"): print(c) for c in mynet.con.execute("select * from hiddenurl"): print(c)
def trainingTest(): mynet=nn.searchnet('nn.db') wWorld,wRiver,wBank =101,102,103 uWorldBank,uRiver,uEarth =201,202,203 allurls=[uWorldBank,uRiver,uEarth] for i in range(30): mynet.trainquery([wWorld,wBank],allurls,uWorldBank) mynet.trainquery([wRiver,wBank],allurls,uRiver) mynet.trainquery([wWorld],allurls,uEarth) print mynet.getresult([wWorld,wBank],allurls) print mynet.getresult([wRiver,wBank],allurls) print mynet.getresult([wBank],allurls) '''
def test_nn(): online, pharmacy = 1, 2 spam, notspam = 1, 2 possible = [spam, notspam] neuralnet = nn.searchnet('nntest.db') neuralnet.maketables() neuralnet.trainquery([online], possible, notspam) neuralnet.trainquery([online, pharmacy], possible, spam) neuralnet.trainquery([pharmacy], possible, notspam) neuralnet.getresult([online, pharmacy], possible) neuralnet.getresult([online], possible) neuralnet.trainquery([online], possible, notspam) neuralnet.getresult([online], possible) neuralnet.trainquery([online], possible, notspam) neuralnet.getresult([online], possible) quit()
def getscoredlist(self, rows, wordids): #文档:分数 totalscores = dict([(row[0], 0) for row in rows]) mynet = nn.searchnet('nn.db') #多种考量方法,加权累计评分 weights = [(1.0, self.frequencyscore(rows)), (1.0, self.locationscore(rows)), (1.0, self.distancescore(rows)), (1.0, self.inboundlinkscore(rows)), (1.0, self.pagerankscore(rows)), (1.0, self.linktextscore(rows, wordids)), (2, self.nnscore(rows, wordids, mynet))] for (weight, scores) in weights: for url in totalscores: totalscores[url] += weight * scores[url] return totalscores
def train_nn(train_path, test_path): ham = 0 spam = 1 allans = [ham, spam] words = {} spamnet = nn.searchnet('spam.db') spamnet.maketables() for filename in glob.glob(train_path): with open(filename, 'r') as f: f = f.read() for word in nn.getwords(f): if words.has_key(word) == False: wordslen = len(words) + 2 words[word] = wordslen cnt = 1 for filename in glob.glob(train_path): print cnt cnt = cnt + 1 with open(filename, 'r') as f: f = f.read() features = nn.getwords(f) wordNum = [words[word] for word in features] spamnet.generatehiddennode(wordNum, allans) label = filename.split('.')[3] if label == 'ham': label = 0 else: label = 1 spamnet.trainquery(wordNum, allans, label) print "Train Done!"
def test_trainquery(): sys.stderr.write("testing training query...\n") mynet=nn.searchnet('nn.db') mynet.trainquery([wWorld,wBank],[uWorldBank,uRiver,uEarth],uWorldBank) print mynet.getresult([wWorld,wBank],[uWorldBank,uRiver,uEarth])
def test_feedforward(): sys.stderr.write("testing feedforward (without training)...\n") mynet=nn.searchnet('nn.db') print mynet.getresult([wWorld,wBank],[uWorldBank,uRiver,uEarth])
from urllib.request import urlopen from bs4 import BeautifulSoup from urllib.parse import urljoin import sqlite3 as sql from sqlite3 import Error import re import nn mynet = nn.searchnet('nn.db') # Connect to neural network database # Create a list of words to ifnore ignorewords = set(['the', 'of', 'to', 'and', 'a', 'in', 'is', 'it']) # DB connection function def connect(db_file): try: conn = sql.connect(db_file) return conn except Error as e: print(e) return None class crawler: # Initialize the crawler with the name of database def __init__(self, dbname): self.con = sql.connect(dbname)
def backpropagation(): mynet=nn.searchnet('nn.db') wWorld,wRiver,wBank =101,102,103 uWorldBank,uRiver,uEarth =201,202,203 mynet.trainquery([wWorld,wBank],[uWorldBank,uRiver,uEarth],uWorldBank) print mynet.getresult([wWorld,wBank],[uWorldBank,uRiver,uEarth])
# coding: UTF-8 # Author: David # Email: [email protected] # Created: 2016-07-30 15:25 # Last modified: 2016-08-02 16:06 # Filename: searchengine.py # Description: import urllib2 from BeautifulSoup import * from urlparse import urljoin import MySQLdb import re import nn ignore_words = set(['the', 'of', 'to', 'and', 'a', 'in', 'is', 'it']) mynet = nn.searchnet() class crawler: def __init__(self, db='PCI', host='localhost', user='******', passwd='root', port=3306): self.con = MySQLdb.connect(host=host, user=user, passwd=passwd, port=port, db=db) self.cur = self.con.cursor()
get_ipython().magic(u'logstart example.py append') import nn online, pharmacy = 1, 2 spam, notspam = 1, 2 possible = [spam, notspam] neuralnet = nn.searchnet('nntest.db') neuralnet.maketables() neuralnet.trainquery([online], possible, notspam) neuralnet.trainquery([online, pharmacy], possible, spam) neuralnet.trainquery([pharmacy], possible, notspam) neuralnet.getresult([online, pharmacy], possible) neuralnet.getresult([online], possible) neuralnet.trainquery([online], possible, notspam) neuralnet.getresult([online], possible) neuralnet.trainquery([online], possible, notspam) neuralnet.getresult([online], possible) quit()
import searchengine as s import nn from flask import Flask from flask import render_template, request import os mynet = nn.searchnet("nn.db") tempate = os.path.join(os.getcwd(), 'templates') app = Flask(__name__, template_folder=tempate) def makeindex(key): e = s.searcher('searchindex.db') result = e.query(key) List = [] size = len(result) for i in range(size): for j in result[i]: List.append(e.geturlname(j)) return List @app.route('/', methods=['GET', 'POST']) def index(): if request.method == 'POST': keyword = request.form['keyword'] res_list = makeindex(keyword) if keyword: return render_template('search.html', query=res_list,
class searcher: def __init__(self, dbname): self.conn = sqlite.connect(dbname) def __del__(self): self.conn.close() def getmatchrows(self, q): # Strings to build the query fieldlist = 'w0.urlid' tablelist = '' clauselist = '' wordids = [] # Split the words by spaces words = q.split(' ') tablenumber = 0 for word in words: # Get the word ID wordrow = self.conn.execute( "select rowid from wordlist where word='%s'" % word).fetchone() if wordrow != None: wordid = wordrow[0] wordids.append(wordid) if tablenumber > 0: tablelist += ',' clauselist += ' and ' clauselist += 'w%d.urlid=w%d.urlid and ' % (tablenumber - 1, tablenumber) fieldlist += ',w%d.location' % tablenumber tablelist += 'wordlocation w%d' % tablenumber clauselist += 'w%d.wordid=%d' % (tablenumber, wordid) tablenumber += 1 if (len(clauselist.strip()) == 0): return None # Create the query from the separate parts fullquery = 'select %s from %s where %s' % (fieldlist, tablelist, clauselist) print(fullquery) cur = self.conn.execute(fullquery) rows = [row for row in cur] return rows, wordids def getscoredlist(self, rows, wordids): # row fromat : [urlid, loc1, loc2...] totalscores = dict([(row[0], 0) for row in rows]) # This is where you'll later put the scoring functions weights = [] weights = [(1.0, self.frequencyscore(rows)), (1.5, self.locationscore(rows))] for (weight, scores) in weights: for url in totalscores: totalscores[url] += weight * scores[url] return totalscores def geturlname(self, id): return self.conn.execute("select url from urllist where rowid=%d" % id).fetchone()[0] def query(self, q): rows, wordids = self.getmatchrows(q) scores = self.getscoredlist(rows, wordids) # sorted([(s,url),(s,url),...],reverse=1) rankedscores = sorted([(score, url) for (url, score) in scores.items()], reverse=1) for (score, urlid) in rankedscores[0:10]: print '%f\t%s' % (score, self.geturlname(urlid)) # add after nn return wordids, [r[1] for r in rankedscores[0:10]] def normalizescores(self, scores, smallIsBetter=0): vsmall = 0.00001 # Avoid division by zero errors if smallIsBetter: minscore = min(scores.values()) return dict([(u,float(minscore)/max(vsmall,l)) for (u,l) \ in scores.items()]) else: maxscore = max(scores.values()) if maxscore == 0: maxscore = vsmall return dict([(u, float(c) / maxscore) for (u, c) in scores.items()]) def frequencyscore(self, rows): # init, (urlid, freq) counts = dict([(row[0], 0) for row in rows]) for row in rows: counts[row[0]] += 1 return self.normalizescores(counts) def locationscore(self, rows): locations = dict([(row[0], 1000000) for row in rows]) for row in rows: loc = sum(row[1:]) if loc < locations[row[0]]: locations[row[0]] = loc return self.normalizescores(locations, smallIsBetter=1) def distancescore(self, rows): # If there's only one word, everyone wins! if len(rows[0]) <= 2: return dict([(row[0], 1.0) for row in rows]) # Initialize the dictionary with large values mindistance = dict([(row[0], 1000000) for row in rows]) for row in rows: dist = sum([abs(row[i] - row[i - 1]) for i in range(2, len(row))]) if dist < mindistance[row[0]]: mindistance[row[0]] = dist return self.normalizescores(mindistance, smallIsBetter=1) def inboundlinkscore(self, rows): uniqueurls = set([row[0] for row in rows]) inboundcount=dict([(u,self.con.execute( \ 'select count(*) from link where toid=%d' % u).fetchone()[0]) for u in uniqueurls]) return self.normalizescores(inboundcount) def calculatepagerank(self, iterations=20): # clear out the current PageRank tables self.conn.execute('drop table if exists pagerank') self.conn.execute('create table pagerank(urlid primary key,score)') # initialize every url with a PageRank of 1 self.conn.execute( 'insert into pagerank select rowid, 1.0 from urllist') self.dbcommit() for i in range(iterations): print "Iteration %d" % (i) for (urlid, ) in self.conn.execute('select rowid from urllist'): pr = 0.15 # Loop through all the pages that link to this one for (linker, ) in self.conn.execute( 'select distinct fromid from link where toid=%d' % urlid): # Get the PageRank of the linker linkingpr = self.conn.execute( 'select score from pagerank where urlid=%d' % linker).fetchone()[0] # Get the total number of links from the linker linkingcount = self.con.execute( 'select count(*) from link where fromid=%d' % linker).fetchone()[0] pr += 0.85 * (linkingpr / linkingcount) self.conn.execute( 'update pagerank set score=%f where urlid=%d' % (pr, urlid)) self.dbcommit() def pagerankscore(self, rows): pageranks = dict([ (row[0], self.con.execute('select score from pagerank where urlid=%d' % row[0]).fetchone()[0]) for row in rows ]) maxrank = max(pageranks.values()) normalizedscores = dict([(u, float(l) / maxrank) for (u, l) in pageranks.items()]) return normalizedscores def linktextscore(self, rows, wordids): linkscores = dict([(row[0], 0) for row in rows]) for wordid in wordids: cur = self.con.execute( 'select link.fromid,link.toid from linkwords,link where wordid=%d and linkwords.linkid=link.rowid' % wordid) for (fromid, toid) in cur: if toid in linkscores: pr = self.con.execute( 'select score from pagerank where urlid=%d' % fromid).fetchone()[0] linkscores[toid] += pr maxscore = max(linkscores.values()) normalizedscores = dict([(u, float(l) / maxscore) for (u, l) in linkscores.items()]) return normalizedscores import nn mynet = nn.searchnet('nn.db') def nnscore(self, rows, wordids): # Get unique URL IDs as an ordered list urlids = [urlid for urlid in set([row[0] for row in rows])] nnres = mynet.getresult(wordids, urlids) scores = dict([(urlids[i], nnres[i]) for i in range(len(urlids))]) return self.normalizescores(scores)
# -*- coding: utf-8 -*- import traceback import urllib2 from BeautifulSoup import * from urlparse import urljoin import sqlite3 as sqlite import nn mynet = nn.searchnet('output/nn.db') # Create a list of words to ignore ignorewords = { 'the': 1, 'of': 1, 'to': 1, 'and': 1, 'a': 1, 'in': 1, 'is': 1, 'it': 1 } class crawler: # Initialize the crawler with the name of database def __init__(self, dbname): self.con = sqlite.connect(dbname) def __del__(self): self.con.close()
def __init__(self,dbname): self.con=sqlite.connect(dbname) self.mynet=nn.searchnet('nn.db')
from urllib.request import urlopen, urljoin from bs4 import BeautifulSoup import pymysql import re import jieba import time import nn mynet = nn.searchnet('test') class crawler: # Initialize the crawler with the name of database def __init__(self, dbname): self.con = pymysql.connect(host="*****", port=3306, user="******", password="******", database=dbname, charset="utf8", use_unicode=True) self.cur = self.con.cursor() def __del__(self): #self.cur.close() self.con.close() def dbcommit(self): self.con.commit() #return the ID of an entry
get_ipython().magic(u"logstart example.py append") import nn online, pharmacy = 1, 2 spam, notspam = 1, 2 possible = [spam, notspam] neuralnet = nn.searchnet("nntest.db") neuralnet.maketables() neuralnet.trainquery([online], possible, notspam) neuralnet.trainquery([online, pharmacy], possible, spam) neuralnet.trainquery([pharmacy], possible, notspam) neuralnet.getresult([online, pharmacy], possible) neuralnet.getresult([online], possible) neuralnet.trainquery([online], possible, notspam) neuralnet.getresult([online], possible) neuralnet.trainquery([online], possible, notspam) neuralnet.getresult([online], possible) quit()
import os import re import urllib2 import urlparse from pysqlite2 import dbapi2 as sqlite from BeautifulSoup import BeautifulSoup import nn net = nn.searchnet('nn.db') # XXX: somehow train this from user clicks ignorewords = set(['the', 'of', 'to', 'and', 'a', 'in', 'is', 'it']) # XXX: the root page (amnoid.de) is indexed twice for some reason (e.g. # select * from links where toid = 2; # shows the link 1->2 two times. class crawler: def __init__(self, dbname): self.con = sqlite.connect(dbname) def __del__(self): self.con.close() def dbcommit(self): self.con.commit() def getentryid(self, table, field, value, createnew=True): """Returns an entry id and creates it if it is not present.""" cur = self.con.execute('select rowid from %s where %s="%s"' % (table, field, value))
#!/Users/kawasakitaku/Documents/python-PVM/ln-python3.4/bin/python3.4 import searchengine as s import nn from flask import Flask from flask import render_template, request import os mynet = nn.searchnet("nn.db") tempate = os.path.join(os.getcwd(),'templates') app = Flask(__name__,template_folder=tempate) def makeindex(key): e = s.searcher('searchindex.db') result = e.query(key) List = [] size = len(result) for i in range(size): for j in result[i]: List.append(e.geturlname(j)) return List @app.route('/',methods=['GET','POST']) def index(): if request.method == 'POST': keyword = request.form['keyword'] res_list = makeindex(keyword) if keyword:
#!/usr/bin/env python # -*- coding: utf-8 -*- import bottle import searchengine import nn import beaker.middleware import os mynet = nn.searchnet() session_opts = { 'session.type': 'file', 'session.data_dir': './session/', 'session.auto': True, } app = beaker.middleware.SessionMiddleware(bottle.app(), session_opts) @bottle.route('/') def querypage(): return bottle.template('query') @bottle.route('/resume') def resumepage(): return bottle.template('resume') @bottle.route('/query', method='POST') def queryhandler(): e = searchengine.searcher() q = bottle.request.forms.get("query") mywords, myurls = e.query(q)
class searcher: def __init__(self, dbname): self.con = sqlite.connect(dbname) def __del__(self): self.con.close() def getmatchrows(self, q): #构造查询的字符串 fieldlist = 'w0.urlid' tablelist = '' clauselist = '' wordids = [] #根据空格拆分单词 words = q.split(' ') #print words#测试 tablenumber = 0 for word in words: #获取单词的ID wordrow = self.con.execute( "select rowid from wordlist where word='%s'" % word).fetchone() #print 'wordrow = %s' % wordrow #测试 if wordrow != None: wordid = wordrow[0] wordids.append(wordid) if tablenumber > 0: tablelist += ',' #clauselist += ' and ' clauselist += ' and w%d.urlid=w%d.urlid and ' % ( tablenumber - 1, tablenumber) fieldlist += ',w%d.location' % tablenumber tablelist += 'wordlocation w%d' % tablenumber clauselist += 'w%d.wordid=%d' % (tablenumber, wordid) tablenumber += 1 else: continue if '' == clauselist: return None, None #根据各个组分,建立查询 #运行错误:sqlite3.OperationalError: near "where": syntax error fullquery = 'select %s from %s where %s' % (fieldlist, tablelist, clauselist) cur = self.con.execute(fullquery) rows = [row for row in cur] # print rows #测试 return rows, wordids def getscoredlist(self, rows, wordids): totalscores = dict((row[0], 0) for row in rows) #此处是稍后放置评价函数的地方 weights = [(1.0, self.frequencyscore(rows)), (1.0, self.locationscore(rows)), (1.0, self.pagerankscore(rows)), (1.0, self.linktextscore(rows, wordids))] for (weight, scores) in weights: for url in totalscores: totalscores[url] += weight * scores[url] return totalscores def geturlname(self, id): return self.con.execute("select url from urllist where rowid=%d" % id).fetchone()[0] def query(self, q): rows, wordids = self.getmatchrows(q) scores = self.getscoredlist(rows, wordids) rankedscores = sorted([(score, url) for (url, score) in scores.items()], reverse=1) for (score, urlid) in rankedscores[0:10]: print '%f\t%s' % (score, self.geturlname(urlid)) return wordids, [r[1] for r in rankedscores[0:10]] #归一化,1表示接近,0表示很远 def normalizescores(self, scores, smallIsBetter=0): vsmall = 0.00001 #避免被零整除 if smallIsBetter: minscore = min(scores.values()) return dict([(u, float(minscore) / max(vsmall, l)) for (u, l) in scores.items()]) else: maxscore = max(scores.values()) return dict([(u, float(c) / maxscore) for (u, c) in scores.items()]) def frequencyscore(self, rows): counts = dict([(row[0], 0) for row in rows]) for row in rows: counts[row[0]] += 1 return self.normalizescores(counts) def locationscore(self, rows): locations = dict([(row[0], 1000000) for row in rows]) for row in rows: loc = sum(row[1:]) if loc < locations[row[0]]: locations[row[0]] = loc return self.normalizescores(locations, smallIsBetter=1) def distancescore(self, rows): #如果只有一个单词,则得分都一样 if len(rows[0]) <= 2: return dict([(row[0], 1.0) for row in rows]) #初始化字典,并填入一个很大的数 mindistance = dict([(row[0], 100000) for row in rows]) for row in rows: dist = sum([abs(row[i] - row[i - 1]) for i in range(2, len(row))]) if dist < mindistance[row[0]]: mindistance[row[0]] = dist return self.normalizescores(mindistance, smallIsBetter=1) def inboundlinkscore(self, rows): uniqueurls = set([row[0] for row in rows]) inboundcount = dict([ (u, self.con.execute('select count(*) from link where toid=%d' % u).fetchone()[0]) for u in uniqueurls ]) return self.normalizescores(inboundcount) def pagerankscore(self, rows): pageranks = dict([ (row[0], self.con.execute('select score from pagerank where urlid=%d' % row[0]).fetchone()[0]) for row in rows ]) maxrank = max(pageranks.values()) normalizescores = dict([(u, float(l) / maxrank) for (u, l) in pageranks.items()]) return normalizescores def linktextscore(self, rows, wordids): linkscores = dict([(row[0], 0.00001) for row in rows]) for wordid in wordids: cur = self.con.execute( 'select link.fromid,link.toid from linkwords,link where wordid=%d and linkwords.linkid=link.rowid' % wordid) for (fromid, toid) in cur: if toid in linkscores: pr = self.con.execute( 'select score from pagerank where urlid=%d' % fromid).fetchone()[0] linkscores[toid] += pr maxscore = max(linkscores.values()) normalizescores = dict([(u, float(l) / maxscore) for (u, l) in linkscores.items()]) return normalizescores mynet = nn.searchnet('nn.db') def nnscore(self, rows, wordids): #获得一个由唯一的URL ID 构成的有序列表 urlids = [urlid for urlid in set([row[0] for row in rows])] nnres = mynet.getresult(wordids, urlids) scores = dict([(urlidd[i], nnres[i]) for i in range(len(urlids))]) return self.normalizescores(scores)
import os import nn db_file = 'nn.db' if os.path.exists(db_file): os.unlink(db_file) print 'delete db file:', db_file mynet = nn.searchnet(db_file) mynet.maketables() w_world, w_river, w_bank = 101, 102, 103 u_worldbank, u_river, u_earth = 201, 202, 203 #mynet.generate_hidden_node([w_world, w_bank], [u_worldbank, u_river, u_earth]) #result = mynet.getresult([w_world, w_bank], [u_worldbank, u_river, u_earth]) #print "result:", result #print 'show hiddennode:' #for c in mynet.con.execute('select * from hiddennode'): print c #print 'show wordhidden:' #for c in mynet.con.execute('select * from wordhidden'): print c #print 'show hiddenurl:' #for c in mynet.con.execute('select * from hiddenurl'): print c #mynet.train_query([w_world, w_bank], [u_worldbank, u_river, u_earth], u_worldbank) #result = mynet.getresult([w_world, w_bank], [u_worldbank, u_river, u_earth]) #print "result:", result
import os import re import urllib2 import urlparse from pysqlite2 import dbapi2 as sqlite from BeautifulSoup import BeautifulSoup import nn net = nn.searchnet('nn.db') # XXX: somehow train this from user clicks ignorewords = set(['the', 'of', 'to', 'and', 'a', 'in', 'is', 'it']) # XXX: the root page (amnoid.de) is indexed twice for some reason (e.g. # select * from links where toid = 2; # shows the link 1->2 two times. class crawler: def __init__(self, dbname): self.con = sqlite.connect(dbname) def __del__(self): self.con.close() def dbcommit(self): self.con.commit() def getentryid(self, table, field, value, createnew=True): """Returns an entry id and creates it if it is not present.""" cur = self.con.execute('select rowid from %s where %s="%s"'
def neural_network(self, rows, wordids): urlids = [urlid for urlid in set([row[0] for row in rows])] network_nel = nn.searchnet('nn.db') node_output = network_nel.getresult(wordids, urlids) net_score = dict([(urlids[i], node_output[i]) for i in range(len(urlids))]) return self.normalizescores(net_score)
# crawler.createindextables() crawler = searchengine.crawler("searchindex.db") # crawler.createindextables() # pages=['http://kiwitobes.com/wiki/Categorical_list_of_programming_languages.html'] # crawler.crawl(pages) # crawler.calculatepagerank() # search # e=searchengine.searcher('searchindex.db') # e.query('function programming') import nn mynn = nn.searchnet("nndb.db") # mynn.maketables() kaka1 = mynn.getstrength(0, 5, 0) kaka1 = mynn.getstrength(0, 5, 1) mynn.setstrength(0, 5, 0, 3) mynn.setstrength(0, 5, 1, 2) kaka1 = mynn.getstrength(0, 5, 0) kaka1 = mynn.getstrength(0, 5, 1) print("Hello World")
import nn a = nn.searchnet('Truth.db') a.train('I did not have sexual relations with that woman, Miss Kravinsky.', 0) print("Training...") a.train('I do not believe in god as he is who never was.', 0) print("Training...") a.train("I didn't see him go, I promise.", 1) print("Training...") a.train('He went in and kissed her ass off.', 1) print("Training...") a.train('I am very sorry for your loss, Johnny was like a brother to me.', 1) print("Training...") a.train( 'Did you say something, mister Greene. Because, well I am sorry for saying this but you can go shit your idea.', 1) print("Training...") a.train( 'In all candor, I did not do that murder. This is my plea, not guilty.', 0) print("Training...") a.train( 'Honestly, I am very nice and so good and kind. You all should learn a thing or two from me.', 0)
import urllib2 from BeautifulSoup import * from urlparse import urljoin from pysqlite2 import dbapi2 as sqlite import nn mynet=nn.searchnet('nn.db') # Create a list of words to ignore ignorewords={'the':1,'of':1,'to':1,'and':1,'a':1,'in':1,'is':1,'it':1} class crawler: # Initialize the crawler with the name of database def __init__(self,dbname): self.con=sqlite.connect(dbname) def __del__(self): self.con.close() def dbcommit(self): self.con.commit() # Auxilliary function for getting an entry id and adding # it if it's not present def getentryid(self,table,field,value,createnew=True): cur=self.con.execute( "select rowid from %s where %s='%s'" % (table,field,value)) res=cur.fetchone() if res==None: cur=self.con.execute( "insert into %s (%s) values ('%s')" % (table,field,value))
pageranks=dict([(row[0],self.curs.execute('select score from pagerank where urlid=%d' % row[0]).fetchall()[0][0]) for row in rows]) maxrank=max(pageranks.values()) #求最大的pagerank值 for urlid in pageranks: pageranks[urlid] /= maxrank #归一化 return pageranks #返回归一化的url的pagerank # 根据神经网络(用户点击行为学习)进行评价的函数。神经网络在nn.py中实现。#rows是[urlid,wordlocation1,wordlocation2,wordlocation3...] def nnscore(self,rows,wordids): # 获得一个由唯一的url id构成的有序列表 urlids=[urlid for urlid in dict([(row[0],1) for row in rows])] nnres=mynet.getresult(wordids,urlids) scores=dict([(urlids[i],nnres[i]) for i in range(len(urlids))]) return self.normalizescores(scores) mynet=nn.searchnet('csdn.db') if __name__ == '__main__': mysearcher= searcher('csdn.db') searchkey = input("搜索关键词>") wordids,urlids=mysearcher.query(searchkey) # print(wordids,urlids) selurlid= input("选中链接id>") selurlid = int(selurlid) mynet.trainquery(wordids, urlids,selurlid) #根据用户选择的链接进行训练
#coding:utf-8 import urllib2 from BeautifulSoup import * from urlparse import urljoin from sqlite3 import dbapi2 as sqlite import nn mynet=nn.searchnet('nn.db') # Create a list of words to ignore ignorewords={'the':1,'of':1,'to':1,'and':1,'a':1,'in':1,'is':1,'it':1} class crawler: # Initialize the crawler with the name of database def __init__(self,dbname): self.con=sqlite.connect(dbname) def __del__(self): self.con.close() def dbcommit(self): self.con.commit() # Auxilliary function for getting an entry id and adding # it if it's not present def getentryid(self,table,field,value,createnew=True): cur=self.con.execute( "select rowid from %s where %s='%s'" % (table,field,value)) res=cur.fetchone() if res==None: cur=self.con.execute( "insert into %s (%s) values ('%s')" % (table,field,value))
#coding=utf-8 import nn mynet=nn.searchnet('nn.db')#创建数据库 #mynet.maketables()#创建数据表 wWorld,wRiver,wBank=101,102,103 uWorldBank,uRiver,uEarth=201,202,203 mynet.generatehiddennode([wWorld,wBank],[uWorldBank,uRiver,uEarth]) for c in mynet.con.execute('select * from wordhidden'): print c for c in mynet.con.execute('select * from hiddenurl'): print c